diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 1931.7947387695312, + "epoch": 0.0016, + "grad_norm": 0.20221363008022308, + "kl": 0.0006256103515625, + "learning_rate": 1.5873015873015872e-08, + "loss": 0.0395, + "num_tokens": 234085.0, + "reward": 1.93798166513443, + "reward_std": 1.0481289327144623, + "rewards/accuracy_reward": 0.11607143003493547, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.9558388292789459, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.0032, + "grad_norm": 0.17630808055400848, + "kl": 0.0005741119384765625, + "learning_rate": 3.1746031746031744e-08, + "loss": 0.0, + "num_tokens": 484713.0, + "reward": 1.611539900302887, + "reward_std": 0.894216388463974, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 0.7186827063560486, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.0048, + "grad_norm": 0.1734064668416977, + "kl": 0.000579833984375, + "learning_rate": 4.7619047619047613e-08, + "loss": 0.0, + "num_tokens": 734627.0, + "reward": 1.5392849445343018, + "reward_std": 1.0640218257904053, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8839285969734192, + "rewards/tag_count_reward": 0.655356302857399, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.7589721679688, + "epoch": 0.0064, + "grad_norm": 0.1862727254629135, + "kl": 0.000598907470703125, + "learning_rate": 6.349206349206349e-08, + "loss": 0.0142, + "num_tokens": 982652.0, + "reward": 1.8086180686950684, + "reward_std": 0.816057026386261, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9017857313156128, + "rewards/tag_count_reward": 0.7818323373794556, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 1963.9019165039062, + "epoch": 0.008, + "grad_norm": 0.20643271505832672, + "kl": 0.0006732940673828125, + "learning_rate": 7.936507936507936e-08, + "loss": 0.0342, + "num_tokens": 1222489.0, + "reward": 1.8883721828460693, + "reward_std": 0.7550848424434662, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.9464285969734192, + "rewards/tag_count_reward": 0.8973005712032318, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 2003.9464721679688, + "epoch": 0.0096, + "grad_norm": 0.18964679539203644, + "kl": 0.000659942626953125, + "learning_rate": 9.523809523809523e-08, + "loss": 0.0104, + "num_tokens": 1467511.0, + "reward": 1.6734029650688171, + "reward_std": 0.7824555337429047, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.9464285671710968, + "rewards/tag_count_reward": 0.7180458307266235, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 1952.9464721679688, + "epoch": 0.0112, + "grad_norm": 0.18615683913230896, + "kl": 0.000621795654296875, + "learning_rate": 1.111111111111111e-07, + "loss": 0.0164, + "num_tokens": 1705225.0, + "reward": 2.0677093267440796, + "reward_std": 0.7639530301094055, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 1.112352043390274, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 1849.1250610351562, + "epoch": 0.0128, + "grad_norm": 0.19185127317905426, + "kl": 0.000579833984375, + "learning_rate": 1.2698412698412698e-07, + "loss": 0.0031, + "num_tokens": 1932865.0, + "reward": 2.161295175552368, + "reward_std": 0.9222905039787292, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 1.2148663997650146, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 2026.1607666015625, + "epoch": 0.0144, + "grad_norm": 0.2118251472711563, + "kl": 0.00070953369140625, + "learning_rate": 1.4285714285714285e-07, + "loss": -0.0026, + "num_tokens": 2182433.0, + "reward": 1.890566885471344, + "reward_std": 0.8773335218429565, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 0.845923900604248, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 2021.294677734375, + "epoch": 0.016, + "grad_norm": 0.1781444400548935, + "kl": 0.0006313323974609375, + "learning_rate": 1.5873015873015872e-07, + "loss": 0.0019, + "num_tokens": 2428880.0, + "reward": 1.6018391847610474, + "reward_std": 0.8075584471225739, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 0.6196962594985962, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 1885.6608276367188, + "epoch": 0.0176, + "grad_norm": 0.1895192265510559, + "kl": 0.000614166259765625, + "learning_rate": 1.7460317460317458e-07, + "loss": 0.004, + "num_tokens": 2659954.0, + "reward": 1.9622865915298462, + "reward_std": 0.9196446239948273, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 0.9176436960697174, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 2031.02685546875, + "epoch": 0.0192, + "grad_norm": 0.183857262134552, + "kl": 0.000640869140625, + "learning_rate": 1.9047619047619045e-07, + "loss": -0.0029, + "num_tokens": 2912013.0, + "reward": 1.6477259993553162, + "reward_std": 0.7232430875301361, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.7191544771194458, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.357177734375, + "epoch": 0.0208, + "grad_norm": 0.19473284482955933, + "kl": 0.0006561279296875, + "learning_rate": 2.0634920634920632e-07, + "loss": -0.0012, + "num_tokens": 3157207.0, + "reward": 1.8293968439102173, + "reward_std": 1.0389590859413147, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 0.8829682469367981, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.5803833007812, + "epoch": 0.0224, + "grad_norm": 0.19564709067344666, + "kl": 0.0006618499755859375, + "learning_rate": 2.222222222222222e-07, + "loss": 0.0013, + "num_tokens": 3406640.0, + "reward": 1.5986675024032593, + "reward_std": 0.8608555197715759, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9017857313156128, + "rewards/tag_count_reward": 0.6968817412853241, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 1989.9375610351562, + "epoch": 0.024, + "grad_norm": 0.18532443046569824, + "kl": 0.00067138671875, + "learning_rate": 2.3809523809523806e-07, + "loss": -0.0011, + "num_tokens": 3650709.0, + "reward": 1.9418567419052124, + "reward_std": 0.7777000963687897, + "rewards/accuracy_reward": 0.1964285746216774, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.8436424136161804, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.0256, + "grad_norm": 0.18705320358276367, + "kl": 0.000644683837890625, + "learning_rate": 2.5396825396825396e-07, + "loss": 0.0, + "num_tokens": 3902163.0, + "reward": 1.6161985993385315, + "reward_std": 0.6961945593357086, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 0.6786984801292419, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.3214721679688, + "epoch": 0.0272, + "grad_norm": 0.1878025233745575, + "kl": 0.000690460205078125, + "learning_rate": 2.698412698412698e-07, + "loss": 0.0083, + "num_tokens": 4143839.0, + "reward": 1.6606950163841248, + "reward_std": 0.756260484457016, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142984867096, + "rewards/tag_count_reward": 0.749980628490448, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 1878.2501220703125, + "epoch": 0.0288, + "grad_norm": 0.2041172832250595, + "kl": 0.0006504058837890625, + "learning_rate": 2.857142857142857e-07, + "loss": -0.0002, + "num_tokens": 4373579.0, + "reward": 1.9688191413879395, + "reward_std": 0.717673122882843, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 1.0402476489543915, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 2018.0803833007812, + "epoch": 0.0304, + "grad_norm": 0.18298229575157166, + "kl": 0.0005626678466796875, + "learning_rate": 3.0158730158730156e-07, + "loss": 0.0077, + "num_tokens": 4621206.0, + "reward": 1.6859977841377258, + "reward_std": 1.0936334431171417, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 0.7931405305862427, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 1799.9197387695312, + "epoch": 0.032, + "grad_norm": 0.2321448177099228, + "kl": 0.000652313232421875, + "learning_rate": 3.1746031746031743e-07, + "loss": 0.0256, + "num_tokens": 4844959.0, + "reward": 2.240412950515747, + "reward_std": 1.0117415189743042, + "rewards/accuracy_reward": 0.2589285746216774, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 1.0707700848579407, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 1925.8750610351562, + "epoch": 0.0336, + "grad_norm": 0.19921426475048065, + "kl": 0.0006580352783203125, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0011, + "num_tokens": 5083351.0, + "reward": 1.8359440565109253, + "reward_std": 1.0148980617523193, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 0.9788012802600861, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 1938.0447387695312, + "epoch": 0.0352, + "grad_norm": 0.20125266909599304, + "kl": 0.00061798095703125, + "learning_rate": 3.4920634920634917e-07, + "loss": 0.0222, + "num_tokens": 5320586.0, + "reward": 1.9609485268592834, + "reward_std": 0.7480472922325134, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.9430912435054779, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.02685546875, + "epoch": 0.0368, + "grad_norm": 0.20395107567310333, + "kl": 0.0007190704345703125, + "learning_rate": 3.6507936507936504e-07, + "loss": -0.0048, + "num_tokens": 5556321.0, + "reward": 1.8211841583251953, + "reward_std": 1.0986509323120117, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 0.8211839646100998, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 1969.65185546875, + "epoch": 0.0384, + "grad_norm": 0.18154564499855042, + "kl": 0.0005970001220703125, + "learning_rate": 3.809523809523809e-07, + "loss": -0.0081, + "num_tokens": 5797096.0, + "reward": 2.07223117351532, + "reward_std": 0.8915455937385559, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 1.1079451441764832, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 1954.58935546875, + "epoch": 0.04, + "grad_norm": 0.20226912200450897, + "kl": 0.0006580352783203125, + "learning_rate": 3.968253968253968e-07, + "loss": 0.0155, + "num_tokens": 6035820.0, + "reward": 1.9015146493911743, + "reward_std": 1.0711874663829803, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.972942978143692, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 1893.8482666015625, + "epoch": 0.0416, + "grad_norm": 0.21135295927524567, + "kl": 0.0006046295166015625, + "learning_rate": 4.1269841269841265e-07, + "loss": 0.0277, + "num_tokens": 6265935.0, + "reward": 1.694935142993927, + "reward_std": 1.1835195422172546, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 0.8377922177314758, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.0432, + "grad_norm": 0.18777894973754883, + "kl": 0.000637054443359375, + "learning_rate": 4.285714285714285e-07, + "loss": 0.0, + "num_tokens": 6516913.0, + "reward": 1.6720025539398193, + "reward_std": 0.9006729125976562, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 0.7255738377571106, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 1899.0357666015625, + "epoch": 0.0448, + "grad_norm": 0.19071584939956665, + "kl": 0.0006084442138671875, + "learning_rate": 4.444444444444444e-07, + "loss": -0.0015, + "num_tokens": 6753251.0, + "reward": 1.8601660132408142, + "reward_std": 0.6736527979373932, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 0.9405229985713959, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 1978.5179443359375, + "epoch": 0.0464, + "grad_norm": 0.18578658998012543, + "kl": 0.0006427764892578125, + "learning_rate": 4.6031746031746025e-07, + "loss": 0.0131, + "num_tokens": 6995551.0, + "reward": 2.0168097019195557, + "reward_std": 0.869392067193985, + "rewards/accuracy_reward": 0.14285713993012905, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 0.9364522695541382, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.794677734375, + "epoch": 0.048, + "grad_norm": 0.21278461813926697, + "kl": 0.0007839202880859375, + "learning_rate": 4.761904761904761e-07, + "loss": 0.0007, + "num_tokens": 7248256.0, + "reward": 1.6645175218582153, + "reward_std": 0.9781303107738495, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.7627317905426025, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.0496, + "grad_norm": 0.1852043718099594, + "kl": 0.0006084442138671875, + "learning_rate": 4.92063492063492e-07, + "loss": 0.0, + "num_tokens": 7498884.0, + "reward": 1.6939566135406494, + "reward_std": 0.9078642427921295, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.7921708226203918, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 1921.2053833007812, + "epoch": 0.0512, + "grad_norm": 0.2176561951637268, + "kl": 0.000640869140625, + "learning_rate": 5.079365079365079e-07, + "loss": 0.0281, + "num_tokens": 7733379.0, + "reward": 1.97622150182724, + "reward_std": 0.9426892399787903, + "rewards/accuracy_reward": 0.0595238097012043, + "rewards/format_reward": 0.9017857313156128, + "rewards/tag_count_reward": 1.0297927260398865, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 2012.0001220703125, + "epoch": 0.0528, + "grad_norm": 0.19185538589954376, + "kl": 0.0006732940673828125, + "learning_rate": 5.238095238095238e-07, + "loss": 0.0039, + "num_tokens": 7980675.0, + "reward": 1.8059584498405457, + "reward_std": 0.6661086976528168, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142984867096, + "rewards/tag_count_reward": 0.8952441513538361, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.294677734375, + "epoch": 0.0544, + "grad_norm": 0.1801755130290985, + "kl": 0.0006198883056640625, + "learning_rate": 5.396825396825396e-07, + "loss": 0.004, + "num_tokens": 8227752.0, + "reward": 1.7206667065620422, + "reward_std": 0.9755340814590454, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 0.8188808560371399, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 2040.4554443359375, + "epoch": 0.056, + "grad_norm": 0.1840222179889679, + "kl": 0.0005970001220703125, + "learning_rate": 5.555555555555555e-07, + "loss": 0.003, + "num_tokens": 8477087.0, + "reward": 1.8552265167236328, + "reward_std": 0.8474010527133942, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.9087979197502136, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.2947387695312, + "epoch": 0.0576, + "grad_norm": 0.20413252711296082, + "kl": 0.0006999969482421875, + "learning_rate": 5.714285714285714e-07, + "loss": -0.0063, + "num_tokens": 8725830.0, + "reward": 1.7541846632957458, + "reward_std": 0.9167749285697937, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 0.7541846036911011, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.4464721679688, + "epoch": 0.0592, + "grad_norm": 0.19186584651470184, + "kl": 0.000598907470703125, + "learning_rate": 5.873015873015873e-07, + "loss": 0.0018, + "num_tokens": 8973120.0, + "reward": 1.5263110995292664, + "reward_std": 1.0249995589256287, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 0.570953980088234, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.6161499023438, + "epoch": 0.0608, + "grad_norm": 0.1711588203907013, + "kl": 0.0006084442138671875, + "learning_rate": 6.031746031746031e-07, + "loss": 0.0031, + "num_tokens": 9223341.0, + "reward": 1.6887956261634827, + "reward_std": 0.8009350895881653, + "rewards/accuracy_reward": 0.12499999720603228, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 0.6441525816917419, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.4285888671875, + "epoch": 0.0624, + "grad_norm": 0.18240338563919067, + "kl": 0.000637054443359375, + "learning_rate": 6.19047619047619e-07, + "loss": 0.0064, + "num_tokens": 9468627.0, + "reward": 1.448964238166809, + "reward_std": 0.9788670837879181, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.5650356262922287, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.0536499023438, + "epoch": 0.064, + "grad_norm": 0.1836073100566864, + "kl": 0.000637054443359375, + "learning_rate": 6.349206349206349e-07, + "loss": 0.0029, + "num_tokens": 9717595.0, + "reward": 1.6074355244636536, + "reward_std": 0.9641588926315308, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8660714030265808, + "rewards/tag_count_reward": 0.7324354499578476, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.9375610351562, + "epoch": 0.0656, + "grad_norm": 0.2034454047679901, + "kl": 0.000705718994140625, + "learning_rate": 6.507936507936507e-07, + "loss": 0.0019, + "num_tokens": 9966914.0, + "reward": 1.5452317595481873, + "reward_std": 0.8603548109531403, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142984867096, + "rewards/tag_count_reward": 0.6345174014568329, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 1870.2322387695312, + "epoch": 0.0672, + "grad_norm": 0.17887958884239197, + "kl": 0.000598907470703125, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0011, + "num_tokens": 10196764.0, + "reward": 1.9205936193466187, + "reward_std": 1.0858833193778992, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.8482142984867096, + "rewards/tag_count_reward": 1.0009506940841675, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 1998.6428833007812, + "epoch": 0.0688, + "grad_norm": 0.18684324622154236, + "kl": 0.0006542205810546875, + "learning_rate": 6.825396825396826e-07, + "loss": 0.0001, + "num_tokens": 10437664.0, + "reward": 1.6057739853858948, + "reward_std": 0.7890215814113617, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 0.6950596272945404, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.0704, + "grad_norm": 0.197466179728508, + "kl": 0.00069427490234375, + "learning_rate": 6.984126984126983e-07, + "loss": 0.0, + "num_tokens": 10686808.0, + "reward": 1.5753554105758667, + "reward_std": 0.49380502104759216, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9464285969734192, + "rewards/tag_count_reward": 0.6289267539978027, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 1979.71435546875, + "epoch": 0.072, + "grad_norm": 0.18728844821453094, + "kl": 0.0006389617919921875, + "learning_rate": 7.142857142857143e-07, + "loss": 0.0077, + "num_tokens": 10926666.0, + "reward": 1.909911870956421, + "reward_std": 0.688534140586853, + "rewards/accuracy_reward": 0.11607143003493547, + "rewards/format_reward": 0.9464285969734192, + "rewards/tag_count_reward": 0.8474116325378418, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 2031.8036499023438, + "epoch": 0.0736, + "grad_norm": 0.18801996111869812, + "kl": 0.0006694793701171875, + "learning_rate": 7.301587301587301e-07, + "loss": 0.0096, + "num_tokens": 11172120.0, + "reward": 1.7619670033454895, + "reward_std": 0.9705714583396912, + "rewards/accuracy_reward": 0.1160714291036129, + "rewards/format_reward": 0.9017857313156128, + "rewards/tag_count_reward": 0.7441096603870392, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 2012.7500610351562, + "epoch": 0.0752, + "grad_norm": 0.1785956621170044, + "kl": 0.00057220458984375, + "learning_rate": 7.46031746031746e-07, + "loss": 0.011, + "num_tokens": 11416308.0, + "reward": 1.822909414768219, + "reward_std": 0.9432471692562103, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.9032664597034454, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 2022.4375610351562, + "epoch": 0.0768, + "grad_norm": 0.17894971370697021, + "kl": 0.0006275177001953125, + "learning_rate": 7.619047619047618e-07, + "loss": 0.0053, + "num_tokens": 11663205.0, + "reward": 1.98964262008667, + "reward_std": 0.6732676327228546, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9464285671710968, + "rewards/tag_count_reward": 1.0432139039039612, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 1930.1250610351562, + "epoch": 0.0784, + "grad_norm": 0.20506837964057922, + "kl": 0.0006160736083984375, + "learning_rate": 7.777777777777778e-07, + "loss": 0.0227, + "num_tokens": 11899931.0, + "reward": 2.049280047416687, + "reward_std": 1.0347035825252533, + "rewards/accuracy_reward": 0.2053571417927742, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.9153512418270111, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 1884.4554443359375, + "epoch": 0.08, + "grad_norm": 0.19870403409004211, + "kl": 0.0006313323974609375, + "learning_rate": 7.936507936507936e-07, + "loss": 0.0242, + "num_tokens": 12130660.0, + "reward": 1.940011978149414, + "reward_std": 0.9366481304168701, + "rewards/accuracy_reward": 0.2142857164144516, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 0.8507262766361237, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 2029.4285888671875, + "epoch": 0.0816, + "grad_norm": 0.18944840133190155, + "kl": 0.000701904296875, + "learning_rate": 8.095238095238095e-07, + "loss": 0.0079, + "num_tokens": 12377192.0, + "reward": 1.8336897492408752, + "reward_std": 1.1141316294670105, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 0.9319039583206177, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.6162109375, + "epoch": 0.0832, + "grad_norm": 0.1917104870080948, + "kl": 0.0006542205810546875, + "learning_rate": 8.253968253968253e-07, + "loss": 0.0112, + "num_tokens": 12623647.0, + "reward": 1.7883602976799011, + "reward_std": 0.9723587334156036, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 0.859788715839386, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 2028.8572387695312, + "epoch": 0.0848, + "grad_norm": 0.18266935646533966, + "kl": 0.0006351470947265625, + "learning_rate": 8.412698412698413e-07, + "loss": 0.0023, + "num_tokens": 12870759.0, + "reward": 1.8882185816764832, + "reward_std": 0.7862816154956818, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.959646999835968, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 1942.9285888671875, + "epoch": 0.0864, + "grad_norm": 0.1978449821472168, + "kl": 0.00067138671875, + "learning_rate": 8.57142857142857e-07, + "loss": -0.0186, + "num_tokens": 13105853.0, + "reward": 1.6764647364616394, + "reward_std": 1.065181016921997, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 0.8371789753437042, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.2232666015625, + "epoch": 0.088, + "grad_norm": 0.18327437341213226, + "kl": 0.0006732940673828125, + "learning_rate": 8.73015873015873e-07, + "loss": 0.0007, + "num_tokens": 13353118.0, + "reward": 1.6873248219490051, + "reward_std": 1.1574660539627075, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.7408962249755859, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.90185546875, + "epoch": 0.0896, + "grad_norm": 0.19516612589359283, + "kl": 0.00066375732421875, + "learning_rate": 8.888888888888888e-07, + "loss": 0.0125, + "num_tokens": 13601859.0, + "reward": 1.4990279078483582, + "reward_std": 0.8056567907333374, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 0.5883135199546814, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.7678833007812, + "epoch": 0.0912, + "grad_norm": 0.18653278052806854, + "kl": 0.0006580352783203125, + "learning_rate": 9.047619047619047e-07, + "loss": 0.0002, + "num_tokens": 13851383.0, + "reward": 1.4730042815208435, + "reward_std": 1.0120689272880554, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.6069327592849731, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.2500610351562, + "epoch": 0.0928, + "grad_norm": 0.19008979201316833, + "kl": 0.000640869140625, + "learning_rate": 9.206349206349205e-07, + "loss": 0.012, + "num_tokens": 14093051.0, + "reward": 2.01254940032959, + "reward_std": 0.8032801449298859, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.9553571343421936, + "rewards/tag_count_reward": 0.9679064452648163, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.9464721679688, + "epoch": 0.0944, + "grad_norm": 0.178494393825531, + "kl": 0.000614166259765625, + "learning_rate": 9.365079365079365e-07, + "loss": 0.001, + "num_tokens": 14339655.0, + "reward": 1.9087846875190735, + "reward_std": 0.7311671376228333, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 0.9266417920589447, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.1697387695312, + "epoch": 0.096, + "grad_norm": 0.18219903111457825, + "kl": 0.000614166259765625, + "learning_rate": 9.523809523809522e-07, + "loss": 0.0095, + "num_tokens": 14583442.0, + "reward": 1.8292317986488342, + "reward_std": 1.1740328073501587, + "rewards/accuracy_reward": 0.1071428619325161, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 0.864946037530899, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.857177734375, + "epoch": 0.0976, + "grad_norm": 0.19832941889762878, + "kl": 0.000701904296875, + "learning_rate": 9.682539682539682e-07, + "loss": 0.0006, + "num_tokens": 14833060.0, + "reward": 1.639484941959381, + "reward_std": 0.8469606339931488, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 0.7198420166969299, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 2027.2144165039062, + "epoch": 0.0992, + "grad_norm": 0.18930822610855103, + "kl": 0.000667572021484375, + "learning_rate": 9.84126984126984e-07, + "loss": 0.0008, + "num_tokens": 15079792.0, + "reward": 1.7284113764762878, + "reward_std": 0.8725647926330566, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 0.7998397648334503, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.1008, + "grad_norm": 0.2171279788017273, + "kl": 0.0007953643798828125, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 15329622.0, + "reward": 1.6952548027038574, + "reward_std": 0.5612336248159409, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.955357164144516, + "rewards/tag_count_reward": 0.7398976683616638, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 1783.4464721679688, + "epoch": 0.1024, + "grad_norm": 0.1978946328163147, + "kl": 0.00064849853515625, + "learning_rate": 9.999929691391713e-07, + "loss": -0.0099, + "num_tokens": 15547792.0, + "reward": 1.9876956343650818, + "reward_std": 0.9232241809368134, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 1.0769811868667603, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 1913.6250610351562, + "epoch": 0.104, + "grad_norm": 0.19778388738632202, + "kl": 0.0006618499755859375, + "learning_rate": 9.999718767763874e-07, + "loss": 0.0231, + "num_tokens": 15782502.0, + "reward": 2.0121395587921143, + "reward_std": 0.8027443587779999, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.9464285671710968, + "rewards/tag_count_reward": 0.9674965143203735, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 1961.3483276367188, + "epoch": 0.1056, + "grad_norm": 0.19835074245929718, + "kl": 0.000698089599609375, + "learning_rate": 9.99936723570748e-07, + "loss": -0.0042, + "num_tokens": 16023831.0, + "reward": 1.8285565972328186, + "reward_std": 0.45500020682811737, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.955357164144516, + "rewards/tag_count_reward": 0.8731992542743683, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 2040.5803833007812, + "epoch": 0.1072, + "grad_norm": 0.16382135450839996, + "kl": 0.0005397796630859375, + "learning_rate": 9.998875106207303e-07, + "loss": 0.0067, + "num_tokens": 16273446.0, + "reward": 1.6499149203300476, + "reward_std": 0.9288487434387207, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.7213434875011444, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.1088, + "grad_norm": 0.19687965512275696, + "kl": 0.0006542205810546875, + "learning_rate": 9.998242394641538e-07, + "loss": 0.0, + "num_tokens": 16519902.0, + "reward": 1.7142491340637207, + "reward_std": 0.862162858247757, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.8124634027481079, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 1951.3929443359375, + "epoch": 0.1104, + "grad_norm": 0.1926136612892151, + "kl": 0.00067901611328125, + "learning_rate": 9.997469120781325e-07, + "loss": 0.0078, + "num_tokens": 16760158.0, + "reward": 2.1412118673324585, + "reward_std": 0.7864842116832733, + "rewards/accuracy_reward": 0.214285708963871, + "rewards/format_reward": 0.955357164144516, + "rewards/tag_count_reward": 0.9715688526630402, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.5625610351562, + "epoch": 0.112, + "grad_norm": 0.16935071349143982, + "kl": 0.0006256103515625, + "learning_rate": 9.996555308790137e-07, + "loss": 0.0108, + "num_tokens": 17004311.0, + "reward": 1.8509339690208435, + "reward_std": 0.9214654266834259, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 0.9134339094161987, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 1959.9107666015625, + "epoch": 0.1136, + "grad_norm": 0.21051207184791565, + "kl": 0.000705718994140625, + "learning_rate": 9.995500987223014e-07, + "loss": 0.0107, + "num_tokens": 17245255.0, + "reward": 1.7619197964668274, + "reward_std": 0.8635795712471008, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 0.8422768115997314, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 1954.2858276367188, + "epoch": 0.1152, + "grad_norm": 0.20449475944042206, + "kl": 0.0006389617919921875, + "learning_rate": 9.994306189025687e-07, + "loss": 0.0223, + "num_tokens": 17483805.0, + "reward": 1.8450915217399597, + "reward_std": 0.5589276105165482, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.9464285671710968, + "rewards/tag_count_reward": 0.8808056712150574, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.1168, + "grad_norm": 0.17351263761520386, + "kl": 0.0006313323974609375, + "learning_rate": 9.992970951533528e-07, + "loss": 0.0, + "num_tokens": 17734965.0, + "reward": 1.6268017292022705, + "reward_std": 0.935045450925827, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.6893016397953033, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.794677734375, + "epoch": 0.1184, + "grad_norm": 0.19078053534030914, + "kl": 0.0007381439208984375, + "learning_rate": 9.991495316470405e-07, + "loss": 0.0001, + "num_tokens": 17981510.0, + "reward": 1.853630542755127, + "reward_std": 0.7228849232196808, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 0.862558901309967, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 2027.1161499023438, + "epoch": 0.12, + "grad_norm": 0.1787819266319275, + "kl": 0.00063323974609375, + "learning_rate": 9.989879329947353e-07, + "loss": 0.0072, + "num_tokens": 18229673.0, + "reward": 2.1175267696380615, + "reward_std": 1.2206133008003235, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 1.0639551877975464, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.52685546875, + "epoch": 0.1216, + "grad_norm": 0.20822736620903015, + "kl": 0.0007877349853515625, + "learning_rate": 9.988123042461165e-07, + "loss": 0.0039, + "num_tokens": 18476230.0, + "reward": 1.7025582790374756, + "reward_std": 0.8648377656936646, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.7739867568016052, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 2039.0357666015625, + "epoch": 0.1232, + "grad_norm": 0.20402787625789642, + "kl": 0.0008068084716796875, + "learning_rate": 9.98622650889278e-07, + "loss": 0.0052, + "num_tokens": 18723194.0, + "reward": 1.7542842030525208, + "reward_std": 0.804975152015686, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 0.79892697930336, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.02685546875, + "epoch": 0.1248, + "grad_norm": 0.17983591556549072, + "kl": 0.000682830810546875, + "learning_rate": 9.984189788505596e-07, + "loss": 0.001, + "num_tokens": 18964893.0, + "reward": 2.1779526472091675, + "reward_std": 0.920309841632843, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.1868810653686523, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.1264, + "grad_norm": 0.19380465149879456, + "kl": 0.00069427490234375, + "learning_rate": 9.9820129449436e-07, + "loss": 0.0, + "num_tokens": 19216333.0, + "reward": 1.6939796209335327, + "reward_std": 1.0346761345863342, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 0.8189795911312103, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 1933.107177734375, + "epoch": 0.128, + "grad_norm": 0.2029217779636383, + "kl": 0.0007572174072265625, + "learning_rate": 9.97969604622939e-07, + "loss": 0.0011, + "num_tokens": 19452651.0, + "reward": 1.777596652507782, + "reward_std": 0.6555261313915253, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9464285969734192, + "rewards/tag_count_reward": 0.7061679065227509, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 1959.21435546875, + "epoch": 0.1296, + "grad_norm": 0.18850286304950714, + "kl": 0.0006427764892578125, + "learning_rate": 9.97723916476204e-07, + "loss": 0.0028, + "num_tokens": 19691613.0, + "reward": 1.8187804222106934, + "reward_std": 0.7906266748905182, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 0.800923228263855, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.9554443359375, + "epoch": 0.1312, + "grad_norm": 0.1903940588235855, + "kl": 0.0006885528564453125, + "learning_rate": 9.974642377314851e-07, + "loss": 0.0238, + "num_tokens": 19936160.0, + "reward": 1.8783518075942993, + "reward_std": 1.3667024970054626, + "rewards/accuracy_reward": 0.1428571417927742, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 0.8872801661491394, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.6250610351562, + "epoch": 0.1328, + "grad_norm": 0.1846151351928711, + "kl": 0.00074005126953125, + "learning_rate": 9.971905765032935e-07, + "loss": -0.0003, + "num_tokens": 20182280.0, + "reward": 1.8320837020874023, + "reward_std": 0.8542994260787964, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 0.9124407470226288, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 1962.1964721679688, + "epoch": 0.1344, + "grad_norm": 0.19544626772403717, + "kl": 0.0007495880126953125, + "learning_rate": 9.969029413430694e-07, + "loss": -0.0096, + "num_tokens": 20421674.0, + "reward": 2.1109968423843384, + "reward_std": 1.0384481251239777, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 1.164568305015564, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.136, + "grad_norm": 0.19759157299995422, + "kl": 0.000782012939453125, + "learning_rate": 9.966013412389143e-07, + "loss": 0.0, + "num_tokens": 20670076.0, + "reward": 1.8557525277137756, + "reward_std": 0.7097788453102112, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9464285671710968, + "rewards/tag_count_reward": 0.9093239009380341, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.0625610351562, + "epoch": 0.1376, + "grad_norm": 0.19634370505809784, + "kl": 0.0007171630859375, + "learning_rate": 9.962857856153094e-07, + "loss": 0.006, + "num_tokens": 20915783.0, + "reward": 1.9900143146514893, + "reward_std": 0.6862541139125824, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.9464285671710968, + "rewards/tag_count_reward": 1.0346570014953613, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 1956.1876220703125, + "epoch": 0.1392, + "grad_norm": 0.19186896085739136, + "kl": 0.000705718994140625, + "learning_rate": 9.959562843328222e-07, + "loss": 0.0014, + "num_tokens": 21156114.0, + "reward": 1.8994413614273071, + "reward_std": 0.9937511384487152, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 1.015512764453888, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 1999.4732666015625, + "epoch": 0.1408, + "grad_norm": 0.19095157086849213, + "kl": 0.000751495361328125, + "learning_rate": 9.956128476877983e-07, + "loss": 0.0072, + "num_tokens": 21401041.0, + "reward": 2.1542248129844666, + "reward_std": 0.898104339838028, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 1.2345817387104034, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 2006.1161499023438, + "epoch": 0.1424, + "grad_norm": 0.18644173443317413, + "kl": 0.00077056884765625, + "learning_rate": 9.952554864120382e-07, + "loss": 0.0072, + "num_tokens": 21645802.0, + "reward": 1.9318454265594482, + "reward_std": 0.9257108867168427, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 0.9854167997837067, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 2023.0535888671875, + "epoch": 0.144, + "grad_norm": 0.19187797605991364, + "kl": 0.0007266998291015625, + "learning_rate": 9.948842116724641e-07, + "loss": -0.0059, + "num_tokens": 21889604.0, + "reward": 2.16513592004776, + "reward_std": 0.889827311038971, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 1.1294215619564056, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.1456, + "grad_norm": 0.18406300246715546, + "kl": 0.000774383544921875, + "learning_rate": 9.944990350707691e-07, + "loss": 0.0, + "num_tokens": 22139252.0, + "reward": 1.6987809538841248, + "reward_std": 0.9749361276626587, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142984867096, + "rewards/tag_count_reward": 0.7880666553974152, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.8839721679688, + "epoch": 0.1472, + "grad_norm": 0.18649983406066895, + "kl": 0.000782012939453125, + "learning_rate": 9.940999686430562e-07, + "loss": -0.0071, + "num_tokens": 22383819.0, + "reward": 2.0155146718025208, + "reward_std": 0.7496342062950134, + "rewards/accuracy_reward": 0.1339285671710968, + "rewards/format_reward": 0.9285714328289032, + "rewards/tag_count_reward": 0.9530145823955536, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.544677734375, + "epoch": 0.1488, + "grad_norm": 0.18544988334178925, + "kl": 0.000743865966796875, + "learning_rate": 9.936870248594607e-07, + "loss": 0.0009, + "num_tokens": 22632786.0, + "reward": 2.028047561645508, + "reward_std": 0.9547284841537476, + "rewards/accuracy_reward": 0.08928571362048388, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 1.0816189050674438, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.0537109375, + "epoch": 0.1504, + "grad_norm": 0.21311891078948975, + "kl": 0.0008392333984375, + "learning_rate": 9.932602166237614e-07, + "loss": 0.0126, + "num_tokens": 22883560.0, + "reward": 1.8055466413497925, + "reward_std": 0.9914458692073822, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.830357164144516, + "rewards/tag_count_reward": 0.9484035968780518, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.8839721679688, + "epoch": 0.152, + "grad_norm": 0.20109416544437408, + "kl": 0.0009441375732421875, + "learning_rate": 9.928195572729781e-07, + "loss": 0.0013, + "num_tokens": 23134945.0, + "reward": 1.9409413933753967, + "reward_std": 0.8371607959270477, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.9017857313156128, + "rewards/tag_count_reward": 0.9855841100215912, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 2042.4285888671875, + "epoch": 0.1536, + "grad_norm": 0.1908876597881317, + "kl": 0.000850677490234375, + "learning_rate": 9.923650605769528e-07, + "loss": 0.0013, + "num_tokens": 23381729.0, + "reward": 1.9653971791267395, + "reward_std": 0.8384782671928406, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 0.9743256270885468, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 2019.4911499023438, + "epoch": 0.1552, + "grad_norm": 0.17845752835273743, + "kl": 0.000732421875, + "learning_rate": 9.918967407379211e-07, + "loss": 0.0068, + "num_tokens": 23629080.0, + "reward": 2.079575538635254, + "reward_std": 0.8202497959136963, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.0349326133728027, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 1885.3482666015625, + "epoch": 0.1568, + "grad_norm": 0.21199046075344086, + "kl": 0.00089263916015625, + "learning_rate": 9.914146123900682e-07, + "loss": -0.0224, + "num_tokens": 23859419.0, + "reward": 1.8016860485076904, + "reward_std": 1.1776442527770996, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.857142835855484, + "rewards/tag_count_reward": 0.8195430040359497, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 1926.1787109375, + "epoch": 0.1584, + "grad_norm": 0.17472998797893524, + "kl": 0.000682830810546875, + "learning_rate": 9.90918690599071e-07, + "loss": 0.0107, + "num_tokens": 24100043.0, + "reward": 2.283388137817383, + "reward_std": 0.9286086559295654, + "rewards/accuracy_reward": 0.2053571417927742, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.1941022872924805, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.16, + "grad_norm": 0.19071215391159058, + "kl": 0.000873565673828125, + "learning_rate": 9.904089908616276e-07, + "loss": 0.0, + "num_tokens": 24351553.0, + "reward": 1.8189749717712402, + "reward_std": 0.9319272637367249, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.908260703086853, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 2027.857177734375, + "epoch": 0.1616, + "grad_norm": 0.1905428171157837, + "kl": 0.0008449554443359375, + "learning_rate": 9.898855291049734e-07, + "loss": 0.0078, + "num_tokens": 24597475.0, + "reward": 1.6177130937576294, + "reward_std": 1.1110000014305115, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 0.7159273326396942, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 1974.8214721679688, + "epoch": 0.1632, + "grad_norm": 0.18952271342277527, + "kl": 0.000858306884765625, + "learning_rate": 9.893483216863826e-07, + "loss": -0.0014, + "num_tokens": 24838171.0, + "reward": 2.3227990865707397, + "reward_std": 1.150947391986847, + "rewards/accuracy_reward": 0.2321428507566452, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.206727534532547, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.5982666015625, + "epoch": 0.1648, + "grad_norm": 0.1971505880355835, + "kl": 0.0009288787841796875, + "learning_rate": 9.887973853926581e-07, + "loss": 0.0078, + "num_tokens": 25082720.0, + "reward": 2.0358253121376038, + "reward_std": 1.084350049495697, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 1.1161824464797974, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.1664, + "grad_norm": 0.18338799476623535, + "kl": 0.0007991790771484375, + "learning_rate": 9.88232737439606e-07, + "loss": 0.0, + "num_tokens": 25335910.0, + "reward": 1.647301197052002, + "reward_std": 0.7568677067756653, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 0.7455154061317444, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 2026.6785888671875, + "epoch": 0.168, + "grad_norm": 0.19427067041397095, + "kl": 0.0008678436279296875, + "learning_rate": 9.876543954714983e-07, + "loss": 0.0054, + "num_tokens": 25585340.0, + "reward": 1.790745496749878, + "reward_std": 0.8796463310718536, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 0.8978882133960724, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 2004.7501220703125, + "epoch": 0.1696, + "grad_norm": 0.18772822618484497, + "kl": 0.0008449554443359375, + "learning_rate": 9.87062377560521e-07, + "loss": 0.0005, + "num_tokens": 25828632.0, + "reward": 1.9568280577659607, + "reward_std": 0.9918583333492279, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 0.9746851623058319, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 1892.1161499023438, + "epoch": 0.1712, + "grad_norm": 0.1886899620294571, + "kl": 0.0008525848388671875, + "learning_rate": 9.864567022062098e-07, + "loss": 0.0076, + "num_tokens": 26059687.0, + "reward": 1.9226229190826416, + "reward_std": 0.8764113485813141, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 1.0119085013866425, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 1961.7411499023438, + "epoch": 0.1728, + "grad_norm": 0.18055196106433868, + "kl": 0.00091552734375, + "learning_rate": 9.858373883348724e-07, + "loss": 0.0159, + "num_tokens": 26301466.0, + "reward": 2.3169023990631104, + "reward_std": 1.1062219142913818, + "rewards/accuracy_reward": 0.09821428824216127, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 1.3169023990631104, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.0803833007812, + "epoch": 0.1744, + "grad_norm": 0.19035661220550537, + "kl": 0.00095367431640625, + "learning_rate": 9.852044552989955e-07, + "loss": 0.0002, + "num_tokens": 26551011.0, + "reward": 2.0909934639930725, + "reward_std": 0.9729891419410706, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 1.1713505387306213, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 1991.8750610351562, + "epoch": 0.176, + "grad_norm": 0.1867968887090683, + "kl": 0.00091552734375, + "learning_rate": 9.84557922876642e-07, + "loss": 0.0183, + "num_tokens": 26794093.0, + "reward": 1.8264098763465881, + "reward_std": 1.1650634407997131, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.8214285969734192, + "rewards/tag_count_reward": 0.9424812197685242, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 1956.1786499023438, + "epoch": 0.1776, + "grad_norm": 0.18876078724861145, + "kl": 0.00102996826171875, + "learning_rate": 9.83897811270832e-07, + "loss": 0.0233, + "num_tokens": 27031707.0, + "reward": 1.9083644151687622, + "reward_std": 1.0403705835342407, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 1.0422929227352142, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 1776.5982666015625, + "epoch": 0.1792, + "grad_norm": 0.210955411195755, + "kl": 0.001026153564453125, + "learning_rate": 9.83224141108911e-07, + "loss": -0.0013, + "num_tokens": 27249390.0, + "reward": 2.241396903991699, + "reward_std": 1.0636254847049713, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.2413968741893768, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 1868.3125610351562, + "epoch": 0.1808, + "grad_norm": 0.21245959401130676, + "kl": 0.00115966796875, + "learning_rate": 9.825369334419066e-07, + "loss": 0.0415, + "num_tokens": 27479361.0, + "reward": 2.5927335023880005, + "reward_std": 1.1143870949745178, + "rewards/accuracy_reward": 0.1785714328289032, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 1.5213046669960022, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 1981.2411499023438, + "epoch": 0.1824, + "grad_norm": 0.18423087894916534, + "kl": 0.001007080078125, + "learning_rate": 9.818362097438694e-07, + "loss": 0.0023, + "num_tokens": 27718228.0, + "reward": 2.209169387817383, + "reward_std": 1.0043182671070099, + "rewards/accuracy_reward": 0.1339285671710968, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.1913121938705444, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 2027.08935546875, + "epoch": 0.184, + "grad_norm": 0.18483412265777588, + "kl": 0.00112152099609375, + "learning_rate": 9.811219919112036e-07, + "loss": 0.0102, + "num_tokens": 27966094.0, + "reward": 2.403584122657776, + "reward_std": 1.085727870464325, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 1.4303699135780334, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 1974.6964721679688, + "epoch": 0.1856, + "grad_norm": 0.18976955115795135, + "kl": 0.00099945068359375, + "learning_rate": 9.803943022619808e-07, + "loss": 0.0085, + "num_tokens": 28211298.0, + "reward": 1.6214704513549805, + "reward_std": 1.1240004301071167, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 0.8178989589214325, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 1901.5536499023438, + "epoch": 0.1872, + "grad_norm": 0.20297886431217194, + "kl": 0.00109100341796875, + "learning_rate": 9.796531635352437e-07, + "loss": 0.0191, + "num_tokens": 28442962.0, + "reward": 2.561613082885742, + "reward_std": 0.7503717541694641, + "rewards/accuracy_reward": 0.13392857182770967, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 1.4901841878890991, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 1998.5714721679688, + "epoch": 0.1888, + "grad_norm": 0.19414573907852173, + "kl": 0.0010986328125, + "learning_rate": 9.788985988902959e-07, + "loss": 0.0058, + "num_tokens": 28688516.0, + "reward": 2.0081594586372375, + "reward_std": 1.2013514041900635, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 1.0528023540973663, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 1944.90185546875, + "epoch": 0.1904, + "grad_norm": 0.17990875244140625, + "kl": 0.001155853271484375, + "learning_rate": 9.781306319059776e-07, + "loss": 0.0092, + "num_tokens": 28923593.0, + "reward": 2.521070122718811, + "reward_std": 1.2792147994041443, + "rewards/accuracy_reward": 0.1428571417927742, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 1.5032129883766174, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.4910888671875, + "epoch": 0.192, + "grad_norm": 0.17426860332489014, + "kl": 0.001094818115234375, + "learning_rate": 9.773492865799279e-07, + "loss": 0.0001, + "num_tokens": 29167220.0, + "reward": 2.052501618862152, + "reward_std": 1.0305362343788147, + "rewards/accuracy_reward": 0.14285714365541935, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 1.034644365310669, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 2040.884033203125, + "epoch": 0.1936, + "grad_norm": 0.1981295943260193, + "kl": 0.001369476318359375, + "learning_rate": 9.765545873278378e-07, + "loss": -0.0003, + "num_tokens": 29416771.0, + "reward": 1.8538218140602112, + "reward_std": 1.021961897611618, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 0.9966788291931152, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.7857666015625, + "epoch": 0.1952, + "grad_norm": 0.18369273841381073, + "kl": 0.001190185546875, + "learning_rate": 9.757465589826837e-07, + "loss": 0.0054, + "num_tokens": 29666185.0, + "reward": 2.02440345287323, + "reward_std": 0.863814115524292, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 1.12261763215065, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 1983.6161499023438, + "epoch": 0.1968, + "grad_norm": 0.2096128612756729, + "kl": 0.00131988525390625, + "learning_rate": 9.749252267939538e-07, + "loss": -0.0006, + "num_tokens": 29910372.0, + "reward": 2.2691932916641235, + "reward_std": 1.0254777073860168, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 1.2513360381126404, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 1900.3929443359375, + "epoch": 0.1984, + "grad_norm": 0.20099978148937225, + "kl": 0.001506805419921875, + "learning_rate": 9.740906164268588e-07, + "loss": -0.0117, + "num_tokens": 30144244.0, + "reward": 2.5708857774734497, + "reward_std": 1.450139045715332, + "rewards/accuracy_reward": 0.22321428824216127, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 1.5708854794502258, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.9375610351562, + "epoch": 0.2, + "grad_norm": 0.19147999584674835, + "kl": 0.0013885498046875, + "learning_rate": 9.73242753961529e-07, + "loss": 0.0207, + "num_tokens": 30389111.0, + "reward": 2.003620207309723, + "reward_std": 1.0822959542274475, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 1.1554058194160461, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 1852.1786499023438, + "epoch": 0.2016, + "grad_norm": 0.199344202876091, + "kl": 0.00145721435546875, + "learning_rate": 9.723816658921996e-07, + "loss": -0.0208, + "num_tokens": 30618759.0, + "reward": 2.454930543899536, + "reward_std": 1.176861047744751, + "rewards/accuracy_reward": 0.2232142835855484, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 1.3924305438995361, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.0178833007812, + "epoch": 0.2032, + "grad_norm": 0.2023949772119522, + "kl": 0.00140380859375, + "learning_rate": 9.715073791263836e-07, + "loss": 0.0014, + "num_tokens": 30863761.0, + "reward": 2.175950288772583, + "reward_std": 0.9374763369560242, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.919642835855484, + "rewards/tag_count_reward": 1.2563072443008423, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 1966.08935546875, + "epoch": 0.2048, + "grad_norm": 0.1900157481431961, + "kl": 0.001369476318359375, + "learning_rate": 9.7061992098403e-07, + "loss": 0.0088, + "num_tokens": 31103745.0, + "reward": 2.1340863704681396, + "reward_std": 1.0493548214435577, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 1.2412291765213013, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.9554443359375, + "epoch": 0.2064, + "grad_norm": 0.19164487719535828, + "kl": 0.0013275146484375, + "learning_rate": 9.697193191966714e-07, + "loss": 0.0049, + "num_tokens": 31344386.0, + "reward": 2.493199348449707, + "reward_std": 1.0826833248138428, + "rewards/accuracy_reward": 0.2053571492433548, + "rewards/format_reward": 0.9107142686843872, + "rewards/tag_count_reward": 1.3771279454231262, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 1880.8839721679688, + "epoch": 0.208, + "grad_norm": 0.19880811870098114, + "kl": 0.001373291015625, + "learning_rate": 9.688056019065553e-07, + "loss": 0.0098, + "num_tokens": 31575051.0, + "reward": 2.196740508079529, + "reward_std": 1.0288796126842499, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 1.3485261797904968, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.1428833007812, + "epoch": 0.2096, + "grad_norm": 0.1983546018600464, + "kl": 0.00139617919921875, + "learning_rate": 9.678787976657668e-07, + "loss": 0.0012, + "num_tokens": 31828999.0, + "reward": 2.028841197490692, + "reward_std": 0.9513568580150604, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 1.127055436372757, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 2040.794677734375, + "epoch": 0.2112, + "grad_norm": 0.18948617577552795, + "kl": 0.0014190673828125, + "learning_rate": 9.669389354353352e-07, + "loss": 0.0004, + "num_tokens": 32080206.0, + "reward": 2.069916009902954, + "reward_std": 1.1423694491386414, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.857142835855484, + "rewards/tag_count_reward": 1.1413444876670837, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 1785.5625610351562, + "epoch": 0.2128, + "grad_norm": 0.22348791360855103, + "kl": 0.00185394287109375, + "learning_rate": 9.6598604458433e-07, + "loss": -0.012, + "num_tokens": 32300447.0, + "reward": 2.845059037208557, + "reward_std": 1.2160727381706238, + "rewards/accuracy_reward": 0.2678571492433548, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 1.711130440235138, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 1996.3839721679688, + "epoch": 0.2144, + "grad_norm": 0.1856621950864792, + "kl": 0.001827239990234375, + "learning_rate": 9.650201548889417e-07, + "loss": 0.008, + "num_tokens": 32544636.0, + "reward": 2.450540781021118, + "reward_std": 1.164226770401001, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 1.557683527469635, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.6785888671875, + "epoch": 0.216, + "grad_norm": 0.19564266502857208, + "kl": 0.001651763916015625, + "learning_rate": 9.640412965315527e-07, + "loss": 0.0092, + "num_tokens": 32792372.0, + "reward": 2.4906792044639587, + "reward_std": 0.9858261048793793, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 1.446036159992218, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 2024.2500610351562, + "epoch": 0.2176, + "grad_norm": 0.2004474252462387, + "kl": 0.00160980224609375, + "learning_rate": 9.63049500099794e-07, + "loss": 0.0052, + "num_tokens": 33042370.0, + "reward": 1.851273536682129, + "reward_std": 1.1549388766288757, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 1.0477019846439362, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 1795.33935546875, + "epoch": 0.2192, + "grad_norm": 0.21786148846149445, + "kl": 0.001674652099609375, + "learning_rate": 9.620447965855881e-07, + "loss": 0.0039, + "num_tokens": 33266030.0, + "reward": 2.5075560808181763, + "reward_std": 1.0102073848247528, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 1.5075559616088867, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 1984.9733276367188, + "epoch": 0.2208, + "grad_norm": 0.1828126609325409, + "kl": 0.0018310546875, + "learning_rate": 9.610272173841823e-07, + "loss": 0.0136, + "num_tokens": 33507191.0, + "reward": 2.0787981748580933, + "reward_std": 1.0666296780109406, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.8214285969734192, + "rewards/tag_count_reward": 1.1859408915042877, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.6428833007812, + "epoch": 0.2224, + "grad_norm": 0.17558911442756653, + "kl": 0.00196075439453125, + "learning_rate": 9.599967942931663e-07, + "loss": 0.0004, + "num_tokens": 33754069.0, + "reward": 2.5989197492599487, + "reward_std": 1.1930701732635498, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 1.7596338987350464, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.982177734375, + "epoch": 0.224, + "grad_norm": 0.19782274961471558, + "kl": 0.00167083740234375, + "learning_rate": 9.5895355951148e-07, + "loss": -0.0019, + "num_tokens": 34006431.0, + "reward": 1.9635185599327087, + "reward_std": 1.244540274143219, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 1.1420899033546448, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 1968.4019165039062, + "epoch": 0.2256, + "grad_norm": 0.1838637888431549, + "kl": 0.0018157958984375, + "learning_rate": 9.578975456384054e-07, + "loss": 0.0194, + "num_tokens": 34245792.0, + "reward": 2.6848630905151367, + "reward_std": 1.297763168811798, + "rewards/accuracy_reward": 0.2142857164144516, + "rewards/format_reward": 0.8303571343421936, + "rewards/tag_count_reward": 1.6402199864387512, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 2004.3929443359375, + "epoch": 0.2272, + "grad_norm": 0.18045230209827423, + "kl": 0.001949310302734375, + "learning_rate": 9.568287856725497e-07, + "loss": 0.0223, + "num_tokens": 34489506.0, + "reward": 2.3562939167022705, + "reward_std": 1.3892123699188232, + "rewards/accuracy_reward": 0.09523809887468815, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 1.4991509318351746, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.2288, + "grad_norm": 0.19414274394512177, + "kl": 0.00191497802734375, + "learning_rate": 9.55747313010813e-07, + "loss": 0.0001, + "num_tokens": 34738426.0, + "reward": 2.0270004272460938, + "reward_std": 1.3003739714622498, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 1.2502146363258362, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.732177734375, + "epoch": 0.2304, + "grad_norm": 0.19759587943553925, + "kl": 0.00206756591796875, + "learning_rate": 9.54653161447346e-07, + "loss": 0.0009, + "num_tokens": 34986140.0, + "reward": 2.4790889024734497, + "reward_std": 1.3382359147071838, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 1.6219459176063538, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 1936.71435546875, + "epoch": 0.232, + "grad_norm": 0.17966888844966888, + "kl": 0.00174713134765625, + "learning_rate": 9.535463651724919e-07, + "loss": 0.0092, + "num_tokens": 35224122.0, + "reward": 2.536754608154297, + "reward_std": 0.9390050172805786, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.9196428656578064, + "rewards/tag_count_reward": 1.4921115040779114, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 1955.8572387695312, + "epoch": 0.2336, + "grad_norm": 0.20333611965179443, + "kl": 0.00191497802734375, + "learning_rate": 9.524269587717203e-07, + "loss": -0.0028, + "num_tokens": 35464780.0, + "reward": 2.645366907119751, + "reward_std": 0.9237397015094757, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.9375, + "rewards/tag_count_reward": 1.5917953848838806, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 2009.3035888671875, + "epoch": 0.2352, + "grad_norm": 0.18414156138896942, + "kl": 0.0019989013671875, + "learning_rate": 9.512949772245449e-07, + "loss": 0.0092, + "num_tokens": 35709856.0, + "reward": 2.5110520124435425, + "reward_std": 1.0912683308124542, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 1.6806948781013489, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.9107666015625, + "epoch": 0.2368, + "grad_norm": 0.17920136451721191, + "kl": 0.00206756591796875, + "learning_rate": 9.501504559034311e-07, + "loss": -0.0084, + "num_tokens": 35945984.0, + "reward": 2.347660541534424, + "reward_std": 1.3760259747505188, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 1.535160481929779, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 1984.1697387695312, + "epoch": 0.2384, + "grad_norm": 0.20252735912799835, + "kl": 0.00228118896484375, + "learning_rate": 9.489934305726907e-07, + "loss": 0.007, + "num_tokens": 36188567.0, + "reward": 2.7513363361358643, + "reward_std": 1.1908563375473022, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 1.9031217694282532, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 1941.7679443359375, + "epoch": 0.24, + "grad_norm": 0.21546600759029388, + "kl": 0.0027008056640625, + "learning_rate": 9.478239373873638e-07, + "loss": 0.0204, + "num_tokens": 36426289.0, + "reward": 2.639132857322693, + "reward_std": 1.474111020565033, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 1.7819899916648865, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.669677734375, + "epoch": 0.2416, + "grad_norm": 0.1785498708486557, + "kl": 0.00222015380859375, + "learning_rate": 9.466420128920899e-07, + "loss": 0.0011, + "num_tokens": 36675928.0, + "reward": 2.408002257347107, + "reward_std": 1.450328767299652, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 1.6133593320846558, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.2432, + "grad_norm": 0.20229867100715637, + "kl": 0.0024871826171875, + "learning_rate": 9.454476940199651e-07, + "loss": 0.0001, + "num_tokens": 36926472.0, + "reward": 2.4465404748916626, + "reward_std": 1.1793063879013062, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 1.6340402960777283, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 1985.419677734375, + "epoch": 0.2448, + "grad_norm": 0.19798661768436432, + "kl": 0.001953125, + "learning_rate": 9.442410180913881e-07, + "loss": -0.0067, + "num_tokens": 37170749.0, + "reward": 2.2579965591430664, + "reward_std": 0.9552808701992035, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 1.3651392459869385, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 1898.6250610351562, + "epoch": 0.2464, + "grad_norm": 0.1807425171136856, + "kl": 0.00264739990234375, + "learning_rate": 9.430220228128951e-07, + "loss": -0.0013, + "num_tokens": 37401595.0, + "reward": 2.90531063079834, + "reward_std": 1.2447484731674194, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.1106674671173096, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 2041.169677734375, + "epoch": 0.248, + "grad_norm": 0.19562363624572754, + "kl": 0.0025787353515625, + "learning_rate": 9.417907462759797e-07, + "loss": 0.0036, + "num_tokens": 37650940.0, + "reward": 2.6775286197662354, + "reward_std": 1.1324981451034546, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8928571343421936, + "rewards/tag_count_reward": 1.7846713662147522, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 1977.134033203125, + "epoch": 0.2496, + "grad_norm": 0.18219788372516632, + "kl": 0.00234222412109375, + "learning_rate": 9.40547226955904e-07, + "loss": 0.0122, + "num_tokens": 37889473.0, + "reward": 2.662699818611145, + "reward_std": 1.511205792427063, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 1.7519855499267578, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 2039.71435546875, + "epoch": 0.2512, + "grad_norm": 0.1782846599817276, + "kl": 0.00238037109375, + "learning_rate": 9.39291503710496e-07, + "loss": 0.0031, + "num_tokens": 38136667.0, + "reward": 2.5118582248687744, + "reward_std": 1.346319317817688, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 1.6815008521080017, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.2528, + "grad_norm": 0.18420282006263733, + "kl": 0.002094268798828125, + "learning_rate": 9.380236157789353e-07, + "loss": 0.0001, + "num_tokens": 38385923.0, + "reward": 2.1016871333122253, + "reward_std": 1.216222882270813, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 1.2802583575248718, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 2009.7054443359375, + "epoch": 0.2544, + "grad_norm": 0.17066477239131927, + "kl": 0.00237274169921875, + "learning_rate": 9.367436027805265e-07, + "loss": 0.0022, + "num_tokens": 38630876.0, + "reward": 2.7085288763046265, + "reward_std": 1.3586682081222534, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 1.8960288166999817, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 1857.3036499023438, + "epoch": 0.256, + "grad_norm": 0.19021914899349213, + "kl": 0.00286865234375, + "learning_rate": 9.354515047134619e-07, + "loss": -0.0038, + "num_tokens": 38856702.0, + "reward": 2.7227373123168945, + "reward_std": 1.3205369114875793, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 1.8120229840278625, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 1987.0089721679688, + "epoch": 0.2576, + "grad_norm": 0.18428273499011993, + "kl": 0.00257110595703125, + "learning_rate": 9.341473619535713e-07, + "loss": -0.0035, + "num_tokens": 39097713.0, + "reward": 2.737136721611023, + "reward_std": 1.2027007937431335, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 1.8264224529266357, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 2016.3929443359375, + "epoch": 0.2592, + "grad_norm": 0.1778215765953064, + "kl": 0.0028076171875, + "learning_rate": 9.328312152530602e-07, + "loss": 0.0122, + "num_tokens": 39342211.0, + "reward": 2.7062898874282837, + "reward_std": 1.308540165424347, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 1.8045040965080261, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 1935.2947387695312, + "epoch": 0.2608, + "grad_norm": 0.1787494271993637, + "kl": 0.0029449462890625, + "learning_rate": 9.315031057392365e-07, + "loss": -0.0026, + "num_tokens": 39578830.0, + "reward": 2.8691320419311523, + "reward_std": 1.3374991416931152, + "rewards/accuracy_reward": 0.1517857126891613, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 1.9494889378547668, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.58935546875, + "epoch": 0.2624, + "grad_norm": 0.19417735934257507, + "kl": 0.002532958984375, + "learning_rate": 9.301630749132254e-07, + "loss": -0.001, + "num_tokens": 39823574.0, + "reward": 2.2719223499298096, + "reward_std": 1.1875560879707336, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 1.477279245853424, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.6339721679688, + "epoch": 0.264, + "grad_norm": 0.17820978164672852, + "kl": 0.00286865234375, + "learning_rate": 9.288111646486724e-07, + "loss": 0.0101, + "num_tokens": 40068379.0, + "reward": 2.7197179794311523, + "reward_std": 1.4722105264663696, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 1.8625749945640564, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.0804443359375, + "epoch": 0.2656, + "grad_norm": 0.18839386105537415, + "kl": 0.0032196044921875, + "learning_rate": 9.274474171904349e-07, + "loss": 0.0073, + "num_tokens": 40311036.0, + "reward": 3.046691060066223, + "reward_std": 1.483076810836792, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.260976552963257, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 2010.5714721679688, + "epoch": 0.2672, + "grad_norm": 0.1758975386619568, + "kl": 0.0027313232421875, + "learning_rate": 9.260718751532621e-07, + "loss": -0.0043, + "num_tokens": 40556870.0, + "reward": 2.582813799381256, + "reward_std": 1.3750835061073303, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.7410714030265808, + "rewards/tag_count_reward": 1.6899564862251282, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 1824.52685546875, + "epoch": 0.2688, + "grad_norm": 0.19979457557201385, + "kl": 0.00313568115234375, + "learning_rate": 9.246845815204636e-07, + "loss": -0.0158, + "num_tokens": 40783323.0, + "reward": 2.8962193727493286, + "reward_std": 1.3149868845939636, + "rewards/accuracy_reward": 0.2410714253783226, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 1.8962193727493286, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.9375610351562, + "epoch": 0.2704, + "grad_norm": 0.19459320604801178, + "kl": 0.00322723388671875, + "learning_rate": 9.232855796425658e-07, + "loss": -0.0003, + "num_tokens": 41031760.0, + "reward": 2.6312978267669678, + "reward_std": 1.4839596152305603, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 1.8009403944015503, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 1931.5894165039062, + "epoch": 0.272, + "grad_norm": 0.1898786425590515, + "kl": 0.0029144287109375, + "learning_rate": 9.218749132359577e-07, + "loss": 0.0072, + "num_tokens": 41269980.0, + "reward": 2.7799973487854004, + "reward_std": 1.328313410282135, + "rewards/accuracy_reward": 0.16071428824216127, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 1.8067830801010132, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.2410888671875, + "epoch": 0.2736, + "grad_norm": 0.19318215548992157, + "kl": 0.0027618408203125, + "learning_rate": 9.204526263815244e-07, + "loss": 0.002, + "num_tokens": 41511787.0, + "reward": 2.5259042978286743, + "reward_std": 1.3321889638900757, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.7589285969734192, + "rewards/tag_count_reward": 1.6776897311210632, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 1921.1787109375, + "epoch": 0.2752, + "grad_norm": 0.18998001515865326, + "kl": 0.0031280517578125, + "learning_rate": 9.190187635232703e-07, + "loss": -0.0198, + "num_tokens": 41748673.0, + "reward": 2.5858575105667114, + "reward_std": 1.225433111190796, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 1.7376431226730347, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 1820.1876220703125, + "epoch": 0.2768, + "grad_norm": 0.20242632925510406, + "kl": 0.0032958984375, + "learning_rate": 9.175733694669292e-07, + "loss": 0.0037, + "num_tokens": 41973380.0, + "reward": 3.3245655298233032, + "reward_std": 1.2330602407455444, + "rewards/accuracy_reward": 0.2232142835855484, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.2799227237701416, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.294677734375, + "epoch": 0.2784, + "grad_norm": 0.20798489451408386, + "kl": 0.00347137451171875, + "learning_rate": 9.161164893785657e-07, + "loss": 0.004, + "num_tokens": 42220779.0, + "reward": 2.4281100034713745, + "reward_std": 1.2546451687812805, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 1.651324212551117, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 2004.96435546875, + "epoch": 0.28, + "grad_norm": 0.2021005004644394, + "kl": 0.00333404541015625, + "learning_rate": 9.146481687831627e-07, + "loss": 0.007, + "num_tokens": 42466937.0, + "reward": 2.724056601524353, + "reward_std": 1.4038877487182617, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.8482142984867096, + "rewards/tag_count_reward": 1.7954849004745483, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.419677734375, + "epoch": 0.2816, + "grad_norm": 0.20509660243988037, + "kl": 0.0036163330078125, + "learning_rate": 9.131684535631987e-07, + "loss": 0.0011, + "num_tokens": 42718074.0, + "reward": 2.577963352203369, + "reward_std": 1.4450840950012207, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 1.8279631733894348, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 2009.357177734375, + "epoch": 0.2832, + "grad_norm": 0.18589240312576294, + "kl": 0.00353240966796875, + "learning_rate": 9.116773899572154e-07, + "loss": 0.0066, + "num_tokens": 42961490.0, + "reward": 2.9963479042053223, + "reward_std": 1.3788501024246216, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.0945619344711304, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.0357666015625, + "epoch": 0.2848, + "grad_norm": 0.19317328929901123, + "kl": 0.00345611572265625, + "learning_rate": 9.101750245583711e-07, + "loss": 0.0124, + "num_tokens": 43204786.0, + "reward": 2.498630404472351, + "reward_std": 1.3702949285507202, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 1.757558822631836, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 1955.3214721679688, + "epoch": 0.2864, + "grad_norm": 0.18019506335258484, + "kl": 0.00348663330078125, + "learning_rate": 9.086614043129865e-07, + "loss": -0.0025, + "num_tokens": 43444586.0, + "reward": 2.7696298360824585, + "reward_std": 1.3129031658172607, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 1.9928439855575562, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.9732666015625, + "epoch": 0.288, + "grad_norm": 0.19656753540039062, + "kl": 0.003570556640625, + "learning_rate": 9.071365765190756e-07, + "loss": 0.0001, + "num_tokens": 43694021.0, + "reward": 2.3433433771133423, + "reward_std": 1.3831621408462524, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 1.6290575861930847, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.02685546875, + "epoch": 0.2896, + "grad_norm": 0.19680939614772797, + "kl": 0.0041656494140625, + "learning_rate": 9.056005888248699e-07, + "loss": 0.0009, + "num_tokens": 43942258.0, + "reward": 2.8307249546051025, + "reward_std": 1.3184887766838074, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.0360819697380066, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 2007.6161499023438, + "epoch": 0.2912, + "grad_norm": 0.1729881763458252, + "kl": 0.00382232666015625, + "learning_rate": 9.040534892273281e-07, + "loss": 0.0049, + "num_tokens": 44186095.0, + "reward": 2.850070357322693, + "reward_std": 1.390480637550354, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.0464988350868225, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.4910888671875, + "epoch": 0.2928, + "grad_norm": 0.19782409071922302, + "kl": 0.00379180908203125, + "learning_rate": 9.024953260706365e-07, + "loss": 0.0025, + "num_tokens": 44436918.0, + "reward": 2.2910720109939575, + "reward_std": 1.4133784174919128, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 1.5767860412597656, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 1975.759033203125, + "epoch": 0.2944, + "grad_norm": 0.16854418814182281, + "kl": 0.0042724609375, + "learning_rate": 9.009261480446988e-07, + "loss": 0.0051, + "num_tokens": 44674695.0, + "reward": 2.8301870822906494, + "reward_std": 1.5645469427108765, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.6160714328289032, + "rewards/tag_count_reward": 2.0980440974235535, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 1984.1428833007812, + "epoch": 0.296, + "grad_norm": 0.19186586141586304, + "kl": 0.004364013671875, + "learning_rate": 8.993460041836142e-07, + "loss": 0.0024, + "num_tokens": 44920775.0, + "reward": 2.868486762046814, + "reward_std": 1.4330458641052246, + "rewards/accuracy_reward": 0.14285713993012905, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.020272195339203, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 2004.2679443359375, + "epoch": 0.2976, + "grad_norm": 0.19371843338012695, + "kl": 0.00433349609375, + "learning_rate": 8.977549438641452e-07, + "loss": 0.0079, + "num_tokens": 45166939.0, + "reward": 3.3741060495376587, + "reward_std": 1.3059431314468384, + "rewards/accuracy_reward": 0.12500000186264515, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.4455344676971436, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.1964721679688, + "epoch": 0.2992, + "grad_norm": 0.18221555650234222, + "kl": 0.004364013671875, + "learning_rate": 8.96153016804175e-07, + "loss": 0.0007, + "num_tokens": 45416021.0, + "reward": 2.6063982248306274, + "reward_std": 1.3395533561706543, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 1.936755120754242, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.3008, + "grad_norm": 0.1893230527639389, + "kl": 0.0038604736328125, + "learning_rate": 8.94540273061153e-07, + "loss": 0.0002, + "num_tokens": 45668679.0, + "reward": 2.1663967967033386, + "reward_std": 1.363420009613037, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.6517857015132904, + "rewards/tag_count_reward": 1.505682349205017, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 1806.419677734375, + "epoch": 0.3024, + "grad_norm": 0.21161283552646637, + "kl": 0.004364013671875, + "learning_rate": 8.929167630305322e-07, + "loss": 0.017, + "num_tokens": 45891508.0, + "reward": 3.3130438327789307, + "reward_std": 1.243873804807663, + "rewards/accuracy_reward": 0.12499999720603228, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 2.339829444885254, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.4107666015625, + "epoch": 0.304, + "grad_norm": 0.17927788197994232, + "kl": 0.004852294921875, + "learning_rate": 8.912825374441926e-07, + "loss": 0.005, + "num_tokens": 46135616.0, + "reward": 2.8110532760620117, + "reward_std": 1.4588611721992493, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.0967674255371094, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 1942.4732666015625, + "epoch": 0.3056, + "grad_norm": 0.190276101231575, + "kl": 0.0048065185546875, + "learning_rate": 8.896376473688572e-07, + "loss": -0.0061, + "num_tokens": 46371387.0, + "reward": 3.383459210395813, + "reward_std": 1.3481693863868713, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.4638161659240723, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 2024.3035888671875, + "epoch": 0.3072, + "grad_norm": 0.18835295736789703, + "kl": 0.0042724609375, + "learning_rate": 8.87982144204496e-07, + "loss": 0.0023, + "num_tokens": 46618381.0, + "reward": 2.691133499145508, + "reward_std": 1.3572381138801575, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 1.932204782962799, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.3088, + "grad_norm": 0.20698517560958862, + "kl": 0.00426483154296875, + "learning_rate": 8.863160796827192e-07, + "loss": 0.0002, + "num_tokens": 46867343.0, + "reward": 2.306548833847046, + "reward_std": 1.4258261322975159, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 1.583334505558014, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 1980.4732666015625, + "epoch": 0.3104, + "grad_norm": 0.18495045602321625, + "kl": 0.004730224609375, + "learning_rate": 8.846395058651617e-07, + "loss": 0.0086, + "num_tokens": 47112466.0, + "reward": 3.0054091215133667, + "reward_std": 1.362310767173767, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.6607142984867096, + "rewards/tag_count_reward": 2.2196946144104004, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 1978.8035888671875, + "epoch": 0.312, + "grad_norm": 0.17692245543003082, + "kl": 0.0047454833984375, + "learning_rate": 8.829524751418549e-07, + "loss": 0.0027, + "num_tokens": 47354504.0, + "reward": 3.101409077644348, + "reward_std": 1.5126274228096008, + "rewards/accuracy_reward": 0.2321428507566452, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.1371230483055115, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.3304443359375, + "epoch": 0.3136, + "grad_norm": 0.18617282807826996, + "kl": 0.00469970703125, + "learning_rate": 8.812550402295912e-07, + "loss": 0.0083, + "num_tokens": 47600913.0, + "reward": 3.144892692565918, + "reward_std": 1.5188844203948975, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.2341784238815308, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.357177734375, + "epoch": 0.3152, + "grad_norm": 0.16865494847297668, + "kl": 0.00439453125, + "learning_rate": 8.795472541702759e-07, + "loss": -0.0045, + "num_tokens": 47847045.0, + "reward": 2.8418890237808228, + "reward_std": 1.3128314018249512, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.1097460985183716, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.83935546875, + "epoch": 0.3168, + "grad_norm": 0.1837884783744812, + "kl": 0.00445556640625, + "learning_rate": 8.77829170329269e-07, + "loss": 0.0002, + "num_tokens": 48095107.0, + "reward": 2.6048643589019775, + "reward_std": 1.4826075434684753, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 1.8816499710083008, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.5625610351562, + "epoch": 0.3184, + "grad_norm": 0.18343526124954224, + "kl": 0.0044403076171875, + "learning_rate": 8.761008423937193e-07, + "loss": 0.0034, + "num_tokens": 48343572.0, + "reward": 2.373244524002075, + "reward_std": 1.5314162373542786, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.6607142984867096, + "rewards/tag_count_reward": 1.6857443451881409, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 2019.884033203125, + "epoch": 0.32, + "grad_norm": 0.1762218028306961, + "kl": 0.004913330078125, + "learning_rate": 8.74362324370885e-07, + "loss": 0.0065, + "num_tokens": 48589973.0, + "reward": 2.9657039642333984, + "reward_std": 1.5113163590431213, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.224632501602173, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.3660888671875, + "epoch": 0.3216, + "grad_norm": 0.17960911989212036, + "kl": 0.0044403076171875, + "learning_rate": 8.726136705864476e-07, + "loss": 0.0011, + "num_tokens": 48841636.0, + "reward": 3.0598315000534058, + "reward_std": 1.3100690245628357, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.193759799003601, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.4553833007812, + "epoch": 0.3232, + "grad_norm": 0.18656551837921143, + "kl": 0.00389862060546875, + "learning_rate": 8.70854935682813e-07, + "loss": 0.0014, + "num_tokens": 49089417.0, + "reward": 2.470710277557373, + "reward_std": 1.3912266492843628, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 1.7117815613746643, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 1955.3483276367188, + "epoch": 0.3248, + "grad_norm": 0.18237388134002686, + "kl": 0.0045318603515625, + "learning_rate": 8.690861746174052e-07, + "loss": 0.0047, + "num_tokens": 49330242.0, + "reward": 3.3656238317489624, + "reward_std": 1.3094004392623901, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.8303571343421936, + "rewards/tag_count_reward": 2.4549094438552856, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 2019.5982666015625, + "epoch": 0.3264, + "grad_norm": 0.1730743944644928, + "kl": 0.0045166015625, + "learning_rate": 8.673074426609479e-07, + "loss": -0.001, + "num_tokens": 49575365.0, + "reward": 2.8676246404647827, + "reward_std": 1.448303759098053, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 1.9926244616508484, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 1971.1429443359375, + "epoch": 0.328, + "grad_norm": 0.1951436996459961, + "kl": 0.004730224609375, + "learning_rate": 8.655187953957385e-07, + "loss": 0.0044, + "num_tokens": 49814543.0, + "reward": 3.2238101959228516, + "reward_std": 1.2805925011634827, + "rewards/accuracy_reward": 0.1964285746216774, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.223810076713562, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 1995.3750610351562, + "epoch": 0.3296, + "grad_norm": 0.19258084893226624, + "kl": 0.0050506591796875, + "learning_rate": 8.6372028871391e-07, + "loss": 0.0077, + "num_tokens": 50056043.0, + "reward": 3.147221088409424, + "reward_std": 1.4533080458641052, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.3257923126220703, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 1948.0804443359375, + "epoch": 0.3312, + "grad_norm": 0.18906666338443756, + "kl": 0.0048980712890625, + "learning_rate": 8.619119788156856e-07, + "loss": 0.0079, + "num_tokens": 50294878.0, + "reward": 2.940903425216675, + "reward_std": 1.4975730776786804, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.1551889181137085, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 2002.52685546875, + "epoch": 0.3328, + "grad_norm": 0.19643443822860718, + "kl": 0.00432586669921875, + "learning_rate": 8.600939222076218e-07, + "loss": 0.0007, + "num_tokens": 50540189.0, + "reward": 2.8533190488815308, + "reward_std": 1.1489585041999817, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.031890392303467, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 1855.919677734375, + "epoch": 0.3344, + "grad_norm": 0.19533519446849823, + "kl": 0.004791259765625, + "learning_rate": 8.58266175700843e-07, + "loss": -0.0155, + "num_tokens": 50766980.0, + "reward": 3.152464747428894, + "reward_std": 1.357027530670166, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 2.286393105983734, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.336, + "grad_norm": 0.17954601347446442, + "kl": 0.0045318603515625, + "learning_rate": 8.564287964092662e-07, + "loss": 0.0002, + "num_tokens": 51017580.0, + "reward": 2.6247535943984985, + "reward_std": 1.3123067617416382, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 1.8211821913719177, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 1929.3214721679688, + "epoch": 0.3376, + "grad_norm": 0.19151780009269714, + "kl": 0.00537109375, + "learning_rate": 8.545818417478162e-07, + "loss": -0.0047, + "num_tokens": 51251570.0, + "reward": 3.1753450632095337, + "reward_std": 1.365180253982544, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.6785714328289032, + "rewards/tag_count_reward": 2.336059272289276, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 1968.6697387695312, + "epoch": 0.3392, + "grad_norm": 0.18628749251365662, + "kl": 0.004486083984375, + "learning_rate": 8.527253694306317e-07, + "loss": 0.0021, + "num_tokens": 51492207.0, + "reward": 2.8268449306488037, + "reward_std": 1.3861590027809143, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.112558960914612, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 1899.15185546875, + "epoch": 0.3408, + "grad_norm": 0.1829199343919754, + "kl": 0.005035400390625, + "learning_rate": 8.508594374692614e-07, + "loss": -0.0102, + "num_tokens": 51723350.0, + "reward": 2.9703420400619507, + "reward_std": 1.3894107937812805, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6785714328289032, + "rewards/tag_count_reward": 2.291770577430725, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.6964721679688, + "epoch": 0.3424, + "grad_norm": 0.17923219501972198, + "kl": 0.0047149658203125, + "learning_rate": 8.489841041708517e-07, + "loss": 0.0013, + "num_tokens": 51971060.0, + "reward": 2.7653404474258423, + "reward_std": 1.3720880150794983, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.0421260595321655, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 2008.3035888671875, + "epoch": 0.344, + "grad_norm": 0.20746184885501862, + "kl": 0.0051116943359375, + "learning_rate": 8.470994281363246e-07, + "loss": -0.0009, + "num_tokens": 52218936.0, + "reward": 2.425147533416748, + "reward_std": 1.5362690091133118, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6160714030265808, + "rewards/tag_count_reward": 1.809075951576233, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 2021.919677734375, + "epoch": 0.3456, + "grad_norm": 0.18841612339019775, + "kl": 0.005157470703125, + "learning_rate": 8.452054682585464e-07, + "loss": 0.0018, + "num_tokens": 52464417.0, + "reward": 2.455490231513977, + "reward_std": 1.5965161323547363, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.6428571343421936, + "rewards/tag_count_reward": 1.7590615153312683, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 1920.0804443359375, + "epoch": 0.3472, + "grad_norm": 0.20596633851528168, + "kl": 0.004669189453125, + "learning_rate": 8.433022837204872e-07, + "loss": 0.0175, + "num_tokens": 52698660.0, + "reward": 3.174674153327942, + "reward_std": 1.5698022246360779, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.156816840171814, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 2040.1250610351562, + "epoch": 0.3488, + "grad_norm": 0.19105574488639832, + "kl": 0.0046234130859375, + "learning_rate": 8.413899339933723e-07, + "loss": 0.0025, + "num_tokens": 52947860.0, + "reward": 2.6989803314208984, + "reward_std": 1.3166117668151855, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 1.9043374061584473, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 1982.8483276367188, + "epoch": 0.3504, + "grad_norm": 0.18312574923038483, + "kl": 0.004791259765625, + "learning_rate": 8.394684788348224e-07, + "loss": 0.0145, + "num_tokens": 53190813.0, + "reward": 3.2730246782302856, + "reward_std": 1.620066523551941, + "rewards/accuracy_reward": 0.1428571455180645, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.3712387084960938, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 1948.0982666015625, + "epoch": 0.352, + "grad_norm": 0.20302268862724304, + "kl": 0.0054473876953125, + "learning_rate": 8.375379782869884e-07, + "loss": 0.0184, + "num_tokens": 53427830.0, + "reward": 2.9743258953094482, + "reward_std": 1.5162199139595032, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.135040044784546, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.02685546875, + "epoch": 0.3536, + "grad_norm": 0.1985490769147873, + "kl": 0.0052642822265625, + "learning_rate": 8.35598492674673e-07, + "loss": 0.0038, + "num_tokens": 53680225.0, + "reward": 2.8662047386169434, + "reward_std": 1.555790364742279, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.0804905891418457, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 1966.21435546875, + "epoch": 0.3552, + "grad_norm": 0.20105598866939545, + "kl": 0.004638671875, + "learning_rate": 8.336500826034468e-07, + "loss": 0.0089, + "num_tokens": 53924717.0, + "reward": 2.892626404762268, + "reward_std": 1.465615153312683, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.169412136077881, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.65185546875, + "epoch": 0.3568, + "grad_norm": 0.19225962460041046, + "kl": 0.004425048828125, + "learning_rate": 8.316928089577546e-07, + "loss": 0.0005, + "num_tokens": 54172898.0, + "reward": 2.624147057533264, + "reward_std": 1.278559923171997, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 1.8295040130615234, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 2000.5625610351562, + "epoch": 0.3584, + "grad_norm": 0.1756444275379181, + "kl": 0.00457763671875, + "learning_rate": 8.297267328990128e-07, + "loss": 0.0097, + "num_tokens": 54418325.0, + "reward": 2.7878557443618774, + "reward_std": 1.4472312331199646, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 1.9664268493652344, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 2030.8036499023438, + "epoch": 0.36, + "grad_norm": 0.1926804780960083, + "kl": 0.004974365234375, + "learning_rate": 8.27751915863697e-07, + "loss": 0.0002, + "num_tokens": 54675189.0, + "reward": 2.8549516201019287, + "reward_std": 1.5149240493774414, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.078165829181671, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.7589721679688, + "epoch": 0.3616, + "grad_norm": 0.19321848452091217, + "kl": 0.004913330078125, + "learning_rate": 8.257684195614243e-07, + "loss": 0.0045, + "num_tokens": 54923242.0, + "reward": 3.177859902381897, + "reward_std": 1.443784236907959, + "rewards/accuracy_reward": 0.12499999720603228, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.2939311265945435, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 1692.02685546875, + "epoch": 0.3632, + "grad_norm": 0.20032915472984314, + "kl": 0.0052337646484375, + "learning_rate": 8.237763059730231e-07, + "loss": -0.0011, + "num_tokens": 55131411.0, + "reward": 3.6385324001312256, + "reward_std": 1.4031717777252197, + "rewards/accuracy_reward": 0.1696428507566452, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.701032042503357, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 1982.0714721679688, + "epoch": 0.3648, + "grad_norm": 0.19999070465564728, + "kl": 0.005279541015625, + "learning_rate": 8.217756373485975e-07, + "loss": 0.0071, + "num_tokens": 55374641.0, + "reward": 3.0676087141036987, + "reward_std": 1.3920780420303345, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.2283228039741516, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 2002.40185546875, + "epoch": 0.3664, + "grad_norm": 0.18938691914081573, + "kl": 0.0048980712890625, + "learning_rate": 8.197664762055816e-07, + "loss": 0.0124, + "num_tokens": 55617964.0, + "reward": 2.5855820178985596, + "reward_std": 1.524293839931488, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 1.7909390330314636, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.2857666015625, + "epoch": 0.368, + "grad_norm": 0.1814538985490799, + "kl": 0.004913330078125, + "learning_rate": 8.177488853267858e-07, + "loss": -0.0106, + "num_tokens": 55860882.0, + "reward": 2.6684749126434326, + "reward_std": 1.3504092693328857, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 1.9006178975105286, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.3696, + "grad_norm": 0.19926764070987701, + "kl": 0.0051116943359375, + "learning_rate": 8.157229277584356e-07, + "loss": 0.0002, + "num_tokens": 56113582.0, + "reward": 2.8791091442108154, + "reward_std": 1.3986335396766663, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.0933948159217834, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.7500610351562, + "epoch": 0.3712, + "grad_norm": 0.19891411066055298, + "kl": 0.005523681640625, + "learning_rate": 8.136886668082005e-07, + "loss": 0.0168, + "num_tokens": 56358344.0, + "reward": 3.5349191427230835, + "reward_std": 1.3189321160316467, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.534919023513794, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 1898.27685546875, + "epoch": 0.3728, + "grad_norm": 0.20571205019950867, + "kl": 0.0047760009765625, + "learning_rate": 8.116461660432166e-07, + "loss": 0.0026, + "num_tokens": 56593575.0, + "reward": 2.986366033554077, + "reward_std": 1.323690116405487, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714030265808, + "rewards/tag_count_reward": 2.245294451713562, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 1973.571533203125, + "epoch": 0.3744, + "grad_norm": 0.20184798538684845, + "kl": 0.0054473876953125, + "learning_rate": 8.095954892881005e-07, + "loss": 0.0134, + "num_tokens": 56833277.0, + "reward": 2.884376645088196, + "reward_std": 1.607332706451416, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.6517857313156128, + "rewards/tag_count_reward": 2.134376585483551, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 2010.21435546875, + "epoch": 0.376, + "grad_norm": 0.1921064555644989, + "kl": 0.005218505859375, + "learning_rate": 8.075367006229535e-07, + "loss": 0.0091, + "num_tokens": 57080331.0, + "reward": 3.17162823677063, + "reward_std": 1.39780855178833, + "rewards/accuracy_reward": 0.2321428582072258, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.162699580192566, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 1873.1161499023438, + "epoch": 0.3776, + "grad_norm": 0.18873317539691925, + "kl": 0.00543212890625, + "learning_rate": 8.054698643813603e-07, + "loss": -0.0013, + "num_tokens": 57310238.0, + "reward": 3.1395496129989624, + "reward_std": 1.2801710367202759, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.669642835855484, + "rewards/tag_count_reward": 2.3449066877365112, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 1945.8036499023438, + "epoch": 0.3792, + "grad_norm": 0.1878959983587265, + "kl": 0.0053863525390625, + "learning_rate": 8.033950451483787e-07, + "loss": -0.0288, + "num_tokens": 57547418.0, + "reward": 2.878166913986206, + "reward_std": 1.3352981209754944, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 2.208523988723755, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 1933.7054443359375, + "epoch": 0.3808, + "grad_norm": 0.1873466968536377, + "kl": 0.0050048828125, + "learning_rate": 8.01312307758521e-07, + "loss": -0.0064, + "num_tokens": 57783677.0, + "reward": 3.090421199798584, + "reward_std": 1.3635611534118652, + "rewards/accuracy_reward": 0.1517857126891613, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.224349617958069, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 1959.6785888671875, + "epoch": 0.3824, + "grad_norm": 0.1905670017004013, + "kl": 0.005401611328125, + "learning_rate": 7.992217172937283e-07, + "loss": -0.0146, + "num_tokens": 58020927.0, + "reward": 3.0559548139572144, + "reward_std": 1.4843246936798096, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.261311888694763, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.8839721679688, + "epoch": 0.384, + "grad_norm": 0.18875561654567719, + "kl": 0.004913330078125, + "learning_rate": 7.971233390813365e-07, + "loss": -0.001, + "num_tokens": 58265816.0, + "reward": 3.1196141242980957, + "reward_std": 1.3198893666267395, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 2.2356855273246765, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 1970.3572387695312, + "epoch": 0.3856, + "grad_norm": 0.1774861067533493, + "kl": 0.0048370361328125, + "learning_rate": 7.950172386920353e-07, + "loss": 0.0198, + "num_tokens": 58507146.0, + "reward": 3.172262668609619, + "reward_std": 1.4735772609710693, + "rewards/accuracy_reward": 0.1696428582072258, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 2.3329769372940063, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.446533203125, + "epoch": 0.3872, + "grad_norm": 0.19256854057312012, + "kl": 0.0054473876953125, + "learning_rate": 7.929034819378191e-07, + "loss": -0.0005, + "num_tokens": 58756018.0, + "reward": 3.0550564527511597, + "reward_std": 1.402879536151886, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.287199020385742, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 1966.919677734375, + "epoch": 0.3888, + "grad_norm": 0.18801355361938477, + "kl": 0.0054779052734375, + "learning_rate": 7.907821348699303e-07, + "loss": -0.0059, + "num_tokens": 59001135.0, + "reward": 2.916814684867859, + "reward_std": 1.2081127762794495, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.211457371711731, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 2033.044677734375, + "epoch": 0.3904, + "grad_norm": 0.1874251812696457, + "kl": 0.00453948974609375, + "learning_rate": 7.886532637767957e-07, + "loss": 0.0024, + "num_tokens": 59248394.0, + "reward": 2.7856675386428833, + "reward_std": 1.3530530333518982, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 1.955310344696045, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 2001.3929443359375, + "epoch": 0.392, + "grad_norm": 0.1773170828819275, + "kl": 0.0053253173828125, + "learning_rate": 7.86516935181955e-07, + "loss": 0.0057, + "num_tokens": 59493466.0, + "reward": 3.2287521362304688, + "reward_std": 1.6988569498062134, + "rewards/accuracy_reward": 0.1160714291036129, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.3626805543899536, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 1753.732177734375, + "epoch": 0.3936, + "grad_norm": 0.20985598862171173, + "kl": 0.0057220458984375, + "learning_rate": 7.84373215841981e-07, + "loss": -0.0106, + "num_tokens": 59707328.0, + "reward": 3.353125810623169, + "reward_std": 1.3706098794937134, + "rewards/accuracy_reward": 0.25, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.397768259048462, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 1925.1250610351562, + "epoch": 0.3952, + "grad_norm": 0.1996774673461914, + "kl": 0.0045623779296875, + "learning_rate": 7.822221727443958e-07, + "loss": -0.0062, + "num_tokens": 59942612.0, + "reward": 2.858483910560608, + "reward_std": 1.3655762076377869, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.0549123287200928, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 1898.2947387695312, + "epoch": 0.3968, + "grad_norm": 0.18544811010360718, + "kl": 0.0049591064453125, + "learning_rate": 7.800638731055755e-07, + "loss": -0.0118, + "num_tokens": 60175507.0, + "reward": 3.3319300413131714, + "reward_std": 1.1426488161087036, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 2.3855013847351074, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 1991.294677734375, + "epoch": 0.3984, + "grad_norm": 0.17491687834262848, + "kl": 0.0047760009765625, + "learning_rate": 7.778983843686506e-07, + "loss": 0.0088, + "num_tokens": 60418734.0, + "reward": 3.134061813354492, + "reward_std": 1.3991496562957764, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.285847306251526, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 1918.40185546875, + "epoch": 0.4, + "grad_norm": 0.20697081089019775, + "kl": 0.0049285888671875, + "learning_rate": 7.757257742013988e-07, + "loss": -0.0022, + "num_tokens": 60658375.0, + "reward": 3.238408327102661, + "reward_std": 1.4190102219581604, + "rewards/accuracy_reward": 0.2410714253783226, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.2026939392089844, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.6964721679688, + "epoch": 0.4016, + "grad_norm": 0.18666158616542816, + "kl": 0.0048675537109375, + "learning_rate": 7.735461104941297e-07, + "loss": 0.0042, + "num_tokens": 60900723.0, + "reward": 2.711184501647949, + "reward_std": 1.3956908583641052, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 1.961184322834015, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.607177734375, + "epoch": 0.4032, + "grad_norm": 0.21047449111938477, + "kl": 0.00555419921875, + "learning_rate": 7.71359461357564e-07, + "loss": 0.0033, + "num_tokens": 61148577.0, + "reward": 3.0383453369140625, + "reward_std": 1.5368072390556335, + "rewards/accuracy_reward": 0.15178572107106447, + "rewards/format_reward": 0.6607142686843872, + "rewards/tag_count_reward": 2.26155948638916, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.6607666015625, + "epoch": 0.4048, + "grad_norm": 0.17982521653175354, + "kl": 0.004364013671875, + "learning_rate": 7.691658951207056e-07, + "loss": 0.0033, + "num_tokens": 61391019.0, + "reward": 3.016200542449951, + "reward_std": 1.3868630528450012, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.1947717666625977, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 1786.2678833007812, + "epoch": 0.4064, + "grad_norm": 0.21953041851520538, + "kl": 0.0062713623046875, + "learning_rate": 7.66965480328705e-07, + "loss": 0.0136, + "num_tokens": 61615343.0, + "reward": 3.144799590110779, + "reward_std": 1.3934525847434998, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.6428571343421936, + "rewards/tag_count_reward": 2.493013858795166, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 1825.5625610351562, + "epoch": 0.408, + "grad_norm": 0.20141878724098206, + "kl": 0.0050048828125, + "learning_rate": 7.647582857407184e-07, + "loss": -0.0061, + "num_tokens": 61839028.0, + "reward": 3.120089054107666, + "reward_std": 1.3458682298660278, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.218303143978119, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.3839721679688, + "epoch": 0.4096, + "grad_norm": 0.17724959552288055, + "kl": 0.004669189453125, + "learning_rate": 7.625443803277591e-07, + "loss": -0.0125, + "num_tokens": 62073417.0, + "reward": 3.2354369163513184, + "reward_std": 1.2268921732902527, + "rewards/accuracy_reward": 0.2053571417927742, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.2086511850357056, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 1973.1876220703125, + "epoch": 0.4112, + "grad_norm": 0.1881449818611145, + "kl": 0.0050506591796875, + "learning_rate": 7.603238332705419e-07, + "loss": 0.0155, + "num_tokens": 62315764.0, + "reward": 3.2619768381118774, + "reward_std": 1.5180407166481018, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.494119644165039, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 1789.9107666015625, + "epoch": 0.4128, + "grad_norm": 0.21762605011463165, + "kl": 0.005218505859375, + "learning_rate": 7.580967139573219e-07, + "loss": -0.0074, + "num_tokens": 62534252.0, + "reward": 3.1847121715545654, + "reward_std": 1.3734914064407349, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.7410714030265808, + "rewards/tag_count_reward": 2.3275691270828247, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.0535888671875, + "epoch": 0.4144, + "grad_norm": 0.18776056170463562, + "kl": 0.0049285888671875, + "learning_rate": 7.558630919817251e-07, + "loss": 0.0047, + "num_tokens": 62778558.0, + "reward": 3.1045215129852295, + "reward_std": 1.33121919631958, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.2205928564071655, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 1913.5625610351562, + "epoch": 0.416, + "grad_norm": 0.20927421748638153, + "kl": 0.0044708251953125, + "learning_rate": 7.536230371405751e-07, + "loss": -0.0124, + "num_tokens": 63013639.0, + "reward": 2.858312249183655, + "reward_std": 1.2387158274650574, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.04581218957901, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 2041.7857666015625, + "epoch": 0.4176, + "grad_norm": 0.18277142941951752, + "kl": 0.0047454833984375, + "learning_rate": 7.513766194317109e-07, + "loss": 0.0022, + "num_tokens": 63264271.0, + "reward": 2.95268452167511, + "reward_std": 1.5351403951644897, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.1669700145721436, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.8660888671875, + "epoch": 0.4192, + "grad_norm": 0.1891564577817917, + "kl": 0.004669189453125, + "learning_rate": 7.491239090518006e-07, + "loss": -0.0004, + "num_tokens": 63516662.0, + "reward": 2.4479382038116455, + "reward_std": 1.3744735717773438, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 1.724723994731903, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.9732666015625, + "epoch": 0.4208, + "grad_norm": 0.18977634608745575, + "kl": 0.0051727294921875, + "learning_rate": 7.468649763941472e-07, + "loss": 0.0065, + "num_tokens": 63766461.0, + "reward": 2.955631136894226, + "reward_std": 1.3882953524589539, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.205631136894226, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 1951.9732666015625, + "epoch": 0.4224, + "grad_norm": 0.2063266783952713, + "kl": 0.004364013671875, + "learning_rate": 7.445998920464889e-07, + "loss": 0.0089, + "num_tokens": 64011500.0, + "reward": 2.715405225753784, + "reward_std": 1.4910256266593933, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 1.876119613647461, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 1953.3750610351562, + "epoch": 0.424, + "grad_norm": 0.19136229157447815, + "kl": 0.0047760009765625, + "learning_rate": 7.423287267887941e-07, + "loss": 0.0019, + "num_tokens": 64256192.0, + "reward": 2.7753779888153076, + "reward_std": 1.3352714776992798, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714030265808, + "rewards/tag_count_reward": 2.034306526184082, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 1903.5982666015625, + "epoch": 0.4256, + "grad_norm": 0.20534822344779968, + "kl": 0.00460052490234375, + "learning_rate": 7.400515515910485e-07, + "loss": -0.0113, + "num_tokens": 64490031.0, + "reward": 2.9298150539398193, + "reward_std": 1.1629910469055176, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.857142835855484, + "rewards/tag_count_reward": 2.072671890258789, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 1854.08935546875, + "epoch": 0.4272, + "grad_norm": 0.19483095407485962, + "kl": 0.004852294921875, + "learning_rate": 7.377684376110383e-07, + "loss": 0.0203, + "num_tokens": 64715945.0, + "reward": 2.997569441795349, + "reward_std": 1.3354122042655945, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.2297122478485107, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.8482666015625, + "epoch": 0.4288, + "grad_norm": 0.1912766695022583, + "kl": 0.0049285888671875, + "learning_rate": 7.354794561921263e-07, + "loss": 0.0027, + "num_tokens": 64963854.0, + "reward": 2.9332973957061768, + "reward_std": 1.3259008526802063, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.1832971572875977, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 1975.8482666015625, + "epoch": 0.4304, + "grad_norm": 0.1883944869041443, + "kl": 0.0047607421875, + "learning_rate": 7.331846788610231e-07, + "loss": 0.0228, + "num_tokens": 65207017.0, + "reward": 2.7274930477142334, + "reward_std": 1.5612693428993225, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.039992928504944, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.4286499023438, + "epoch": 0.432, + "grad_norm": 0.1842639446258545, + "kl": 0.00439453125, + "learning_rate": 7.308841773255508e-07, + "loss": -0.0004, + "num_tokens": 65449573.0, + "reward": 2.9219133853912354, + "reward_std": 1.3566462397575378, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.1183416843414307, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 2028.6697387695312, + "epoch": 0.4336, + "grad_norm": 0.19256386160850525, + "kl": 0.005096435546875, + "learning_rate": 7.285780234724035e-07, + "loss": 0.005, + "num_tokens": 65697644.0, + "reward": 3.0199010372161865, + "reward_std": 1.514003574848175, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.2163294553756714, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 1952.1250610351562, + "epoch": 0.4352, + "grad_norm": 0.20819565653800964, + "kl": 0.0045623779296875, + "learning_rate": 7.262662893649e-07, + "loss": 0.0039, + "num_tokens": 65938066.0, + "reward": 2.9495275020599365, + "reward_std": 1.4245181679725647, + "rewards/accuracy_reward": 0.2232142835855484, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.0030986070632935, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 1890.3126220703125, + "epoch": 0.4368, + "grad_norm": 0.19455422461032867, + "kl": 0.0044708251953125, + "learning_rate": 7.239490472407331e-07, + "loss": 0.0096, + "num_tokens": 66169675.0, + "reward": 2.9417855739593506, + "reward_std": 1.290075957775116, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 2.004285454750061, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 1893.134033203125, + "epoch": 0.4384, + "grad_norm": 0.18601737916469574, + "kl": 0.004547119140625, + "learning_rate": 7.216263695097109e-07, + "loss": -0.0101, + "num_tokens": 66402370.0, + "reward": 3.2832823991775513, + "reward_std": 1.3122333884239197, + "rewards/accuracy_reward": 0.2678571492433548, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.2832823991775513, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 2004.3482666015625, + "epoch": 0.44, + "grad_norm": 0.17347562313079834, + "kl": 0.004364013671875, + "learning_rate": 7.19298328751495e-07, + "loss": -0.0019, + "num_tokens": 66648529.0, + "reward": 2.797078490257263, + "reward_std": 1.4458576440811157, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 1.9310069680213928, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 1965.7589721679688, + "epoch": 0.4416, + "grad_norm": 0.1826094537973404, + "kl": 0.0049591064453125, + "learning_rate": 7.169649977133327e-07, + "loss": -0.0058, + "num_tokens": 66885928.0, + "reward": 3.0118287801742554, + "reward_std": 1.4323386549949646, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.226114511489868, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.1160888671875, + "epoch": 0.4432, + "grad_norm": 0.1898445188999176, + "kl": 0.0051727294921875, + "learning_rate": 7.146264493077826e-07, + "loss": 0.0149, + "num_tokens": 67128925.0, + "reward": 3.2442198991775513, + "reward_std": 1.4192506074905396, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.3781481981277466, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.6875610351562, + "epoch": 0.4448, + "grad_norm": 0.19760717451572418, + "kl": 0.005157470703125, + "learning_rate": 7.122827566104379e-07, + "loss": 0.003, + "num_tokens": 67376802.0, + "reward": 3.028987407684326, + "reward_std": 1.3204010128974915, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.2254159450531006, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.419677734375, + "epoch": 0.4464, + "grad_norm": 0.18276847898960114, + "kl": 0.0048675537109375, + "learning_rate": 7.099339928576415e-07, + "loss": 0.0042, + "num_tokens": 67619581.0, + "reward": 3.0182182788848877, + "reward_std": 1.3150616884231567, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.669642835855484, + "rewards/tag_count_reward": 2.2950039505958557, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.607177734375, + "epoch": 0.448, + "grad_norm": 0.20669980347156525, + "kl": 0.005096435546875, + "learning_rate": 7.075802314441982e-07, + "loss": 0.0016, + "num_tokens": 67867337.0, + "reward": 2.7381601333618164, + "reward_std": 1.5319562554359436, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.669642835855484, + "rewards/tag_count_reward": 1.9703031182289124, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 2032.6428833007812, + "epoch": 0.4496, + "grad_norm": 0.19768260419368744, + "kl": 0.005096435546875, + "learning_rate": 7.052215459210809e-07, + "loss": 0.0033, + "num_tokens": 68113123.0, + "reward": 2.8451104164123535, + "reward_std": 1.4076507687568665, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.148681879043579, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 1914.2411499023438, + "epoch": 0.4512, + "grad_norm": 0.1901230365037918, + "kl": 0.00555419921875, + "learning_rate": 7.028580099931326e-07, + "loss": 0.0136, + "num_tokens": 68344066.0, + "reward": 3.609326958656311, + "reward_std": 1.4263173341751099, + "rewards/accuracy_reward": 0.1339285671710968, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.707540988922119, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 1978.7500610351562, + "epoch": 0.4528, + "grad_norm": 0.1953936070203781, + "kl": 0.0048370361328125, + "learning_rate": 7.00489697516763e-07, + "loss": -0.0015, + "num_tokens": 68589122.0, + "reward": 3.355332851409912, + "reward_std": 1.3932841420173645, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 2.5517613887786865, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 1884.9286499023438, + "epoch": 0.4544, + "grad_norm": 0.18990421295166016, + "kl": 0.0046844482421875, + "learning_rate": 6.981166824976403e-07, + "loss": -0.0041, + "num_tokens": 68818168.0, + "reward": 2.937221050262451, + "reward_std": 1.3947793245315552, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.0979349613189697, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 1857.7500610351562, + "epoch": 0.456, + "grad_norm": 0.22904610633850098, + "kl": 0.0051116943359375, + "learning_rate": 6.957390390883796e-07, + "loss": -0.014, + "num_tokens": 69046060.0, + "reward": 2.7895983457565308, + "reward_std": 1.1918574571609497, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.066383957862854, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 1997.571533203125, + "epoch": 0.4576, + "grad_norm": 0.1852007955312729, + "kl": 0.00440216064453125, + "learning_rate": 6.933568415862251e-07, + "loss": -0.0089, + "num_tokens": 69288520.0, + "reward": 2.7777276039123535, + "reward_std": 1.619453251361847, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.045584559440613, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.9464721679688, + "epoch": 0.4592, + "grad_norm": 0.1999320238828659, + "kl": 0.0054168701171875, + "learning_rate": 6.909701644307282e-07, + "loss": 0.0052, + "num_tokens": 69537308.0, + "reward": 3.050122618675232, + "reward_std": 1.5349740386009216, + "rewards/accuracy_reward": 0.1339285708963871, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.201908230781555, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.9107666015625, + "epoch": 0.4608, + "grad_norm": 0.20256422460079193, + "kl": 0.005096435546875, + "learning_rate": 6.885790822014218e-07, + "loss": -0.0001, + "num_tokens": 69788234.0, + "reward": 2.753170967102051, + "reward_std": 1.4355602264404297, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.0299564003944397, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.4822387695312, + "epoch": 0.4624, + "grad_norm": 0.18857809901237488, + "kl": 0.0048828125, + "learning_rate": 6.8618366961549e-07, + "loss": 0.0139, + "num_tokens": 70029256.0, + "reward": 3.153316378593445, + "reward_std": 1.298505187034607, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.3140305280685425, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.15185546875, + "epoch": 0.464, + "grad_norm": 0.19147846102714539, + "kl": 0.0053253173828125, + "learning_rate": 6.837840015254328e-07, + "loss": 0.0034, + "num_tokens": 70270423.0, + "reward": 3.2323691844940186, + "reward_std": 1.3145222663879395, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.3484405279159546, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.9910888671875, + "epoch": 0.4656, + "grad_norm": 0.20279374718666077, + "kl": 0.004791259765625, + "learning_rate": 6.813801529167275e-07, + "loss": 0.0002, + "num_tokens": 70523920.0, + "reward": 3.0314860343933105, + "reward_std": 1.2722330689430237, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.1386287212371826, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.4672, + "grad_norm": 0.18420623242855072, + "kl": 0.004608154296875, + "learning_rate": 6.789721989054851e-07, + "loss": 0.0002, + "num_tokens": 70772364.0, + "reward": 2.875911593437195, + "reward_std": 1.4389634728431702, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.108054280281067, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 1888.52685546875, + "epoch": 0.4688, + "grad_norm": 0.18741582334041595, + "kl": 0.0048675537109375, + "learning_rate": 6.765602147361037e-07, + "loss": 0.0261, + "num_tokens": 71003073.0, + "reward": 3.3042229413986206, + "reward_std": 1.371212661266327, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.38457989692688, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.8929443359375, + "epoch": 0.4704, + "grad_norm": 0.19396907091140747, + "kl": 0.004974365234375, + "learning_rate": 6.741442757789169e-07, + "loss": 0.0048, + "num_tokens": 71253661.0, + "reward": 2.9303698539733887, + "reward_std": 1.336452841758728, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.1892982721328735, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 2028.1428833007812, + "epoch": 0.472, + "grad_norm": 0.19250023365020752, + "kl": 0.0051727294921875, + "learning_rate": 6.717244575278381e-07, + "loss": 0.0006, + "num_tokens": 71500973.0, + "reward": 3.070032835006714, + "reward_std": 1.3798995614051819, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.2843183279037476, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.4736, + "grad_norm": 0.18445788323879242, + "kl": 0.0049896240234375, + "learning_rate": 6.693008355980021e-07, + "loss": 0.0002, + "num_tokens": 71752217.0, + "reward": 2.840848922729492, + "reward_std": 1.2793698906898499, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.1176347732543945, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 2028.4287109375, + "epoch": 0.4752, + "grad_norm": 0.18938905000686646, + "kl": 0.0053863525390625, + "learning_rate": 6.668734857234025e-07, + "loss": 0.009, + "num_tokens": 71997475.0, + "reward": 2.662877917289734, + "reward_std": 1.3100085258483887, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 1.895020604133606, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 2006.9732666015625, + "epoch": 0.4768, + "grad_norm": 0.20618362724781036, + "kl": 0.0059356689453125, + "learning_rate": 6.644424837545243e-07, + "loss": 0.0135, + "num_tokens": 72242010.0, + "reward": 3.0030598640441895, + "reward_std": 1.3937220573425293, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 2.226273775100708, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 1986.7501220703125, + "epoch": 0.4784, + "grad_norm": 0.18999327719211578, + "kl": 0.005279541015625, + "learning_rate": 6.620079056559743e-07, + "loss": 0.0147, + "num_tokens": 72481970.0, + "reward": 3.1716312170028687, + "reward_std": 1.4614059329032898, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.305559754371643, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 1913.9732666015625, + "epoch": 0.48, + "grad_norm": 0.1882185935974121, + "kl": 0.0052337646484375, + "learning_rate": 6.595698275041069e-07, + "loss": -0.0017, + "num_tokens": 72716215.0, + "reward": 3.0739006996154785, + "reward_std": 1.2861343622207642, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.3239004611968994, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.4732666015625, + "epoch": 0.4816, + "grad_norm": 0.18030355870723724, + "kl": 0.00439453125, + "learning_rate": 6.571283254846476e-07, + "loss": 0.0037, + "num_tokens": 72963830.0, + "reward": 2.936666250228882, + "reward_std": 1.4350295066833496, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.1241660714149475, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 1945.2858276367188, + "epoch": 0.4832, + "grad_norm": 0.19212894141674042, + "kl": 0.005035400390625, + "learning_rate": 6.546834758903114e-07, + "loss": -0.0001, + "num_tokens": 73201120.0, + "reward": 3.288641095161438, + "reward_std": 1.5466267466545105, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.3689982891082764, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 1877.7233276367188, + "epoch": 0.4848, + "grad_norm": 0.19531312584877014, + "kl": 0.0054168701171875, + "learning_rate": 6.522353551184193e-07, + "loss": 0.0275, + "num_tokens": 73428505.0, + "reward": 3.085123062133789, + "reward_std": 1.3863674402236938, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.361908793449402, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 2030.9464721679688, + "epoch": 0.4864, + "grad_norm": 0.19703370332717896, + "kl": 0.0056304931640625, + "learning_rate": 6.497840396685111e-07, + "loss": 0.0042, + "num_tokens": 73675123.0, + "reward": 2.919885993003845, + "reward_std": 1.5591852068901062, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.6517857015132904, + "rewards/tag_count_reward": 2.2234573364257812, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 2033.4375610351562, + "epoch": 0.488, + "grad_norm": 0.21490877866744995, + "kl": 0.005126953125, + "learning_rate": 6.47329606139955e-07, + "loss": 0.0069, + "num_tokens": 73919612.0, + "reward": 2.9240970611572266, + "reward_std": 1.3899248838424683, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.0848113298416138, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 1958.9911499023438, + "epoch": 0.4896, + "grad_norm": 0.18843451142311096, + "kl": 0.00537109375, + "learning_rate": 6.44872131229553e-07, + "loss": 0.0092, + "num_tokens": 74157191.0, + "reward": 3.271487832069397, + "reward_std": 1.470854640007019, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.4857733249664307, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 2029.6786499023438, + "epoch": 0.4912, + "grad_norm": 0.17970992624759674, + "kl": 0.0047760009765625, + "learning_rate": 6.424116917291458e-07, + "loss": 0.0132, + "num_tokens": 74409561.0, + "reward": 2.549420475959778, + "reward_std": 1.490764856338501, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6339285969734192, + "rewards/tag_count_reward": 1.915491759777069, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.982177734375, + "epoch": 0.4928, + "grad_norm": 0.1873333752155304, + "kl": 0.0049896240234375, + "learning_rate": 6.399483645232118e-07, + "loss": 0.0003, + "num_tokens": 74660551.0, + "reward": 2.6970221996307373, + "reward_std": 1.522648274898529, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6785714030265808, + "rewards/tag_count_reward": 2.018450677394867, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 1913.4911499023438, + "epoch": 0.4944, + "grad_norm": 0.18798880279064178, + "kl": 0.0055084228515625, + "learning_rate": 6.374822265864659e-07, + "loss": -0.0104, + "num_tokens": 74898004.0, + "reward": 3.126481056213379, + "reward_std": 1.5098605751991272, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.403266668319702, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 1852.3036499023438, + "epoch": 0.496, + "grad_norm": 0.24387024343013763, + "kl": 0.0054473876953125, + "learning_rate": 6.350133549814527e-07, + "loss": -0.0107, + "num_tokens": 75127400.0, + "reward": 3.12575626373291, + "reward_std": 1.5693780779838562, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.268613278865814, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 1951.9554443359375, + "epoch": 0.4976, + "grad_norm": 0.18329599499702454, + "kl": 0.0048828125, + "learning_rate": 6.325418268561396e-07, + "loss": -0.0107, + "num_tokens": 75364093.0, + "reward": 3.2046165466308594, + "reward_std": 1.3450925946235657, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.320688009262085, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 1952.4286499023438, + "epoch": 0.4992, + "grad_norm": 0.18232479691505432, + "kl": 0.0053253173828125, + "learning_rate": 6.300677194415059e-07, + "loss": 0.0019, + "num_tokens": 75602253.0, + "reward": 3.3948432207107544, + "reward_std": 1.5811928510665894, + "rewards/accuracy_reward": 0.20535713899880648, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.4394859075546265, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 1949.2232666015625, + "epoch": 0.5008, + "grad_norm": 0.19040940701961517, + "kl": 0.0051422119140625, + "learning_rate": 6.275911100491285e-07, + "loss": 0.0035, + "num_tokens": 75841174.0, + "reward": 3.078882336616516, + "reward_std": 1.2447201013565063, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.284239411354065, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 2024.6161499023438, + "epoch": 0.5024, + "grad_norm": 0.16620422899723053, + "kl": 0.00482177734375, + "learning_rate": 6.251120760687678e-07, + "loss": 0.0049, + "num_tokens": 76086635.0, + "reward": 3.2574245929718018, + "reward_std": 1.330361247062683, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.3645671606063843, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 2007.571533203125, + "epoch": 0.504, + "grad_norm": 0.20670749247074127, + "kl": 0.0050201416015625, + "learning_rate": 6.226306949659474e-07, + "loss": 0.0101, + "num_tokens": 76329431.0, + "reward": 2.799281120300293, + "reward_std": 1.5193068981170654, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.0135666728019714, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 1888.9017944335938, + "epoch": 0.5056, + "grad_norm": 0.1993604153394699, + "kl": 0.0051116943359375, + "learning_rate": 6.201470442795351e-07, + "loss": 0.0142, + "num_tokens": 76559804.0, + "reward": 2.8636410236358643, + "reward_std": 1.4456069469451904, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 2.185069501399994, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 1960.5001220703125, + "epoch": 0.5072, + "grad_norm": 0.197881817817688, + "kl": 0.005126953125, + "learning_rate": 6.176612016193189e-07, + "loss": 0.0031, + "num_tokens": 76799764.0, + "reward": 3.3995840549468994, + "reward_std": 1.2997100949287415, + "rewards/accuracy_reward": 0.14285714365541935, + "rewards/format_reward": 0.8303571343421936, + "rewards/tag_count_reward": 2.4263696670532227, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 2026.4911499023438, + "epoch": 0.5088, + "grad_norm": 0.18382194638252258, + "kl": 0.0051422119140625, + "learning_rate": 6.151732446635823e-07, + "loss": 0.0053, + "num_tokens": 77046625.0, + "reward": 3.078157663345337, + "reward_std": 1.566679298877716, + "rewards/accuracy_reward": 0.11607143096625805, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.2299432158470154, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.7679443359375, + "epoch": 0.5104, + "grad_norm": 0.19238810241222382, + "kl": 0.0048980712890625, + "learning_rate": 6.126832511566769e-07, + "loss": 0.0043, + "num_tokens": 77294679.0, + "reward": 2.9504741430282593, + "reward_std": 1.4131128191947937, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.1915454864501953, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.512, + "grad_norm": 0.17691336572170258, + "kl": 0.0048828125, + "learning_rate": 6.101912989065929e-07, + "loss": 0.0002, + "num_tokens": 77540757.0, + "reward": 2.9239248037338257, + "reward_std": 1.4492520689964294, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.20071017742157, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 2041.4553833007812, + "epoch": 0.5136, + "grad_norm": 0.18865837156772614, + "kl": 0.0048675537109375, + "learning_rate": 6.07697465782528e-07, + "loss": 0.001, + "num_tokens": 77792402.0, + "reward": 2.7769583463668823, + "reward_std": 1.3331434726715088, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.053744077682495, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 2006.0714721679688, + "epoch": 0.5152, + "grad_norm": 0.16854676604270935, + "kl": 0.0051116943359375, + "learning_rate": 6.052018297124538e-07, + "loss": -0.0019, + "num_tokens": 78036066.0, + "reward": 3.0360474586486816, + "reward_std": 1.5145050287246704, + "rewards/accuracy_reward": 0.10714285634458065, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.169975757598877, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 1878.8036499023438, + "epoch": 0.5168, + "grad_norm": 0.19097816944122314, + "kl": 0.0054779052734375, + "learning_rate": 6.02704468680681e-07, + "loss": -0.0089, + "num_tokens": 78269298.0, + "reward": 3.450610399246216, + "reward_std": 1.3819818496704102, + "rewards/accuracy_reward": 0.10714285541325808, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.58453905582428, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 1829.8839721679688, + "epoch": 0.5184, + "grad_norm": 0.19469261169433594, + "kl": 0.00506591796875, + "learning_rate": 6.002054607254222e-07, + "loss": 0.0056, + "num_tokens": 78498073.0, + "reward": 3.15816867351532, + "reward_std": 1.3624765276908875, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.229596972465515, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 2031.5803833007812, + "epoch": 0.52, + "grad_norm": 0.19981543719768524, + "kl": 0.00506591796875, + "learning_rate": 5.977048839363537e-07, + "loss": 0.0009, + "num_tokens": 78747422.0, + "reward": 3.059827446937561, + "reward_std": 1.310868740081787, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.2473273277282715, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 2019.446533203125, + "epoch": 0.5216, + "grad_norm": 0.18436074256896973, + "kl": 0.0049896240234375, + "learning_rate": 5.952028164521749e-07, + "loss": 0.0072, + "num_tokens": 78998268.0, + "reward": 2.8247079849243164, + "reward_std": 1.303443193435669, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 1.985422134399414, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 1766.607177734375, + "epoch": 0.5232, + "grad_norm": 0.20071090757846832, + "kl": 0.0051422119140625, + "learning_rate": 5.926993364581669e-07, + "loss": 0.0087, + "num_tokens": 79214804.0, + "reward": 3.298001289367676, + "reward_std": 1.3492382764816284, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 2.414072871208191, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.7857666015625, + "epoch": 0.5248, + "grad_norm": 0.18874508142471313, + "kl": 0.005523681640625, + "learning_rate": 5.901945221837495e-07, + "loss": 0.0093, + "num_tokens": 79456336.0, + "reward": 3.4187530279159546, + "reward_std": 1.5302408337593079, + "rewards/accuracy_reward": 0.1696428544819355, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.4901816844940186, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 1950.3126220703125, + "epoch": 0.5264, + "grad_norm": 0.18537086248397827, + "kl": 0.0046844482421875, + "learning_rate": 5.876884519000364e-07, + "loss": 0.0166, + "num_tokens": 79694133.0, + "reward": 2.9681973457336426, + "reward_std": 1.2943559885025024, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.1378400325775146, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 2024.90185546875, + "epoch": 0.528, + "grad_norm": 0.1989290565252304, + "kl": 0.0050201416015625, + "learning_rate": 5.851812039173892e-07, + "loss": 0.0076, + "num_tokens": 79942986.0, + "reward": 3.084001302719116, + "reward_std": 1.4653407335281372, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.2357869148254395, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.2410888671875, + "epoch": 0.5296, + "grad_norm": 0.18793053925037384, + "kl": 0.0048828125, + "learning_rate": 5.826728565829707e-07, + "loss": 0.0017, + "num_tokens": 80193697.0, + "reward": 2.8564494848251343, + "reward_std": 1.3955742120742798, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.052877724170685, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.5312, + "grad_norm": 0.19123612344264984, + "kl": 0.004638671875, + "learning_rate": 5.801634882782969e-07, + "loss": 0.0002, + "num_tokens": 80443877.0, + "reward": 2.7979527711868286, + "reward_std": 1.2929344773292542, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.047952651977539, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 2000.7054443359375, + "epoch": 0.5328, + "grad_norm": 0.18771521747112274, + "kl": 0.0055694580078125, + "learning_rate": 5.776531774167865e-07, + "loss": 0.0037, + "num_tokens": 80688998.0, + "reward": 3.183181405067444, + "reward_std": 1.4215174913406372, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.4510385990142822, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completion_length": 1910.794677734375, + "epoch": 0.5344, + "grad_norm": 0.21091251075267792, + "kl": 0.0055694580078125, + "learning_rate": 5.75142002441312e-07, + "loss": 0.0276, + "num_tokens": 80926359.0, + "reward": 3.2268385887145996, + "reward_std": 1.3238035440444946, + "rewards/accuracy_reward": 0.24107143096625805, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.1821956038475037, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 1863.1161499023438, + "epoch": 0.536, + "grad_norm": 0.17465730011463165, + "kl": 0.0053253173828125, + "learning_rate": 5.726300418217483e-07, + "loss": 0.0212, + "num_tokens": 81157932.0, + "reward": 3.6087775230407715, + "reward_std": 1.1862934827804565, + "rewards/accuracy_reward": 0.2767857015132904, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 2.5284202098846436, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.2589721679688, + "epoch": 0.5376, + "grad_norm": 0.18666903674602509, + "kl": 0.0054931640625, + "learning_rate": 5.701173740525197e-07, + "loss": 0.0048, + "num_tokens": 81403857.0, + "reward": 3.0205955505371094, + "reward_std": 1.5118699669837952, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.2259525060653687, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 1944.58935546875, + "epoch": 0.5392, + "grad_norm": 0.19781051576137543, + "kl": 0.005645751953125, + "learning_rate": 5.676040776501478e-07, + "loss": 0.0114, + "num_tokens": 81638717.0, + "reward": 3.1699211597442627, + "reward_std": 1.5526323914527893, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.6607142686843872, + "rewards/tag_count_reward": 2.321706771850586, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 1809.0358276367188, + "epoch": 0.5408, + "grad_norm": 0.20321589708328247, + "kl": 0.0058135986328125, + "learning_rate": 5.650902311507984e-07, + "loss": 0.0112, + "num_tokens": 81862917.0, + "reward": 3.536224842071533, + "reward_std": 1.3557929396629333, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.6433675289154053, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.02685546875, + "epoch": 0.5424, + "grad_norm": 0.18814384937286377, + "kl": 0.004852294921875, + "learning_rate": 5.625759131078261e-07, + "loss": -0.0036, + "num_tokens": 82111210.0, + "reward": 3.050404191017151, + "reward_std": 1.249034583568573, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.273618221282959, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 1893.5982666015625, + "epoch": 0.544, + "grad_norm": 0.20067259669303894, + "kl": 0.005462646484375, + "learning_rate": 5.600612020893212e-07, + "loss": 0.0183, + "num_tokens": 82342725.0, + "reward": 3.4213311672210693, + "reward_std": 1.4960218667984009, + "rewards/accuracy_reward": 0.1696428582072258, + "rewards/format_reward": 0.705357164144516, + "rewards/tag_count_reward": 2.54633104801178, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 1961.21435546875, + "epoch": 0.5456, + "grad_norm": 0.20519143342971802, + "kl": 0.0054931640625, + "learning_rate": 5.575461766756536e-07, + "loss": -0.0017, + "num_tokens": 82583787.0, + "reward": 3.49313223361969, + "reward_std": 1.3561880588531494, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.68063223361969, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 1912.4732666015625, + "epoch": 0.5472, + "grad_norm": 0.1899724006652832, + "kl": 0.0048828125, + "learning_rate": 5.55030915457017e-07, + "loss": 0.0034, + "num_tokens": 82817850.0, + "reward": 3.0055044889450073, + "reward_std": 1.379512369632721, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.2197899222373962, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 1802.5536499023438, + "epoch": 0.5488, + "grad_norm": 0.19226229190826416, + "kl": 0.0052947998046875, + "learning_rate": 5.525154970309741e-07, + "loss": -0.008, + "num_tokens": 83044754.0, + "reward": 3.3275548219680786, + "reward_std": 1.4011380076408386, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.4257689714431763, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 1866.0089721679688, + "epoch": 0.5504, + "grad_norm": 0.20864298939704895, + "kl": 0.004730224609375, + "learning_rate": 5.5e-07, + "loss": 0.0058, + "num_tokens": 83273165.0, + "reward": 2.7546796798706055, + "reward_std": 1.5095359086990356, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.669642835855484, + "rewards/tag_count_reward": 1.9957510232925415, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 1966.4107666015625, + "epoch": 0.552, + "grad_norm": 0.19302670657634735, + "kl": 0.0052032470703125, + "learning_rate": 5.474845029690258e-07, + "loss": 0.0132, + "num_tokens": 83514627.0, + "reward": 3.303720712661743, + "reward_std": 1.44356369972229, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.491220474243164, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.9285888671875, + "epoch": 0.5536, + "grad_norm": 0.17900726199150085, + "kl": 0.005157470703125, + "learning_rate": 5.44969084542983e-07, + "loss": 0.0002, + "num_tokens": 83764001.0, + "reward": 2.812817096710205, + "reward_std": 1.4069485664367676, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.0538885593414307, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 2024.9286499023438, + "epoch": 0.5552, + "grad_norm": 0.20330828428268433, + "kl": 0.0057220458984375, + "learning_rate": 5.424538233243463e-07, + "loss": 0.0057, + "num_tokens": 84011765.0, + "reward": 3.1032931804656982, + "reward_std": 1.4880832433700562, + "rewards/accuracy_reward": 0.12499999813735485, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.2729358673095703, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 1958.8662109375, + "epoch": 0.5568, + "grad_norm": 0.19807647168636322, + "kl": 0.0048065185546875, + "learning_rate": 5.399387979106786e-07, + "loss": 0.0166, + "num_tokens": 84249946.0, + "reward": 3.130396246910095, + "reward_std": 1.5106209516525269, + "rewards/accuracy_reward": 0.13392857275903225, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.273253321647644, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.4464721679688, + "epoch": 0.5584, + "grad_norm": 0.1849880814552307, + "kl": 0.005157470703125, + "learning_rate": 5.374240868921738e-07, + "loss": 0.0001, + "num_tokens": 84497404.0, + "reward": 3.3302862644195557, + "reward_std": 1.2904528975486755, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.4374289512634277, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 1976.5447387695312, + "epoch": 0.56, + "grad_norm": 0.19814656674861908, + "kl": 0.004974365234375, + "learning_rate": 5.349097688492017e-07, + "loss": -0.0038, + "num_tokens": 84743347.0, + "reward": 3.0697375535964966, + "reward_std": 1.537265956401825, + "rewards/accuracy_reward": 0.1874999962747097, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.1054517030715942, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 1977.5803833007812, + "epoch": 0.5616, + "grad_norm": 0.19933153688907623, + "kl": 0.0053253173828125, + "learning_rate": 5.323959223498522e-07, + "loss": 0.0054, + "num_tokens": 84990428.0, + "reward": 2.9231536388397217, + "reward_std": 1.3591064810752869, + "rewards/accuracy_reward": 0.1339285671710968, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.066010534763336, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 1903.6785888671875, + "epoch": 0.5632, + "grad_norm": 0.17919033765792847, + "kl": 0.005035400390625, + "learning_rate": 5.298826259474804e-07, + "loss": 0.0114, + "num_tokens": 85222540.0, + "reward": 3.276139974594116, + "reward_std": 1.4754623770713806, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.4636396169662476, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 2022.7322387695312, + "epoch": 0.5648, + "grad_norm": 0.19957581162452698, + "kl": 0.00482177734375, + "learning_rate": 5.273699581782518e-07, + "loss": -0.0135, + "num_tokens": 85473502.0, + "reward": 2.715932846069336, + "reward_std": 1.4064764380455017, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 1.974861204624176, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 2008.5536499023438, + "epoch": 0.5664, + "grad_norm": 0.19876334071159363, + "kl": 0.005706787109375, + "learning_rate": 5.248579975586878e-07, + "loss": 0.0086, + "num_tokens": 85717612.0, + "reward": 3.3152692317962646, + "reward_std": 1.370304822921753, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.4134833812713623, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 2039.90185546875, + "epoch": 0.568, + "grad_norm": 0.17952960729599, + "kl": 0.0054168701171875, + "learning_rate": 5.223468225832136e-07, + "loss": 0.0046, + "num_tokens": 85965807.0, + "reward": 3.142555832862854, + "reward_std": 1.4479745626449585, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.3300557136535645, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 1901.9376220703125, + "epoch": 0.5696, + "grad_norm": 0.1901482343673706, + "kl": 0.00579833984375, + "learning_rate": 5.198365117217032e-07, + "loss": -0.0138, + "num_tokens": 86199306.0, + "reward": 3.685123085975647, + "reward_std": 1.1870827674865723, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 2.7208372354507446, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 2002.4912109375, + "epoch": 0.5712, + "grad_norm": 0.1687598079442978, + "kl": 0.005126953125, + "learning_rate": 5.173271434170293e-07, + "loss": 0.0137, + "num_tokens": 86441925.0, + "reward": 3.4336557388305664, + "reward_std": 1.4403055310249329, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.549726963043213, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.9554443359375, + "epoch": 0.5728, + "grad_norm": 0.17336024343967438, + "kl": 0.0052947998046875, + "learning_rate": 5.148187960826108e-07, + "loss": 0.0055, + "num_tokens": 86691050.0, + "reward": 3.144088387489319, + "reward_std": 1.316700041294098, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.3673025369644165, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.0178833007812, + "epoch": 0.5744, + "grad_norm": 0.17832538485527039, + "kl": 0.005706787109375, + "learning_rate": 5.123115480999637e-07, + "loss": 0.0051, + "num_tokens": 86937256.0, + "reward": 3.25367534160614, + "reward_std": 1.492504894733429, + "rewards/accuracy_reward": 0.15178571455180645, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.3251036405563354, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.2053833007812, + "epoch": 0.576, + "grad_norm": 0.19991135597229004, + "kl": 0.005889892578125, + "learning_rate": 5.098054778162505e-07, + "loss": -0.0006, + "num_tokens": 87186003.0, + "reward": 2.7348639965057373, + "reward_std": 1.47906494140625, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 1.9848636984825134, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 1969.3660888671875, + "epoch": 0.5776, + "grad_norm": 0.19183167815208435, + "kl": 0.0048370361328125, + "learning_rate": 5.073006635418332e-07, + "loss": 0.0127, + "num_tokens": 87427754.0, + "reward": 3.052048444747925, + "reward_std": 1.4555712342262268, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.2841912508010864, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.482177734375, + "epoch": 0.5792, + "grad_norm": 0.16952449083328247, + "kl": 0.0047454833984375, + "learning_rate": 5.047971835478252e-07, + "loss": 0.0008, + "num_tokens": 87678730.0, + "reward": 2.889155626296997, + "reward_std": 1.4227197766304016, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.1659411191940308, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.83935546875, + "epoch": 0.5808, + "grad_norm": 0.2043970674276352, + "kl": 0.00579833984375, + "learning_rate": 5.022951160636465e-07, + "loss": 0.0016, + "num_tokens": 87929326.0, + "reward": 3.1575881242752075, + "reward_std": 1.3543331027030945, + "rewards/accuracy_reward": 0.08035714644938707, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.362945079803467, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.5824, + "grad_norm": 0.21254591643810272, + "kl": 0.0059814453125, + "learning_rate": 4.997945392745778e-07, + "loss": 0.0002, + "num_tokens": 88180066.0, + "reward": 3.022618293762207, + "reward_std": 1.3408613204956055, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.227975368499756, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 2009.3660888671875, + "epoch": 0.584, + "grad_norm": 0.18245896697044373, + "kl": 0.0052947998046875, + "learning_rate": 4.972955313193189e-07, + "loss": 0.0003, + "num_tokens": 88425275.0, + "reward": 2.8426198959350586, + "reward_std": 1.357367217540741, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.0301197171211243, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.5856, + "grad_norm": 0.19507494568824768, + "kl": 0.005157470703125, + "learning_rate": 4.947981702875461e-07, + "loss": 0.0002, + "num_tokens": 88676547.0, + "reward": 2.9270598888397217, + "reward_std": 1.403258204460144, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.1502740383148193, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 1966.6607666015625, + "epoch": 0.5872, + "grad_norm": 0.19825610518455505, + "kl": 0.0057220458984375, + "learning_rate": 4.923025342174718e-07, + "loss": 0.0104, + "num_tokens": 88915391.0, + "reward": 3.322133183479309, + "reward_std": 1.3307727575302124, + "rewards/accuracy_reward": 0.11607143003493547, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.4471330642700195, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 2042.5089721679688, + "epoch": 0.5888, + "grad_norm": 0.1739693284034729, + "kl": 0.005096435546875, + "learning_rate": 4.898087010934072e-07, + "loss": 0.002, + "num_tokens": 89161974.0, + "reward": 2.956283211708069, + "reward_std": 1.4695819020271301, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.233068823814392, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.3035888671875, + "epoch": 0.5904, + "grad_norm": 0.18526127934455872, + "kl": 0.0055389404296875, + "learning_rate": 4.873167488433231e-07, + "loss": 0.0051, + "num_tokens": 89406532.0, + "reward": 3.020469546318054, + "reward_std": 1.463829517364502, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.279397964477539, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 2027.696533203125, + "epoch": 0.592, + "grad_norm": 0.19311098754405975, + "kl": 0.0049591064453125, + "learning_rate": 4.848267553364177e-07, + "loss": -0.0026, + "num_tokens": 89655250.0, + "reward": 3.496925950050354, + "reward_std": 1.241596281528473, + "rewards/accuracy_reward": 0.1696428544819355, + "rewards/format_reward": 0.8839285671710968, + "rewards/tag_count_reward": 2.4433541297912598, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 1963.857177734375, + "epoch": 0.5936, + "grad_norm": 0.18368864059448242, + "kl": 0.0057830810546875, + "learning_rate": 4.82338798380681e-07, + "loss": 0.0029, + "num_tokens": 89894368.0, + "reward": 3.3844285011291504, + "reward_std": 1.3576760292053223, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.420142650604248, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 1907.7679443359375, + "epoch": 0.5952, + "grad_norm": 0.1654559075832367, + "kl": 0.0048065185546875, + "learning_rate": 4.798529557204649e-07, + "loss": 0.0092, + "num_tokens": 90129556.0, + "reward": 3.4519050121307373, + "reward_std": 1.1426236629486084, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.567976236343384, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.3661499023438, + "epoch": 0.5968, + "grad_norm": 0.1833089143037796, + "kl": 0.0059967041015625, + "learning_rate": 4.773693050340526e-07, + "loss": 0.0052, + "num_tokens": 90374373.0, + "reward": 3.095685362815857, + "reward_std": 1.4921497702598572, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.265328109264374, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 1859.33935546875, + "epoch": 0.5984, + "grad_norm": 0.20234867930412292, + "kl": 0.005828857421875, + "learning_rate": 4.7488792393123223e-07, + "loss": 0.0115, + "num_tokens": 90602457.0, + "reward": 3.434733271598816, + "reward_std": 1.37566739320755, + "rewards/accuracy_reward": 0.12499999813735485, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.550804615020752, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.0625610351562, + "epoch": 0.6, + "grad_norm": 0.2032018005847931, + "kl": 0.0053558349609375, + "learning_rate": 4.724088899508715e-07, + "loss": 0.0073, + "num_tokens": 90850208.0, + "reward": 3.0400646924972534, + "reward_std": 1.4670978784561157, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.245421528816223, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 1991.0804443359375, + "epoch": 0.6016, + "grad_norm": 0.18128786981105804, + "kl": 0.0048980712890625, + "learning_rate": 4.6993228055849423e-07, + "loss": 0.0059, + "num_tokens": 91097891.0, + "reward": 3.3534656763076782, + "reward_std": 1.1807912588119507, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.540965437889099, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 1907.96435546875, + "epoch": 0.6032, + "grad_norm": 0.18298283219337463, + "kl": 0.0053558349609375, + "learning_rate": 4.6745817314386047e-07, + "loss": 0.0183, + "num_tokens": 91330931.0, + "reward": 3.6907273530960083, + "reward_std": 1.1614291667938232, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 2.6639418601989746, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 1942.40185546875, + "epoch": 0.6048, + "grad_norm": 0.18598800897598267, + "kl": 0.005615234375, + "learning_rate": 4.6498664501854736e-07, + "loss": -0.0038, + "num_tokens": 91566218.0, + "reward": 3.568502426147461, + "reward_std": 1.2938817143440247, + "rewards/accuracy_reward": 0.12499999720603228, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.6220738887786865, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 1922.5714721679688, + "epoch": 0.6064, + "grad_norm": 0.19694586098194122, + "kl": 0.0057830810546875, + "learning_rate": 4.6251777341353425e-07, + "loss": 0.0041, + "num_tokens": 91802714.0, + "reward": 3.8619067668914795, + "reward_std": 1.4072984457015991, + "rewards/accuracy_reward": 0.3303571343421936, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.7190492153167725, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 1971.02685546875, + "epoch": 0.608, + "grad_norm": 0.202091783285141, + "kl": 0.00543212890625, + "learning_rate": 4.6005163547678806e-07, + "loss": 0.0081, + "num_tokens": 92044805.0, + "reward": 3.3209285736083984, + "reward_std": 1.5686394572257996, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.4548569917678833, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 1901.4733276367188, + "epoch": 0.6096, + "grad_norm": 0.21134795248508453, + "kl": 0.00604248046875, + "learning_rate": 4.5758830827085426e-07, + "loss": 0.0044, + "num_tokens": 92280492.0, + "reward": 3.2631337642669678, + "reward_std": 1.5696486830711365, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7142857313156128, + "rewards/tag_count_reward": 2.49527645111084, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 1920.6339721679688, + "epoch": 0.6112, + "grad_norm": 0.19174407422542572, + "kl": 0.0055084228515625, + "learning_rate": 4.5512786877044695e-07, + "loss": 0.0205, + "num_tokens": 92515147.0, + "reward": 3.713483214378357, + "reward_std": 1.3700454831123352, + "rewards/accuracy_reward": 0.2142857164144516, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.713482975959778, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 1940.9733276367188, + "epoch": 0.6128, + "grad_norm": 0.19002830982208252, + "kl": 0.006103515625, + "learning_rate": 4.526703938600449e-07, + "loss": 0.0124, + "num_tokens": 92752164.0, + "reward": 3.104248523712158, + "reward_std": 1.5254728198051453, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.6607142984867096, + "rewards/tag_count_reward": 2.381034016609192, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.0089721679688, + "epoch": 0.6144, + "grad_norm": 0.1817457675933838, + "kl": 0.0051727294921875, + "learning_rate": 4.502159603314888e-07, + "loss": 0.0008, + "num_tokens": 92994435.0, + "reward": 3.0157527923583984, + "reward_std": 1.2675994634628296, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.1318241357803345, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 2002.9286499023438, + "epoch": 0.616, + "grad_norm": 0.18610180914402008, + "kl": 0.0056915283203125, + "learning_rate": 4.477646448815806e-07, + "loss": -0.0022, + "num_tokens": 93236711.0, + "reward": 3.168850302696228, + "reward_std": 1.560289978981018, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.3384931087493896, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 2018.7411499023438, + "epoch": 0.6176, + "grad_norm": 0.1825464963912964, + "kl": 0.005645751953125, + "learning_rate": 4.4531652410968866e-07, + "loss": 0.0006, + "num_tokens": 93481346.0, + "reward": 3.2705390453338623, + "reward_std": 1.4516504406929016, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.3955389261245728, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.6192, + "grad_norm": 0.1808074414730072, + "kl": 0.005218505859375, + "learning_rate": 4.4287167451535235e-07, + "loss": 0.0002, + "num_tokens": 93732982.0, + "reward": 2.6982990503311157, + "reward_std": 1.441219985485077, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 1.9572274088859558, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 1833.08935546875, + "epoch": 0.6208, + "grad_norm": 0.19311031699180603, + "kl": 0.0056915283203125, + "learning_rate": 4.404301724958931e-07, + "loss": -0.0014, + "num_tokens": 93955998.0, + "reward": 3.7769627571105957, + "reward_std": 1.1764092445373535, + "rewards/accuracy_reward": 0.2678571417927742, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 2.643033981323242, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.2053833007812, + "epoch": 0.6224, + "grad_norm": 0.1780703216791153, + "kl": 0.005279541015625, + "learning_rate": 4.379920943440256e-07, + "loss": 0.0111, + "num_tokens": 94201441.0, + "reward": 3.425575375556946, + "reward_std": 1.3170424103736877, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.657718300819397, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 2008.9644165039062, + "epoch": 0.624, + "grad_norm": 0.18192249536514282, + "kl": 0.0052337646484375, + "learning_rate": 4.3555751624547577e-07, + "loss": -0.0022, + "num_tokens": 94447039.0, + "reward": 3.2452290058135986, + "reward_std": 1.2611849308013916, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.8214285969734192, + "rewards/tag_count_reward": 2.35237193107605, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 1903.5269165039062, + "epoch": 0.6256, + "grad_norm": 0.20617492496967316, + "kl": 0.0052490234375, + "learning_rate": 4.331265142765974e-07, + "loss": 0.0231, + "num_tokens": 94679918.0, + "reward": 3.7578675746917725, + "reward_std": 1.2577440738677979, + "rewards/accuracy_reward": 0.1696428582072258, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 2.7489389181137085, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 1889.9108276367188, + "epoch": 0.6272, + "grad_norm": 0.19898207485675812, + "kl": 0.0057525634765625, + "learning_rate": 4.306991644019979e-07, + "loss": 0.0062, + "num_tokens": 94911104.0, + "reward": 3.867734670639038, + "reward_std": 1.1987918019294739, + "rewards/accuracy_reward": 0.3214285746216774, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 2.689163088798523, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.33935546875, + "epoch": 0.6288, + "grad_norm": 0.19190049171447754, + "kl": 0.0053863525390625, + "learning_rate": 4.28275542472162e-07, + "loss": 0.0089, + "num_tokens": 95158340.0, + "reward": 2.7501951456069946, + "reward_std": 1.4344849586486816, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.705357164144516, + "rewards/tag_count_reward": 1.9912664890289307, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.6304, + "grad_norm": 0.1949337124824524, + "kl": 0.0055999755859375, + "learning_rate": 4.258557242210831e-07, + "loss": 0.0002, + "num_tokens": 95408464.0, + "reward": 2.951748490333557, + "reward_std": 1.450048565864563, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.2106770277023315, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.571533203125, + "epoch": 0.632, + "grad_norm": 0.17732079327106476, + "kl": 0.005645751953125, + "learning_rate": 4.2343978526389634e-07, + "loss": 0.002, + "num_tokens": 95658960.0, + "reward": 3.057677984237671, + "reward_std": 1.4699004292488098, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.1826778650283813, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.5894165039062, + "epoch": 0.6336, + "grad_norm": 0.1762060523033142, + "kl": 0.00555419921875, + "learning_rate": 4.21027801094515e-07, + "loss": 0.0022, + "num_tokens": 95905986.0, + "reward": 2.8867948055267334, + "reward_std": 1.4828120470046997, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.6785714328289032, + "rewards/tag_count_reward": 2.1992945671081543, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 1912.6428833007812, + "epoch": 0.6352, + "grad_norm": 0.21289189159870148, + "kl": 0.0056915283203125, + "learning_rate": 4.186198470832726e-07, + "loss": -0.019, + "num_tokens": 96141776.0, + "reward": 2.6750375032424927, + "reward_std": 1.3964202404022217, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 1.8982517719268799, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.6368, + "grad_norm": 0.18072207272052765, + "kl": 0.005096435546875, + "learning_rate": 4.1621599847456714e-07, + "loss": 0.0002, + "num_tokens": 96392194.0, + "reward": 2.701108694076538, + "reward_std": 1.343613624572754, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 1.9689656496047974, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.107177734375, + "epoch": 0.6384, + "grad_norm": 0.1985914260149002, + "kl": 0.005584716796875, + "learning_rate": 4.1381633038451003e-07, + "loss": 0.0131, + "num_tokens": 96633818.0, + "reward": 3.357666015625, + "reward_std": 1.2866899371147156, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.4469516277313232, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 2026.2857666015625, + "epoch": 0.64, + "grad_norm": 0.1984076350927353, + "kl": 0.0055389404296875, + "learning_rate": 4.114209177985782e-07, + "loss": 0.0028, + "num_tokens": 96880432.0, + "reward": 3.0977829694747925, + "reward_std": 1.5115153789520264, + "rewards/accuracy_reward": 0.1160714328289032, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.2317113876342773, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 2021.8482666015625, + "epoch": 0.6416, + "grad_norm": 0.19538640975952148, + "kl": 0.0063018798828125, + "learning_rate": 4.0902983556927193e-07, + "loss": 0.0028, + "num_tokens": 97123917.0, + "reward": 3.535492420196533, + "reward_std": 1.3765368461608887, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.7051349878311157, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.821533203125, + "epoch": 0.6432, + "grad_norm": 0.18938350677490234, + "kl": 0.00567626953125, + "learning_rate": 4.066431584137748e-07, + "loss": 0.0087, + "num_tokens": 97366937.0, + "reward": 3.5022947788238525, + "reward_std": 1.5938639044761658, + "rewards/accuracy_reward": 0.2410714328289032, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.5201518535614014, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 1983.65185546875, + "epoch": 0.6448, + "grad_norm": 0.17440277338027954, + "kl": 0.0048980712890625, + "learning_rate": 4.042609609116202e-07, + "loss": -0.0036, + "num_tokens": 97611030.0, + "reward": 3.2224295139312744, + "reward_std": 1.314091444015503, + "rewards/accuracy_reward": 0.14285714365541935, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.2670722007751465, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.3482666015625, + "epoch": 0.6464, + "grad_norm": 0.18399862945079803, + "kl": 0.00531005859375, + "learning_rate": 4.018833175023598e-07, + "loss": 0.0008, + "num_tokens": 97856685.0, + "reward": 3.0022189617156982, + "reward_std": 1.3115220665931702, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.216504454612732, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 2041.884033203125, + "epoch": 0.648, + "grad_norm": 0.21120086312294006, + "kl": 0.0054168701171875, + "learning_rate": 3.9951030248323705e-07, + "loss": 0.0016, + "num_tokens": 98106362.0, + "reward": 2.907158374786377, + "reward_std": 1.3235044479370117, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.103586971759796, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.7410888671875, + "epoch": 0.6496, + "grad_norm": 0.1961909830570221, + "kl": 0.0054779052734375, + "learning_rate": 3.971419900068674e-07, + "loss": 0.0089, + "num_tokens": 98358613.0, + "reward": 3.094893455505371, + "reward_std": 1.3357966542243958, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.3448933362960815, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.134033203125, + "epoch": 0.6512, + "grad_norm": 0.1960398256778717, + "kl": 0.0059661865234375, + "learning_rate": 3.9477845407891906e-07, + "loss": -0.0008, + "num_tokens": 98605070.0, + "reward": 3.150224208831787, + "reward_std": 1.325607717037201, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.373438239097595, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.8839721679688, + "epoch": 0.6528, + "grad_norm": 0.18400336802005768, + "kl": 0.0056915283203125, + "learning_rate": 3.924197685558017e-07, + "loss": 0.0002, + "num_tokens": 98856945.0, + "reward": 2.7869744300842285, + "reward_std": 1.4112587571144104, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.028045892715454, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 1899.4107666015625, + "epoch": 0.6544, + "grad_norm": 0.20461797714233398, + "kl": 0.0059814453125, + "learning_rate": 3.9006600714235846e-07, + "loss": 0.0035, + "num_tokens": 99087263.0, + "reward": 3.763200283050537, + "reward_std": 1.2916709780693054, + "rewards/accuracy_reward": 0.1964285671710968, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.745342969894409, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 2029.3035888671875, + "epoch": 0.656, + "grad_norm": 0.1990826576948166, + "kl": 0.0049591064453125, + "learning_rate": 3.8771724338956214e-07, + "loss": -0.0004, + "num_tokens": 99333053.0, + "reward": 3.092265009880066, + "reward_std": 1.245220124721527, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.1726220846176147, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 1846.482177734375, + "epoch": 0.6576, + "grad_norm": 0.18375076353549957, + "kl": 0.0054779052734375, + "learning_rate": 3.853735506922173e-07, + "loss": -0.0078, + "num_tokens": 99560425.0, + "reward": 3.3849786520004272, + "reward_std": 1.2553666234016418, + "rewards/accuracy_reward": 0.2410714328289032, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.3671212196350098, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 1971.919677734375, + "epoch": 0.6592, + "grad_norm": 0.19765444099903107, + "kl": 0.0057373046875, + "learning_rate": 3.830350022866674e-07, + "loss": 0.014, + "num_tokens": 99800264.0, + "reward": 3.174409031867981, + "reward_std": 1.4391340017318726, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.317266047000885, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.6608, + "grad_norm": 0.19754108786582947, + "kl": 0.0048828125, + "learning_rate": 3.8070167124850495e-07, + "loss": 0.0002, + "num_tokens": 100054882.0, + "reward": 2.805709719657898, + "reward_std": 1.3076648712158203, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.011066734790802, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.3839721679688, + "epoch": 0.6624, + "grad_norm": 0.19234426319599152, + "kl": 0.0057373046875, + "learning_rate": 3.7837363049028904e-07, + "loss": 0.0039, + "num_tokens": 100302809.0, + "reward": 2.9762094020843506, + "reward_std": 1.2921881079673767, + "rewards/accuracy_reward": 0.09821428544819355, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.1637094020843506, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 1946.3572387695312, + "epoch": 0.664, + "grad_norm": 0.19783206284046173, + "kl": 0.00579833984375, + "learning_rate": 3.760509527592669e-07, + "loss": 0.0236, + "num_tokens": 100542627.0, + "reward": 3.2028050422668457, + "reward_std": 1.411239206790924, + "rewards/accuracy_reward": 0.12499999813735485, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.309947967529297, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 1967.3751220703125, + "epoch": 0.6656, + "grad_norm": 0.17683227360248566, + "kl": 0.005096435546875, + "learning_rate": 3.7373371063509976e-07, + "loss": -0.0124, + "num_tokens": 100782237.0, + "reward": 3.2608145475387573, + "reward_std": 1.3170040845870972, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.341171622276306, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 1760.3572387695312, + "epoch": 0.6672, + "grad_norm": 0.20658712089061737, + "kl": 0.005615234375, + "learning_rate": 3.7142197652759654e-07, + "loss": 0.0187, + "num_tokens": 100996743.0, + "reward": 3.7096354961395264, + "reward_std": 1.3730250597000122, + "rewards/accuracy_reward": 0.16964286379516125, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.73642098903656, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 1920.509033203125, + "epoch": 0.6688, + "grad_norm": 0.20206357538700104, + "kl": 0.0053253173828125, + "learning_rate": 3.691158226744492e-07, + "loss": -0.0246, + "num_tokens": 101231174.0, + "reward": 2.9214537143707275, + "reward_std": 1.378883957862854, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.100024998188019, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 1906.2947387695312, + "epoch": 0.6704, + "grad_norm": 0.19553060829639435, + "kl": 0.0052490234375, + "learning_rate": 3.6681532113897695e-07, + "loss": 0.0074, + "num_tokens": 101464055.0, + "reward": 3.1882935762405396, + "reward_std": 1.2863349318504333, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 2.3311504125595093, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 1992.2857666015625, + "epoch": 0.672, + "grad_norm": 0.1883488893508911, + "kl": 0.005401611328125, + "learning_rate": 3.6452054380787366e-07, + "loss": 0.0148, + "num_tokens": 101710823.0, + "reward": 3.3555866479873657, + "reward_std": 1.2577791810035706, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.8660714328289032, + "rewards/tag_count_reward": 2.3734437227249146, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 1868.02685546875, + "epoch": 0.6736, + "grad_norm": 0.18497923016548157, + "kl": 0.0055084228515625, + "learning_rate": 3.622315623889619e-07, + "loss": 0.014, + "num_tokens": 101937360.0, + "reward": 3.4650295972824097, + "reward_std": 1.3894270062446594, + "rewards/accuracy_reward": 0.1785714328289032, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.554314970970154, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 1833.2232666015625, + "epoch": 0.6752, + "grad_norm": 0.19730480015277863, + "kl": 0.0055084228515625, + "learning_rate": 3.5994844840895166e-07, + "loss": -0.0106, + "num_tokens": 102161063.0, + "reward": 3.435041069984436, + "reward_std": 1.3073344230651855, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8303571343421936, + "rewards/tag_count_reward": 2.5421838760375977, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 1971.21435546875, + "epoch": 0.6768, + "grad_norm": 0.20196868479251862, + "kl": 0.0057220458984375, + "learning_rate": 3.5767127321120593e-07, + "loss": 0.0056, + "num_tokens": 102401873.0, + "reward": 3.223639726638794, + "reward_std": 1.4147793650627136, + "rewards/accuracy_reward": 0.07142857182770967, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 2.3486396074295044, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 1962.357177734375, + "epoch": 0.6784, + "grad_norm": 0.19078510999679565, + "kl": 0.0056610107421875, + "learning_rate": 3.5540010795351117e-07, + "loss": 0.0101, + "num_tokens": 102638317.0, + "reward": 3.456148386001587, + "reward_std": 1.3702346682548523, + "rewards/accuracy_reward": 0.1696428582072258, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.491862416267395, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 1771.2857666015625, + "epoch": 0.68, + "grad_norm": 0.20089930295944214, + "kl": 0.006439208984375, + "learning_rate": 3.531350236058528e-07, + "loss": 0.0062, + "num_tokens": 102855825.0, + "reward": 3.773930072784424, + "reward_std": 1.449974775314331, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.836430072784424, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 1889.009033203125, + "epoch": 0.6816, + "grad_norm": 0.20441655814647675, + "kl": 0.0057525634765625, + "learning_rate": 3.5087609094819937e-07, + "loss": 0.0049, + "num_tokens": 103091208.0, + "reward": 3.4335126876831055, + "reward_std": 1.2638221979141235, + "rewards/accuracy_reward": 0.14285713993012905, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.531726837158203, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.6832, + "grad_norm": 0.17981161177158356, + "kl": 0.0051116943359375, + "learning_rate": 3.4862338056828916e-07, + "loss": 0.0002, + "num_tokens": 103338266.0, + "reward": 2.899856925010681, + "reward_std": 1.4434887170791626, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.114142417907715, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 1910.9822387695312, + "epoch": 0.6848, + "grad_norm": 0.19169668853282928, + "kl": 0.00579833984375, + "learning_rate": 3.46376962859425e-07, + "loss": 0.0208, + "num_tokens": 103571406.0, + "reward": 3.6440869569778442, + "reward_std": 1.4199631214141846, + "rewards/accuracy_reward": 0.1785714291036129, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.6887298822402954, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 1928.5536499023438, + "epoch": 0.6864, + "grad_norm": 0.17431218922138214, + "kl": 0.0054473876953125, + "learning_rate": 3.441369080182748e-07, + "loss": -0.0028, + "num_tokens": 103805366.0, + "reward": 3.537634253501892, + "reward_std": 1.377600371837616, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.725134253501892, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.857177734375, + "epoch": 0.688, + "grad_norm": 0.19302013516426086, + "kl": 0.0059967041015625, + "learning_rate": 3.4190328604267815e-07, + "loss": 0.0005, + "num_tokens": 104053766.0, + "reward": 3.1029685735702515, + "reward_std": 1.3621240854263306, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.2279685735702515, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 2001.33935546875, + "epoch": 0.6896, + "grad_norm": 0.21009165048599243, + "kl": 0.0055084228515625, + "learning_rate": 3.396761667294579e-07, + "loss": 0.0159, + "num_tokens": 104298398.0, + "reward": 2.7407108545303345, + "reward_std": 1.4287609457969666, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.0532108545303345, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 2005.3125610351562, + "epoch": 0.6912, + "grad_norm": 0.17791488766670227, + "kl": 0.005706787109375, + "learning_rate": 3.374556196722408e-07, + "loss": 0.0146, + "num_tokens": 104540521.0, + "reward": 3.606230139732361, + "reward_std": 1.3654711246490479, + "rewards/accuracy_reward": 0.1964285671710968, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 2.56158709526062, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 1968.2410888671875, + "epoch": 0.6928, + "grad_norm": 0.19957475364208221, + "kl": 0.005340576171875, + "learning_rate": 3.3524171425928155e-07, + "loss": -0.0008, + "num_tokens": 104778338.0, + "reward": 3.2961137294769287, + "reward_std": 1.2924441695213318, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.4211134910583496, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 1987.1339721679688, + "epoch": 0.6944, + "grad_norm": 0.19773735105991364, + "kl": 0.006011962890625, + "learning_rate": 3.33034519671295e-07, + "loss": 0.0034, + "num_tokens": 105021365.0, + "reward": 3.081148147583008, + "reward_std": 1.504008948802948, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6517857313156128, + "rewards/tag_count_reward": 2.429362416267395, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 1960.0089721679688, + "epoch": 0.696, + "grad_norm": 0.17906151711940765, + "kl": 0.00555419921875, + "learning_rate": 3.3083410487929444e-07, + "loss": 0.0036, + "num_tokens": 105262152.0, + "reward": 3.563599705696106, + "reward_std": 1.3764487504959106, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.688599705696106, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 1925.46435546875, + "epoch": 0.6976, + "grad_norm": 0.19912101328372955, + "kl": 0.006103515625, + "learning_rate": 3.286405386424358e-07, + "loss": 0.0082, + "num_tokens": 105499406.0, + "reward": 3.3798606395721436, + "reward_std": 1.2954837679862976, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.638788938522339, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 2006.21435546875, + "epoch": 0.6992, + "grad_norm": 0.203093484044075, + "kl": 0.006256103515625, + "learning_rate": 3.264538895058705e-07, + "loss": -0.0009, + "num_tokens": 105750940.0, + "reward": 3.043925166130066, + "reward_std": 1.3882110118865967, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.2939250469207764, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 1968.02685546875, + "epoch": 0.7008, + "grad_norm": 0.18831394612789154, + "kl": 0.005523681640625, + "learning_rate": 3.242742257986013e-07, + "loss": 0.0045, + "num_tokens": 105989699.0, + "reward": 3.0606935024261475, + "reward_std": 1.514654517173767, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.194622039794922, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 2007.6519165039062, + "epoch": 0.7024, + "grad_norm": 0.1807248741388321, + "kl": 0.0054168701171875, + "learning_rate": 3.2210161563134955e-07, + "loss": 0.0075, + "num_tokens": 106234450.0, + "reward": 3.2039501667022705, + "reward_std": 1.4991829991340637, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7589285969734192, + "rewards/tag_count_reward": 2.3914501667022705, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 1848.3125610351562, + "epoch": 0.704, + "grad_norm": 0.23598583042621613, + "kl": 0.0062255859375, + "learning_rate": 3.199361268944245e-07, + "loss": -0.0113, + "num_tokens": 106460921.0, + "reward": 3.257811665534973, + "reward_std": 1.347205936908722, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.4095972776412964, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.8036499023438, + "epoch": 0.7056, + "grad_norm": 0.18468347191810608, + "kl": 0.0056915283203125, + "learning_rate": 3.177778272556041e-07, + "loss": 0.0062, + "num_tokens": 106709903.0, + "reward": 3.1498888731002808, + "reward_std": 1.5550615191459656, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.3373886346817017, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 2019.1429443359375, + "epoch": 0.7072, + "grad_norm": 0.18881992995738983, + "kl": 0.0055694580078125, + "learning_rate": 3.1562678415801903e-07, + "loss": 0.0084, + "num_tokens": 106956179.0, + "reward": 3.0269607305526733, + "reward_std": 1.3820400834083557, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.2680320739746094, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 1984.4733276367188, + "epoch": 0.7088, + "grad_norm": 0.18366068601608276, + "kl": 0.0054779052734375, + "learning_rate": 3.134830648180451e-07, + "loss": 0.0083, + "num_tokens": 107197872.0, + "reward": 3.172049641609192, + "reward_std": 1.4098306894302368, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.4309780597686768, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.8572387695312, + "epoch": 0.7104, + "grad_norm": 0.19104242324829102, + "kl": 0.005889892578125, + "learning_rate": 3.113467362232043e-07, + "loss": 0.0056, + "num_tokens": 107445642.0, + "reward": 3.1890716552734375, + "reward_std": 1.4772273898124695, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.349785804748535, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.712, + "grad_norm": 0.17187470197677612, + "kl": 0.0053253173828125, + "learning_rate": 3.092178651300697e-07, + "loss": 0.0002, + "num_tokens": 107693540.0, + "reward": 3.0975565910339355, + "reward_std": 1.1923416256904602, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.267199158668518, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 1956.3929443359375, + "epoch": 0.7136, + "grad_norm": 0.17438076436519623, + "kl": 0.004730224609375, + "learning_rate": 3.0709651806218107e-07, + "loss": 0.0001, + "num_tokens": 107928812.0, + "reward": 3.2405694723129272, + "reward_std": 1.2347821593284607, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 2.2584264278411865, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 1904.8214721679688, + "epoch": 0.7152, + "grad_norm": 0.19764257967472076, + "kl": 0.0061492919921875, + "learning_rate": 3.0498276130796475e-07, + "loss": 0.0019, + "num_tokens": 108164412.0, + "reward": 3.1762431859970093, + "reward_std": 1.3816869258880615, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.417314291000366, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 2024.7500610351562, + "epoch": 0.7168, + "grad_norm": 0.19265976548194885, + "kl": 0.005706787109375, + "learning_rate": 3.0287666091866354e-07, + "loss": -0.0008, + "num_tokens": 108412562.0, + "reward": 3.0607857704162598, + "reward_std": 1.3706170916557312, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.328642964363098, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.7589721679688, + "epoch": 0.7184, + "grad_norm": 0.18051329255104065, + "kl": 0.0052490234375, + "learning_rate": 3.0077828270627183e-07, + "loss": 0.0038, + "num_tokens": 108665067.0, + "reward": 2.940533757209778, + "reward_std": 1.2972338199615479, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.1816052198410034, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.77685546875, + "epoch": 0.72, + "grad_norm": 0.19165199995040894, + "kl": 0.0057830810546875, + "learning_rate": 2.9868769224147896e-07, + "loss": 0.0214, + "num_tokens": 108897274.0, + "reward": 3.634817361831665, + "reward_std": 1.3616573810577393, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.7062456607818604, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 1983.482177734375, + "epoch": 0.7216, + "grad_norm": 0.19042696058750153, + "kl": 0.00604248046875, + "learning_rate": 2.966049548516212e-07, + "loss": 0.0044, + "num_tokens": 109140620.0, + "reward": 3.277406930923462, + "reward_std": 1.4605275988578796, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.447049856185913, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.0803833007812, + "epoch": 0.7232, + "grad_norm": 0.18178941309452057, + "kl": 0.0059051513671875, + "learning_rate": 2.9453013561863956e-07, + "loss": 0.0005, + "num_tokens": 109387505.0, + "reward": 2.934882164001465, + "reward_std": 1.4374279975891113, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.2116678953170776, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.732177734375, + "epoch": 0.7248, + "grad_norm": 0.17115643620491028, + "kl": 0.00531005859375, + "learning_rate": 2.924632993770465e-07, + "loss": 0.0003, + "num_tokens": 109635751.0, + "reward": 3.12574303150177, + "reward_std": 1.2907991409301758, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.2686002254486084, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 1940.4732666015625, + "epoch": 0.7264, + "grad_norm": 0.19064931571483612, + "kl": 0.0053558349609375, + "learning_rate": 2.9040451071189933e-07, + "loss": 0.0122, + "num_tokens": 109872810.0, + "reward": 3.0217565298080444, + "reward_std": 1.2835578322410583, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.2092564702033997, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 1847.1964721679688, + "epoch": 0.728, + "grad_norm": 0.19782426953315735, + "kl": 0.0058441162109375, + "learning_rate": 2.8835383395678326e-07, + "loss": -0.0017, + "num_tokens": 110099170.0, + "reward": 3.434313178062439, + "reward_std": 1.3075591921806335, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.5771701335906982, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 1918.7947387695312, + "epoch": 0.7296, + "grad_norm": 0.2034033089876175, + "kl": 0.006103515625, + "learning_rate": 2.8631133319179943e-07, + "loss": 0.0043, + "num_tokens": 110333423.0, + "reward": 3.4216816425323486, + "reward_std": 1.4223222732543945, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.6716814041137695, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 1990.5178833007812, + "epoch": 0.7312, + "grad_norm": 0.19291247427463531, + "kl": 0.0052947998046875, + "learning_rate": 2.842770722415645e-07, + "loss": 0.012, + "num_tokens": 110580091.0, + "reward": 3.3236422538757324, + "reward_std": 1.3450376391410828, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.8482142984867096, + "rewards/tag_count_reward": 2.359356462955475, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 1982.2769165039062, + "epoch": 0.7328, + "grad_norm": 0.18507017195224762, + "kl": 0.0050811767578125, + "learning_rate": 2.8225111467321405e-07, + "loss": -0.0025, + "num_tokens": 110821552.0, + "reward": 3.4004461765289307, + "reward_std": 1.356360137462616, + "rewards/accuracy_reward": 0.2321428582072258, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.3825888633728027, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.7344, + "grad_norm": 0.19775927066802979, + "kl": 0.00543212890625, + "learning_rate": 2.8023352379441847e-07, + "loss": 0.0002, + "num_tokens": 111073062.0, + "reward": 2.8134649991989136, + "reward_std": 1.2074169516563416, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.0188220739364624, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.1339721679688, + "epoch": 0.736, + "grad_norm": 0.20942422747612, + "kl": 0.005584716796875, + "learning_rate": 2.7822436265140244e-07, + "loss": 0.0206, + "num_tokens": 111315753.0, + "reward": 2.8364397287368774, + "reward_std": 1.3892544507980347, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142686843872, + "rewards/tag_count_reward": 2.113225221633911, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 1904.3839721679688, + "epoch": 0.7376, + "grad_norm": 0.20766523480415344, + "kl": 0.0059967041015625, + "learning_rate": 2.7622369402697676e-07, + "loss": 0.0021, + "num_tokens": 111549974.0, + "reward": 3.097164511680603, + "reward_std": 1.412791669368744, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.3293073177337646, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 1901.5179443359375, + "epoch": 0.7392, + "grad_norm": 0.1890152096748352, + "kl": 0.0060882568359375, + "learning_rate": 2.742315804385757e-07, + "loss": 0.0209, + "num_tokens": 111779212.0, + "reward": 3.520388603210449, + "reward_std": 1.1873919367790222, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 2.64538836479187, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 1916.8125610351562, + "epoch": 0.7408, + "grad_norm": 0.22606436908245087, + "kl": 0.0057373046875, + "learning_rate": 2.7224808413630285e-07, + "loss": 0.0239, + "num_tokens": 112016281.0, + "reward": 2.9023056030273438, + "reward_std": 1.4708049893379211, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.2148057222366333, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 1950.7501220703125, + "epoch": 0.7424, + "grad_norm": 0.1955224871635437, + "kl": 0.005279541015625, + "learning_rate": 2.702732671009873e-07, + "loss": -0.003, + "num_tokens": 112257347.0, + "reward": 2.693434000015259, + "reward_std": 1.4321749210357666, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 1.9166482090950012, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 2016.2679443359375, + "epoch": 0.744, + "grad_norm": 0.18564175069332123, + "kl": 0.0055694580078125, + "learning_rate": 2.683071910422453e-07, + "loss": 0.0048, + "num_tokens": 112503805.0, + "reward": 2.9958345890045166, + "reward_std": 1.4481412172317505, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.1744059324264526, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 2039.77685546875, + "epoch": 0.7456, + "grad_norm": 0.20268253982067108, + "kl": 0.005645751953125, + "learning_rate": 2.663499173965533e-07, + "loss": 0.004, + "num_tokens": 112756662.0, + "reward": 2.8543174266815186, + "reward_std": 1.3723962903022766, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.015031635761261, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 1869.9108276367188, + "epoch": 0.7472, + "grad_norm": 0.19427870213985443, + "kl": 0.0058746337890625, + "learning_rate": 2.6440150732532717e-07, + "loss": 0.0009, + "num_tokens": 112989094.0, + "reward": 3.276418447494507, + "reward_std": 1.3566447496414185, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.7589285969734192, + "rewards/tag_count_reward": 2.401418447494507, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 2040.8214721679688, + "epoch": 0.7488, + "grad_norm": 0.19961531460285187, + "kl": 0.0061798095703125, + "learning_rate": 2.624620217130118e-07, + "loss": 0.0022, + "num_tokens": 113234830.0, + "reward": 3.230058789253235, + "reward_std": 1.467456877231598, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.4889872074127197, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 1960.7679443359375, + "epoch": 0.7504, + "grad_norm": 0.18365560472011566, + "kl": 0.0050506591796875, + "learning_rate": 2.605315211651776e-07, + "loss": 0.0229, + "num_tokens": 113478068.0, + "reward": 3.214093565940857, + "reward_std": 1.2719497084617615, + "rewards/accuracy_reward": 0.1696428582072258, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.2319506406784058, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 1949.8751220703125, + "epoch": 0.752, + "grad_norm": 0.18737457692623138, + "kl": 0.0053863525390625, + "learning_rate": 2.5861006600662775e-07, + "loss": 0.0007, + "num_tokens": 113714290.0, + "reward": 3.4580111503601074, + "reward_std": 1.3894099593162537, + "rewards/accuracy_reward": 0.2410714253783226, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.4312254190444946, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.9554443359375, + "epoch": 0.7536, + "grad_norm": 0.1845475435256958, + "kl": 0.00543212890625, + "learning_rate": 2.5669771627951276e-07, + "loss": -0.0029, + "num_tokens": 113960489.0, + "reward": 3.2032936811447144, + "reward_std": 1.556068778038025, + "rewards/accuracy_reward": 0.19642857182770967, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.274722099304199, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 1997.40185546875, + "epoch": 0.7552, + "grad_norm": 0.19840526580810547, + "kl": 0.0057525634765625, + "learning_rate": 2.547945317414536e-07, + "loss": 0.0096, + "num_tokens": 114207312.0, + "reward": 3.380433440208435, + "reward_std": 1.5018176436424255, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.4161475896835327, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.5179443359375, + "epoch": 0.7568, + "grad_norm": 0.19542165100574493, + "kl": 0.005279541015625, + "learning_rate": 2.5290057186367524e-07, + "loss": 0.0008, + "num_tokens": 114453070.0, + "reward": 3.2010425329208374, + "reward_std": 1.2939642071723938, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.3885424733161926, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 2000.5000610351562, + "epoch": 0.7584, + "grad_norm": 0.2062504142522812, + "kl": 0.005859375, + "learning_rate": 2.510158958291483e-07, + "loss": 0.0148, + "num_tokens": 114696334.0, + "reward": 3.3958789110183716, + "reward_std": 1.3592751026153564, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.6369502544403076, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 1981.1697387695312, + "epoch": 0.76, + "grad_norm": 0.19891047477722168, + "kl": 0.005035400390625, + "learning_rate": 2.4914056253073856e-07, + "loss": 0.027, + "num_tokens": 114938245.0, + "reward": 3.3824381828308105, + "reward_std": 1.3261027932167053, + "rewards/accuracy_reward": 0.08035714365541935, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 2.4449379444122314, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.52685546875, + "epoch": 0.7616, + "grad_norm": 0.19620952010154724, + "kl": 0.0058746337890625, + "learning_rate": 2.472746305693682e-07, + "loss": 0.0016, + "num_tokens": 115184060.0, + "reward": 3.047945022583008, + "reward_std": 1.3033799529075623, + "rewards/accuracy_reward": 0.10714285541325808, + "rewards/format_reward": 0.6785714328289032, + "rewards/tag_count_reward": 2.2622305154800415, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 1977.5000610351562, + "epoch": 0.7632, + "grad_norm": 0.1795632690191269, + "kl": 0.0056915283203125, + "learning_rate": 2.4541815825218376e-07, + "loss": 0.0151, + "num_tokens": 115426218.0, + "reward": 3.61670982837677, + "reward_std": 1.3729158639907837, + "rewards/accuracy_reward": 0.1339285746216774, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.7149240970611572, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.321533203125, + "epoch": 0.7648, + "grad_norm": 0.18105751276016235, + "kl": 0.0055999755859375, + "learning_rate": 2.4357120359073367e-07, + "loss": 0.0118, + "num_tokens": 115670092.0, + "reward": 3.125066637992859, + "reward_std": 1.4167006611824036, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.3661378622055054, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 1996.982177734375, + "epoch": 0.7664, + "grad_norm": 0.20324698090553284, + "kl": 0.005828857421875, + "learning_rate": 2.41733824299157e-07, + "loss": 0.0123, + "num_tokens": 115915566.0, + "reward": 3.0149813890457153, + "reward_std": 1.344652533531189, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.732142835855484, + "rewards/tag_count_reward": 2.282838463783264, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 1882.8036499023438, + "epoch": 0.768, + "grad_norm": 0.19364458322525024, + "kl": 0.0052642822265625, + "learning_rate": 2.3990607779237814e-07, + "loss": 0.0135, + "num_tokens": 116147454.0, + "reward": 3.8620420694351196, + "reward_std": 1.1791847944259644, + "rewards/accuracy_reward": 0.3035714328289032, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.7370415925979614, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 1920.0625610351562, + "epoch": 0.7696, + "grad_norm": 0.19176708161830902, + "kl": 0.005157470703125, + "learning_rate": 2.3808802118431427e-07, + "loss": 0.0136, + "num_tokens": 116379203.0, + "reward": 3.212235689163208, + "reward_std": 1.2428618669509888, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8660714030265808, + "rewards/tag_count_reward": 2.337235391139984, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 1782.1429443359375, + "epoch": 0.7712, + "grad_norm": 0.23938943445682526, + "kl": 0.0055389404296875, + "learning_rate": 2.362797112860899e-07, + "loss": -0.0301, + "num_tokens": 116600321.0, + "reward": 2.8932920694351196, + "reward_std": 1.2740073204040527, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.071863353252411, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 1926.2411499023438, + "epoch": 0.7728, + "grad_norm": 0.19429346919059753, + "kl": 0.005615234375, + "learning_rate": 2.3448120460426147e-07, + "loss": 0.0047, + "num_tokens": 116837592.0, + "reward": 3.5109294652938843, + "reward_std": 1.4210047125816345, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.635929226875305, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 1978.15185546875, + "epoch": 0.7744, + "grad_norm": 0.18889088928699493, + "kl": 0.0050811767578125, + "learning_rate": 2.3269255733905205e-07, + "loss": 0.0117, + "num_tokens": 117085241.0, + "reward": 2.956921100616455, + "reward_std": 1.4342970252037048, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.1890639066696167, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 2032.7144165039062, + "epoch": 0.776, + "grad_norm": 0.19146110117435455, + "kl": 0.005035400390625, + "learning_rate": 2.309138253825948e-07, + "loss": 0.0062, + "num_tokens": 117336971.0, + "reward": 3.050230026245117, + "reward_std": 1.4859873056411743, + "rewards/accuracy_reward": 0.07142857275903225, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.1663013696670532, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 1881.8304443359375, + "epoch": 0.7776, + "grad_norm": 0.18879354000091553, + "kl": 0.005645751953125, + "learning_rate": 2.2914506431718705e-07, + "loss": 0.0033, + "num_tokens": 117566048.0, + "reward": 3.608255982398987, + "reward_std": 1.3796170949935913, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.6707557439804077, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.9107666015625, + "epoch": 0.7792, + "grad_norm": 0.1887301206588745, + "kl": 0.0051116943359375, + "learning_rate": 2.2738632941355248e-07, + "loss": 0.0008, + "num_tokens": 117813782.0, + "reward": 2.897287964820862, + "reward_std": 1.2155529260635376, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 1.9955021739006042, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 1899.27685546875, + "epoch": 0.7808, + "grad_norm": 0.1913236379623413, + "kl": 0.005645751953125, + "learning_rate": 2.2563767562911504e-07, + "loss": 0.0012, + "num_tokens": 118047893.0, + "reward": 3.595033884048462, + "reward_std": 1.3825852870941162, + "rewards/accuracy_reward": 0.3035714328289032, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.4700340628623962, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.357177734375, + "epoch": 0.7824, + "grad_norm": 0.1980510652065277, + "kl": 0.006317138671875, + "learning_rate": 2.2389915760628075e-07, + "loss": 0.0069, + "num_tokens": 118293983.0, + "reward": 3.285255193710327, + "reward_std": 1.2765869498252869, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.8035714030265808, + "rewards/tag_count_reward": 2.3745408058166504, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.02685546875, + "epoch": 0.784, + "grad_norm": 0.17984379827976227, + "kl": 0.0050506591796875, + "learning_rate": 2.2217082967073087e-07, + "loss": -0.0005, + "num_tokens": 118543368.0, + "reward": 2.946680426597595, + "reward_std": 1.216102123260498, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.160965919494629, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 1883.9197387695312, + "epoch": 0.7856, + "grad_norm": 0.2042740136384964, + "kl": 0.0053863525390625, + "learning_rate": 2.2045274582972417e-07, + "loss": 0.0393, + "num_tokens": 118774233.0, + "reward": 3.4483667612075806, + "reward_std": 1.191817045211792, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.830357164144516, + "rewards/tag_count_reward": 2.501937985420227, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.4732666015625, + "epoch": 0.7872, + "grad_norm": 0.19604535400867462, + "kl": 0.005584716796875, + "learning_rate": 2.1874495977040868e-07, + "loss": 0.0012, + "num_tokens": 119027196.0, + "reward": 3.116202235221863, + "reward_std": 1.2671216130256653, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8571428656578064, + "rewards/tag_count_reward": 2.2501306533813477, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 1994.83935546875, + "epoch": 0.7888, + "grad_norm": 0.19408971071243286, + "kl": 0.005279541015625, + "learning_rate": 2.1704752485814514e-07, + "loss": -0.0029, + "num_tokens": 119270988.0, + "reward": 2.89313805103302, + "reward_std": 1.5023037195205688, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 1.9467093348503113, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 2019.107177734375, + "epoch": 0.7904, + "grad_norm": 0.18091094493865967, + "kl": 0.005523681640625, + "learning_rate": 2.153604941348384e-07, + "loss": -0.0018, + "num_tokens": 119517316.0, + "reward": 3.2437936067581177, + "reward_std": 1.3781623244285583, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.449150562286377, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 2021.071533203125, + "epoch": 0.792, + "grad_norm": 0.18328644335269928, + "kl": 0.0052337646484375, + "learning_rate": 2.1368392031728065e-07, + "loss": 0.0002, + "num_tokens": 119764732.0, + "reward": 3.0611943006515503, + "reward_std": 1.3113956451416016, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.2397656440734863, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 2030.5625610351562, + "epoch": 0.7936, + "grad_norm": 0.19279994070529938, + "kl": 0.0056304931640625, + "learning_rate": 2.1201785579550395e-07, + "loss": 0.0039, + "num_tokens": 120009459.0, + "reward": 3.1865333318710327, + "reward_std": 1.547074794769287, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.391890287399292, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 1863.5536499023438, + "epoch": 0.7952, + "grad_norm": 0.19358927011489868, + "kl": 0.005096435546875, + "learning_rate": 2.1036235263114261e-07, + "loss": -0.0193, + "num_tokens": 120236055.0, + "reward": 3.4904396533966064, + "reward_std": 1.3421818017959595, + "rewards/accuracy_reward": 0.3303571343421936, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.3386539220809937, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 2031.509033203125, + "epoch": 0.7968, + "grad_norm": 0.18164695799350739, + "kl": 0.0055694580078125, + "learning_rate": 2.087174625558073e-07, + "loss": 0.0028, + "num_tokens": 120483744.0, + "reward": 3.1548445224761963, + "reward_std": 1.428589642047882, + "rewards/accuracy_reward": 0.08928571827709675, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.2798444032669067, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.1250610351562, + "epoch": 0.7984, + "grad_norm": 0.1964939385652542, + "kl": 0.005706787109375, + "learning_rate": 2.0708323696946784e-07, + "loss": -0.0012, + "num_tokens": 120731348.0, + "reward": 2.7884483337402344, + "reward_std": 1.2616525888442993, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.6875, + "rewards/tag_count_reward": 2.0830910205841064, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 1811.3304443359375, + "epoch": 0.8, + "grad_norm": 0.1874411255121231, + "kl": 0.0064849853515625, + "learning_rate": 2.054597269388469e-07, + "loss": -0.0094, + "num_tokens": 120953817.0, + "reward": 3.3626434803009033, + "reward_std": 1.4066117405891418, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.523357629776001, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.6876220703125, + "epoch": 0.8016, + "grad_norm": 0.184902161359787, + "kl": 0.0056304931640625, + "learning_rate": 2.03846983195825e-07, + "loss": 0.0023, + "num_tokens": 121203164.0, + "reward": 3.044499635696411, + "reward_std": 1.4554401636123657, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.2855708599090576, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 1996.232177734375, + "epoch": 0.8032, + "grad_norm": 0.18576684594154358, + "kl": 0.0054473876953125, + "learning_rate": 2.022450561358548e-07, + "loss": 0.0042, + "num_tokens": 121449296.0, + "reward": 3.2111419439315796, + "reward_std": 1.4501546621322632, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.4522132873535156, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 1960.1697387695312, + "epoch": 0.8048, + "grad_norm": 0.19447900354862213, + "kl": 0.0060272216796875, + "learning_rate": 2.0065399581638573e-07, + "loss": 0.015, + "num_tokens": 121686279.0, + "reward": 3.473777174949646, + "reward_std": 1.51576167345047, + "rewards/accuracy_reward": 0.09821428917348385, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.616634249687195, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 2031.6429443359375, + "epoch": 0.8064, + "grad_norm": 0.18073509633541107, + "kl": 0.005584716796875, + "learning_rate": 1.9907385195530124e-07, + "loss": 0.0038, + "num_tokens": 121932345.0, + "reward": 2.9982718229293823, + "reward_std": 1.653907299041748, + "rewards/accuracy_reward": 0.0982142873108387, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.203628897666931, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.009033203125, + "epoch": 0.808, + "grad_norm": 0.20493066310882568, + "kl": 0.006195068359375, + "learning_rate": 1.975046739293635e-07, + "loss": 0.0022, + "num_tokens": 122183520.0, + "reward": 2.9470202922821045, + "reward_std": 1.3558412790298462, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.1613059043884277, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 1988.2053833007812, + "epoch": 0.8096, + "grad_norm": 0.18799978494644165, + "kl": 0.0056610107421875, + "learning_rate": 1.95946510772672e-07, + "loss": 0.0079, + "num_tokens": 122427633.0, + "reward": 3.3652079105377197, + "reward_std": 1.1550991535186768, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8482142984867096, + "rewards/tag_count_reward": 2.5169934034347534, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 1607.4911499023438, + "epoch": 0.8112, + "grad_norm": 0.2183038592338562, + "kl": 0.0056304931640625, + "learning_rate": 1.9439941117513007e-07, + "loss": 0.0099, + "num_tokens": 122632578.0, + "reward": 3.8952372074127197, + "reward_std": 1.3577601313591003, + "rewards/accuracy_reward": 0.3482142761349678, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.8327369689941406, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.107177734375, + "epoch": 0.8128, + "grad_norm": 0.20285974442958832, + "kl": 0.0057220458984375, + "learning_rate": 1.9286342348092445e-07, + "loss": 0.0003, + "num_tokens": 122882098.0, + "reward": 2.95181405544281, + "reward_std": 1.281922698020935, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8482142686843872, + "rewards/tag_count_reward": 2.103599727153778, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 1936.3750610351562, + "epoch": 0.8144, + "grad_norm": 0.19424667954444885, + "kl": 0.0057373046875, + "learning_rate": 1.9133859568701355e-07, + "loss": -0.02, + "num_tokens": 123120560.0, + "reward": 3.444503903388977, + "reward_std": 1.33060222864151, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.60521799325943, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 1948.6964721679688, + "epoch": 0.816, + "grad_norm": 0.19305741786956787, + "kl": 0.0058746337890625, + "learning_rate": 1.8982497544162867e-07, + "loss": 0.0043, + "num_tokens": 123367598.0, + "reward": 3.353354811668396, + "reward_std": 1.3816343545913696, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.540854573249817, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.5536499023438, + "epoch": 0.8176, + "grad_norm": 0.20285403728485107, + "kl": 0.00592041015625, + "learning_rate": 1.8832261004278461e-07, + "loss": 0.0078, + "num_tokens": 123602272.0, + "reward": 3.216193199157715, + "reward_std": 1.4055215120315552, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.323335886001587, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 1892.4733276367188, + "epoch": 0.8192, + "grad_norm": 0.19242845475673676, + "kl": 0.00653076171875, + "learning_rate": 1.8683154643680127e-07, + "loss": -0.0028, + "num_tokens": 123839625.0, + "reward": 3.2361403703689575, + "reward_std": 1.2032982110977173, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.830357164144516, + "rewards/tag_count_reward": 2.4057830572128296, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.4107666015625, + "epoch": 0.8208, + "grad_norm": 0.1699412763118744, + "kl": 0.005157470703125, + "learning_rate": 1.853518312168375e-07, + "loss": 0.0045, + "num_tokens": 124088381.0, + "reward": 3.138668179512024, + "reward_std": 1.3461718559265137, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.361882448196411, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 1866.696533203125, + "epoch": 0.8224, + "grad_norm": 0.18753986060619354, + "kl": 0.0059967041015625, + "learning_rate": 1.838835106214343e-07, + "loss": 0.0096, + "num_tokens": 124318031.0, + "reward": 3.481819987297058, + "reward_std": 1.5179959535598755, + "rewards/accuracy_reward": 0.1964285746216774, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.553248405456543, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 2025.0536499023438, + "epoch": 0.824, + "grad_norm": 0.1878945380449295, + "kl": 0.0055999755859375, + "learning_rate": 1.824266305330709e-07, + "loss": -0.0015, + "num_tokens": 124565809.0, + "reward": 3.163859724998474, + "reward_std": 1.3125908970832825, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.360288143157959, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 2042.4019165039062, + "epoch": 0.8256, + "grad_norm": 0.19435888528823853, + "kl": 0.006072998046875, + "learning_rate": 1.8098123647672987e-07, + "loss": 0.0049, + "num_tokens": 124816804.0, + "reward": 2.9803141355514526, + "reward_std": 1.3814730048179626, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.2481712102890015, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 2022.83935546875, + "epoch": 0.8272, + "grad_norm": 0.2015310823917389, + "kl": 0.005615234375, + "learning_rate": 1.7954737361847565e-07, + "loss": 0.0049, + "num_tokens": 125065174.0, + "reward": 3.1193963289260864, + "reward_std": 1.3312036395072937, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.2890390157699585, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 1987.5357666015625, + "epoch": 0.8288, + "grad_norm": 0.17983581125736237, + "kl": 0.00616455078125, + "learning_rate": 1.781250867640423e-07, + "loss": 0.0061, + "num_tokens": 125307854.0, + "reward": 3.708823561668396, + "reward_std": 1.5559759140014648, + "rewards/accuracy_reward": 0.2321428507566452, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.708823323249817, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 2007.08935546875, + "epoch": 0.8304, + "grad_norm": 0.1982608139514923, + "kl": 0.005767822265625, + "learning_rate": 1.767144203574341e-07, + "loss": 0.0082, + "num_tokens": 125552416.0, + "reward": 2.941048741340637, + "reward_std": 1.4890641570091248, + "rewards/accuracy_reward": 0.0446428582072258, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.128548562526703, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 2037.9822387695312, + "epoch": 0.832, + "grad_norm": 0.20186671614646912, + "kl": 0.0062408447265625, + "learning_rate": 1.753154184795363e-07, + "loss": 0.0037, + "num_tokens": 125799360.0, + "reward": 3.1124082803726196, + "reward_std": 1.6119695901870728, + "rewards/accuracy_reward": 0.035714286379516125, + "rewards/format_reward": 0.705357164144516, + "rewards/tag_count_reward": 2.371336817741394, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 1997.1875610351562, + "epoch": 0.8336, + "grad_norm": 0.18789102137088776, + "kl": 0.0052337646484375, + "learning_rate": 1.739281248467379e-07, + "loss": 0.012, + "num_tokens": 126046719.0, + "reward": 3.2609002590179443, + "reward_std": 1.417835533618927, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.359114408493042, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 1695.21435546875, + "epoch": 0.8352, + "grad_norm": 0.20930421352386475, + "kl": 0.0067901611328125, + "learning_rate": 1.725525828095651e-07, + "loss": -0.0492, + "num_tokens": 126254195.0, + "reward": 3.6533420085906982, + "reward_std": 1.3044832944869995, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.742627501487732, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.8368, + "grad_norm": 0.19353055953979492, + "kl": 0.0057525634765625, + "learning_rate": 1.7118883535132756e-07, + "loss": 0.0002, + "num_tokens": 126503409.0, + "reward": 3.0779024362564087, + "reward_std": 1.125401258468628, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8392857015132904, + "rewards/tag_count_reward": 2.229688048362732, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 2043.15185546875, + "epoch": 0.8384, + "grad_norm": 0.18194374442100525, + "kl": 0.0057830810546875, + "learning_rate": 1.6983692508677456e-07, + "loss": 0.0023, + "num_tokens": 126751464.0, + "reward": 3.3010823726654053, + "reward_std": 1.5149815678596497, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.452867865562439, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 2033.884033203125, + "epoch": 0.84, + "grad_norm": 0.1844831258058548, + "kl": 0.0057220458984375, + "learning_rate": 1.684968942607634e-07, + "loss": 0.0066, + "num_tokens": 126997991.0, + "reward": 3.2455042600631714, + "reward_std": 1.2465353608131409, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.424075484275818, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.77685546875, + "epoch": 0.8416, + "grad_norm": 0.2025192528963089, + "kl": 0.0066680908203125, + "learning_rate": 1.6716878474693977e-07, + "loss": 0.0114, + "num_tokens": 127244856.0, + "reward": 3.288322329521179, + "reward_std": 1.4935184121131897, + "rewards/accuracy_reward": 0.02678571455180645, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.511536478996277, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 2012.6250610351562, + "epoch": 0.8432, + "grad_norm": 0.1798221319913864, + "kl": 0.00592041015625, + "learning_rate": 1.6585263804642864e-07, + "loss": 0.0025, + "num_tokens": 127491340.0, + "reward": 3.4644246101379395, + "reward_std": 1.4006040692329407, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.625138759613037, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.1251220703125, + "epoch": 0.8448, + "grad_norm": 0.189687117934227, + "kl": 0.0054931640625, + "learning_rate": 1.6454849528653814e-07, + "loss": 0.0023, + "num_tokens": 127737894.0, + "reward": 2.9721614122390747, + "reward_std": 1.3662317395210266, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.186447024345398, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 2042.21435546875, + "epoch": 0.8464, + "grad_norm": 0.18893834948539734, + "kl": 0.00616455078125, + "learning_rate": 1.6325639721947354e-07, + "loss": 0.002, + "num_tokens": 127986068.0, + "reward": 2.8575727939605713, + "reward_std": 1.455811083316803, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6785714030265808, + "rewards/tag_count_reward": 2.1790013909339905, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 1993.0447387695312, + "epoch": 0.848, + "grad_norm": 0.2043030560016632, + "kl": 0.0059661865234375, + "learning_rate": 1.619763842210647e-07, + "loss": 0.011, + "num_tokens": 128227937.0, + "reward": 3.3528358936309814, + "reward_std": 1.334109902381897, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.4689072370529175, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.8496, + "grad_norm": 0.19278356432914734, + "kl": 0.0053558349609375, + "learning_rate": 1.60708496289504e-07, + "loss": 0.0002, + "num_tokens": 128479419.0, + "reward": 2.9217913150787354, + "reward_std": 1.246551275253296, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.1003626585006714, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 1834.7679443359375, + "epoch": 0.8512, + "grad_norm": 0.20679216086864471, + "kl": 0.0059967041015625, + "learning_rate": 1.59452773044096e-07, + "loss": -0.0112, + "num_tokens": 128709847.0, + "reward": 3.520416021347046, + "reward_std": 1.299731433391571, + "rewards/accuracy_reward": 0.13988094963133335, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.627558708190918, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.8528, + "grad_norm": 0.19247810542583466, + "kl": 0.00567626953125, + "learning_rate": 1.582092537240204e-07, + "loss": 0.0002, + "num_tokens": 128958515.0, + "reward": 3.0004868507385254, + "reward_std": 1.3707662224769592, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.2594151496887207, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 2034.2410888671875, + "epoch": 0.8544, + "grad_norm": 0.18071478605270386, + "kl": 0.005584716796875, + "learning_rate": 1.569779771871049e-07, + "loss": 0.0072, + "num_tokens": 129210696.0, + "reward": 3.0387990474700928, + "reward_std": 1.4162453413009644, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7589285969734192, + "rewards/tag_count_reward": 2.2798702120780945, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 1979.5536499023438, + "epoch": 0.856, + "grad_norm": 0.19915911555290222, + "kl": 0.00604248046875, + "learning_rate": 1.5575898190861181e-07, + "loss": 0.0061, + "num_tokens": 129452048.0, + "reward": 3.376705050468445, + "reward_std": 1.4127461910247803, + "rewards/accuracy_reward": 0.1607142835855484, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.4481334686279297, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 1878.2232666015625, + "epoch": 0.8576, + "grad_norm": 0.1918126791715622, + "kl": 0.0055389404296875, + "learning_rate": 1.5455230598003495e-07, + "loss": 0.0099, + "num_tokens": 129684809.0, + "reward": 3.5761971473693848, + "reward_std": 1.0726081728935242, + "rewards/accuracy_reward": 0.2767857015132904, + "rewards/format_reward": 0.9017857015132904, + "rewards/tag_count_reward": 2.397625684738159, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 1949.169677734375, + "epoch": 0.8592, + "grad_norm": 0.17706818878650665, + "kl": 0.00592041015625, + "learning_rate": 1.5335798710791009e-07, + "loss": 0.0002, + "num_tokens": 129920546.0, + "reward": 3.2802977561950684, + "reward_std": 1.4198022484779358, + "rewards/accuracy_reward": 0.053571430034935474, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.4677975177764893, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 1997.1339721679688, + "epoch": 0.8608, + "grad_norm": 0.20544330775737762, + "kl": 0.0058441162109375, + "learning_rate": 1.5217606261263615e-07, + "loss": 0.018, + "num_tokens": 130168935.0, + "reward": 3.1617459058761597, + "reward_std": 1.2542779445648193, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.3671029806137085, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 1729.8572387695312, + "epoch": 0.8624, + "grad_norm": 0.2146475911140442, + "kl": 0.0063323974609375, + "learning_rate": 1.5100656942730932e-07, + "loss": -0.0271, + "num_tokens": 130382573.0, + "reward": 3.146600604057312, + "reward_std": 1.4548588395118713, + "rewards/accuracy_reward": 0.13392857182770967, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.271600365638733, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 1778.0982666015625, + "epoch": 0.864, + "grad_norm": 0.20400752127170563, + "kl": 0.0062713623046875, + "learning_rate": 1.4984954409656895e-07, + "loss": -0.0054, + "num_tokens": 130601250.0, + "reward": 3.8312467336654663, + "reward_std": 1.3907028436660767, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.830357164144516, + "rewards/tag_count_reward": 2.884818196296692, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 1926.65185546875, + "epoch": 0.8656, + "grad_norm": 0.19670452177524567, + "kl": 0.0063629150390625, + "learning_rate": 1.4870502277545515e-07, + "loss": -0.0032, + "num_tokens": 130835795.0, + "reward": 3.4652657508850098, + "reward_std": 1.278001606464386, + "rewards/accuracy_reward": 0.1339285671710968, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.5099083185195923, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 1975.384033203125, + "epoch": 0.8672, + "grad_norm": 0.1920643001794815, + "kl": 0.005950927734375, + "learning_rate": 1.475730412282797e-07, + "loss": 0.0151, + "num_tokens": 131075784.0, + "reward": 3.315953850746155, + "reward_std": 1.26567804813385, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.5927393436431885, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 1923.6251220703125, + "epoch": 0.8688, + "grad_norm": 0.1884908676147461, + "kl": 0.00537109375, + "learning_rate": 1.464536348275081e-07, + "loss": -0.0122, + "num_tokens": 131309514.0, + "reward": 3.1024152040481567, + "reward_std": 1.1635739207267761, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.2720577716827393, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 1956.5894165039062, + "epoch": 0.8704, + "grad_norm": 0.1845502406358719, + "kl": 0.0057525634765625, + "learning_rate": 1.4534683855265404e-07, + "loss": 0.0003, + "num_tokens": 131549414.0, + "reward": 3.4224483966827393, + "reward_std": 1.353294014930725, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8303571343421936, + "rewards/tag_count_reward": 2.574233889579773, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.509033203125, + "epoch": 0.872, + "grad_norm": 0.19549857079982758, + "kl": 0.0062408447265625, + "learning_rate": 1.4425268698918677e-07, + "loss": 0.012, + "num_tokens": 131783159.0, + "reward": 3.2711364030838013, + "reward_std": 1.4156261682510376, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.47649347782135, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 1878.0178833007812, + "epoch": 0.8736, + "grad_norm": 0.2057812362909317, + "kl": 0.006011962890625, + "learning_rate": 1.4317121432745027e-07, + "loss": -0.0077, + "num_tokens": 132012719.0, + "reward": 3.0634517669677734, + "reward_std": 1.371016263961792, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.2420228719711304, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 2007.5179443359375, + "epoch": 0.8752, + "grad_norm": 0.17245817184448242, + "kl": 0.006072998046875, + "learning_rate": 1.4210245436159454e-07, + "loss": 0.0093, + "num_tokens": 132255299.0, + "reward": 3.352210760116577, + "reward_std": 1.4375743865966797, + "rewards/accuracy_reward": 0.1964285671710968, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.3879250288009644, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 1968.232177734375, + "epoch": 0.8768, + "grad_norm": 0.1815684735774994, + "kl": 0.005889892578125, + "learning_rate": 1.4104644048851994e-07, + "loss": 0.0242, + "num_tokens": 132494235.0, + "reward": 3.3072292804718018, + "reward_std": 1.433179259300232, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7767857313156128, + "rewards/tag_count_reward": 2.5125863552093506, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.4553833007812, + "epoch": 0.8784, + "grad_norm": 0.1794120967388153, + "kl": 0.005462646484375, + "learning_rate": 1.4000320570683355e-07, + "loss": 0.0051, + "num_tokens": 132739146.0, + "reward": 3.4411338567733765, + "reward_std": 1.0911927223205566, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 2.4500621557235718, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 2030.8929443359375, + "epoch": 0.88, + "grad_norm": 0.18249621987342834, + "kl": 0.0053253173828125, + "learning_rate": 1.3897278261581771e-07, + "loss": 0.0028, + "num_tokens": 132985604.0, + "reward": 3.2859009504318237, + "reward_std": 1.3561326265335083, + "rewards/accuracy_reward": 0.05357143096625805, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.4109007120132446, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 2033.1964721679688, + "epoch": 0.8816, + "grad_norm": 0.18041563034057617, + "kl": 0.0062103271484375, + "learning_rate": 1.3795520341441192e-07, + "loss": 0.0058, + "num_tokens": 133232642.0, + "reward": 3.4603484869003296, + "reward_std": 1.331321358680725, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 2.5585626363754272, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 2039.4732666015625, + "epoch": 0.8832, + "grad_norm": 0.1840829849243164, + "kl": 0.005859375, + "learning_rate": 1.3695049990020605e-07, + "loss": 0.0057, + "num_tokens": 133480929.0, + "reward": 2.963152050971985, + "reward_std": 1.400626003742218, + "rewards/accuracy_reward": 0.044642859138548374, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.1327948570251465, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.669677734375, + "epoch": 0.8848, + "grad_norm": 0.19522184133529663, + "kl": 0.0059967041015625, + "learning_rate": 1.3595870346844722e-07, + "loss": 0.01, + "num_tokens": 133732584.0, + "reward": 3.122857928276062, + "reward_std": 1.2291558384895325, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 2.28357195854187, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 1996.2589721679688, + "epoch": 0.8864, + "grad_norm": 0.20720477402210236, + "kl": 0.0053253173828125, + "learning_rate": 1.3497984511105828e-07, + "loss": 0.0137, + "num_tokens": 133977543.0, + "reward": 3.0356905460357666, + "reward_std": 1.3279377818107605, + "rewards/accuracy_reward": 0.1428571492433548, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.0803334712982178, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 2027.2589721679688, + "epoch": 0.888, + "grad_norm": 0.17782160639762878, + "kl": 0.0055694580078125, + "learning_rate": 1.3401395541567008e-07, + "loss": 0.0066, + "num_tokens": 134223538.0, + "reward": 3.264726758003235, + "reward_std": 1.284030020236969, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.4611552953720093, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 1797.4822387695312, + "epoch": 0.8896, + "grad_norm": 0.20841240882873535, + "kl": 0.0059051513671875, + "learning_rate": 1.3306106456466474e-07, + "loss": -0.0065, + "num_tokens": 134442202.0, + "reward": 4.075626611709595, + "reward_std": 1.209336757659912, + "rewards/accuracy_reward": 0.3392857164144516, + "rewards/format_reward": 0.875, + "rewards/tag_count_reward": 2.861340641975403, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 1903.0179443359375, + "epoch": 0.8912, + "grad_norm": 0.21418721973896027, + "kl": 0.00628662109375, + "learning_rate": 1.321212023342333e-07, + "loss": -0.001, + "num_tokens": 134672882.0, + "reward": 3.065307378768921, + "reward_std": 1.4176130890846252, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.315307378768921, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 2016.8750610351562, + "epoch": 0.8928, + "grad_norm": 0.19372744858264923, + "kl": 0.006134033203125, + "learning_rate": 1.3119439809344475e-07, + "loss": 0.0072, + "num_tokens": 134918386.0, + "reward": 3.149105191230774, + "reward_std": 1.501021683216095, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.399105191230774, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 2044.169677734375, + "epoch": 0.8944, + "grad_norm": 0.19350114464759827, + "kl": 0.006317138671875, + "learning_rate": 1.3028068080332854e-07, + "loss": 0.0037, + "num_tokens": 135165897.0, + "reward": 3.122075080871582, + "reward_std": 1.5554208755493164, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.363146424293518, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 1995.4911499023438, + "epoch": 0.896, + "grad_norm": 0.20528611540794373, + "kl": 0.0055084228515625, + "learning_rate": 1.2938007901596978e-07, + "loss": 0.0072, + "num_tokens": 135418008.0, + "reward": 3.174467444419861, + "reward_std": 1.3752895593643188, + "rewards/accuracy_reward": 0.1785714328289032, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.1744673252105713, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.071533203125, + "epoch": 0.8976, + "grad_norm": 0.19797545671463013, + "kl": 0.0055389404296875, + "learning_rate": 1.2849262087361637e-07, + "loss": 0.0017, + "num_tokens": 135668658.0, + "reward": 2.6191213130950928, + "reward_std": 1.4006128311157227, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 1.8691211938858032, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 1966.0447387695312, + "epoch": 0.8992, + "grad_norm": 0.17479199171066284, + "kl": 0.00555419921875, + "learning_rate": 1.2761833410780045e-07, + "loss": 0.0112, + "num_tokens": 135908931.0, + "reward": 3.742344856262207, + "reward_std": 1.132834792137146, + "rewards/accuracy_reward": 0.2946428582072258, + "rewards/format_reward": 0.8303571343421936, + "rewards/tag_count_reward": 2.6173447370529175, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 1964.2679443359375, + "epoch": 0.9008, + "grad_norm": 0.18675729632377625, + "kl": 0.0058135986328125, + "learning_rate": 1.26757246038471e-07, + "loss": -0.0145, + "num_tokens": 136149201.0, + "reward": 3.2416350841522217, + "reward_std": 1.3353677988052368, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.5005635023117065, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 1919.5982666015625, + "epoch": 0.9024, + "grad_norm": 0.18720659613609314, + "kl": 0.006011962890625, + "learning_rate": 1.2590938357314112e-07, + "loss": -0.0064, + "num_tokens": 136385084.0, + "reward": 3.435679316520691, + "reward_std": 1.405099868774414, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.57853627204895, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 1994.4375610351562, + "epoch": 0.904, + "grad_norm": 0.19510866701602936, + "kl": 0.0056915283203125, + "learning_rate": 1.2507477320604608e-07, + "loss": 0.0006, + "num_tokens": 136632695.0, + "reward": 3.0828466415405273, + "reward_std": 1.4187443852424622, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.2703466415405273, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 1938.3482666015625, + "epoch": 0.9056, + "grad_norm": 0.19292594492435455, + "kl": 0.006195068359375, + "learning_rate": 1.2425344101731632e-07, + "loss": -0.0017, + "num_tokens": 136869656.0, + "reward": 3.6934484243392944, + "reward_std": 1.3612673878669739, + "rewards/accuracy_reward": 0.10714285541325808, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.791662573814392, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 1959.40185546875, + "epoch": 0.9072, + "grad_norm": 0.1983197033405304, + "kl": 0.0057373046875, + "learning_rate": 1.2344541267216224e-07, + "loss": -0.0161, + "num_tokens": 137107617.0, + "reward": 3.3298003673553467, + "reward_std": 1.0859698355197906, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.4637287855148315, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.3214721679688, + "epoch": 0.9088, + "grad_norm": 0.17937511205673218, + "kl": 0.0061798095703125, + "learning_rate": 1.2265071342007196e-07, + "loss": 0.0004, + "num_tokens": 137357091.0, + "reward": 3.167315363883972, + "reward_std": 1.4310060739517212, + "rewards/accuracy_reward": 0.06250000093132257, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.3816009759902954, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 1900.3660888671875, + "epoch": 0.9104, + "grad_norm": 0.19049756228923798, + "kl": 0.005889892578125, + "learning_rate": 1.2186936809402249e-07, + "loss": -0.006, + "num_tokens": 137590330.0, + "reward": 3.2381094694137573, + "reward_std": 1.3123273849487305, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.309537887573242, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 1906.3929443359375, + "epoch": 0.912, + "grad_norm": 0.19406186044216156, + "kl": 0.006256103515625, + "learning_rate": 1.2110140110970391e-07, + "loss": 0.0116, + "num_tokens": 137823740.0, + "reward": 3.3636668920516968, + "reward_std": 1.3221712112426758, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.57795250415802, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 1909.1785888671875, + "epoch": 0.9136, + "grad_norm": 0.21337126195430756, + "kl": 0.0058135986328125, + "learning_rate": 1.2034683646475615e-07, + "loss": -0.004, + "num_tokens": 138061634.0, + "reward": 3.1373177766799927, + "reward_std": 1.310356855392456, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.75, + "rewards/tag_count_reward": 2.2623177766799927, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 2013.02685546875, + "epoch": 0.9152, + "grad_norm": 0.1914016157388687, + "kl": 0.0056304931640625, + "learning_rate": 1.1960569773801915e-07, + "loss": -0.0068, + "num_tokens": 138305951.0, + "reward": 2.863747000694275, + "reward_std": 1.4287275075912476, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 2.1762468814849854, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 1997.0179443359375, + "epoch": 0.9168, + "grad_norm": 0.19853228330612183, + "kl": 0.0059356689453125, + "learning_rate": 1.1887800808879631e-07, + "loss": 0.0146, + "num_tokens": 138550393.0, + "reward": 3.6593021154403687, + "reward_std": 1.3323598504066467, + "rewards/accuracy_reward": 0.1517857164144516, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.686087727546692, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 2017.4375610351562, + "epoch": 0.9184, + "grad_norm": 0.19723980128765106, + "kl": 0.0062408447265625, + "learning_rate": 1.1816379025613046e-07, + "loss": 0.0086, + "num_tokens": 138801224.0, + "reward": 2.8551533222198486, + "reward_std": 1.410047173500061, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.1587246656417847, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 2039.1875610351562, + "epoch": 0.92, + "grad_norm": 0.18896090984344482, + "kl": 0.0061492919921875, + "learning_rate": 1.1746306655809353e-07, + "loss": 0.0051, + "num_tokens": 139049829.0, + "reward": 3.023775339126587, + "reward_std": 1.5366047024726868, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.2380608916282654, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.6607666015625, + "epoch": 0.9216, + "grad_norm": 0.18459564447402954, + "kl": 0.0058135986328125, + "learning_rate": 1.1677585889108908e-07, + "loss": 0.0005, + "num_tokens": 139302253.0, + "reward": 3.2337427139282227, + "reward_std": 1.3450033068656921, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.385528326034546, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 1809.6429443359375, + "epoch": 0.9232, + "grad_norm": 0.18546897172927856, + "kl": 0.0067291259765625, + "learning_rate": 1.1610218872916806e-07, + "loss": 0.0055, + "num_tokens": 139522601.0, + "reward": 3.608990430831909, + "reward_std": 1.372089922428131, + "rewards/accuracy_reward": 0.125, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.716133236885071, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 2035.7054443359375, + "epoch": 0.9248, + "grad_norm": 0.183374285697937, + "kl": 0.00555419921875, + "learning_rate": 1.1544207712335794e-07, + "loss": 0.0012, + "num_tokens": 139768828.0, + "reward": 3.0064207315444946, + "reward_std": 1.441347897052765, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.2653492093086243, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 1905.7053833007812, + "epoch": 0.9264, + "grad_norm": 0.20621263980865479, + "kl": 0.006317138671875, + "learning_rate": 1.1479554470100446e-07, + "loss": 0.0014, + "num_tokens": 140004289.0, + "reward": 3.3837616443634033, + "reward_std": 1.2718448638916016, + "rewards/accuracy_reward": 0.130952388048172, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.508761405944824, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.5000610351562, + "epoch": 0.928, + "grad_norm": 0.18133869767189026, + "kl": 0.006500244140625, + "learning_rate": 1.1416261166512765e-07, + "loss": 0.0005, + "num_tokens": 140251599.0, + "reward": 3.5401611328125, + "reward_std": 1.1662200093269348, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 2.6919467449188232, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 2045.4553833007812, + "epoch": 0.9296, + "grad_norm": 0.195255309343338, + "kl": 0.0060272216796875, + "learning_rate": 1.1354329779379001e-07, + "loss": 0.0014, + "num_tokens": 140505666.0, + "reward": 3.197594165802002, + "reward_std": 1.3284226655960083, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.340451240539551, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 1995.3572387695312, + "epoch": 0.9312, + "grad_norm": 0.18633654713630676, + "kl": 0.00604248046875, + "learning_rate": 1.1293762243947903e-07, + "loss": 0.0074, + "num_tokens": 140750384.0, + "reward": 3.533870816230774, + "reward_std": 1.3958919048309326, + "rewards/accuracy_reward": 0.1696428582072258, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.560656428337097, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 2042.482177734375, + "epoch": 0.9328, + "grad_norm": 0.18621599674224854, + "kl": 0.005950927734375, + "learning_rate": 1.1234560452850176e-07, + "loss": 0.0024, + "num_tokens": 140999792.0, + "reward": 2.878381848335266, + "reward_std": 1.4884929060935974, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.1730247139930725, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 1980.1964721679688, + "epoch": 0.9344, + "grad_norm": 0.18155336380004883, + "kl": 0.0060882568359375, + "learning_rate": 1.1176726256039399e-07, + "loss": -0.0016, + "num_tokens": 141239102.0, + "reward": 2.952871322631836, + "reward_std": 1.4712472558021545, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.669642835855484, + "rewards/tag_count_reward": 2.2832283973693848, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 2026.482177734375, + "epoch": 0.936, + "grad_norm": 0.17793045938014984, + "kl": 0.0062255859375, + "learning_rate": 1.1120261460734188e-07, + "loss": 0.0059, + "num_tokens": 141484086.0, + "reward": 3.021064519882202, + "reward_std": 1.354138433933258, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.288921594619751, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.9376, + "grad_norm": 0.1904812604188919, + "kl": 0.0058441162109375, + "learning_rate": 1.1065167831361735e-07, + "loss": 0.0002, + "num_tokens": 141733496.0, + "reward": 3.0356212854385376, + "reward_std": 1.3065844774246216, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.2409781217575073, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 1922.3662109375, + "epoch": 0.9392, + "grad_norm": 0.20721037685871124, + "kl": 0.0064849853515625, + "learning_rate": 1.1011447089502658e-07, + "loss": 0.0026, + "num_tokens": 141967911.0, + "reward": 3.4084410667419434, + "reward_std": 1.5011770129203796, + "rewards/accuracy_reward": 0.15178571082651615, + "rewards/format_reward": 0.7410714030265808, + "rewards/tag_count_reward": 2.515583634376526, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 2011.0803833007812, + "epoch": 0.9408, + "grad_norm": 0.18446482717990875, + "kl": 0.0061798095703125, + "learning_rate": 1.0959100913837224e-07, + "loss": -0.0002, + "num_tokens": 142212360.0, + "reward": 3.1501721143722534, + "reward_std": 1.382131814956665, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7142857015132904, + "rewards/tag_count_reward": 2.435886263847351, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 2047.9375610351562, + "epoch": 0.9424, + "grad_norm": 0.17910334467887878, + "kl": 0.00616455078125, + "learning_rate": 1.0908130940092892e-07, + "loss": 0.0003, + "num_tokens": 142460965.0, + "reward": 3.1845543384552, + "reward_std": 1.2679128646850586, + "rewards/accuracy_reward": 0.026785715483129025, + "rewards/format_reward": 0.7946428656578064, + "rewards/tag_count_reward": 2.363125801086426, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 2038.2053833007812, + "epoch": 0.944, + "grad_norm": 0.17953084409236908, + "kl": 0.005950927734375, + "learning_rate": 1.0858538760993169e-07, + "loss": 0.004, + "num_tokens": 142706422.0, + "reward": 3.2399942874908447, + "reward_std": 1.3346341252326965, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.418565511703491, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 2003.1161499023438, + "epoch": 0.9456, + "grad_norm": 0.19361357390880585, + "kl": 0.0061492919921875, + "learning_rate": 1.0810325926207883e-07, + "loss": 0.0124, + "num_tokens": 142950511.0, + "reward": 3.1207356452941895, + "reward_std": 1.4924516081809998, + "rewards/accuracy_reward": 0.0892857164144516, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.3350212574005127, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.9472, + "grad_norm": 0.18471744656562805, + "kl": 0.0059051513671875, + "learning_rate": 1.0763493942304726e-07, + "loss": 0.0002, + "num_tokens": 143198857.0, + "reward": 2.993699550628662, + "reward_std": 1.156280279159546, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.830357164144516, + "rewards/tag_count_reward": 2.1633421778678894, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 1870.2679443359375, + "epoch": 0.9488, + "grad_norm": 0.19141925871372223, + "kl": 0.0060272216796875, + "learning_rate": 1.0718044272702193e-07, + "loss": -0.0173, + "num_tokens": 143427787.0, + "reward": 3.5085227489471436, + "reward_std": 1.3375614285469055, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.62459397315979, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 2008.3125610351562, + "epoch": 0.9504, + "grad_norm": 0.19998760521411896, + "kl": 0.0060272216796875, + "learning_rate": 1.0673978337623845e-07, + "loss": 0.0112, + "num_tokens": 143672094.0, + "reward": 3.211104393005371, + "reward_std": 1.5366321802139282, + "rewards/accuracy_reward": 0.0803571417927742, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.345032811164856, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 1930.1787109375, + "epoch": 0.952, + "grad_norm": 0.19523145258426666, + "kl": 0.00592041015625, + "learning_rate": 1.063129751405394e-07, + "loss": 0.0014, + "num_tokens": 143906572.0, + "reward": 3.051987886428833, + "reward_std": 1.3392629623413086, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.3198448419570923, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 1991.0179443359375, + "epoch": 0.9536, + "grad_norm": 0.18792715668678284, + "kl": 0.005859375, + "learning_rate": 1.0590003135694382e-07, + "loss": 0.0089, + "num_tokens": 144149712.0, + "reward": 3.3845818042755127, + "reward_std": 1.5075795650482178, + "rewards/accuracy_reward": 0.1964285671710968, + "rewards/format_reward": 0.794642835855484, + "rewards/tag_count_reward": 2.393510341644287, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.9552, + "grad_norm": 0.1865776777267456, + "kl": 0.0053558349609375, + "learning_rate": 1.0550096492923077e-07, + "loss": 0.0002, + "num_tokens": 144404148.0, + "reward": 3.0649664402008057, + "reward_std": 1.279529333114624, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.279252052307129, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.0625610351562, + "epoch": 0.9568, + "grad_norm": 0.17786939442157745, + "kl": 0.005889892578125, + "learning_rate": 1.0511578832753589e-07, + "loss": 0.0011, + "num_tokens": 144654041.0, + "reward": 2.8292033672332764, + "reward_std": 1.5441558957099915, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.6696428656578064, + "rewards/tag_count_reward": 2.141703248023987, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 2001.1607666015625, + "epoch": 0.9584, + "grad_norm": 0.1699460744857788, + "kl": 0.0057525634765625, + "learning_rate": 1.0474451358796166e-07, + "loss": -0.0075, + "num_tokens": 144898807.0, + "reward": 3.1888365745544434, + "reward_std": 1.4460148215293884, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.6785714030265808, + "rewards/tag_count_reward": 2.4566938877105713, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 2020.8482666015625, + "epoch": 0.96, + "grad_norm": 0.19819030165672302, + "kl": 0.0064239501953125, + "learning_rate": 1.0438715231220166e-07, + "loss": 0.0017, + "num_tokens": 145146030.0, + "reward": 3.103357195854187, + "reward_std": 1.4078664183616638, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.344428539276123, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 1912.6964721679688, + "epoch": 0.9616, + "grad_norm": 0.19154293835163116, + "kl": 0.0060882568359375, + "learning_rate": 1.0404371566717758e-07, + "loss": -0.0173, + "num_tokens": 145380524.0, + "reward": 3.256765604019165, + "reward_std": 1.530589520931244, + "rewards/accuracy_reward": 0.08928571455180645, + "rewards/format_reward": 0.7053571343421936, + "rewards/tag_count_reward": 2.462122678756714, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 1944.6608276367188, + "epoch": 0.9632, + "grad_norm": 0.1987132877111435, + "kl": 0.006317138671875, + "learning_rate": 1.037142143846905e-07, + "loss": 0.0106, + "num_tokens": 145617016.0, + "reward": 3.407961368560791, + "reward_std": 1.3827595710754395, + "rewards/accuracy_reward": 0.0714285746216774, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.568675398826599, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 1934.821533203125, + "epoch": 0.9648, + "grad_norm": 0.17651954293251038, + "kl": 0.0059051513671875, + "learning_rate": 1.0339865876108561e-07, + "loss": -0.003, + "num_tokens": 145852070.0, + "reward": 3.6048834323883057, + "reward_std": 1.429735004901886, + "rewards/accuracy_reward": 0.14285714365541935, + "rewards/format_reward": 0.767857164144516, + "rewards/tag_count_reward": 2.694169044494629, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 1889.357177734375, + "epoch": 0.9664, + "grad_norm": 0.19650956988334656, + "kl": 0.006195068359375, + "learning_rate": 1.0309705865693042e-07, + "loss": -0.0222, + "num_tokens": 146084804.0, + "reward": 3.1590899229049683, + "reward_std": 1.2867639660835266, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.3376612067222595, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 2015.65185546875, + "epoch": 0.968, + "grad_norm": 0.1821531057357788, + "kl": 0.0056915283203125, + "learning_rate": 1.0280942349670642e-07, + "loss": 0.0056, + "num_tokens": 146331767.0, + "reward": 3.34850013256073, + "reward_std": 1.3253604173660278, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.5270715951919556, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 1970.0982666015625, + "epoch": 0.9696, + "grad_norm": 0.1977119892835617, + "kl": 0.0056915283203125, + "learning_rate": 1.0253576226851488e-07, + "loss": 0.0193, + "num_tokens": 146573432.0, + "reward": 3.1449300050735474, + "reward_std": 1.265475332736969, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.3502869606018066, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.0536499023438, + "epoch": 0.9712, + "grad_norm": 0.20502740144729614, + "kl": 0.0055999755859375, + "learning_rate": 1.022760835237959e-07, + "loss": 0.001, + "num_tokens": 146825774.0, + "reward": 3.040839195251465, + "reward_std": 1.4972584247589111, + "rewards/accuracy_reward": 0.0535714291036129, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.210481882095337, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.2679443359375, + "epoch": 0.9728, + "grad_norm": 0.17513740062713623, + "kl": 0.005523681640625, + "learning_rate": 1.0203039537706106e-07, + "loss": 0.0047, + "num_tokens": 147070300.0, + "reward": 3.0925419330596924, + "reward_std": 1.4447033405303955, + "rewards/accuracy_reward": 0.0625, + "rewards/format_reward": 0.6964285671710968, + "rewards/tag_count_reward": 2.3336132764816284, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 1942.634033203125, + "epoch": 0.9744, + "grad_norm": 0.19008903205394745, + "kl": 0.0063323974609375, + "learning_rate": 1.0179870550563994e-07, + "loss": -0.0005, + "num_tokens": 147305347.0, + "reward": 3.2044488191604614, + "reward_std": 1.3913370966911316, + "rewards/accuracy_reward": 0.1160714253783226, + "rewards/format_reward": 0.7410714030265808, + "rewards/tag_count_reward": 2.3473058938980103, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 2022.0536499023438, + "epoch": 0.976, + "grad_norm": 0.19450171291828156, + "kl": 0.005950927734375, + "learning_rate": 1.0158102114944037e-07, + "loss": 0.0003, + "num_tokens": 147550843.0, + "reward": 3.1200013160705566, + "reward_std": 1.4148624539375305, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.387858033180237, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 1998.3928833007812, + "epoch": 0.9776, + "grad_norm": 0.19535484910011292, + "kl": 0.0057373046875, + "learning_rate": 1.0137734911072195e-07, + "loss": 0.0044, + "num_tokens": 147796923.0, + "reward": 3.0610857009887695, + "reward_std": 1.408657193183899, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7321428656578064, + "rewards/tag_count_reward": 2.2307283878326416, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.0982666015625, + "epoch": 0.9792, + "grad_norm": 0.17270007729530334, + "kl": 0.005645751953125, + "learning_rate": 1.0118769575388354e-07, + "loss": -0.0006, + "num_tokens": 148046806.0, + "reward": 3.226860761642456, + "reward_std": 1.2183409929275513, + "rewards/accuracy_reward": 0.0357142873108387, + "rewards/format_reward": 0.7857142984867096, + "rewards/tag_count_reward": 2.405431866645813, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 1936.7144165039062, + "epoch": 0.9808, + "grad_norm": 0.2173808515071869, + "kl": 0.0062408447265625, + "learning_rate": 1.0101206700526457e-07, + "loss": 0.0267, + "num_tokens": 148283010.0, + "reward": 3.6556992530822754, + "reward_std": 1.367648959159851, + "rewards/accuracy_reward": 0.1875, + "rewards/format_reward": 0.8214285671710968, + "rewards/tag_count_reward": 2.646770477294922, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 1992.1250610351562, + "epoch": 0.9824, + "grad_norm": 0.19206741452217102, + "kl": 0.0055389404296875, + "learning_rate": 1.0085046835295959e-07, + "loss": 0.0108, + "num_tokens": 148526162.0, + "reward": 2.975595712661743, + "reward_std": 1.2686077952384949, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8125, + "rewards/tag_count_reward": 2.1452386379241943, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 2046.4285888671875, + "epoch": 0.984, + "grad_norm": 0.1944650262594223, + "kl": 0.0061492919921875, + "learning_rate": 1.0070290484664712e-07, + "loss": 0.0012, + "num_tokens": 148775088.0, + "reward": 3.110299825668335, + "reward_std": 1.3647432923316956, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7410714328289032, + "rewards/tag_count_reward": 2.36922824382782, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 2014.8929443359375, + "epoch": 0.9856, + "grad_norm": 0.18487314879894257, + "kl": 0.0048065185546875, + "learning_rate": 1.005693810974313e-07, + "loss": 0.0042, + "num_tokens": 149023492.0, + "reward": 3.0254071950912476, + "reward_std": 1.1849504113197327, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.248621344566345, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 2012.90185546875, + "epoch": 0.9872, + "grad_norm": 0.19127902388572693, + "kl": 0.006378173828125, + "learning_rate": 1.0044990127769852e-07, + "loss": 0.01, + "num_tokens": 149266885.0, + "reward": 3.3402512073516846, + "reward_std": 1.4714823365211487, + "rewards/accuracy_reward": 0.008928571827709675, + "rewards/format_reward": 0.7857142686843872, + "rewards/tag_count_reward": 2.5456082820892334, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 1953.0358276367188, + "epoch": 0.9888, + "grad_norm": 0.17758721113204956, + "kl": 0.0058441162109375, + "learning_rate": 1.0034446912098636e-07, + "loss": 0.0102, + "num_tokens": 149504175.0, + "reward": 3.5070276260375977, + "reward_std": 1.2196210622787476, + "rewards/accuracy_reward": 0.12499999720603228, + "rewards/format_reward": 0.7589285671710968, + "rewards/tag_count_reward": 2.623098850250244, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 2036.96435546875, + "epoch": 0.9904, + "grad_norm": 0.18733817338943481, + "kl": 0.00531005859375, + "learning_rate": 1.0025308792186744e-07, + "loss": 0.0025, + "num_tokens": 149757837.0, + "reward": 2.9232553243637085, + "reward_std": 1.6057178974151611, + "rewards/accuracy_reward": 0.1071428582072258, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 2.0928980112075806, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 1799.5983276367188, + "epoch": 0.992, + "grad_norm": 0.18921758234500885, + "kl": 0.006103515625, + "learning_rate": 1.001757605358462e-07, + "loss": 0.0115, + "num_tokens": 149979090.0, + "reward": 3.4840941429138184, + "reward_std": 1.3042045831680298, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7767857015132904, + "rewards/tag_count_reward": 2.6090937852859497, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 1995.134033203125, + "epoch": 0.9936, + "grad_norm": 0.2008657306432724, + "kl": 0.0064239501953125, + "learning_rate": 1.001124893792696e-07, + "loss": 0.0031, + "num_tokens": 150222789.0, + "reward": 3.454424262046814, + "reward_std": 1.4247412085533142, + "rewards/accuracy_reward": 0.01785714365541935, + "rewards/format_reward": 0.8035714328289032, + "rewards/tag_count_reward": 2.63299560546875, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 2048.0, + "epoch": 0.9952, + "grad_norm": 0.19542710483074188, + "kl": 0.0054931640625, + "learning_rate": 1.0006327642925186e-07, + "loss": 0.0002, + "num_tokens": 150474957.0, + "reward": 2.5968751907348633, + "reward_std": 1.4759308099746704, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.7232142984867096, + "rewards/tag_count_reward": 1.8736608624458313, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 1939.2322387695312, + "epoch": 0.9968, + "grad_norm": 0.19268657267093658, + "kl": 0.0054779052734375, + "learning_rate": 1.0002812322361265e-07, + "loss": -0.0042, + "num_tokens": 150711093.0, + "reward": 3.298256754875183, + "reward_std": 1.1486346125602722, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.8392857313156128, + "rewards/tag_count_reward": 2.4589709043502808, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 1992.4375610351562, + "epoch": 0.9984, + "grad_norm": 0.2007443904876709, + "kl": 0.0061187744140625, + "learning_rate": 1.0000703086082874e-07, + "loss": 0.0092, + "num_tokens": 150957584.0, + "reward": 3.2020119428634644, + "reward_std": 1.3203019499778748, + "rewards/accuracy_reward": 0.0982142835855484, + "rewards/format_reward": 0.7678571343421936, + "rewards/tag_count_reward": 2.335940361022949, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 1842.03125, + "epoch": 1.0, + "grad_norm": 0.19545716047286987, + "kl": 0.0063323974609375, + "learning_rate": 1e-07, + "loss": 0.0185, + "num_tokens": 151178972.0, + "reward": 3.9977399110794067, + "reward_std": 1.256572186946869, + "rewards/accuracy_reward": 0.1696428544819355, + "rewards/format_reward": 0.8482142984867096, + "rewards/tag_count_reward": 2.9798827171325684, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 0.0, + "train_loss": 0.0038764608660159863, + "train_runtime": 163789.7991, + "train_samples_per_second": 0.031, + "train_steps_per_second": 0.004 + } + ], + "logging_steps": 1, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 30, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}