{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages": -2.60770320892334e-08, "completion_length": 256.0, "epoch": 0.001, "grad_norm": 3.7380807399749756, "kl": 0.0, "learning_rate": 9.989999999999999e-07, "loss": 0.0637, "reward": 0.7604166865348816, "reward_mean": 0.7604166865348816, "reward_std": 0.42027419805526733, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 0.25, "step": 1 }, { "advantages": 0.0, "completion_length": 201.0625, "epoch": 0.002, "grad_norm": 5.145930290222168, "kl": 0.00118255615234375, "learning_rate": 9.98e-07, "loss": -0.0282, "reward": 0.7708333730697632, "reward_mean": 0.7708333730697632, "reward_std": 0.7378304600715637, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.5, "step": 2 }, { "advantages": 0.0, "completion_length": 232.0, "epoch": 0.003, "grad_norm": 3.798980474472046, "kl": 0.003448486328125, "learning_rate": 9.97e-07, "loss": 0.0955, "reward": 1.1875, "reward_mean": 1.1875, "reward_std": 0.7253239154815674, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.8125, "step": 3 }, { "advantages": 1.862645149230957e-08, "completion_length": 245.25, "epoch": 0.004, "grad_norm": 4.136316299438477, "kl": 0.00421142578125, "learning_rate": 9.959999999999999e-07, "loss": 0.0824, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.598172664642334, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.8125, "step": 4 }, { "advantages": -1.4901161193847656e-08, "completion_length": 229.5, "epoch": 0.005, "grad_norm": 3.53371000289917, "kl": 0.00439453125, "learning_rate": 9.95e-07, "loss": 0.0099, "reward": 1.21875, "reward_mean": 1.21875, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 1.0, "step": 5 }, { "advantages": -5.960464477539063e-08, "completion_length": 191.125, "epoch": 0.006, "grad_norm": 4.162847518920898, "kl": 0.00921630859375, "learning_rate": 9.94e-07, "loss": 0.0321, "reward": 1.3020833730697632, "reward_mean": 1.3020833730697632, "reward_std": 0.41478484869003296, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 0.875, "step": 6 }, { "advantages": -1.4901161193847656e-08, "completion_length": 212.5, "epoch": 0.007, "grad_norm": 3.7386105060577393, "kl": 0.01312255859375, "learning_rate": 9.929999999999999e-07, "loss": 0.0328, "reward": 1.1041667461395264, "reward_mean": 1.1041667461395264, "reward_std": 0.349293053150177, "rewards/accuracy_reward": 0.2291666865348816, "rewards/format_reward": 0.875, "step": 7 }, { "advantages": -1.1920928955078125e-07, "completion_length": 229.5625, "epoch": 0.008, "grad_norm": 3.4274792671203613, "kl": 0.01171875, "learning_rate": 9.92e-07, "loss": -0.0102, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.28498581051826477, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 8 }, { "advantages": 0.0, "completion_length": 187.4375, "epoch": 0.009, "grad_norm": 5.677432537078857, "kl": 0.01123046875, "learning_rate": 9.91e-07, "loss": 0.1596, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.4972116947174072, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9375, "step": 9 }, { "advantages": -1.4901161193847656e-08, "completion_length": 196.25, "epoch": 0.01, "grad_norm": 4.712809085845947, "kl": 0.0247802734375, "learning_rate": 9.9e-07, "loss": -0.1289, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.37918925285339355, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 10 }, { "advantages": 0.0, "completion_length": 116.0625, "epoch": 0.011, "grad_norm": 4.270755767822266, "kl": 0.0751953125, "learning_rate": 9.89e-07, "loss": -0.0513, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 11 }, { "advantages": 7.450580596923828e-09, "completion_length": 170.0625, "epoch": 0.012, "grad_norm": 3.437450408935547, "kl": 0.04638671875, "learning_rate": 9.88e-07, "loss": -0.0187, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 12 }, { "advantages": 0.0, "completion_length": 86.5, "epoch": 0.013, "grad_norm": 4.844762802124023, "kl": 0.0419921875, "learning_rate": 9.87e-07, "loss": 0.0026, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 13 }, { "advantages": -6.332993507385254e-08, "completion_length": 186.875, "epoch": 0.014, "grad_norm": 4.118823528289795, "kl": 0.04638671875, "learning_rate": 9.86e-07, "loss": -0.0292, "reward": 1.5833333730697632, "reward_mean": 1.5833333730697632, "reward_std": 0.32946425676345825, "rewards/accuracy_reward": 0.5833333730697632, "rewards/format_reward": 1.0, "step": 14 }, { "advantages": 1.4901161193847656e-08, "completion_length": 95.75, "epoch": 0.015, "grad_norm": 4.095740795135498, "kl": 0.0703125, "learning_rate": 9.849999999999999e-07, "loss": 0.1122, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 15 }, { "advantages": -9.313225746154785e-08, "completion_length": 136.9375, "epoch": 0.016, "grad_norm": 5.639898777008057, "kl": 0.0859375, "learning_rate": 9.84e-07, "loss": -0.0948, "reward": 1.5833333730697632, "reward_mean": 1.5833333730697632, "reward_std": 0.3827785551548004, "rewards/accuracy_reward": 0.5833333134651184, "rewards/format_reward": 1.0, "step": 16 }, { "advantages": 2.2351741790771484e-08, "completion_length": 134.1875, "epoch": 0.017, "grad_norm": 5.9321980476379395, "kl": 0.0654296875, "learning_rate": 9.83e-07, "loss": -0.126, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.47921282052993774, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 17 }, { "advantages": 5.21540641784668e-08, "completion_length": 124.375, "epoch": 0.018, "grad_norm": 7.639815807342529, "kl": 0.052734375, "learning_rate": 9.819999999999999e-07, "loss": 0.0134, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.5096293687820435, "rewards/accuracy_reward": 0.3125000298023224, "rewards/format_reward": 0.9375, "step": 18 }, { "advantages": 1.4901161193847656e-08, "completion_length": 160.6875, "epoch": 0.019, "grad_norm": 4.792241096496582, "kl": 0.064453125, "learning_rate": 9.81e-07, "loss": 0.1099, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "step": 19 }, { "advantages": 1.4901161193847656e-08, "completion_length": 161.0625, "epoch": 0.02, "grad_norm": 3.4121909141540527, "kl": 0.064453125, "learning_rate": 9.8e-07, "loss": 0.0186, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "step": 20 }, { "advantages": -2.60770320892334e-07, "completion_length": 204.0625, "epoch": 0.021, "grad_norm": 3.9842889308929443, "kl": 0.0703125, "learning_rate": 9.789999999999999e-07, "loss": -0.0411, "reward": 1.4166667461395264, "reward_mean": 1.4166667461395264, "reward_std": 0.18292954564094543, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "step": 21 }, { "advantages": 0.0, "completion_length": 140.875, "epoch": 0.022, "grad_norm": 6.302048206329346, "kl": 0.050537109375, "learning_rate": 9.78e-07, "loss": 0.1697, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.3471825420856476, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 22 }, { "advantages": -7.450580596923828e-09, "completion_length": 111.5625, "epoch": 0.023, "grad_norm": 8.212870597839355, "kl": 0.1103515625, "learning_rate": 9.77e-07, "loss": 0.1932, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.44403791427612305, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 23 }, { "advantages": -9.685754776000977e-08, "completion_length": 180.25, "epoch": 0.024, "grad_norm": 4.934231758117676, "kl": 0.0791015625, "learning_rate": 9.759999999999998e-07, "loss": 0.2501, "reward": 1.4270833730697632, "reward_mean": 1.4270833730697632, "reward_std": 0.31544241309165955, "rewards/accuracy_reward": 0.4270833432674408, "rewards/format_reward": 1.0, "step": 24 }, { "advantages": -7.82310962677002e-08, "completion_length": 117.625, "epoch": 0.025, "grad_norm": 6.088715076446533, "kl": 0.126953125, "learning_rate": 9.75e-07, "loss": -0.0576, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.36558622121810913, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 25 }, { "advantages": 7.450580596923828e-09, "completion_length": 93.5625, "epoch": 0.026, "grad_norm": 7.816601753234863, "kl": 0.1103515625, "learning_rate": 9.74e-07, "loss": 0.1863, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 26 }, { "advantages": -4.842877388000488e-08, "completion_length": 150.6875, "epoch": 0.027, "grad_norm": 5.928228378295898, "kl": 0.162109375, "learning_rate": 9.729999999999998e-07, "loss": -0.2134, "reward": 1.3645833730697632, "reward_mean": 1.3645833730697632, "reward_std": 0.28207486867904663, "rewards/accuracy_reward": 0.4270833730697632, "rewards/format_reward": 0.9375, "step": 27 }, { "advantages": -2.2351741790771484e-08, "completion_length": 98.0, "epoch": 0.028, "grad_norm": 6.595263481140137, "kl": 0.1005859375, "learning_rate": 9.72e-07, "loss": -0.0471, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2925041913986206, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 28 }, { "advantages": 1.862645149230957e-08, "completion_length": 119.75, "epoch": 0.029, "grad_norm": 4.980852127075195, "kl": 0.1279296875, "learning_rate": 9.709999999999999e-07, "loss": 0.0662, "reward": 1.1875, "reward_mean": 1.1875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 1.0, "step": 29 }, { "advantages": 7.078051567077637e-08, "completion_length": 112.875, "epoch": 0.03, "grad_norm": 6.368246555328369, "kl": 0.166015625, "learning_rate": 9.7e-07, "loss": 0.0972, "reward": 1.5729167461395264, "reward_mean": 1.5729167461395264, "reward_std": 0.27226415276527405, "rewards/accuracy_reward": 0.5729166865348816, "rewards/format_reward": 1.0, "step": 30 }, { "advantages": -7.450580596923828e-09, "completion_length": 109.4375, "epoch": 0.031, "grad_norm": 3.693007469177246, "kl": 0.1259765625, "learning_rate": 9.69e-07, "loss": -0.0644, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 31 }, { "advantages": 2.9802322387695312e-08, "completion_length": 99.0, "epoch": 0.032, "grad_norm": 7.649048805236816, "kl": 0.150390625, "learning_rate": 9.679999999999999e-07, "loss": -0.0374, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.41912031173706055, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 32 }, { "advantages": 1.0803341865539551e-07, "completion_length": 107.25, "epoch": 0.033, "grad_norm": 7.077078819274902, "kl": 0.1416015625, "learning_rate": 9.67e-07, "loss": -0.0125, "reward": 1.4791667461395264, "reward_mean": 1.4791667461395264, "reward_std": 0.27867573499679565, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 1.0, "step": 33 }, { "advantages": -7.450580596923828e-09, "completion_length": 102.875, "epoch": 0.034, "grad_norm": 7.0495991706848145, "kl": 0.125, "learning_rate": 9.66e-07, "loss": -0.1118, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 34 }, { "advantages": -2.2351741790771484e-08, "completion_length": 102.5625, "epoch": 0.035, "grad_norm": 6.116304874420166, "kl": 0.1328125, "learning_rate": 9.649999999999999e-07, "loss": -0.023, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.4972116947174072, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.9375, "step": 35 }, { "advantages": -1.4901161193847656e-08, "completion_length": 121.9375, "epoch": 0.036, "grad_norm": 5.6247453689575195, "kl": 0.25, "learning_rate": 9.64e-07, "loss": 0.0057, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.5260357856750488, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 36 }, { "advantages": -1.862645149230957e-08, "completion_length": 114.0, "epoch": 0.037, "grad_norm": 5.942628860473633, "kl": 0.0859375, "learning_rate": 9.63e-07, "loss": 0.0132, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 37 }, { "advantages": -1.862645149230957e-08, "completion_length": 120.9375, "epoch": 0.038, "grad_norm": 6.8025312423706055, "kl": 0.09375, "learning_rate": 9.619999999999999e-07, "loss": 0.298, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 38 }, { "advantages": -3.725290298461914e-09, "completion_length": 95.0, "epoch": 0.039, "grad_norm": 4.515552520751953, "kl": 0.05712890625, "learning_rate": 9.61e-07, "loss": -0.0356, "reward": 1.7291667461395264, "reward_mean": 1.7291667461395264, "reward_std": 0.12400396168231964, "rewards/accuracy_reward": 0.7291666269302368, "rewards/format_reward": 1.0, "step": 39 }, { "advantages": -1.6391277313232422e-07, "completion_length": 115.6875, "epoch": 0.04, "grad_norm": 3.725029706954956, "kl": 0.0947265625, "learning_rate": 9.6e-07, "loss": 0.0472, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.08908708393573761, "rewards/accuracy_reward": 0.3333333730697632, "rewards/format_reward": 1.0, "step": 40 }, { "advantages": 0.0, "completion_length": 108.125, "epoch": 0.041, "grad_norm": 4.70515775680542, "kl": 0.11328125, "learning_rate": 9.589999999999998e-07, "loss": 0.0818, "reward": 1.6666667461395264, "reward_mean": 1.6666667461395264, "reward_std": 0.08908706903457642, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 41 }, { "advantages": 1.862645149230957e-07, "completion_length": 116.1875, "epoch": 0.042, "grad_norm": 6.178482532501221, "kl": 0.125, "learning_rate": 9.58e-07, "loss": -0.0352, "reward": 1.4166667461395264, "reward_mean": 1.4166667461395264, "reward_std": 0.18292953073978424, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "step": 42 }, { "advantages": -1.4901161193847656e-08, "completion_length": 114.5625, "epoch": 0.043, "grad_norm": 3.4989614486694336, "kl": 0.087890625, "learning_rate": 9.57e-07, "loss": -0.0562, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 43 }, { "advantages": -5.960464477539063e-08, "completion_length": 111.4375, "epoch": 0.044, "grad_norm": 5.46613883972168, "kl": 0.11181640625, "learning_rate": 9.559999999999998e-07, "loss": 0.0258, "reward": 1.4895833730697632, "reward_mean": 1.4895833730697632, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.4895833730697632, "rewards/format_reward": 1.0, "step": 44 }, { "advantages": -1.4901161193847656e-08, "completion_length": 145.5625, "epoch": 0.045, "grad_norm": 4.604372501373291, "kl": 0.140625, "learning_rate": 9.55e-07, "loss": -0.1122, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 45 }, { "advantages": -1.0803341865539551e-07, "completion_length": 133.6875, "epoch": 0.046, "grad_norm": 5.48823881149292, "kl": 0.08935546875, "learning_rate": 9.539999999999999e-07, "loss": 0.0234, "reward": 1.5416667461395264, "reward_mean": 1.5416667461395264, "reward_std": 0.24800795316696167, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "step": 46 }, { "advantages": -7.450580596923828e-09, "completion_length": 113.8125, "epoch": 0.047, "grad_norm": 3.9073755741119385, "kl": 0.12451171875, "learning_rate": 9.529999999999999e-07, "loss": 0.0176, "reward": 1.4583333730697632, "reward_mean": 1.4583333730697632, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.4583333730697632, "rewards/format_reward": 1.0, "step": 47 }, { "advantages": 0.0, "completion_length": 145.6875, "epoch": 0.048, "grad_norm": 5.332810878753662, "kl": 0.08740234375, "learning_rate": 9.52e-07, "loss": -0.0314, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.22201895713806152, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 48 }, { "advantages": -1.4901161193847656e-08, "completion_length": 126.875, "epoch": 0.049, "grad_norm": 5.358933925628662, "kl": 0.150390625, "learning_rate": 9.509999999999999e-07, "loss": 0.1055, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 49 }, { "advantages": 0.0, "completion_length": 138.875, "epoch": 0.05, "grad_norm": 5.692139625549316, "kl": 0.0859375, "learning_rate": 9.499999999999999e-07, "loss": -0.1248, "reward": 1.4479167461395264, "reward_mean": 1.4479167461395264, "reward_std": 0.17747542262077332, "rewards/accuracy_reward": 0.4479166865348816, "rewards/format_reward": 1.0, "step": 50 }, { "advantages": 2.60770320892334e-08, "completion_length": 146.875, "epoch": 0.051, "grad_norm": 5.381588459014893, "kl": 0.08935546875, "learning_rate": 9.489999999999999e-07, "loss": -0.129, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.41746097803115845, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 51 }, { "advantages": 2.2351741790771484e-08, "completion_length": 150.875, "epoch": 0.052, "grad_norm": 5.1832451820373535, "kl": 0.083984375, "learning_rate": 9.479999999999999e-07, "loss": -0.0529, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.2925041913986206, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 52 }, { "advantages": 0.0, "completion_length": 124.8125, "epoch": 0.053, "grad_norm": 0.0, "kl": 0.1689453125, "learning_rate": 9.469999999999999e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 53 }, { "advantages": -2.9802322387695312e-08, "completion_length": 126.0, "epoch": 0.054, "grad_norm": 6.027964115142822, "kl": 0.07421875, "learning_rate": 9.459999999999999e-07, "loss": 0.1005, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.3471825420856476, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 54 }, { "advantages": -1.4901161193847656e-08, "completion_length": 144.75, "epoch": 0.055, "grad_norm": 4.921864986419678, "kl": 0.07568359375, "learning_rate": 9.45e-07, "loss": 0.0249, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.3047097325325012, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 55 }, { "advantages": -1.862645149230957e-08, "completion_length": 125.125, "epoch": 0.056, "grad_norm": 5.077033042907715, "kl": 0.0673828125, "learning_rate": 9.439999999999999e-07, "loss": -0.045, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.5260357856750488, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 56 }, { "advantages": -1.862645149230957e-08, "completion_length": 131.625, "epoch": 0.057, "grad_norm": 4.984986782073975, "kl": 0.1015625, "learning_rate": 9.429999999999999e-07, "loss": -0.0956, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 57 }, { "advantages": 4.470348358154297e-08, "completion_length": 175.25, "epoch": 0.058, "grad_norm": 3.103456974029541, "kl": 0.0810546875, "learning_rate": 9.419999999999999e-07, "loss": -0.0034, "reward": 1.2291667461395264, "reward_mean": 1.2291667461395264, "reward_std": 0.12400396913290024, "rewards/accuracy_reward": 0.2291666865348816, "rewards/format_reward": 1.0, "step": 58 }, { "advantages": 1.862645149230957e-08, "completion_length": 147.25, "epoch": 0.059, "grad_norm": 4.62039852142334, "kl": 0.07763671875, "learning_rate": 9.409999999999999e-07, "loss": 0.1301, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.37714511156082153, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 59 }, { "advantages": 1.4901161193847656e-08, "completion_length": 167.25, "epoch": 0.06, "grad_norm": 4.27726411819458, "kl": 0.111328125, "learning_rate": 9.399999999999999e-07, "loss": -0.0044, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "step": 60 }, { "advantages": 0.0, "completion_length": 147.25, "epoch": 0.061, "grad_norm": 4.738762855529785, "kl": 0.087890625, "learning_rate": 9.389999999999999e-07, "loss": -0.0094, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.40089184045791626, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 61 }, { "advantages": 1.862645149230957e-08, "completion_length": 163.75, "epoch": 0.062, "grad_norm": 4.51692008972168, "kl": 0.07421875, "learning_rate": 9.379999999999998e-07, "loss": 0.1029, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.48037588596343994, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 62 }, { "advantages": 0.0, "completion_length": 128.5625, "epoch": 0.063, "grad_norm": 3.7537429332733154, "kl": 0.06396484375, "learning_rate": 9.37e-07, "loss": 0.0021, "reward": 1.21875, "reward_mean": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 1.0, "step": 63 }, { "advantages": -8.195638656616211e-08, "completion_length": 183.0625, "epoch": 0.064, "grad_norm": 4.70877742767334, "kl": 0.0732421875, "learning_rate": 9.36e-07, "loss": -0.0168, "reward": 1.1041667461395264, "reward_mean": 1.1041667461395264, "reward_std": 0.25392839312553406, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 1.0, "step": 64 }, { "advantages": 7.450580596923828e-09, "completion_length": 169.0625, "epoch": 0.065, "grad_norm": 2.69047212600708, "kl": 0.0927734375, "learning_rate": 9.35e-07, "loss": -0.0626, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.9375, "step": 65 }, { "advantages": -1.862645149230957e-08, "completion_length": 125.9375, "epoch": 0.066, "grad_norm": 5.199371814727783, "kl": 0.109375, "learning_rate": 9.34e-07, "loss": 0.0566, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 66 }, { "advantages": 7.078051567077637e-08, "completion_length": 161.5, "epoch": 0.067, "grad_norm": 4.959042549133301, "kl": 0.1240234375, "learning_rate": 9.33e-07, "loss": -0.0491, "reward": 1.5416667461395264, "reward_mean": 1.5416667461395264, "reward_std": 0.20198571681976318, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "step": 67 }, { "advantages": 0.0, "completion_length": 131.1875, "epoch": 0.068, "grad_norm": 0.0, "kl": 0.064453125, "learning_rate": 9.32e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 68 }, { "advantages": -5.587935447692871e-08, "completion_length": 149.3125, "epoch": 0.069, "grad_norm": 5.306145668029785, "kl": 0.0908203125, "learning_rate": 9.31e-07, "loss": 0.1358, "reward": 1.7291667461395264, "reward_mean": 1.7291667461395264, "reward_std": 0.384762704372406, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 1.0, "step": 69 }, { "advantages": -1.4901161193847656e-08, "completion_length": 158.8125, "epoch": 0.07, "grad_norm": 4.328370571136475, "kl": 0.107421875, "learning_rate": 9.3e-07, "loss": -0.0507, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.3204349875450134, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 70 }, { "advantages": -2.2351741790771484e-08, "completion_length": 130.5, "epoch": 0.071, "grad_norm": 5.123632431030273, "kl": 0.1259765625, "learning_rate": 9.29e-07, "loss": -0.0134, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 71 }, { "advantages": 2.9802322387695312e-08, "completion_length": 180.6875, "epoch": 0.072, "grad_norm": 2.0722601413726807, "kl": 0.0810546875, "learning_rate": 9.28e-07, "loss": -0.02, "reward": 1.4791667461395264, "reward_mean": 1.4791667461395264, "reward_std": 0.15268757939338684, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 1.0, "step": 72 }, { "advantages": 1.4901161193847656e-08, "completion_length": 138.25, "epoch": 0.073, "grad_norm": 6.854410648345947, "kl": 0.1533203125, "learning_rate": 9.27e-07, "loss": -0.1078, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.3369941711425781, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 73 }, { "advantages": 1.564621925354004e-07, "completion_length": 159.75, "epoch": 0.074, "grad_norm": 5.237288951873779, "kl": 0.11328125, "learning_rate": 9.26e-07, "loss": -0.0132, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.2658637762069702, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 74 }, { "advantages": -1.0803341865539551e-07, "completion_length": 158.4375, "epoch": 0.075, "grad_norm": 4.552402496337891, "kl": 0.11328125, "learning_rate": 9.25e-07, "loss": 0.078, "reward": 1.5520833730697632, "reward_mean": 1.5520833730697632, "reward_std": 0.20653896033763885, "rewards/accuracy_reward": 0.5520833730697632, "rewards/format_reward": 1.0, "step": 75 }, { "advantages": -1.4901161193847656e-08, "completion_length": 172.8125, "epoch": 0.076, "grad_norm": 4.742386817932129, "kl": 0.146484375, "learning_rate": 9.24e-07, "loss": -0.016, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.3198433816432953, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 76 }, { "advantages": -7.450580596923828e-09, "completion_length": 125.3125, "epoch": 0.077, "grad_norm": 3.670785903930664, "kl": 0.10791015625, "learning_rate": 9.23e-07, "loss": -0.0209, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 77 }, { "advantages": 7.450580596923828e-09, "completion_length": 132.4375, "epoch": 0.078, "grad_norm": 3.141366958618164, "kl": 0.17578125, "learning_rate": 9.22e-07, "loss": 0.0153, "reward": 1.28125, "reward_mean": 1.28125, "reward_std": 0.1602174937725067, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 78 }, { "advantages": -7.450580596923828e-09, "completion_length": 155.0, "epoch": 0.079, "grad_norm": 4.80902099609375, "kl": 0.16796875, "learning_rate": 9.21e-07, "loss": 0.013, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.44403791427612305, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 79 }, { "advantages": 5.587935447692871e-08, "completion_length": 151.1875, "epoch": 0.08, "grad_norm": 2.728870391845703, "kl": 0.150390625, "learning_rate": 9.2e-07, "loss": -0.0162, "reward": 1.2291667461395264, "reward_mean": 1.2291667461395264, "reward_std": 0.08625819534063339, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 1.0, "step": 80 }, { "advantages": 0.0, "completion_length": 135.375, "epoch": 0.081, "grad_norm": 0.0, "kl": 0.140625, "learning_rate": 9.19e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 81 }, { "advantages": -1.4901161193847656e-08, "completion_length": 131.0625, "epoch": 0.082, "grad_norm": 5.101280212402344, "kl": 0.119140625, "learning_rate": 9.18e-07, "loss": 0.0076, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 82 }, { "advantages": 0.0, "completion_length": 149.8125, "epoch": 0.083, "grad_norm": 6.299161911010742, "kl": 0.1328125, "learning_rate": 9.17e-07, "loss": 0.1103, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.5430608987808228, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.9375, "step": 83 }, { "advantages": 0.0, "completion_length": 158.8125, "epoch": 0.084, "grad_norm": 5.345361232757568, "kl": 0.16015625, "learning_rate": 9.16e-07, "loss": -0.0678, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.3471825420856476, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 84 }, { "advantages": 8.940696716308594e-08, "completion_length": 147.0, "epoch": 0.085, "grad_norm": 4.728163719177246, "kl": 0.1328125, "learning_rate": 9.15e-07, "loss": 0.0498, "reward": 1.5104167461395264, "reward_mean": 1.5104167461395264, "reward_std": 0.16554003953933716, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 1.0, "step": 85 }, { "advantages": 1.862645149230957e-08, "completion_length": 135.4375, "epoch": 0.086, "grad_norm": 5.456924915313721, "kl": 0.146484375, "learning_rate": 9.14e-07, "loss": -0.0519, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.3924052119255066, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 86 }, { "advantages": -3.725290298461914e-09, "completion_length": 141.875, "epoch": 0.087, "grad_norm": 3.1715574264526367, "kl": 0.2578125, "learning_rate": 9.13e-07, "loss": 0.0425, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 87 }, { "advantages": 0.0, "completion_length": 136.25, "epoch": 0.088, "grad_norm": 6.198694705963135, "kl": 0.154296875, "learning_rate": 9.12e-07, "loss": 0.148, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "step": 88 }, { "advantages": -1.1175870895385742e-08, "completion_length": 150.3125, "epoch": 0.089, "grad_norm": 5.361752510070801, "kl": 0.244140625, "learning_rate": 9.109999999999999e-07, "loss": -0.1088, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 89 }, { "advantages": 1.2665987014770508e-07, "completion_length": 162.5625, "epoch": 0.09, "grad_norm": 3.3533167839050293, "kl": 0.15625, "learning_rate": 9.1e-07, "loss": 0.0269, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 90 }, { "advantages": -1.4901161193847656e-08, "completion_length": 154.125, "epoch": 0.091, "grad_norm": 4.257230281829834, "kl": 0.138671875, "learning_rate": 9.09e-07, "loss": -0.0494, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.3369941711425781, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 91 }, { "advantages": 5.587935447692871e-08, "completion_length": 153.5625, "epoch": 0.092, "grad_norm": 3.06853985786438, "kl": 0.146484375, "learning_rate": 9.08e-07, "loss": -0.0155, "reward": 1.7291667461395264, "reward_mean": 1.7291667461395264, "reward_std": 0.08625819534063339, "rewards/accuracy_reward": 0.7291666269302368, "rewards/format_reward": 1.0, "step": 92 }, { "advantages": 1.4901161193847656e-08, "completion_length": 173.5625, "epoch": 0.093, "grad_norm": 4.021603584289551, "kl": 0.150390625, "learning_rate": 9.07e-07, "loss": 0.1183, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.3369941711425781, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 93 }, { "advantages": 1.4901161193847656e-08, "completion_length": 137.125, "epoch": 0.094, "grad_norm": 3.569105625152588, "kl": 0.1728515625, "learning_rate": 9.06e-07, "loss": -0.0621, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 94 }, { "advantages": -3.3527612686157227e-08, "completion_length": 151.4375, "epoch": 0.095, "grad_norm": 4.879980564117432, "kl": 0.1416015625, "learning_rate": 9.05e-07, "loss": -0.0726, "reward": 1.3958333730697632, "reward_mean": 1.3958333730697632, "reward_std": 0.3177132308483124, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 1.0, "step": 95 }, { "advantages": 1.4901161193847656e-08, "completion_length": 138.9375, "epoch": 0.096, "grad_norm": 3.0653719902038574, "kl": 0.201171875, "learning_rate": 9.039999999999999e-07, "loss": 0.0793, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 96 }, { "advantages": -2.2351741790771484e-08, "completion_length": 165.0, "epoch": 0.097, "grad_norm": 5.2285027503967285, "kl": 0.17578125, "learning_rate": 9.03e-07, "loss": -0.0091, "reward": 1.5833333730697632, "reward_mean": 1.5833333730697632, "reward_std": 0.5487886071205139, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 0.875, "step": 97 }, { "advantages": -1.862645149230957e-08, "completion_length": 187.9375, "epoch": 0.098, "grad_norm": 5.323462963104248, "kl": 0.13671875, "learning_rate": 9.02e-07, "loss": 0.1548, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.3471629321575165, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 98 }, { "advantages": -1.043081283569336e-07, "completion_length": 203.3125, "epoch": 0.099, "grad_norm": 4.417811870574951, "kl": 0.1181640625, "learning_rate": 9.01e-07, "loss": -0.0588, "reward": 1.6666667461395264, "reward_mean": 1.6666667461395264, "reward_std": 0.21507522463798523, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 99 }, { "advantages": 0.0, "completion_length": 205.1875, "epoch": 0.1, "grad_norm": 5.388199329376221, "kl": 0.1484375, "learning_rate": 9e-07, "loss": 0.2401, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.2486058473587036, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 100 }, { "advantages": 0.0, "completion_length": 128.9375, "epoch": 0.101, "grad_norm": 0.0, "kl": 0.13671875, "learning_rate": 8.99e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 101 }, { "advantages": -1.0803341865539551e-07, "completion_length": 199.75, "epoch": 0.102, "grad_norm": 4.06512975692749, "kl": 0.12109375, "learning_rate": 8.98e-07, "loss": 0.0036, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.33592626452445984, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 102 }, { "advantages": 1.4901161193847656e-08, "completion_length": 177.625, "epoch": 0.103, "grad_norm": 4.752602577209473, "kl": 0.140625, "learning_rate": 8.969999999999999e-07, "loss": 0.0261, "reward": 1.4583333730697632, "reward_mean": 1.4583333730697632, "reward_std": 0.27215445041656494, "rewards/accuracy_reward": 0.4583333730697632, "rewards/format_reward": 1.0, "step": 103 }, { "advantages": -7.450580596923828e-09, "completion_length": 123.625, "epoch": 0.104, "grad_norm": 5.437667369842529, "kl": 0.177734375, "learning_rate": 8.96e-07, "loss": -0.0163, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 104 }, { "advantages": 1.1175870895385742e-08, "completion_length": 162.4375, "epoch": 0.105, "grad_norm": 5.383893013000488, "kl": 0.1123046875, "learning_rate": 8.95e-07, "loss": 0.0715, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 105 }, { "advantages": -1.30385160446167e-07, "completion_length": 178.75, "epoch": 0.106, "grad_norm": 4.805429935455322, "kl": 0.142578125, "learning_rate": 8.939999999999999e-07, "loss": -0.0543, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.1451837718486786, "rewards/accuracy_reward": 0.3750000298023224, "rewards/format_reward": 1.0, "step": 106 }, { "advantages": -7.450580596923828e-09, "completion_length": 170.75, "epoch": 0.107, "grad_norm": 2.5841424465179443, "kl": 0.28125, "learning_rate": 8.93e-07, "loss": -0.0829, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 107 }, { "advantages": -5.587935447692871e-08, "completion_length": 188.6875, "epoch": 0.108, "grad_norm": 4.707062244415283, "kl": 0.15234375, "learning_rate": 8.92e-07, "loss": -0.0264, "reward": 1.6458333730697632, "reward_mean": 1.6458333730697632, "reward_std": 0.31493228673934937, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 1.0, "step": 108 }, { "advantages": -7.450580596923828e-09, "completion_length": 190.9375, "epoch": 0.109, "grad_norm": 5.0554022789001465, "kl": 0.1396484375, "learning_rate": 8.91e-07, "loss": 0.0244, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 109 }, { "advantages": 9.685754776000977e-08, "completion_length": 186.0, "epoch": 0.11, "grad_norm": 5.242109298706055, "kl": 0.142578125, "learning_rate": 8.9e-07, "loss": -0.0693, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.25392839312553406, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 110 }, { "advantages": -7.450580596923828e-08, "completion_length": 176.25, "epoch": 0.111, "grad_norm": 4.1332621574401855, "kl": 0.1494140625, "learning_rate": 8.89e-07, "loss": -0.0321, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.12400396913290024, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 111 }, { "advantages": 3.725290298461914e-09, "completion_length": 144.4375, "epoch": 0.112, "grad_norm": 3.3191065788269043, "kl": 0.15234375, "learning_rate": 8.88e-07, "loss": -0.0225, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 112 }, { "advantages": 0.0, "completion_length": 176.5, "epoch": 0.113, "grad_norm": 4.919429302215576, "kl": 0.14453125, "learning_rate": 8.869999999999999e-07, "loss": 0.0628, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 113 }, { "advantages": 7.078051567077637e-08, "completion_length": 189.375, "epoch": 0.114, "grad_norm": 4.54962682723999, "kl": 0.14453125, "learning_rate": 8.86e-07, "loss": 0.0199, "reward": 1.6041667461395264, "reward_mean": 1.6041667461395264, "reward_std": 0.33592626452445984, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 1.0, "step": 114 }, { "advantages": 1.4901161193847656e-08, "completion_length": 199.25, "epoch": 0.115, "grad_norm": 3.2728166580200195, "kl": 0.1640625, "learning_rate": 8.85e-07, "loss": -0.0123, "reward": 1.3958333730697632, "reward_mean": 1.3958333730697632, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 1.0, "step": 115 }, { "advantages": 7.450580596923828e-09, "completion_length": 192.375, "epoch": 0.116, "grad_norm": 3.095080614089966, "kl": 0.146484375, "learning_rate": 8.839999999999999e-07, "loss": -0.0267, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 116 }, { "advantages": 3.3527612686157227e-08, "completion_length": 152.625, "epoch": 0.117, "grad_norm": 5.22807502746582, "kl": 0.1669921875, "learning_rate": 8.83e-07, "loss": -0.0066, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 117 }, { "advantages": 0.0, "completion_length": 155.375, "epoch": 0.118, "grad_norm": 0.0, "kl": 0.150390625, "learning_rate": 8.82e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 118 }, { "advantages": 0.0, "completion_length": 143.375, "epoch": 0.119, "grad_norm": 0.0, "kl": 0.1943359375, "learning_rate": 8.81e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 119 }, { "advantages": 3.725290298461914e-08, "completion_length": 164.5625, "epoch": 0.12, "grad_norm": 3.923374891281128, "kl": 0.162109375, "learning_rate": 8.799999999999999e-07, "loss": -0.0003, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.26346173882484436, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 120 }, { "advantages": 3.725290298461914e-09, "completion_length": 180.0625, "epoch": 0.121, "grad_norm": 4.817902565002441, "kl": 0.169921875, "learning_rate": 8.79e-07, "loss": -0.0816, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 121 }, { "advantages": -1.4901161193847656e-08, "completion_length": 159.5625, "epoch": 0.122, "grad_norm": 3.3247320652008057, "kl": 0.3046875, "learning_rate": 8.78e-07, "loss": 0.0201, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 122 }, { "advantages": 1.4901161193847656e-08, "completion_length": 114.375, "epoch": 0.123, "grad_norm": 4.4976091384887695, "kl": 0.2001953125, "learning_rate": 8.769999999999999e-07, "loss": -0.0023, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 123 }, { "advantages": -3.203749656677246e-07, "completion_length": 151.625, "epoch": 0.124, "grad_norm": 4.727357387542725, "kl": 0.162109375, "learning_rate": 8.76e-07, "loss": -0.1441, "reward": 1.5208333730697632, "reward_mean": 1.5208333730697632, "reward_std": 0.058925580233335495, "rewards/accuracy_reward": 0.5208333134651184, "rewards/format_reward": 1.0, "step": 124 }, { "advantages": -3.725290298461914e-09, "completion_length": 154.625, "epoch": 0.125, "grad_norm": 5.586273670196533, "kl": 0.212890625, "learning_rate": 8.75e-07, "loss": -0.0295, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 125 }, { "advantages": 7.450580596923828e-09, "completion_length": 210.4375, "epoch": 0.126, "grad_norm": 3.2797603607177734, "kl": 0.154296875, "learning_rate": 8.739999999999999e-07, "loss": -0.0005, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 126 }, { "advantages": 0.0, "completion_length": 149.25, "epoch": 0.127, "grad_norm": 0.0, "kl": 0.171875, "learning_rate": 8.729999999999999e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 127 }, { "advantages": 0.0, "completion_length": 142.9375, "epoch": 0.128, "grad_norm": 0.0, "kl": 0.2158203125, "learning_rate": 8.72e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 128 }, { "advantages": 1.0058283805847168e-07, "completion_length": 214.9375, "epoch": 0.129, "grad_norm": 4.753498554229736, "kl": 0.12890625, "learning_rate": 8.71e-07, "loss": 0.0282, "reward": 1.5208333730697632, "reward_mean": 1.5208333730697632, "reward_std": 0.2298392653465271, "rewards/accuracy_reward": 0.5208333730697632, "rewards/format_reward": 1.0, "step": 129 }, { "advantages": -3.725290298461914e-08, "completion_length": 169.9375, "epoch": 0.13, "grad_norm": 2.9244163036346436, "kl": 0.146484375, "learning_rate": 8.699999999999999e-07, "loss": -0.0707, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.08908706903457642, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 130 }, { "advantages": -1.4901161193847656e-08, "completion_length": 140.1875, "epoch": 0.131, "grad_norm": 3.525092840194702, "kl": 0.15625, "learning_rate": 8.69e-07, "loss": 0.0386, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 131 }, { "advantages": 3.725290298461914e-09, "completion_length": 150.75, "epoch": 0.132, "grad_norm": 3.3508222103118896, "kl": 0.150390625, "learning_rate": 8.68e-07, "loss": -0.0317, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 132 }, { "advantages": -1.2293457984924316e-07, "completion_length": 207.0625, "epoch": 0.133, "grad_norm": 4.877673625946045, "kl": 0.15625, "learning_rate": 8.669999999999999e-07, "loss": -0.0678, "reward": 1.6666667461395264, "reward_mean": 1.6666667461395264, "reward_std": 0.17251640558242798, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 133 }, { "advantages": 7.450580596923828e-09, "completion_length": 114.5625, "epoch": 0.134, "grad_norm": 4.1597580909729, "kl": 0.1904296875, "learning_rate": 8.659999999999999e-07, "loss": -0.0128, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 134 }, { "advantages": 0.0, "completion_length": 171.625, "epoch": 0.135, "grad_norm": 0.0, "kl": 0.16015625, "learning_rate": 8.65e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 135 }, { "advantages": -1.862645149230957e-08, "completion_length": 193.3125, "epoch": 0.136, "grad_norm": 3.460597276687622, "kl": 0.1650390625, "learning_rate": 8.639999999999999e-07, "loss": -0.0345, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 136 }, { "advantages": -7.450580596923828e-09, "completion_length": 164.75, "epoch": 0.137, "grad_norm": 3.3782408237457275, "kl": 0.16015625, "learning_rate": 8.629999999999999e-07, "loss": 0.0302, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 137 }, { "advantages": -1.4901161193847656e-08, "completion_length": 159.6875, "epoch": 0.138, "grad_norm": 6.104968547821045, "kl": 0.162109375, "learning_rate": 8.62e-07, "loss": 0.064, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 138 }, { "advantages": 0.0, "completion_length": 204.4375, "epoch": 0.139, "grad_norm": 4.3379902839660645, "kl": 0.1484375, "learning_rate": 8.61e-07, "loss": 0.0819, "reward": 1.4791667461395264, "reward_mean": 1.4791667461395264, "reward_std": 0.3759046792984009, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 1.0, "step": 139 }, { "advantages": -6.146728992462158e-08, "completion_length": 184.6875, "epoch": 0.14, "grad_norm": 2.6453442573547363, "kl": 0.1630859375, "learning_rate": 8.599999999999999e-07, "loss": -0.0203, "reward": 1.7708333730697632, "reward_mean": 1.7708333730697632, "reward_std": 0.19795583188533783, "rewards/accuracy_reward": 0.7708333730697632, "rewards/format_reward": 1.0, "step": 140 }, { "advantages": 0.0, "completion_length": 154.0625, "epoch": 0.141, "grad_norm": 3.7319183349609375, "kl": 0.15234375, "learning_rate": 8.59e-07, "loss": -0.011, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 141 }, { "advantages": 0.0, "completion_length": 148.25, "epoch": 0.142, "grad_norm": 0.0, "kl": 0.171875, "learning_rate": 8.58e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 142 }, { "advantages": 1.1175870895385742e-08, "completion_length": 149.1875, "epoch": 0.143, "grad_norm": 3.001418113708496, "kl": 0.1650390625, "learning_rate": 8.569999999999999e-07, "loss": -0.0812, "reward": 1.7291667461395264, "reward_mean": 1.7291667461395264, "reward_std": 0.15268756449222565, "rewards/accuracy_reward": 0.7291667461395264, "rewards/format_reward": 1.0, "step": 143 }, { "advantages": -1.4901161193847656e-08, "completion_length": 176.3125, "epoch": 0.144, "grad_norm": 4.027692794799805, "kl": 0.1669921875, "learning_rate": 8.559999999999999e-07, "loss": 0.037, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 144 }, { "advantages": 0.0, "completion_length": 156.0, "epoch": 0.145, "grad_norm": 0.0, "kl": 0.19921875, "learning_rate": 8.55e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 145 }, { "advantages": -2.942979335784912e-07, "completion_length": 190.5625, "epoch": 0.146, "grad_norm": 4.445368766784668, "kl": 0.15625, "learning_rate": 8.539999999999999e-07, "loss": -0.0205, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.18292956054210663, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 146 }, { "advantages": 4.470348358154297e-07, "completion_length": 192.4375, "epoch": 0.147, "grad_norm": 5.451050758361816, "kl": 0.169921875, "learning_rate": 8.529999999999999e-07, "loss": -0.0488, "reward": 1.6666667461395264, "reward_mean": 1.6666667461395264, "reward_std": 0.117851123213768, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 147 }, { "advantages": -9.685754776000977e-08, "completion_length": 175.25, "epoch": 0.148, "grad_norm": 4.9530205726623535, "kl": 0.169921875, "learning_rate": 8.52e-07, "loss": -0.0464, "reward": 1.5416667461395264, "reward_mean": 1.5416667461395264, "reward_std": 0.20693820714950562, "rewards/accuracy_reward": 0.5416666865348816, "rewards/format_reward": 1.0, "step": 148 }, { "advantages": -5.587935447692871e-08, "completion_length": 177.75, "epoch": 0.149, "grad_norm": 6.3942551612854, "kl": 0.158203125, "learning_rate": 8.51e-07, "loss": -0.1093, "reward": 1.7083333730697632, "reward_mean": 1.7083333730697632, "reward_std": 0.2630349099636078, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "step": 149 }, { "advantages": 0.0, "completion_length": 198.8125, "epoch": 0.15, "grad_norm": 4.109989166259766, "kl": 0.1533203125, "learning_rate": 8.499999999999999e-07, "loss": -0.0619, "reward": 1.03125, "reward_mean": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 1.0, "step": 150 }, { "advantages": 0.0, "completion_length": 163.8125, "epoch": 0.151, "grad_norm": 0.0, "kl": 0.158203125, "learning_rate": 8.489999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 151 }, { "advantages": -7.450580596923828e-09, "completion_length": 147.3125, "epoch": 0.152, "grad_norm": 3.5985171794891357, "kl": 0.173828125, "learning_rate": 8.48e-07, "loss": 0.0055, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 152 }, { "advantages": 7.450580596923828e-09, "completion_length": 150.875, "epoch": 0.153, "grad_norm": 4.749815940856934, "kl": 0.1767578125, "learning_rate": 8.469999999999999e-07, "loss": -0.0527, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 153 }, { "advantages": 0.0, "completion_length": 165.75, "epoch": 0.154, "grad_norm": 0.0, "kl": 0.1962890625, "learning_rate": 8.459999999999999e-07, "loss": 0.0, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 154 }, { "advantages": -7.450580596923828e-09, "completion_length": 184.1875, "epoch": 0.155, "grad_norm": 5.736739635467529, "kl": 0.18359375, "learning_rate": 8.45e-07, "loss": 0.1821, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 155 }, { "advantages": 3.203749656677246e-07, "completion_length": 214.9375, "epoch": 0.156, "grad_norm": 3.542316436767578, "kl": 0.208984375, "learning_rate": 8.439999999999999e-07, "loss": 0.033, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.058925580233335495, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 156 }, { "advantages": 2.60770320892334e-08, "completion_length": 204.625, "epoch": 0.157, "grad_norm": 4.769254684448242, "kl": 0.173828125, "learning_rate": 8.429999999999999e-07, "loss": 0.0339, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.4189920723438263, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 157 }, { "advantages": 0.0, "completion_length": 178.25, "epoch": 0.158, "grad_norm": 2.834043264389038, "kl": 0.1796875, "learning_rate": 8.419999999999999e-07, "loss": 0.0343, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 158 }, { "advantages": 3.725290298461914e-09, "completion_length": 192.75, "epoch": 0.159, "grad_norm": 3.128997802734375, "kl": 0.150390625, "learning_rate": 8.41e-07, "loss": 0.008, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 159 }, { "advantages": 1.1920928955078125e-07, "completion_length": 183.6875, "epoch": 0.16, "grad_norm": 5.255495071411133, "kl": 0.171875, "learning_rate": 8.399999999999999e-07, "loss": 0.1298, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.3250930905342102, "rewards/accuracy_reward": 0.4062500298023224, "rewards/format_reward": 1.0, "step": 160 }, { "advantages": 7.450580596923828e-09, "completion_length": 192.5625, "epoch": 0.161, "grad_norm": 3.1085081100463867, "kl": 0.18359375, "learning_rate": 8.389999999999999e-07, "loss": -0.0476, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 161 }, { "advantages": 2.60770320892334e-08, "completion_length": 149.125, "epoch": 0.162, "grad_norm": 5.676258563995361, "kl": 0.208984375, "learning_rate": 8.38e-07, "loss": -0.0042, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.4189920723438263, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 162 }, { "advantages": 2.60770320892334e-08, "completion_length": 202.4375, "epoch": 0.163, "grad_norm": 3.1146128177642822, "kl": 0.19140625, "learning_rate": 8.369999999999999e-07, "loss": -0.0357, "reward": 1.7291667461395264, "reward_mean": 1.7291667461395264, "reward_std": 0.12400397658348083, "rewards/accuracy_reward": 0.7291666269302368, "rewards/format_reward": 1.0, "step": 163 }, { "advantages": -1.2665987014770508e-07, "completion_length": 190.375, "epoch": 0.164, "grad_norm": 4.653083324432373, "kl": 0.251953125, "learning_rate": 8.359999999999999e-07, "loss": 0.0293, "reward": 1.5104167461395264, "reward_mean": 1.5104167461395264, "reward_std": 0.1473138928413391, "rewards/accuracy_reward": 0.5104166865348816, "rewards/format_reward": 1.0, "step": 164 }, { "advantages": 2.9802322387695312e-08, "completion_length": 208.0625, "epoch": 0.165, "grad_norm": 3.3702642917633057, "kl": 0.185546875, "learning_rate": 8.349999999999999e-07, "loss": -0.001, "reward": 1.8541667461395264, "reward_mean": 1.8541667461395264, "reward_std": 0.10681165009737015, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 1.0, "step": 165 }, { "advantages": 1.862645149230957e-08, "completion_length": 166.375, "epoch": 0.166, "grad_norm": 4.143738746643066, "kl": 0.1865234375, "learning_rate": 8.34e-07, "loss": -0.0756, "reward": 1.1875, "reward_mean": 1.1875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 1.0, "step": 166 }, { "advantages": 7.450580596923828e-09, "completion_length": 153.25, "epoch": 0.167, "grad_norm": 3.872225522994995, "kl": 0.185546875, "learning_rate": 8.329999999999999e-07, "loss": -0.0012, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 167 }, { "advantages": -4.470348358154297e-08, "completion_length": 197.3125, "epoch": 0.168, "grad_norm": 4.1173319816589355, "kl": 0.1953125, "learning_rate": 8.319999999999999e-07, "loss": 0.1437, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.15430334210395813, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 168 }, { "advantages": 7.450580596923828e-09, "completion_length": 159.75, "epoch": 0.169, "grad_norm": 2.953240156173706, "kl": 0.18359375, "learning_rate": 8.31e-07, "loss": 0.0664, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 169 }, { "advantages": 1.4901161193847656e-08, "completion_length": 153.4375, "epoch": 0.17, "grad_norm": 4.36264705657959, "kl": 0.18359375, "learning_rate": 8.299999999999999e-07, "loss": -0.0093, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 170 }, { "advantages": 7.450580596923828e-09, "completion_length": 163.1875, "epoch": 0.171, "grad_norm": 3.9977848529815674, "kl": 0.240234375, "learning_rate": 8.289999999999999e-07, "loss": 0.0138, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.24775780737400055, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 171 }, { "advantages": 3.725290298461914e-09, "completion_length": 174.875, "epoch": 0.172, "grad_norm": 4.679101467132568, "kl": 0.189453125, "learning_rate": 8.28e-07, "loss": 0.1979, "reward": 1.4166667461395264, "reward_mean": 1.4166667461395264, "reward_std": 0.34194856882095337, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 1.0, "step": 172 }, { "advantages": 7.82310962677002e-08, "completion_length": 205.75, "epoch": 0.173, "grad_norm": 5.067877292633057, "kl": 0.22265625, "learning_rate": 8.269999999999999e-07, "loss": -0.1021, "reward": 1.3854167461395264, "reward_mean": 1.3854167461395264, "reward_std": 0.30385708808898926, "rewards/accuracy_reward": 0.3854166865348816, "rewards/format_reward": 1.0, "step": 173 }, { "advantages": -7.450580596923828e-09, "completion_length": 155.3125, "epoch": 0.174, "grad_norm": 6.50193977355957, "kl": 0.2412109375, "learning_rate": 8.259999999999999e-07, "loss": 0.0323, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 174 }, { "advantages": -7.078051567077637e-08, "completion_length": 215.9375, "epoch": 0.175, "grad_norm": 4.612828731536865, "kl": 0.18359375, "learning_rate": 8.249999999999999e-07, "loss": -0.0642, "reward": 1.6458333730697632, "reward_mean": 1.6458333730697632, "reward_std": 0.35351940989494324, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 1.0, "step": 175 }, { "advantages": -7.450580596923828e-09, "completion_length": 166.5, "epoch": 0.176, "grad_norm": 5.000982761383057, "kl": 0.1982421875, "learning_rate": 8.24e-07, "loss": 0.0311, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.44403791427612305, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 176 }, { "advantages": -7.450580596923828e-09, "completion_length": 122.5625, "epoch": 0.177, "grad_norm": 4.273613929748535, "kl": 0.2578125, "learning_rate": 8.229999999999999e-07, "loss": 0.0966, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.24775780737400055, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 177 }, { "advantages": -7.078051567077637e-08, "completion_length": 191.5625, "epoch": 0.178, "grad_norm": 4.648405075073242, "kl": 0.193359375, "learning_rate": 8.219999999999999e-07, "loss": 0.0292, "reward": 1.8645833730697632, "reward_mean": 1.8645833730697632, "reward_std": 0.1746465265750885, "rewards/accuracy_reward": 0.8645833730697632, "rewards/format_reward": 1.0, "step": 178 }, { "advantages": -1.4901161193847656e-08, "completion_length": 173.9375, "epoch": 0.179, "grad_norm": 3.65451717376709, "kl": 0.2119140625, "learning_rate": 8.21e-07, "loss": -0.0106, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 179 }, { "advantages": 0.0, "completion_length": 126.375, "epoch": 0.18, "grad_norm": 5.720065116882324, "kl": 0.73828125, "learning_rate": 8.199999999999999e-07, "loss": -0.1224, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.35564959049224854, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 180 }, { "advantages": -1.4901161193847656e-08, "completion_length": 149.0, "epoch": 0.181, "grad_norm": 4.597268581390381, "kl": 0.21875, "learning_rate": 8.189999999999999e-07, "loss": 0.0604, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 181 }, { "advantages": 3.725290298461914e-09, "completion_length": 148.6875, "epoch": 0.182, "grad_norm": 3.944310188293457, "kl": 0.232421875, "learning_rate": 8.179999999999999e-07, "loss": 0.0728, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 182 }, { "advantages": -1.043081283569336e-07, "completion_length": 128.1875, "epoch": 0.183, "grad_norm": 5.491823673248291, "kl": 0.2314453125, "learning_rate": 8.169999999999999e-07, "loss": -0.0613, "reward": 1.7083333730697632, "reward_mean": 1.7083333730697632, "reward_std": 0.07715167105197906, "rewards/accuracy_reward": 0.7083332538604736, "rewards/format_reward": 1.0, "step": 183 }, { "advantages": 0.0, "completion_length": 141.625, "epoch": 0.184, "grad_norm": 0.0, "kl": 0.2451171875, "learning_rate": 8.159999999999999e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 184 }, { "advantages": -2.9802322387695312e-08, "completion_length": 161.125, "epoch": 0.185, "grad_norm": 5.50625467300415, "kl": 0.2421875, "learning_rate": 8.149999999999999e-07, "loss": -0.0947, "reward": 1.3958333730697632, "reward_mean": 1.3958333730697632, "reward_std": 0.43129098415374756, "rewards/accuracy_reward": 0.3958333432674408, "rewards/format_reward": 1.0, "step": 185 }, { "advantages": -3.725290298461914e-09, "completion_length": 143.0625, "epoch": 0.186, "grad_norm": 6.193937301635742, "kl": 0.271484375, "learning_rate": 8.14e-07, "loss": -0.0267, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.3061639666557312, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 186 }, { "advantages": 0.0, "completion_length": 149.125, "epoch": 0.187, "grad_norm": 0.0, "kl": 0.2490234375, "learning_rate": 8.129999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 187 }, { "advantages": 0.0, "completion_length": 111.75, "epoch": 0.188, "grad_norm": 4.042557716369629, "kl": 0.275390625, "learning_rate": 8.12e-07, "loss": -0.0301, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 188 }, { "advantages": 3.3527612686157227e-08, "completion_length": 129.0, "epoch": 0.189, "grad_norm": 5.769200325012207, "kl": 0.38671875, "learning_rate": 8.11e-07, "loss": 0.0278, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.5065323710441589, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 189 }, { "advantages": 7.450580596923828e-09, "completion_length": 194.125, "epoch": 0.19, "grad_norm": 4.864434242248535, "kl": 0.228515625, "learning_rate": 8.1e-07, "loss": 0.0537, "reward": 1.5208333730697632, "reward_mean": 1.5208333730697632, "reward_std": 0.38895100355148315, "rewards/accuracy_reward": 0.5208333730697632, "rewards/format_reward": 1.0, "step": 190 }, { "advantages": -1.4901161193847656e-08, "completion_length": 130.5, "epoch": 0.191, "grad_norm": 3.6852304935455322, "kl": 0.2490234375, "learning_rate": 8.09e-07, "loss": -0.0524, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 191 }, { "advantages": -7.450580596923828e-09, "completion_length": 158.9375, "epoch": 0.192, "grad_norm": 4.945519924163818, "kl": 0.2373046875, "learning_rate": 8.08e-07, "loss": -0.0072, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.3657589256763458, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 192 }, { "advantages": 0.0, "completion_length": 164.4375, "epoch": 0.193, "grad_norm": 0.0, "kl": 0.25, "learning_rate": 8.070000000000001e-07, "loss": 0.0, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 193 }, { "advantages": 0.0, "completion_length": 118.75, "epoch": 0.194, "grad_norm": 4.313383102416992, "kl": 0.2734375, "learning_rate": 8.06e-07, "loss": 0.0147, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.22160130739212036, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 194 }, { "advantages": 1.4901161193847656e-08, "completion_length": 156.125, "epoch": 0.195, "grad_norm": 3.259519577026367, "kl": 0.26171875, "learning_rate": 8.05e-07, "loss": -0.0172, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 195 }, { "advantages": 0.0, "completion_length": 128.1875, "epoch": 0.196, "grad_norm": 0.0, "kl": 0.2216796875, "learning_rate": 8.04e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 196 }, { "advantages": 8.195638656616211e-08, "completion_length": 156.0, "epoch": 0.197, "grad_norm": 5.883679389953613, "kl": 0.283203125, "learning_rate": 8.03e-07, "loss": -0.0908, "reward": 1.6041667461395264, "reward_mean": 1.6041667461395264, "reward_std": 0.2335786670446396, "rewards/accuracy_reward": 0.6041667461395264, "rewards/format_reward": 1.0, "step": 197 }, { "advantages": -1.4901161193847656e-08, "completion_length": 122.625, "epoch": 0.198, "grad_norm": 3.723879814147949, "kl": 0.3203125, "learning_rate": 8.02e-07, "loss": -0.0498, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 198 }, { "advantages": 1.4901161193847656e-08, "completion_length": 135.375, "epoch": 0.199, "grad_norm": 4.400403022766113, "kl": 0.30078125, "learning_rate": 8.01e-07, "loss": -0.0618, "reward": 1.4583333730697632, "reward_mean": 1.4583333730697632, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.4583333730697632, "rewards/format_reward": 1.0, "step": 199 }, { "advantages": 1.4901161193847656e-08, "completion_length": 176.5625, "epoch": 0.2, "grad_norm": 5.768075942993164, "kl": 0.2578125, "learning_rate": 8e-07, "loss": -0.0495, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 200 }, { "advantages": -7.450580596923828e-09, "completion_length": 131.25, "epoch": 0.201, "grad_norm": 5.473985195159912, "kl": 0.259765625, "learning_rate": 7.99e-07, "loss": 0.0693, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 201 }, { "advantages": -7.450580596923828e-09, "completion_length": 141.25, "epoch": 0.202, "grad_norm": 5.938058853149414, "kl": 0.26171875, "learning_rate": 7.98e-07, "loss": -0.0473, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.44478052854537964, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 202 }, { "advantages": 1.1175870895385742e-08, "completion_length": 163.875, "epoch": 0.203, "grad_norm": 5.90596342086792, "kl": 0.263671875, "learning_rate": 7.970000000000001e-07, "loss": 0.2037, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.4355512857437134, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 203 }, { "advantages": 1.825392246246338e-07, "completion_length": 126.875, "epoch": 0.204, "grad_norm": 6.201707363128662, "kl": 0.259765625, "learning_rate": 7.96e-07, "loss": -0.0593, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.3478616774082184, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 204 }, { "advantages": 0.0, "completion_length": 137.0, "epoch": 0.205, "grad_norm": 0.0, "kl": 0.2578125, "learning_rate": 7.95e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 205 }, { "advantages": 0.0, "completion_length": 130.375, "epoch": 0.206, "grad_norm": 3.8991453647613525, "kl": 0.3203125, "learning_rate": 7.94e-07, "loss": -0.0435, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 206 }, { "advantages": -7.450580596923828e-08, "completion_length": 141.1875, "epoch": 0.207, "grad_norm": 5.12335205078125, "kl": 0.28515625, "learning_rate": 7.93e-07, "loss": -0.134, "reward": 1.7708333730697632, "reward_mean": 1.7708333730697632, "reward_std": 0.12400396913290024, "rewards/accuracy_reward": 0.7708333730697632, "rewards/format_reward": 1.0, "step": 207 }, { "advantages": 0.0, "completion_length": 155.9375, "epoch": 0.208, "grad_norm": 0.0, "kl": 0.28125, "learning_rate": 7.92e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 208 }, { "advantages": 3.725290298461914e-08, "completion_length": 158.9375, "epoch": 0.209, "grad_norm": 4.06764030456543, "kl": 0.333984375, "learning_rate": 7.91e-07, "loss": 0.0656, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.10681164264678955, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 209 }, { "advantages": 0.0, "completion_length": 180.9375, "epoch": 0.21, "grad_norm": 0.0, "kl": 0.2451171875, "learning_rate": 7.9e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 210 }, { "advantages": -7.450580596923828e-09, "completion_length": 152.1875, "epoch": 0.211, "grad_norm": 3.491947650909424, "kl": 0.28515625, "learning_rate": 7.89e-07, "loss": -0.0244, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.16517186164855957, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 211 }, { "advantages": 1.2665987014770508e-07, "completion_length": 175.125, "epoch": 0.212, "grad_norm": 5.700802326202393, "kl": 0.29296875, "learning_rate": 7.88e-07, "loss": 0.0936, "reward": 1.6041667461395264, "reward_mean": 1.6041667461395264, "reward_std": 0.32618677616119385, "rewards/accuracy_reward": 0.6041666865348816, "rewards/format_reward": 1.0, "step": 212 }, { "advantages": 0.0, "completion_length": 164.0, "epoch": 0.213, "grad_norm": 4.179893493652344, "kl": 0.25390625, "learning_rate": 7.87e-07, "loss": -0.0081, "reward": 1.96875, "reward_mean": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 213 }, { "advantages": 1.4901161193847656e-08, "completion_length": 137.125, "epoch": 0.214, "grad_norm": 4.17854118347168, "kl": 0.2470703125, "learning_rate": 7.86e-07, "loss": 0.0071, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 214 }, { "advantages": 1.4156103134155273e-07, "completion_length": 189.875, "epoch": 0.215, "grad_norm": 4.708441734313965, "kl": 0.30859375, "learning_rate": 7.85e-07, "loss": -0.0134, "reward": 1.4791667461395264, "reward_mean": 1.4791667461395264, "reward_std": 0.2903805673122406, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 1.0, "step": 215 }, { "advantages": 0.0, "completion_length": 132.0, "epoch": 0.216, "grad_norm": 0.0, "kl": 0.30078125, "learning_rate": 7.84e-07, "loss": 0.0, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 216 }, { "advantages": 7.450580596923828e-09, "completion_length": 145.3125, "epoch": 0.217, "grad_norm": 3.518888473510742, "kl": 0.34765625, "learning_rate": 7.83e-07, "loss": 0.0506, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 217 }, { "advantages": -5.587935447692871e-08, "completion_length": 165.9375, "epoch": 0.218, "grad_norm": 5.872474193572998, "kl": 0.28125, "learning_rate": 7.82e-07, "loss": 0.0886, "reward": 1.6770833730697632, "reward_mean": 1.6770833730697632, "reward_std": 0.541657567024231, "rewards/accuracy_reward": 0.7395833730697632, "rewards/format_reward": 0.9375, "step": 218 }, { "advantages": 7.450580596923828e-09, "completion_length": 142.6875, "epoch": 0.219, "grad_norm": 5.611164093017578, "kl": 0.298828125, "learning_rate": 7.81e-07, "loss": -0.0346, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 219 }, { "advantages": -5.21540641784668e-08, "completion_length": 152.9375, "epoch": 0.22, "grad_norm": 5.375847816467285, "kl": 0.287109375, "learning_rate": 7.799999999999999e-07, "loss": 0.0275, "reward": 1.3541667461395264, "reward_mean": 1.3541667461395264, "reward_std": 0.2335786670446396, "rewards/accuracy_reward": 0.3541666865348816, "rewards/format_reward": 1.0, "step": 220 }, { "advantages": -1.4901161193847656e-08, "completion_length": 125.8125, "epoch": 0.221, "grad_norm": 5.226174354553223, "kl": 0.328125, "learning_rate": 7.79e-07, "loss": 0.1148, "reward": 1.1875, "reward_mean": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.875, "step": 221 }, { "advantages": -1.2665987014770508e-07, "completion_length": 138.4375, "epoch": 0.222, "grad_norm": 4.286291122436523, "kl": 0.3515625, "learning_rate": 7.78e-07, "loss": 0.0188, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 222 }, { "advantages": -1.4901161193847656e-08, "completion_length": 133.9375, "epoch": 0.223, "grad_norm": 5.600376605987549, "kl": 0.39453125, "learning_rate": 7.77e-07, "loss": -0.0054, "reward": 1.7083333730697632, "reward_mean": 1.7083333730697632, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "step": 223 }, { "advantages": 0.0, "completion_length": 162.125, "epoch": 0.224, "grad_norm": 4.409877300262451, "kl": 0.34375, "learning_rate": 7.76e-07, "loss": 0.0028, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 224 }, { "advantages": 0.0, "completion_length": 115.5625, "epoch": 0.225, "grad_norm": 0.0, "kl": 0.291015625, "learning_rate": 7.75e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 225 }, { "advantages": 5.587935447692871e-08, "completion_length": 133.25, "epoch": 0.226, "grad_norm": 6.370109558105469, "kl": 0.30859375, "learning_rate": 7.74e-07, "loss": -0.1677, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.3450327515602112, "rewards/accuracy_reward": 0.7500000596046448, "rewards/format_reward": 1.0, "step": 226 }, { "advantages": 0.0, "completion_length": 101.8125, "epoch": 0.227, "grad_norm": 0.0, "kl": 0.3828125, "learning_rate": 7.729999999999999e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 227 }, { "advantages": 7.450580596923828e-09, "completion_length": 136.875, "epoch": 0.228, "grad_norm": 3.316059112548828, "kl": 0.44921875, "learning_rate": 7.72e-07, "loss": 0.0161, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 228 }, { "advantages": -1.4156103134155273e-07, "completion_length": 131.5625, "epoch": 0.229, "grad_norm": 5.905332088470459, "kl": 0.349609375, "learning_rate": 7.71e-07, "loss": 0.0514, "reward": 1.8541667461395264, "reward_mean": 1.8541667461395264, "reward_std": 0.2903805673122406, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 1.0, "step": 229 }, { "advantages": -7.450580596923828e-09, "completion_length": 146.5, "epoch": 0.23, "grad_norm": 4.266251564025879, "kl": 0.322265625, "learning_rate": 7.699999999999999e-07, "loss": 0.0306, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 230 }, { "advantages": 0.0, "completion_length": 127.9375, "epoch": 0.231, "grad_norm": 0.0, "kl": 0.30078125, "learning_rate": 7.69e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 231 }, { "advantages": -5.960464477539063e-08, "completion_length": 113.8125, "epoch": 0.232, "grad_norm": 6.29781436920166, "kl": 0.34765625, "learning_rate": 7.68e-07, "loss": -0.009, "reward": 1.7708333730697632, "reward_mean": 1.7708333730697632, "reward_std": 0.2048145979642868, "rewards/accuracy_reward": 0.7708333730697632, "rewards/format_reward": 1.0, "step": 232 }, { "advantages": 7.450580596923828e-09, "completion_length": 112.5625, "epoch": 0.233, "grad_norm": 4.406736373901367, "kl": 0.298828125, "learning_rate": 7.67e-07, "loss": -0.0023, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 233 }, { "advantages": 0.0, "completion_length": 100.125, "epoch": 0.234, "grad_norm": 0.0, "kl": 0.3359375, "learning_rate": 7.66e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 234 }, { "advantages": -6.705522537231445e-08, "completion_length": 120.125, "epoch": 0.235, "grad_norm": 3.8253743648529053, "kl": 0.3359375, "learning_rate": 7.65e-07, "loss": 0.0283, "reward": 1.0416667461395264, "reward_mean": 1.0416667461395264, "reward_std": 0.1178511530160904, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 1.0, "step": 235 }, { "advantages": -7.450580596923828e-09, "completion_length": 134.6875, "epoch": 0.236, "grad_norm": 3.9648969173431396, "kl": 0.373046875, "learning_rate": 7.64e-07, "loss": 0.0765, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.24775780737400055, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 236 }, { "advantages": 0.0, "completion_length": 132.0625, "epoch": 0.237, "grad_norm": 7.067671775817871, "kl": 0.36328125, "learning_rate": 7.629999999999999e-07, "loss": -0.1304, "reward": 1.6979167461395264, "reward_mean": 1.6979167461395264, "reward_std": 0.28634417057037354, "rewards/accuracy_reward": 0.6979166865348816, "rewards/format_reward": 1.0, "step": 237 }, { "advantages": -1.2665987014770508e-07, "completion_length": 116.75, "epoch": 0.238, "grad_norm": 5.4808030128479, "kl": 0.375, "learning_rate": 7.62e-07, "loss": 0.0004, "reward": 1.7604167461395264, "reward_mean": 1.7604167461395264, "reward_std": 0.1473138928413391, "rewards/accuracy_reward": 0.7604166865348816, "rewards/format_reward": 1.0, "step": 238 }, { "advantages": -7.450580596923828e-09, "completion_length": 110.3125, "epoch": 0.239, "grad_norm": 4.075715065002441, "kl": 0.31640625, "learning_rate": 7.61e-07, "loss": 0.0067, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 0.9375, "step": 239 }, { "advantages": 0.0, "completion_length": 103.3125, "epoch": 0.24, "grad_norm": 0.0, "kl": 0.396484375, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 240 }, { "advantages": 1.6391277313232422e-07, "completion_length": 109.4375, "epoch": 0.241, "grad_norm": 5.156554222106934, "kl": 0.421875, "learning_rate": 7.59e-07, "loss": -0.0393, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.08908708393573761, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 241 }, { "advantages": 1.4901161193847656e-08, "completion_length": 119.125, "epoch": 0.242, "grad_norm": 4.303339004516602, "kl": 0.3984375, "learning_rate": 7.58e-07, "loss": 0.1162, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 242 }, { "advantages": 1.4901161193847656e-08, "completion_length": 111.1875, "epoch": 0.243, "grad_norm": 4.342909336090088, "kl": 0.439453125, "learning_rate": 7.57e-07, "loss": 0.0012, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 243 }, { "advantages": 4.470348358154297e-08, "completion_length": 118.4375, "epoch": 0.244, "grad_norm": 7.558548450469971, "kl": 0.37890625, "learning_rate": 7.559999999999999e-07, "loss": -0.1255, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.4149954617023468, "rewards/accuracy_reward": 0.3750000298023224, "rewards/format_reward": 1.0, "step": 244 }, { "advantages": 0.0, "completion_length": 106.875, "epoch": 0.245, "grad_norm": 4.70227575302124, "kl": 0.36328125, "learning_rate": 7.55e-07, "loss": 0.0574, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.22903135418891907, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 245 }, { "advantages": 0.0, "completion_length": 129.9375, "epoch": 0.246, "grad_norm": 0.0, "kl": 0.37109375, "learning_rate": 7.54e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 246 }, { "advantages": -3.725290298461914e-09, "completion_length": 118.5625, "epoch": 0.247, "grad_norm": 4.569678783416748, "kl": 0.421875, "learning_rate": 7.529999999999999e-07, "loss": -0.0377, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 247 }, { "advantages": 1.4901161193847656e-08, "completion_length": 126.25, "epoch": 0.248, "grad_norm": 4.764584064483643, "kl": 0.33203125, "learning_rate": 7.52e-07, "loss": 0.0018, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "step": 248 }, { "advantages": 0.0, "completion_length": 125.3125, "epoch": 0.249, "grad_norm": 5.263643264770508, "kl": 0.384765625, "learning_rate": 7.51e-07, "loss": 0.0607, "reward": 1.96875, "reward_mean": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 249 }, { "advantages": -7.450580596923828e-09, "completion_length": 120.75, "epoch": 0.25, "grad_norm": 4.139052867889404, "kl": 0.38671875, "learning_rate": 7.5e-07, "loss": 0.0403, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 250 }, { "advantages": -7.450580596923828e-09, "completion_length": 113.0625, "epoch": 0.251, "grad_norm": 4.267086029052734, "kl": 0.40234375, "learning_rate": 7.489999999999999e-07, "loss": -0.0034, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 251 }, { "advantages": 0.0, "completion_length": 110.9375, "epoch": 0.252, "grad_norm": 0.0, "kl": 0.44140625, "learning_rate": 7.48e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 252 }, { "advantages": 1.2665987014770508e-07, "completion_length": 125.0, "epoch": 0.253, "grad_norm": 4.108771324157715, "kl": 0.375, "learning_rate": 7.47e-07, "loss": 0.0269, "reward": 1.8541667461395264, "reward_mean": 1.8541667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.8541667461395264, "rewards/format_reward": 1.0, "step": 253 }, { "advantages": 3.725290298461914e-09, "completion_length": 109.4375, "epoch": 0.254, "grad_norm": 6.75657320022583, "kl": 0.53125, "learning_rate": 7.459999999999999e-07, "loss": -0.0183, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 254 }, { "advantages": 1.043081283569336e-07, "completion_length": 125.5625, "epoch": 0.255, "grad_norm": 6.262571334838867, "kl": 0.443359375, "learning_rate": 7.45e-07, "loss": 0.1093, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.13908715546131134, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 255 }, { "advantages": -1.862645149230957e-08, "completion_length": 114.25, "epoch": 0.256, "grad_norm": 4.935299396514893, "kl": 0.5078125, "learning_rate": 7.44e-07, "loss": -0.0599, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 256 }, { "advantages": 0.0, "completion_length": 141.25, "epoch": 0.257, "grad_norm": 5.354793548583984, "kl": 0.419921875, "learning_rate": 7.429999999999999e-07, "loss": 0.0394, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.22201895713806152, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 257 }, { "advantages": -7.450580596923828e-09, "completion_length": 95.3125, "epoch": 0.258, "grad_norm": 4.425192832946777, "kl": 0.40234375, "learning_rate": 7.42e-07, "loss": 0.0065, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 258 }, { "advantages": -7.078051567077637e-08, "completion_length": 114.4375, "epoch": 0.259, "grad_norm": 6.3800835609436035, "kl": 0.41015625, "learning_rate": 7.41e-07, "loss": -0.017, "reward": 1.8645833730697632, "reward_mean": 1.8645833730697632, "reward_std": 0.1746465265750885, "rewards/accuracy_reward": 0.8645833730697632, "rewards/format_reward": 1.0, "step": 259 }, { "advantages": 1.4901161193847656e-08, "completion_length": 132.375, "epoch": 0.26, "grad_norm": 4.138468265533447, "kl": 0.41015625, "learning_rate": 7.4e-07, "loss": 0.0899, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 260 }, { "advantages": 7.450580596923828e-08, "completion_length": 127.375, "epoch": 0.261, "grad_norm": 5.36328649520874, "kl": 0.490234375, "learning_rate": 7.389999999999999e-07, "loss": -0.1071, "reward": 1.7083333730697632, "reward_mean": 1.7083333730697632, "reward_std": 0.2136232852935791, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "step": 261 }, { "advantages": 1.862645149230957e-08, "completion_length": 119.875, "epoch": 0.262, "grad_norm": 4.338840007781982, "kl": 0.451171875, "learning_rate": 7.38e-07, "loss": -0.0061, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.9375, "step": 262 }, { "advantages": -1.4901161193847656e-08, "completion_length": 126.5625, "epoch": 0.263, "grad_norm": 4.404613971710205, "kl": 0.5078125, "learning_rate": 7.37e-07, "loss": -0.0745, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 263 }, { "advantages": 0.0, "completion_length": 117.5625, "epoch": 0.264, "grad_norm": 0.0, "kl": 0.37109375, "learning_rate": 7.359999999999999e-07, "loss": 0.0, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 264 }, { "advantages": 0.0, "completion_length": 123.875, "epoch": 0.265, "grad_norm": 0.0, "kl": 0.404296875, "learning_rate": 7.35e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 265 }, { "advantages": -1.4901161193847656e-08, "completion_length": 117.4375, "epoch": 0.266, "grad_norm": 5.04351282119751, "kl": 0.4375, "learning_rate": 7.34e-07, "loss": 0.0671, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.18600594997406006, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 266 }, { "advantages": 7.450580596923828e-09, "completion_length": 98.3125, "epoch": 0.267, "grad_norm": 4.765639305114746, "kl": 0.421875, "learning_rate": 7.329999999999999e-07, "loss": 0.0228, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 267 }, { "advantages": 0.0, "completion_length": 132.8125, "epoch": 0.268, "grad_norm": 0.0, "kl": 0.4296875, "learning_rate": 7.319999999999999e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 268 }, { "advantages": -1.9371509552001953e-07, "completion_length": 118.875, "epoch": 0.269, "grad_norm": 4.1043314933776855, "kl": 0.392578125, "learning_rate": 7.31e-07, "loss": -0.0282, "reward": 1.7083333730697632, "reward_mean": 1.7083333730697632, "reward_std": 0.07715165615081787, "rewards/accuracy_reward": 0.7083333730697632, "rewards/format_reward": 1.0, "step": 269 }, { "advantages": 7.450580596923828e-09, "completion_length": 137.8125, "epoch": 0.27, "grad_norm": 4.980680465698242, "kl": 0.41015625, "learning_rate": 7.3e-07, "loss": 0.0036, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 270 }, { "advantages": 0.0, "completion_length": 134.1875, "epoch": 0.271, "grad_norm": 0.0, "kl": 0.421875, "learning_rate": 7.289999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 271 }, { "advantages": 0.0, "completion_length": 93.8125, "epoch": 0.272, "grad_norm": 5.348329544067383, "kl": 0.46484375, "learning_rate": 7.28e-07, "loss": 0.0097, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 272 }, { "advantages": 3.203749656677246e-07, "completion_length": 134.125, "epoch": 0.273, "grad_norm": 3.749969244003296, "kl": 0.4375, "learning_rate": 7.27e-07, "loss": -0.062, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.058925580233335495, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 273 }, { "advantages": 0.0, "completion_length": 116.875, "epoch": 0.274, "grad_norm": 4.896990776062012, "kl": 0.455078125, "learning_rate": 7.259999999999999e-07, "loss": 0.0062, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "step": 274 }, { "advantages": 0.0, "completion_length": 98.25, "epoch": 0.275, "grad_norm": 5.642269611358643, "kl": 0.5, "learning_rate": 7.249999999999999e-07, "loss": -0.0376, "reward": 1.28125, "reward_mean": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 275 }, { "advantages": 0.0, "completion_length": 140.1875, "epoch": 0.276, "grad_norm": 3.443995714187622, "kl": 0.392578125, "learning_rate": 7.24e-07, "loss": -0.0466, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 276 }, { "advantages": 2.60770320892334e-08, "completion_length": 96.4375, "epoch": 0.277, "grad_norm": 7.930581092834473, "kl": 0.38671875, "learning_rate": 7.229999999999999e-07, "loss": -0.138, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.4355512857437134, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 277 }, { "advantages": 0.0, "completion_length": 125.25, "epoch": 0.278, "grad_norm": 0.0, "kl": 0.3984375, "learning_rate": 7.219999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 278 }, { "advantages": -7.450580596923828e-09, "completion_length": 108.0625, "epoch": 0.279, "grad_norm": 6.782789707183838, "kl": 0.375, "learning_rate": 7.21e-07, "loss": 0.0696, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 279 }, { "advantages": -3.725290298461914e-09, "completion_length": 106.5, "epoch": 0.28, "grad_norm": 4.9994611740112305, "kl": 0.443359375, "learning_rate": 7.2e-07, "loss": -0.0264, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 280 }, { "advantages": -1.2665987014770508e-07, "completion_length": 143.5625, "epoch": 0.281, "grad_norm": 6.117532253265381, "kl": 0.640625, "learning_rate": 7.189999999999999e-07, "loss": 0.0769, "reward": 1.9479167461395264, "reward_mean": 1.9479167461395264, "reward_std": 0.1473138928413391, "rewards/accuracy_reward": 0.9479166865348816, "rewards/format_reward": 1.0, "step": 281 }, { "advantages": -7.078051567077637e-08, "completion_length": 157.4375, "epoch": 0.282, "grad_norm": 3.375563859939575, "kl": 0.39453125, "learning_rate": 7.179999999999999e-07, "loss": -0.0054, "reward": 1.7083333730697632, "reward_mean": 1.7083333730697632, "reward_std": 0.1178511381149292, "rewards/accuracy_reward": 0.7083333134651184, "rewards/format_reward": 1.0, "step": 282 }, { "advantages": 0.0, "completion_length": 121.9375, "epoch": 0.283, "grad_norm": 0.0, "kl": 0.41015625, "learning_rate": 7.17e-07, "loss": 0.0, "reward": 1.6666667461395264, "reward_mean": 1.6666667461395264, "reward_std": 0.0, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 283 }, { "advantages": 7.450580596923828e-09, "completion_length": 100.875, "epoch": 0.284, "grad_norm": 6.159756183624268, "kl": 0.41015625, "learning_rate": 7.159999999999999e-07, "loss": 0.0617, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 284 }, { "advantages": -7.450580596923828e-09, "completion_length": 111.3125, "epoch": 0.285, "grad_norm": 5.778822898864746, "kl": 0.412109375, "learning_rate": 7.149999999999999e-07, "loss": -0.0822, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 285 }, { "advantages": 0.0, "completion_length": 110.1875, "epoch": 0.286, "grad_norm": 0.0, "kl": 0.40234375, "learning_rate": 7.14e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 286 }, { "advantages": -1.2665987014770508e-07, "completion_length": 121.25, "epoch": 0.287, "grad_norm": 4.658452987670898, "kl": 0.42578125, "learning_rate": 7.129999999999999e-07, "loss": -0.01, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 287 }, { "advantages": -1.2665987014770508e-07, "completion_length": 151.0, "epoch": 0.288, "grad_norm": 3.2589261531829834, "kl": 0.45703125, "learning_rate": 7.119999999999999e-07, "loss": -0.0573, "reward": 1.4791667461395264, "reward_mean": 1.4791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.4791666865348816, "rewards/format_reward": 1.0, "step": 288 }, { "advantages": -3.725290298461914e-08, "completion_length": 112.9375, "epoch": 0.289, "grad_norm": 4.990071773529053, "kl": 0.400390625, "learning_rate": 7.11e-07, "loss": -0.0088, "reward": 1.9166667461395264, "reward_mean": 1.9166667461395264, "reward_std": 0.12598814070224762, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "step": 289 }, { "advantages": 0.0, "completion_length": 123.6875, "epoch": 0.29, "grad_norm": 4.007847309112549, "kl": 0.4375, "learning_rate": 7.1e-07, "loss": 0.0285, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.1602174937725067, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 290 }, { "advantages": -1.4901161193847656e-08, "completion_length": 108.4375, "epoch": 0.291, "grad_norm": 4.9294867515563965, "kl": 0.443359375, "learning_rate": 7.089999999999999e-07, "loss": -0.0249, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 291 }, { "advantages": -3.725290298461914e-09, "completion_length": 135.375, "epoch": 0.292, "grad_norm": 4.507473945617676, "kl": 0.3984375, "learning_rate": 7.079999999999999e-07, "loss": 0.0089, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 292 }, { "advantages": 0.0, "completion_length": 122.25, "epoch": 0.293, "grad_norm": 0.0, "kl": 0.41796875, "learning_rate": 7.07e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 293 }, { "advantages": 0.0, "completion_length": 135.625, "epoch": 0.294, "grad_norm": 5.223430633544922, "kl": 0.466796875, "learning_rate": 7.059999999999999e-07, "loss": 0.0928, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 294 }, { "advantages": -1.2665987014770508e-07, "completion_length": 105.6875, "epoch": 0.295, "grad_norm": 5.42147970199585, "kl": 0.44140625, "learning_rate": 7.049999999999999e-07, "loss": 0.0288, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 295 }, { "advantages": 0.0, "completion_length": 126.6875, "epoch": 0.296, "grad_norm": 3.41044545173645, "kl": 0.51953125, "learning_rate": 7.04e-07, "loss": -0.0624, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 296 }, { "advantages": 0.0, "completion_length": 126.1875, "epoch": 0.297, "grad_norm": 0.0, "kl": 0.42578125, "learning_rate": 7.029999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 297 }, { "advantages": 7.450580596923828e-09, "completion_length": 107.6875, "epoch": 0.298, "grad_norm": 4.168430328369141, "kl": 0.390625, "learning_rate": 7.019999999999999e-07, "loss": 0.0322, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 298 }, { "advantages": 0.0, "completion_length": 135.125, "epoch": 0.299, "grad_norm": 0.0, "kl": 0.4375, "learning_rate": 7.009999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 299 }, { "advantages": -2.2351741790771484e-08, "completion_length": 123.4375, "epoch": 0.3, "grad_norm": 7.8173346519470215, "kl": 0.45703125, "learning_rate": 7e-07, "loss": 0.2183, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 300 }, { "advantages": 1.4901161193847656e-08, "completion_length": 112.4375, "epoch": 0.301, "grad_norm": 4.600705623626709, "kl": 0.46484375, "learning_rate": 6.989999999999999e-07, "loss": 0.0528, "reward": 1.6458333730697632, "reward_mean": 1.6458333730697632, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6458333730697632, "rewards/format_reward": 1.0, "step": 301 }, { "advantages": 7.078051567077637e-08, "completion_length": 119.1875, "epoch": 0.302, "grad_norm": 4.796161651611328, "kl": 0.53515625, "learning_rate": 6.979999999999999e-07, "loss": 0.057, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.0862581804394722, "rewards/accuracy_reward": 0.9375000596046448, "rewards/format_reward": 1.0, "step": 302 }, { "advantages": 0.0, "completion_length": 116.0, "epoch": 0.303, "grad_norm": 4.70276403427124, "kl": 0.515625, "learning_rate": 6.97e-07, "loss": -0.0333, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 303 }, { "advantages": 0.0, "completion_length": 123.875, "epoch": 0.304, "grad_norm": 4.684284687042236, "kl": 0.43359375, "learning_rate": 6.959999999999999e-07, "loss": -0.0197, "reward": 1.96875, "reward_mean": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 304 }, { "advantages": 3.725290298461914e-09, "completion_length": 104.625, "epoch": 0.305, "grad_norm": 4.7765889167785645, "kl": 0.5078125, "learning_rate": 6.949999999999999e-07, "loss": -0.0436, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 305 }, { "advantages": 0.0, "completion_length": 112.125, "epoch": 0.306, "grad_norm": 0.0, "kl": 0.419921875, "learning_rate": 6.939999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 306 }, { "advantages": 0.0, "completion_length": 111.5, "epoch": 0.307, "grad_norm": 8.246498107910156, "kl": 0.4765625, "learning_rate": 6.929999999999999e-07, "loss": 0.1117, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 307 }, { "advantages": 0.0, "completion_length": 116.6875, "epoch": 0.308, "grad_norm": 0.0, "kl": 0.39453125, "learning_rate": 6.919999999999999e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 308 }, { "advantages": -2.60770320892334e-08, "completion_length": 105.5, "epoch": 0.309, "grad_norm": 8.390800476074219, "kl": 0.470703125, "learning_rate": 6.909999999999999e-07, "loss": -0.0529, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.4355512857437134, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9375, "step": 309 }, { "advantages": 0.0, "completion_length": 108.0, "epoch": 0.31, "grad_norm": 0.0, "kl": 0.482421875, "learning_rate": 6.9e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 310 }, { "advantages": 0.0, "completion_length": 112.5625, "epoch": 0.311, "grad_norm": 0.0, "kl": 0.51953125, "learning_rate": 6.889999999999999e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 311 }, { "advantages": 0.0, "completion_length": 105.8125, "epoch": 0.312, "grad_norm": 0.0, "kl": 0.41796875, "learning_rate": 6.879999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 312 }, { "advantages": 7.450580596923828e-09, "completion_length": 95.5625, "epoch": 0.313, "grad_norm": 5.6205830574035645, "kl": 0.5390625, "learning_rate": 6.87e-07, "loss": -0.0113, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 0.9375, "step": 313 }, { "advantages": 0.0, "completion_length": 111.5, "epoch": 0.314, "grad_norm": 0.0, "kl": 0.416015625, "learning_rate": 6.86e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 314 }, { "advantages": 0.0, "completion_length": 123.125, "epoch": 0.315, "grad_norm": 0.0, "kl": 0.46875, "learning_rate": 6.85e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 315 }, { "advantages": 0.0, "completion_length": 113.3125, "epoch": 0.316, "grad_norm": 0.0, "kl": 0.419921875, "learning_rate": 6.84e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 316 }, { "advantages": 1.862645149230957e-08, "completion_length": 116.9375, "epoch": 0.317, "grad_norm": 5.153122901916504, "kl": 0.453125, "learning_rate": 6.830000000000001e-07, "loss": 0.0341, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 317 }, { "advantages": 0.0, "completion_length": 144.5625, "epoch": 0.318, "grad_norm": 0.0, "kl": 0.4375, "learning_rate": 6.82e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 318 }, { "advantages": 0.0, "completion_length": 102.3125, "epoch": 0.319, "grad_norm": 0.0, "kl": 0.5234375, "learning_rate": 6.81e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 319 }, { "advantages": 0.0, "completion_length": 109.75, "epoch": 0.32, "grad_norm": 0.0, "kl": 0.5078125, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 320 }, { "advantages": -1.2665987014770508e-07, "completion_length": 104.5625, "epoch": 0.321, "grad_norm": 4.661564826965332, "kl": 0.53125, "learning_rate": 6.79e-07, "loss": -0.0622, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 321 }, { "advantages": 0.0, "completion_length": 121.375, "epoch": 0.322, "grad_norm": 5.486865043640137, "kl": 0.49609375, "learning_rate": 6.78e-07, "loss": 0.0983, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 322 }, { "advantages": -7.450580596923828e-09, "completion_length": 128.4375, "epoch": 0.323, "grad_norm": 3.9005072116851807, "kl": 0.4453125, "learning_rate": 6.77e-07, "loss": -0.041, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 323 }, { "advantages": 0.0, "completion_length": 115.8125, "epoch": 0.324, "grad_norm": 0.0, "kl": 0.44140625, "learning_rate": 6.76e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 324 }, { "advantages": 1.1175870895385742e-08, "completion_length": 125.4375, "epoch": 0.325, "grad_norm": 5.992334842681885, "kl": 0.40625, "learning_rate": 6.75e-07, "loss": 0.0715, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.447756826877594, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 325 }, { "advantages": 0.0, "completion_length": 117.0, "epoch": 0.326, "grad_norm": 0.0, "kl": 0.482421875, "learning_rate": 6.74e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 326 }, { "advantages": 7.450580596923828e-09, "completion_length": 121.6875, "epoch": 0.327, "grad_norm": 5.490609169006348, "kl": 0.5078125, "learning_rate": 6.730000000000001e-07, "loss": -0.0609, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 327 }, { "advantages": 0.0, "completion_length": 125.5, "epoch": 0.328, "grad_norm": 4.8279337882995605, "kl": 0.41796875, "learning_rate": 6.72e-07, "loss": -0.0221, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 328 }, { "advantages": 0.0, "completion_length": 115.3125, "epoch": 0.329, "grad_norm": 0.0, "kl": 1.2578125, "learning_rate": 6.71e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 329 }, { "advantages": 0.0, "completion_length": 124.5, "epoch": 0.33, "grad_norm": 0.0, "kl": 0.49609375, "learning_rate": 6.7e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 330 }, { "advantages": 0.0, "completion_length": 117.375, "epoch": 0.331, "grad_norm": 0.0, "kl": 0.4453125, "learning_rate": 6.69e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 331 }, { "advantages": -2.2351741790771484e-08, "completion_length": 113.0, "epoch": 0.332, "grad_norm": 6.589673042297363, "kl": 0.390625, "learning_rate": 6.68e-07, "loss": 0.0033, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.9375, "step": 332 }, { "advantages": 0.0, "completion_length": 116.5, "epoch": 0.333, "grad_norm": 0.0, "kl": 0.48828125, "learning_rate": 6.67e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 333 }, { "advantages": -7.450580596923828e-08, "completion_length": 125.5625, "epoch": 0.334, "grad_norm": 4.017887592315674, "kl": 0.625, "learning_rate": 6.66e-07, "loss": -0.0525, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.2182178944349289, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 334 }, { "advantages": -1.4901161193847656e-08, "completion_length": 118.5, "epoch": 0.335, "grad_norm": 5.249420166015625, "kl": 0.453125, "learning_rate": 6.65e-07, "loss": 0.0686, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 335 }, { "advantages": 0.0, "completion_length": 111.25, "epoch": 0.336, "grad_norm": 0.0, "kl": 0.44921875, "learning_rate": 6.64e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 336 }, { "advantages": 0.0, "completion_length": 121.125, "epoch": 0.337, "grad_norm": 0.0, "kl": 0.4609375, "learning_rate": 6.63e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 337 }, { "advantages": 0.0, "completion_length": 116.375, "epoch": 0.338, "grad_norm": 0.0, "kl": 0.466796875, "learning_rate": 6.62e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 338 }, { "advantages": 3.725290298461914e-09, "completion_length": 110.5, "epoch": 0.339, "grad_norm": 4.943254470825195, "kl": 0.48828125, "learning_rate": 6.61e-07, "loss": 0.0704, "reward": 1.15625, "reward_mean": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 1.0, "step": 339 }, { "advantages": 0.0, "completion_length": 114.625, "epoch": 0.34, "grad_norm": 4.797520637512207, "kl": 0.4921875, "learning_rate": 6.6e-07, "loss": 0.0547, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 340 }, { "advantages": -3.725290298461914e-09, "completion_length": 123.1875, "epoch": 0.341, "grad_norm": 5.215485095977783, "kl": 0.447265625, "learning_rate": 6.59e-07, "loss": 0.0024, "reward": 1.6770833730697632, "reward_mean": 1.6770833730697632, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.6770833730697632, "rewards/format_reward": 1.0, "step": 341 }, { "advantages": 0.0, "completion_length": 132.0625, "epoch": 0.342, "grad_norm": 0.0, "kl": 0.48828125, "learning_rate": 6.58e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 342 }, { "advantages": 0.0, "completion_length": 100.0, "epoch": 0.343, "grad_norm": 0.0, "kl": 0.43359375, "learning_rate": 6.57e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 343 }, { "advantages": -1.2665987014770508e-07, "completion_length": 124.125, "epoch": 0.344, "grad_norm": 5.066404819488525, "kl": 0.46484375, "learning_rate": 6.56e-07, "loss": 0.0417, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 344 }, { "advantages": 0.0, "completion_length": 111.9375, "epoch": 0.345, "grad_norm": 0.0, "kl": 0.5390625, "learning_rate": 6.55e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 345 }, { "advantages": 0.0, "completion_length": 125.25, "epoch": 0.346, "grad_norm": 5.6763505935668945, "kl": 0.4765625, "learning_rate": 6.54e-07, "loss": 0.0047, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 346 }, { "advantages": 3.725290298461914e-09, "completion_length": 108.8125, "epoch": 0.347, "grad_norm": 5.239328384399414, "kl": 0.71484375, "learning_rate": 6.53e-07, "loss": 0.0418, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 347 }, { "advantages": 0.0, "completion_length": 113.0, "epoch": 0.348, "grad_norm": 0.0, "kl": 0.4609375, "learning_rate": 6.52e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 348 }, { "advantages": 0.0, "completion_length": 116.125, "epoch": 0.349, "grad_norm": 0.0, "kl": 0.4765625, "learning_rate": 6.51e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 349 }, { "advantages": -2.2351741790771484e-08, "completion_length": 105.3125, "epoch": 0.35, "grad_norm": 7.965950012207031, "kl": 0.447265625, "learning_rate": 6.5e-07, "loss": -0.0053, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.6307864785194397, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.9375, "step": 350 }, { "advantages": 0.0, "completion_length": 135.9375, "epoch": 0.351, "grad_norm": 0.0, "kl": 0.625, "learning_rate": 6.49e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 351 }, { "advantages": 1.9371509552001953e-07, "completion_length": 142.0, "epoch": 0.352, "grad_norm": 3.862729787826538, "kl": 0.453125, "learning_rate": 6.48e-07, "loss": -0.0357, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.07715165615081787, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 352 }, { "advantages": 0.0, "completion_length": 109.625, "epoch": 0.353, "grad_norm": 0.0, "kl": 0.57421875, "learning_rate": 6.47e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 353 }, { "advantages": 0.0, "completion_length": 106.125, "epoch": 0.354, "grad_norm": 0.0, "kl": 0.5078125, "learning_rate": 6.46e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 354 }, { "advantages": 1.862645149230957e-08, "completion_length": 113.3125, "epoch": 0.355, "grad_norm": 5.779082775115967, "kl": 0.52734375, "learning_rate": 6.45e-07, "loss": -0.0804, "reward": 1.1875, "reward_mean": 1.1875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 1.0, "step": 355 }, { "advantages": 0.0, "completion_length": 112.4375, "epoch": 0.356, "grad_norm": 0.0, "kl": 0.42578125, "learning_rate": 6.44e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 356 }, { "advantages": 1.4901161193847656e-08, "completion_length": 117.6875, "epoch": 0.357, "grad_norm": 6.260042190551758, "kl": 0.458984375, "learning_rate": 6.43e-07, "loss": -0.081, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 357 }, { "advantages": 0.0, "completion_length": 133.5625, "epoch": 0.358, "grad_norm": 0.0, "kl": 0.46875, "learning_rate": 6.42e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 358 }, { "advantages": 0.0, "completion_length": 123.0625, "epoch": 0.359, "grad_norm": 0.0, "kl": 1.40625, "learning_rate": 6.41e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 359 }, { "advantages": 0.0, "completion_length": 115.9375, "epoch": 0.36, "grad_norm": 0.0, "kl": 0.4453125, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 360 }, { "advantages": -7.078051567077637e-08, "completion_length": 125.375, "epoch": 0.361, "grad_norm": 4.933547019958496, "kl": 0.48828125, "learning_rate": 6.389999999999999e-07, "loss": -0.1138, "reward": 1.8958333730697632, "reward_mean": 1.8958333730697632, "reward_std": 0.0862581804394722, "rewards/accuracy_reward": 0.8958333730697632, "rewards/format_reward": 1.0, "step": 361 }, { "advantages": 1.4901161193847656e-08, "completion_length": 131.9375, "epoch": 0.362, "grad_norm": 5.297484874725342, "kl": 0.458984375, "learning_rate": 6.38e-07, "loss": -0.0348, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 362 }, { "advantages": 6.705522537231445e-08, "completion_length": 124.125, "epoch": 0.363, "grad_norm": 6.302598476409912, "kl": 0.4140625, "learning_rate": 6.37e-07, "loss": -0.0008, "reward": 1.7604167461395264, "reward_mean": 1.7604167461395264, "reward_std": 0.2062394917011261, "rewards/accuracy_reward": 0.7604167461395264, "rewards/format_reward": 1.0, "step": 363 }, { "advantages": -7.450580596923828e-09, "completion_length": 135.25, "epoch": 0.364, "grad_norm": 3.608915328979492, "kl": 0.390625, "learning_rate": 6.36e-07, "loss": -0.0212, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 364 }, { "advantages": -6.705522537231445e-08, "completion_length": 140.625, "epoch": 0.365, "grad_norm": 5.799376964569092, "kl": 0.4296875, "learning_rate": 6.35e-07, "loss": 0.0235, "reward": 1.4583333730697632, "reward_mean": 1.4583333730697632, "reward_std": 0.2630348801612854, "rewards/accuracy_reward": 0.4583333730697632, "rewards/format_reward": 1.0, "step": 365 }, { "advantages": 0.0, "completion_length": 129.8125, "epoch": 0.366, "grad_norm": 0.0, "kl": 0.4453125, "learning_rate": 6.34e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 366 }, { "advantages": 0.0, "completion_length": 137.6875, "epoch": 0.367, "grad_norm": 4.999783039093018, "kl": 0.44140625, "learning_rate": 6.33e-07, "loss": 0.0351, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 367 }, { "advantages": 0.0, "completion_length": 125.875, "epoch": 0.368, "grad_norm": 0.0, "kl": 0.3671875, "learning_rate": 6.319999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 368 }, { "advantages": 0.0, "completion_length": 134.875, "epoch": 0.369, "grad_norm": 0.0, "kl": 0.43359375, "learning_rate": 6.31e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 369 }, { "advantages": 0.0, "completion_length": 136.8125, "epoch": 0.37, "grad_norm": 0.0, "kl": 0.427734375, "learning_rate": 6.3e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 370 }, { "advantages": -8.195638656616211e-08, "completion_length": 142.375, "epoch": 0.371, "grad_norm": 6.962843418121338, "kl": 0.40625, "learning_rate": 6.289999999999999e-07, "loss": -0.0594, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.2630348801612854, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 371 }, { "advantages": -7.450580596923828e-09, "completion_length": 136.625, "epoch": 0.372, "grad_norm": 6.798043251037598, "kl": 0.447265625, "learning_rate": 6.28e-07, "loss": -0.0675, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 372 }, { "advantages": 0.0, "completion_length": 130.625, "epoch": 0.373, "grad_norm": 5.091549396514893, "kl": 0.447265625, "learning_rate": 6.27e-07, "loss": 0.0482, "reward": 1.96875, "reward_mean": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 373 }, { "advantages": 3.725290298461914e-09, "completion_length": 131.5625, "epoch": 0.374, "grad_norm": 5.158649444580078, "kl": 0.4453125, "learning_rate": 6.26e-07, "loss": 0.0248, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 374 }, { "advantages": -3.725290298461914e-09, "completion_length": 150.875, "epoch": 0.375, "grad_norm": 4.258111953735352, "kl": 0.392578125, "learning_rate": 6.249999999999999e-07, "loss": -0.0959, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 375 }, { "advantages": 1.4901161193847656e-08, "completion_length": 128.4375, "epoch": 0.376, "grad_norm": 4.292641639709473, "kl": 0.40625, "learning_rate": 6.24e-07, "loss": 0.0573, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 376 }, { "advantages": 0.0, "completion_length": 129.8125, "epoch": 0.377, "grad_norm": 0.0, "kl": 0.41796875, "learning_rate": 6.23e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 377 }, { "advantages": 0.0, "completion_length": 161.0, "epoch": 0.378, "grad_norm": 0.0, "kl": 0.390625, "learning_rate": 6.219999999999999e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 378 }, { "advantages": 0.0, "completion_length": 148.5625, "epoch": 0.379, "grad_norm": 4.622002124786377, "kl": 0.42578125, "learning_rate": 6.21e-07, "loss": 0.0414, "reward": 1.9166667461395264, "reward_mean": 1.9166667461395264, "reward_std": 0.08908706158399582, "rewards/accuracy_reward": 0.9166666865348816, "rewards/format_reward": 1.0, "step": 379 }, { "advantages": 0.0, "completion_length": 130.8125, "epoch": 0.38, "grad_norm": 6.805364608764648, "kl": 0.4453125, "learning_rate": 6.2e-07, "loss": 0.1685, "reward": 1.96875, "reward_mean": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 380 }, { "advantages": 0.0, "completion_length": 146.0, "epoch": 0.381, "grad_norm": 4.019841194152832, "kl": 0.416015625, "learning_rate": 6.189999999999999e-07, "loss": -0.0306, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 381 }, { "advantages": 0.0, "completion_length": 133.1875, "epoch": 0.382, "grad_norm": 0.0, "kl": 0.54296875, "learning_rate": 6.18e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 382 }, { "advantages": -7.450580596923828e-09, "completion_length": 129.8125, "epoch": 0.383, "grad_norm": 4.163370132446289, "kl": 0.435546875, "learning_rate": 6.17e-07, "loss": -0.0085, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 383 }, { "advantages": 7.450580596923828e-09, "completion_length": 170.8125, "epoch": 0.384, "grad_norm": 3.4316840171813965, "kl": 0.4453125, "learning_rate": 6.16e-07, "loss": -0.1353, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.18898223340511322, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 384 }, { "advantages": 6.705522537231445e-08, "completion_length": 181.375, "epoch": 0.385, "grad_norm": 3.732250690460205, "kl": 0.486328125, "learning_rate": 6.149999999999999e-07, "loss": -0.0222, "reward": 1.2916667461395264, "reward_mean": 1.2916667461395264, "reward_std": 0.1178511530160904, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 1.0, "step": 385 }, { "advantages": -7.450580596923828e-09, "completion_length": 143.1875, "epoch": 0.386, "grad_norm": 4.219268321990967, "kl": 0.404296875, "learning_rate": 6.14e-07, "loss": 0.0139, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 386 }, { "advantages": -7.450580596923828e-08, "completion_length": 145.0625, "epoch": 0.387, "grad_norm": 4.608545780181885, "kl": 0.5234375, "learning_rate": 6.13e-07, "loss": 0.0689, "reward": 1.0833333730697632, "reward_mean": 1.0833333730697632, "reward_std": 0.15430335700511932, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 1.0, "step": 387 }, { "advantages": 1.862645149230957e-08, "completion_length": 124.1875, "epoch": 0.388, "grad_norm": 5.094681262969971, "kl": 0.4140625, "learning_rate": 6.119999999999999e-07, "loss": -0.0963, "reward": 1.1875, "reward_mean": 1.1875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 1.0, "step": 388 }, { "advantages": 7.450580596923828e-09, "completion_length": 158.1875, "epoch": 0.389, "grad_norm": 4.464499473571777, "kl": 0.4296875, "learning_rate": 6.11e-07, "loss": -0.0446, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 389 }, { "advantages": 0.0, "completion_length": 182.6875, "epoch": 0.39, "grad_norm": 0.0, "kl": 0.3984375, "learning_rate": 6.1e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 390 }, { "advantages": 0.0, "completion_length": 154.5625, "epoch": 0.391, "grad_norm": 0.0, "kl": 0.75, "learning_rate": 6.089999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 391 }, { "advantages": 0.0, "completion_length": 149.4375, "epoch": 0.392, "grad_norm": 0.0, "kl": 0.46484375, "learning_rate": 6.079999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 392 }, { "advantages": 0.0, "completion_length": 136.8125, "epoch": 0.393, "grad_norm": 0.0, "kl": 0.421875, "learning_rate": 6.07e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 393 }, { "advantages": 0.0, "completion_length": 169.1875, "epoch": 0.394, "grad_norm": 0.0, "kl": 0.4296875, "learning_rate": 6.06e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 394 }, { "advantages": 0.0, "completion_length": 164.6875, "epoch": 0.395, "grad_norm": 0.0, "kl": 0.416015625, "learning_rate": 6.049999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 395 }, { "advantages": 0.0, "completion_length": 128.9375, "epoch": 0.396, "grad_norm": 0.0, "kl": 0.4296875, "learning_rate": 6.04e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 396 }, { "advantages": 0.0, "completion_length": 153.375, "epoch": 0.397, "grad_norm": 0.0, "kl": 0.392578125, "learning_rate": 6.03e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 397 }, { "advantages": 8.195638656616211e-08, "completion_length": 154.6875, "epoch": 0.398, "grad_norm": 5.41452169418335, "kl": 0.671875, "learning_rate": 6.019999999999999e-07, "loss": -0.184, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.3382667005062103, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 398 }, { "advantages": -1.4901161193847656e-08, "completion_length": 154.3125, "epoch": 0.399, "grad_norm": 4.080648899078369, "kl": 0.453125, "learning_rate": 6.009999999999999e-07, "loss": 0.0161, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 399 }, { "advantages": 0.0, "completion_length": 169.75, "epoch": 0.4, "grad_norm": 0.0, "kl": 0.4453125, "learning_rate": 6e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 400 }, { "advantages": 0.0, "completion_length": 170.5, "epoch": 0.401, "grad_norm": 0.0, "kl": 0.54296875, "learning_rate": 5.989999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 401 }, { "advantages": 0.0, "completion_length": 141.875, "epoch": 0.402, "grad_norm": 0.0, "kl": 0.4375, "learning_rate": 5.979999999999999e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 402 }, { "advantages": 0.0, "completion_length": 186.8125, "epoch": 0.403, "grad_norm": 4.032413959503174, "kl": 0.375, "learning_rate": 5.97e-07, "loss": 0.0794, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 403 }, { "advantages": -1.4901161193847656e-08, "completion_length": 142.25, "epoch": 0.404, "grad_norm": 4.112726211547852, "kl": 0.4140625, "learning_rate": 5.96e-07, "loss": -0.1048, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 404 }, { "advantages": 6.705522537231445e-08, "completion_length": 190.5625, "epoch": 0.405, "grad_norm": 3.8361196517944336, "kl": 0.357421875, "learning_rate": 5.949999999999999e-07, "loss": 0.0243, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.1178511530160904, "rewards/accuracy_reward": 0.6250000596046448, "rewards/format_reward": 1.0, "step": 405 }, { "advantages": 0.0, "completion_length": 168.5625, "epoch": 0.406, "grad_norm": 0.0, "kl": 0.390625, "learning_rate": 5.939999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 406 }, { "advantages": 0.0, "completion_length": 144.75, "epoch": 0.407, "grad_norm": 0.0, "kl": 0.396484375, "learning_rate": 5.93e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 407 }, { "advantages": 0.0, "completion_length": 200.375, "epoch": 0.408, "grad_norm": 0.0, "kl": 0.3828125, "learning_rate": 5.919999999999999e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 408 }, { "advantages": 0.0, "completion_length": 177.75, "epoch": 0.409, "grad_norm": 0.0, "kl": 0.44140625, "learning_rate": 5.909999999999999e-07, "loss": 0.0, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.8333333730697632, "rewards/format_reward": 1.0, "step": 409 }, { "advantages": 0.0, "completion_length": 150.4375, "epoch": 0.41, "grad_norm": 0.0, "kl": 0.40625, "learning_rate": 5.9e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 410 }, { "advantages": 7.450580596923828e-08, "completion_length": 191.9375, "epoch": 0.411, "grad_norm": 3.727123737335205, "kl": 0.404296875, "learning_rate": 5.89e-07, "loss": 0.0212, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.0862581878900528, "rewards/accuracy_reward": 0.9375000596046448, "rewards/format_reward": 1.0, "step": 411 }, { "advantages": -1.862645149230957e-08, "completion_length": 202.25, "epoch": 0.412, "grad_norm": 3.3219895362854004, "kl": 0.37890625, "learning_rate": 5.879999999999999e-07, "loss": -0.0367, "reward": 1.3125, "reward_mean": 1.3125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 1.0, "step": 412 }, { "advantages": -7.450580596923828e-09, "completion_length": 218.8125, "epoch": 0.413, "grad_norm": 3.1788170337677, "kl": 0.3515625, "learning_rate": 5.87e-07, "loss": -0.0693, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 413 }, { "advantages": 0.0, "completion_length": 229.3125, "epoch": 0.414, "grad_norm": 0.0, "kl": 0.36328125, "learning_rate": 5.86e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 414 }, { "advantages": 7.450580596923828e-09, "completion_length": 226.9375, "epoch": 0.415, "grad_norm": 2.9099948406219482, "kl": 0.388671875, "learning_rate": 5.849999999999999e-07, "loss": 0.0902, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 415 }, { "advantages": -2.9802322387695312e-08, "completion_length": 260.3125, "epoch": 0.416, "grad_norm": 4.535805702209473, "kl": 0.37109375, "learning_rate": 5.839999999999999e-07, "loss": -0.0929, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.44478052854537964, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 416 }, { "advantages": -3.725290298461914e-09, "completion_length": 213.5625, "epoch": 0.417, "grad_norm": 3.6784188747406006, "kl": 0.39453125, "learning_rate": 5.83e-07, "loss": 0.0398, "reward": 1.6770833730697632, "reward_mean": 1.6770833730697632, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.6770833730697632, "rewards/format_reward": 1.0, "step": 417 }, { "advantages": 0.0, "completion_length": 169.125, "epoch": 0.418, "grad_norm": 0.0, "kl": 0.34375, "learning_rate": 5.819999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 418 }, { "advantages": -7.078051567077637e-08, "completion_length": 241.4375, "epoch": 0.419, "grad_norm": 3.5229690074920654, "kl": 0.380859375, "learning_rate": 5.809999999999999e-07, "loss": 0.0437, "reward": 1.3958333730697632, "reward_mean": 1.3958333730697632, "reward_std": 0.0862581804394722, "rewards/accuracy_reward": 0.3958333730697632, "rewards/format_reward": 1.0, "step": 419 }, { "advantages": 0.0, "completion_length": 199.5, "epoch": 0.42, "grad_norm": 0.0, "kl": 0.392578125, "learning_rate": 5.8e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 420 }, { "advantages": -1.4901161193847656e-08, "completion_length": 234.625, "epoch": 0.421, "grad_norm": 4.843015193939209, "kl": 0.40234375, "learning_rate": 5.79e-07, "loss": -0.0363, "reward": 1.7916667461395264, "reward_mean": 1.7916667461395264, "reward_std": 0.3205420970916748, "rewards/accuracy_reward": 0.7916667461395264, "rewards/format_reward": 1.0, "step": 421 }, { "advantages": 1.2665987014770508e-07, "completion_length": 218.4375, "epoch": 0.422, "grad_norm": 5.061634540557861, "kl": 0.37890625, "learning_rate": 5.779999999999999e-07, "loss": 0.0372, "reward": 1.3854167461395264, "reward_mean": 1.3854167461395264, "reward_std": 0.1473138928413391, "rewards/accuracy_reward": 0.3854166865348816, "rewards/format_reward": 1.0, "step": 422 }, { "advantages": -1.4901161193847656e-08, "completion_length": 150.4375, "epoch": 0.423, "grad_norm": 4.365501880645752, "kl": 0.44140625, "learning_rate": 5.769999999999999e-07, "loss": 0.0644, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 423 }, { "advantages": 0.0, "completion_length": 184.5, "epoch": 0.424, "grad_norm": 0.0, "kl": 0.380859375, "learning_rate": 5.76e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 424 }, { "advantages": 3.725290298461914e-09, "completion_length": 170.8125, "epoch": 0.425, "grad_norm": 3.8927226066589355, "kl": 0.38671875, "learning_rate": 5.749999999999999e-07, "loss": -0.0322, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 425 }, { "advantages": 0.0, "completion_length": 201.1875, "epoch": 0.426, "grad_norm": 0.0, "kl": 0.390625, "learning_rate": 5.739999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 426 }, { "advantages": -7.450580596923828e-09, "completion_length": 238.875, "epoch": 0.427, "grad_norm": 2.8062853813171387, "kl": 0.37890625, "learning_rate": 5.73e-07, "loss": -0.0371, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 427 }, { "advantages": -7.450580596923828e-09, "completion_length": 191.8125, "epoch": 0.428, "grad_norm": 3.564711570739746, "kl": 0.39453125, "learning_rate": 5.719999999999999e-07, "loss": -0.012, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.9375, "step": 428 }, { "advantages": -2.60770320892334e-08, "completion_length": 237.375, "epoch": 0.429, "grad_norm": 5.137650012969971, "kl": 0.35546875, "learning_rate": 5.709999999999999e-07, "loss": -0.0172, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.4355512857437134, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 429 }, { "advantages": 0.0, "completion_length": 228.375, "epoch": 0.43, "grad_norm": 0.0, "kl": 0.40625, "learning_rate": 5.699999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 430 }, { "advantages": 3.203749656677246e-07, "completion_length": 189.4375, "epoch": 0.431, "grad_norm": 3.582122325897217, "kl": 0.3828125, "learning_rate": 5.69e-07, "loss": 0.0219, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.058925580233335495, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 431 }, { "advantages": 0.0, "completion_length": 153.125, "epoch": 0.432, "grad_norm": 0.0, "kl": 0.3671875, "learning_rate": 5.679999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 432 }, { "advantages": 0.0, "completion_length": 226.8125, "epoch": 0.433, "grad_norm": 0.0, "kl": 0.38671875, "learning_rate": 5.669999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 433 }, { "advantages": 0.0, "completion_length": 194.875, "epoch": 0.434, "grad_norm": 0.0, "kl": 0.38671875, "learning_rate": 5.66e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 434 }, { "advantages": 0.0, "completion_length": 135.875, "epoch": 0.435, "grad_norm": 0.0, "kl": 0.396484375, "learning_rate": 5.649999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 435 }, { "advantages": 0.0, "completion_length": 173.5, "epoch": 0.436, "grad_norm": 0.0, "kl": 0.390625, "learning_rate": 5.639999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 436 }, { "advantages": -1.2665987014770508e-07, "completion_length": 221.3125, "epoch": 0.437, "grad_norm": 4.741230010986328, "kl": 0.3515625, "learning_rate": 5.629999999999999e-07, "loss": -0.0242, "reward": 1.7291667461395264, "reward_mean": 1.7291667461395264, "reward_std": 0.32618677616119385, "rewards/accuracy_reward": 0.7291666865348816, "rewards/format_reward": 1.0, "step": 437 }, { "advantages": 0.0, "completion_length": 229.0625, "epoch": 0.438, "grad_norm": 0.0, "kl": 0.359375, "learning_rate": 5.620000000000001e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 438 }, { "advantages": 0.0, "completion_length": 217.1875, "epoch": 0.439, "grad_norm": 0.0, "kl": 0.40625, "learning_rate": 5.61e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 439 }, { "advantages": 0.0, "completion_length": 221.0625, "epoch": 0.44, "grad_norm": 0.0, "kl": 0.49609375, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 1.3333333730697632, "reward_mean": 1.3333333730697632, "reward_std": 0.0, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 1.0, "step": 440 }, { "advantages": 7.450580596923828e-09, "completion_length": 192.6875, "epoch": 0.441, "grad_norm": 2.999258041381836, "kl": 0.47265625, "learning_rate": 5.590000000000001e-07, "loss": 0.0634, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 441 }, { "advantages": 1.2665987014770508e-07, "completion_length": 195.4375, "epoch": 0.442, "grad_norm": 4.589319705963135, "kl": 0.37109375, "learning_rate": 5.58e-07, "loss": -0.1397, "reward": 1.8541667461395264, "reward_mean": 1.8541667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.8541666865348816, "rewards/format_reward": 1.0, "step": 442 }, { "advantages": 0.0, "completion_length": 204.5625, "epoch": 0.443, "grad_norm": 3.8165395259857178, "kl": 0.46484375, "learning_rate": 5.57e-07, "loss": 0.122, "reward": 1.9166667461395264, "reward_mean": 1.9166667461395264, "reward_std": 0.08908707648515701, "rewards/accuracy_reward": 0.9166667461395264, "rewards/format_reward": 1.0, "step": 443 }, { "advantages": 1.4901161193847656e-08, "completion_length": 235.3125, "epoch": 0.444, "grad_norm": 3.3493289947509766, "kl": 0.40234375, "learning_rate": 5.560000000000001e-07, "loss": -0.0344, "reward": 1.4583333730697632, "reward_mean": 1.4583333730697632, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.4583333730697632, "rewards/format_reward": 1.0, "step": 444 }, { "advantages": -1.4901161193847656e-08, "completion_length": 173.0625, "epoch": 0.445, "grad_norm": 3.844341278076172, "kl": 0.52734375, "learning_rate": 5.55e-07, "loss": 0.0581, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.18600594997406006, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 445 }, { "advantages": 0.0, "completion_length": 164.0, "epoch": 0.446, "grad_norm": 0.0, "kl": 0.421875, "learning_rate": 5.54e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 446 }, { "advantages": 7.450580596923828e-09, "completion_length": 160.0625, "epoch": 0.447, "grad_norm": 3.5318963527679443, "kl": 0.37890625, "learning_rate": 5.53e-07, "loss": 0.0321, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 447 }, { "advantages": 1.4901161193847656e-08, "completion_length": 207.6875, "epoch": 0.448, "grad_norm": 5.7608489990234375, "kl": 0.359375, "learning_rate": 5.520000000000001e-07, "loss": 0.0509, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.49871626496315, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 448 }, { "advantages": -7.450580596923828e-09, "completion_length": 242.75, "epoch": 0.449, "grad_norm": 4.810704708099365, "kl": 0.46875, "learning_rate": 5.51e-07, "loss": -0.0634, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 0.9375, "step": 449 }, { "advantages": 2.60770320892334e-08, "completion_length": 161.0625, "epoch": 0.45, "grad_norm": 7.428137302398682, "kl": 0.3984375, "learning_rate": 5.5e-07, "loss": 0.0819, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.4355512857437134, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 450 }, { "advantages": 0.0, "completion_length": 193.1875, "epoch": 0.451, "grad_norm": 0.0, "kl": 0.388671875, "learning_rate": 5.490000000000001e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 451 }, { "advantages": 1.862645149230957e-07, "completion_length": 235.75, "epoch": 0.452, "grad_norm": 3.380284547805786, "kl": 0.36328125, "learning_rate": 5.48e-07, "loss": -0.0072, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.0589255690574646, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 452 }, { "advantages": 7.078051567077637e-08, "completion_length": 177.75, "epoch": 0.453, "grad_norm": 4.489373683929443, "kl": 0.40234375, "learning_rate": 5.47e-07, "loss": -0.1424, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.0862581804394722, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 453 }, { "advantages": 0.0, "completion_length": 161.125, "epoch": 0.454, "grad_norm": 0.0, "kl": 0.40625, "learning_rate": 5.46e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 454 }, { "advantages": 1.862645149230957e-08, "completion_length": 179.1875, "epoch": 0.455, "grad_norm": 4.947906017303467, "kl": 0.419921875, "learning_rate": 5.45e-07, "loss": -0.1043, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 455 }, { "advantages": 0.0, "completion_length": 155.6875, "epoch": 0.456, "grad_norm": 0.0, "kl": 0.4140625, "learning_rate": 5.44e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 456 }, { "advantages": 0.0, "completion_length": 136.625, "epoch": 0.457, "grad_norm": 0.0, "kl": 0.361328125, "learning_rate": 5.43e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 457 }, { "advantages": 0.0, "completion_length": 149.5625, "epoch": 0.458, "grad_norm": 0.0, "kl": 0.421875, "learning_rate": 5.420000000000001e-07, "loss": 0.0, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 458 }, { "advantages": 0.0, "completion_length": 185.8125, "epoch": 0.459, "grad_norm": 0.0, "kl": 0.3984375, "learning_rate": 5.41e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 459 }, { "advantages": 1.4901161193847656e-08, "completion_length": 180.25, "epoch": 0.46, "grad_norm": 4.1631245613098145, "kl": 0.3671875, "learning_rate": 5.4e-07, "loss": 0.0139, "reward": 1.125, "reward_mean": 1.125, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "step": 460 }, { "advantages": -1.2665987014770508e-07, "completion_length": 190.875, "epoch": 0.461, "grad_norm": 4.054723262786865, "kl": 0.37109375, "learning_rate": 5.39e-07, "loss": 0.0473, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 461 }, { "advantages": 0.0, "completion_length": 192.8125, "epoch": 0.462, "grad_norm": 4.4658122062683105, "kl": 0.447265625, "learning_rate": 5.38e-07, "loss": 0.0566, "reward": 1.5833333730697632, "reward_mean": 1.5833333730697632, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.5833333134651184, "rewards/format_reward": 1.0, "step": 462 }, { "advantages": 0.0, "completion_length": 192.625, "epoch": 0.463, "grad_norm": 0.0, "kl": 0.400390625, "learning_rate": 5.37e-07, "loss": 0.0, "reward": 1.6666667461395264, "reward_mean": 1.6666667461395264, "reward_std": 0.0, "rewards/accuracy_reward": 0.6666666269302368, "rewards/format_reward": 1.0, "step": 463 }, { "advantages": 0.0, "completion_length": 161.0, "epoch": 0.464, "grad_norm": 0.0, "kl": 0.36328125, "learning_rate": 5.36e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 464 }, { "advantages": 1.6391277313232422e-07, "completion_length": 180.875, "epoch": 0.465, "grad_norm": 4.05633544921875, "kl": 0.384765625, "learning_rate": 5.35e-07, "loss": 0.0124, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.08908708393573761, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 465 }, { "advantages": -7.82310962677002e-08, "completion_length": 189.9375, "epoch": 0.466, "grad_norm": 5.274670124053955, "kl": 0.3828125, "learning_rate": 5.34e-07, "loss": -0.021, "reward": 1.8333333730697632, "reward_mean": 1.8333333730697632, "reward_std": 0.2630348801612854, "rewards/accuracy_reward": 0.8958333730697632, "rewards/format_reward": 0.9375, "step": 466 }, { "advantages": 0.0, "completion_length": 219.5, "epoch": 0.467, "grad_norm": 0.0, "kl": 0.384765625, "learning_rate": 5.33e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 467 }, { "advantages": 0.0, "completion_length": 193.125, "epoch": 0.468, "grad_norm": 0.0, "kl": 0.369140625, "learning_rate": 5.32e-07, "loss": 0.0, "reward": 1.0, "reward_mean": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 1.0, "step": 468 }, { "advantages": -2.2351741790771484e-08, "completion_length": 181.75, "epoch": 0.469, "grad_norm": 5.394594669342041, "kl": 0.42578125, "learning_rate": 5.31e-07, "loss": 0.0852, "reward": 1.6458333730697632, "reward_mean": 1.6458333730697632, "reward_std": 0.4082317352294922, "rewards/accuracy_reward": 0.7708333730697632, "rewards/format_reward": 0.875, "step": 469 }, { "advantages": -7.450580596923828e-09, "completion_length": 162.875, "epoch": 0.47, "grad_norm": 5.32183837890625, "kl": 0.40234375, "learning_rate": 5.3e-07, "loss": 0.1006, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 470 }, { "advantages": 0.0, "completion_length": 171.9375, "epoch": 0.471, "grad_norm": 0.0, "kl": 0.40625, "learning_rate": 5.29e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 471 }, { "advantages": 0.0, "completion_length": 156.0, "epoch": 0.472, "grad_norm": 0.0, "kl": 0.390625, "learning_rate": 5.28e-07, "loss": 0.0, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 472 }, { "advantages": -1.2665987014770508e-07, "completion_length": 153.8125, "epoch": 0.473, "grad_norm": 5.101055145263672, "kl": 0.42578125, "learning_rate": 5.27e-07, "loss": 0.0903, "reward": 1.9791667461395264, "reward_mean": 1.9791667461395264, "reward_std": 0.05892554670572281, "rewards/accuracy_reward": 0.9791666865348816, "rewards/format_reward": 1.0, "step": 473 }, { "advantages": 0.0, "completion_length": 186.9375, "epoch": 0.474, "grad_norm": 0.0, "kl": 0.41796875, "learning_rate": 5.26e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 474 }, { "advantages": 3.725290298461914e-09, "completion_length": 185.75, "epoch": 0.475, "grad_norm": 4.348298072814941, "kl": 0.390625, "learning_rate": 5.25e-07, "loss": 0.0827, "reward": 1.15625, "reward_mean": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 1.0, "step": 475 }, { "advantages": 0.0, "completion_length": 199.0625, "epoch": 0.476, "grad_norm": 3.2782394886016846, "kl": 0.375, "learning_rate": 5.24e-07, "loss": -0.0661, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 476 }, { "advantages": 1.4901161193847656e-08, "completion_length": 170.6875, "epoch": 0.477, "grad_norm": 4.559285640716553, "kl": 0.40234375, "learning_rate": 5.23e-07, "loss": -0.0058, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 477 }, { "advantages": -1.862645149230957e-08, "completion_length": 182.0, "epoch": 0.478, "grad_norm": 3.9179017543792725, "kl": 0.44140625, "learning_rate": 5.22e-07, "loss": 0.0788, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 478 }, { "advantages": 0.0, "completion_length": 174.9375, "epoch": 0.479, "grad_norm": 4.1898298263549805, "kl": 0.3984375, "learning_rate": 5.21e-07, "loss": -0.058, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 479 }, { "advantages": 0.0, "completion_length": 158.5625, "epoch": 0.48, "grad_norm": 3.0333094596862793, "kl": 0.3828125, "learning_rate": 5.2e-07, "loss": 0.0462, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 480 }, { "advantages": 0.0, "completion_length": 142.5, "epoch": 0.481, "grad_norm": 0.0, "kl": 0.39453125, "learning_rate": 5.19e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 481 }, { "advantages": 0.0, "completion_length": 162.3125, "epoch": 0.482, "grad_norm": 0.0, "kl": 0.4375, "learning_rate": 5.18e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 482 }, { "advantages": 0.0, "completion_length": 152.4375, "epoch": 0.483, "grad_norm": 4.092982292175293, "kl": 0.3984375, "learning_rate": 5.17e-07, "loss": -0.008, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 483 }, { "advantages": 0.0, "completion_length": 175.5, "epoch": 0.484, "grad_norm": 0.0, "kl": 0.40625, "learning_rate": 5.16e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 484 }, { "advantages": 0.0, "completion_length": 166.8125, "epoch": 0.485, "grad_norm": 0.0, "kl": 0.4375, "learning_rate": 5.149999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 485 }, { "advantages": 1.862645149230957e-08, "completion_length": 151.8125, "epoch": 0.486, "grad_norm": 5.26322078704834, "kl": 0.41015625, "learning_rate": 5.14e-07, "loss": -0.0091, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 486 }, { "advantages": 0.0, "completion_length": 182.5625, "epoch": 0.487, "grad_norm": 0.0, "kl": 0.42578125, "learning_rate": 5.13e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 487 }, { "advantages": 0.0, "completion_length": 158.125, "epoch": 0.488, "grad_norm": 0.0, "kl": 0.44140625, "learning_rate": 5.12e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 488 }, { "advantages": 1.4901161193847656e-08, "completion_length": 202.3125, "epoch": 0.489, "grad_norm": 6.207299709320068, "kl": 0.33984375, "learning_rate": 5.11e-07, "loss": -0.1083, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 489 }, { "advantages": -7.450580596923828e-09, "completion_length": 167.625, "epoch": 0.49, "grad_norm": 3.2399141788482666, "kl": 0.6015625, "learning_rate": 5.1e-07, "loss": -0.042, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 490 }, { "advantages": 3.203749656677246e-07, "completion_length": 148.0625, "epoch": 0.491, "grad_norm": 4.004068851470947, "kl": 0.48046875, "learning_rate": 5.09e-07, "loss": 0.0066, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.058925580233335495, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 491 }, { "advantages": 0.0, "completion_length": 189.375, "epoch": 0.492, "grad_norm": 0.0, "kl": 0.40234375, "learning_rate": 5.079999999999999e-07, "loss": 0.0, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 492 }, { "advantages": 0.0, "completion_length": 157.5625, "epoch": 0.493, "grad_norm": 0.0, "kl": 0.40234375, "learning_rate": 5.07e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 493 }, { "advantages": 0.0, "completion_length": 145.6875, "epoch": 0.494, "grad_norm": 0.0, "kl": 0.494140625, "learning_rate": 5.06e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 494 }, { "advantages": 1.6391277313232422e-07, "completion_length": 151.25, "epoch": 0.495, "grad_norm": 4.36698579788208, "kl": 0.427734375, "learning_rate": 5.049999999999999e-07, "loss": -0.0001, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0890870913863182, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 495 }, { "advantages": -3.725290298461914e-09, "completion_length": 179.875, "epoch": 0.496, "grad_norm": 4.543258190155029, "kl": 0.3828125, "learning_rate": 5.04e-07, "loss": -0.0217, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 496 }, { "advantages": 0.0, "completion_length": 188.625, "epoch": 0.497, "grad_norm": 0.0, "kl": 0.41796875, "learning_rate": 5.03e-07, "loss": 0.0, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 497 }, { "advantages": -1.4901161193847656e-08, "completion_length": 148.8125, "epoch": 0.498, "grad_norm": 4.038569450378418, "kl": 0.4609375, "learning_rate": 5.02e-07, "loss": -0.0576, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 498 }, { "advantages": 0.0, "completion_length": 129.25, "epoch": 0.499, "grad_norm": 0.0, "kl": 0.42578125, "learning_rate": 5.009999999999999e-07, "loss": 0.0, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 499 }, { "advantages": 7.450580596923828e-09, "completion_length": 148.125, "epoch": 0.5, "grad_norm": 5.02844762802124, "kl": 0.4453125, "learning_rate": 5e-07, "loss": -0.0852, "reward": 1.0625, "reward_mean": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 1.0, "step": 500 } ], "logging_steps": 1.0, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }