{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 161, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantages": 0.0, "completion_length": 77.421875, "epoch": 0.006211180124223602, "grad_norm": 4.007043838500977, "kl": 0.0, "learning_rate": 9.937888198757763e-07, "loss": -0.0, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 1 }, { "advantages": 1.30385160446167e-08, "completion_length": 78.921875, "epoch": 0.012422360248447204, "grad_norm": 7.15122652053833, "kl": 0.00041961669921875, "learning_rate": 9.875776397515528e-07, "loss": 0.0, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.2845909595489502, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 2 }, { "advantages": 0.0, "completion_length": 78.671875, "epoch": 0.018633540372670808, "grad_norm": 2.9171459674835205, "kl": 0.000392913818359375, "learning_rate": 9.813664596273291e-07, "loss": 0.0, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 3 }, { "advantages": -1.862645149230957e-09, "completion_length": 77.78125, "epoch": 0.024844720496894408, "grad_norm": 5.474589824676514, "kl": 0.000759124755859375, "learning_rate": 9.751552795031055e-07, "loss": 0.0001, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.213067427277565, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 4 }, { "advantages": -2.7939677238464355e-09, "completion_length": 90.8125, "epoch": 0.031055900621118012, "grad_norm": 5.2480363845825195, "kl": 0.001495361328125, "learning_rate": 9.68944099378882e-07, "loss": 0.0001, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.17570313811302185, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 5 }, { "advantages": 6.51925802230835e-09, "completion_length": 81.203125, "epoch": 0.037267080745341616, "grad_norm": 8.329508781433105, "kl": 0.006256103515625, "learning_rate": 9.627329192546583e-07, "loss": 0.0006, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.26409146189689636, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 6 }, { "advantages": -3.725290298461914e-09, "completion_length": 84.265625, "epoch": 0.043478260869565216, "grad_norm": 9.400680541992188, "kl": 0.0106201171875, "learning_rate": 9.565217391304349e-07, "loss": 0.0011, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.2404065877199173, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 7 }, { "advantages": -2.7939677238464355e-09, "completion_length": 84.109375, "epoch": 0.049689440993788817, "grad_norm": 6.011223316192627, "kl": 0.0057373046875, "learning_rate": 9.503105590062112e-07, "loss": 0.0006, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 8 }, { "advantages": -3.725290298461914e-09, "completion_length": 87.046875, "epoch": 0.055900621118012424, "grad_norm": 4.103212356567383, "kl": 0.00244140625, "learning_rate": 9.440993788819875e-07, "loss": 0.0002, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 9 }, { "advantages": -7.450580596923828e-09, "completion_length": 76.15625, "epoch": 0.062111801242236024, "grad_norm": 7.4132466316223145, "kl": 0.01287841796875, "learning_rate": 9.37888198757764e-07, "loss": 0.0013, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.3335031569004059, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 1.0, "step": 10 }, { "advantages": 3.725290298461914e-09, "completion_length": 76.75, "epoch": 0.06832298136645963, "grad_norm": 4.6751017570495605, "kl": 0.0130615234375, "learning_rate": 9.316770186335403e-07, "loss": 0.0013, "reward": 1.28125, "reward_mean": 1.28125, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 1.0, "step": 11 }, { "advantages": -1.862645149230957e-09, "completion_length": 76.390625, "epoch": 0.07453416149068323, "grad_norm": 7.682182788848877, "kl": 0.018310546875, "learning_rate": 9.254658385093167e-07, "loss": 0.0018, "reward": 1.734375, "reward_mean": 1.734375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 12 }, { "advantages": -2.7939677238464355e-09, "completion_length": 85.234375, "epoch": 0.08074534161490683, "grad_norm": 4.814305305480957, "kl": 0.00396728515625, "learning_rate": 9.19254658385093e-07, "loss": 0.0004, "reward": 1.8125, "reward_mean": 1.8125, "reward_std": 0.22461533546447754, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 1.0, "step": 13 }, { "advantages": -1.862645149230957e-09, "completion_length": 86.3125, "epoch": 0.08695652173913043, "grad_norm": 189.3062744140625, "kl": 0.033203125, "learning_rate": 9.130434782608695e-07, "loss": 0.0033, "reward": 1.609375, "reward_mean": 1.609375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 14 }, { "advantages": 2.7939677238464355e-09, "completion_length": 69.8125, "epoch": 0.09316770186335403, "grad_norm": 3.5520412921905518, "kl": 0.01129150390625, "learning_rate": 9.06832298136646e-07, "loss": 0.0011, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 15 }, { "advantages": -9.313225746154785e-10, "completion_length": 76.296875, "epoch": 0.09937888198757763, "grad_norm": 3.526542901992798, "kl": 0.00909423828125, "learning_rate": 9.006211180124223e-07, "loss": 0.0009, "reward": 1.609375, "reward_mean": 1.609375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 16 }, { "advantages": 1.862645149230957e-09, "completion_length": 86.3125, "epoch": 0.10559006211180125, "grad_norm": 5.98048210144043, "kl": 0.0057373046875, "learning_rate": 8.944099378881988e-07, "loss": 0.0006, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 17 }, { "advantages": 0.0, "completion_length": 82.921875, "epoch": 0.11180124223602485, "grad_norm": 2.0705599784851074, "kl": 0.00592041015625, "learning_rate": 8.881987577639751e-07, "loss": 0.0006, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.984375, "step": 18 }, { "advantages": -2.7939677238464355e-09, "completion_length": 81.40625, "epoch": 0.11801242236024845, "grad_norm": 9.266715049743652, "kl": 0.0079345703125, "learning_rate": 8.819875776397515e-07, "loss": 0.0008, "reward": 1.546875, "reward_mean": 1.546875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 0.984375, "step": 19 }, { "advantages": -3.725290298461914e-09, "completion_length": 80.203125, "epoch": 0.12422360248447205, "grad_norm": 10.367863655090332, "kl": 0.0072021484375, "learning_rate": 8.757763975155279e-07, "loss": 0.0007, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.2404065728187561, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 20 }, { "advantages": 1.862645149230957e-09, "completion_length": 75.734375, "epoch": 0.13043478260869565, "grad_norm": 2.6553478240966797, "kl": 0.00592041015625, "learning_rate": 8.695652173913043e-07, "loss": 0.0006, "reward": 1.578125, "reward_mean": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 21 }, { "advantages": 3.725290298461914e-09, "completion_length": 85.8125, "epoch": 0.13664596273291926, "grad_norm": 3.458266496658325, "kl": 0.00604248046875, "learning_rate": 8.633540372670807e-07, "loss": 0.0006, "reward": 1.515625, "reward_mean": 1.515625, "reward_std": 0.15981829166412354, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 0.984375, "step": 22 }, { "advantages": -2.7939677238464355e-09, "completion_length": 79.328125, "epoch": 0.14285714285714285, "grad_norm": 3.2002384662628174, "kl": 0.00543212890625, "learning_rate": 8.57142857142857e-07, "loss": 0.0005, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 23 }, { "advantages": -1.862645149230957e-09, "completion_length": 75.375, "epoch": 0.14906832298136646, "grad_norm": 5.946903705596924, "kl": 0.0087890625, "learning_rate": 8.509316770186336e-07, "loss": 0.0009, "reward": 1.484375, "reward_mean": 1.484375, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.984375, "step": 24 }, { "advantages": -9.313225746154785e-10, "completion_length": 68.8125, "epoch": 0.15527950310559005, "grad_norm": 4.977855682373047, "kl": 0.008544921875, "learning_rate": 8.447204968944099e-07, "loss": 0.0009, "reward": 1.734375, "reward_mean": 1.734375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 25 }, { "advantages": -8.381903171539307e-09, "completion_length": 83.59375, "epoch": 0.16149068322981366, "grad_norm": 4.409206390380859, "kl": 0.01239013671875, "learning_rate": 8.385093167701863e-07, "loss": 0.0012, "reward": 1.609375, "reward_mean": 1.609375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 26 }, { "advantages": 1.862645149230957e-09, "completion_length": 78.109375, "epoch": 0.16770186335403728, "grad_norm": 3.1185989379882812, "kl": 0.006072998046875, "learning_rate": 8.322981366459628e-07, "loss": 0.0006, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 27 }, { "advantages": 4.6566128730773926e-09, "completion_length": 72.0625, "epoch": 0.17391304347826086, "grad_norm": 4.9565935134887695, "kl": 0.010009765625, "learning_rate": 8.260869565217391e-07, "loss": 0.001, "reward": 1.34375, "reward_mean": 1.34375, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 28 }, { "advantages": -4.6566128730773926e-09, "completion_length": 84.953125, "epoch": 0.18012422360248448, "grad_norm": 4.35609769821167, "kl": 0.011962890625, "learning_rate": 8.198757763975155e-07, "loss": 0.0012, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.25513991713523865, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 29 }, { "advantages": -9.313225746154785e-10, "completion_length": 85.34375, "epoch": 0.18633540372670807, "grad_norm": 5.767938137054443, "kl": 0.009033203125, "learning_rate": 8.136645962732918e-07, "loss": 0.0009, "reward": 1.609375, "reward_mean": 1.609375, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 30 }, { "advantages": -5.587935447692871e-09, "completion_length": 83.5625, "epoch": 0.19254658385093168, "grad_norm": 49.12059783935547, "kl": 0.0091552734375, "learning_rate": 8.074534161490683e-07, "loss": 0.0009, "reward": 1.578125, "reward_mean": 1.578125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 1.0, "step": 31 }, { "advantages": 4.6566128730773926e-09, "completion_length": 77.734375, "epoch": 0.19875776397515527, "grad_norm": 1.4828208684921265, "kl": 0.00970458984375, "learning_rate": 8.012422360248446e-07, "loss": 0.001, "reward": 1.421875, "reward_mean": 1.421875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 32 }, { "advantages": 1.862645149230957e-09, "completion_length": 78.453125, "epoch": 0.20496894409937888, "grad_norm": 7.876468658447266, "kl": 0.020263671875, "learning_rate": 7.95031055900621e-07, "loss": 0.002, "reward": 1.734375, "reward_mean": 1.734375, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 33 }, { "advantages": -3.725290298461914e-09, "completion_length": 80.5, "epoch": 0.2111801242236025, "grad_norm": 3.8213541507720947, "kl": 0.01361083984375, "learning_rate": 7.888198757763976e-07, "loss": 0.0014, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 34 }, { "advantages": -6.51925802230835e-09, "completion_length": 89.5625, "epoch": 0.21739130434782608, "grad_norm": 3.453101634979248, "kl": 0.01470947265625, "learning_rate": 7.826086956521739e-07, "loss": 0.0015, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 35 }, { "advantages": 0.0, "completion_length": 90.28125, "epoch": 0.2236024844720497, "grad_norm": 2.642101526260376, "kl": 0.0107421875, "learning_rate": 7.763975155279503e-07, "loss": 0.0011, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 36 }, { "advantages": -3.725290298461914e-09, "completion_length": 80.3125, "epoch": 0.22981366459627328, "grad_norm": 3.5424673557281494, "kl": 0.01239013671875, "learning_rate": 7.701863354037266e-07, "loss": 0.0012, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 37 }, { "advantages": 0.0, "completion_length": 84.140625, "epoch": 0.2360248447204969, "grad_norm": 0.38800248503685, "kl": 0.01275634765625, "learning_rate": 7.639751552795031e-07, "loss": 0.0013, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.984375, "step": 38 }, { "advantages": 1.862645149230957e-09, "completion_length": 89.109375, "epoch": 0.2422360248447205, "grad_norm": 2.645474433898926, "kl": 0.01397705078125, "learning_rate": 7.577639751552795e-07, "loss": 0.0014, "reward": 1.515625, "reward_mean": 1.515625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 39 }, { "advantages": -3.725290298461914e-09, "completion_length": 72.296875, "epoch": 0.2484472049689441, "grad_norm": 8.574762344360352, "kl": 0.0159912109375, "learning_rate": 7.515527950310558e-07, "loss": 0.0016, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 40 }, { "advantages": 7.450580596923828e-09, "completion_length": 86.046875, "epoch": 0.2546583850931677, "grad_norm": 36.26329040527344, "kl": 0.0147705078125, "learning_rate": 7.453416149068323e-07, "loss": 0.0015, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 41 }, { "advantages": -3.725290298461914e-09, "completion_length": 77.0625, "epoch": 0.2608695652173913, "grad_norm": 10.992830276489258, "kl": 0.0113525390625, "learning_rate": 7.391304347826086e-07, "loss": 0.0011, "reward": 1.703125, "reward_mean": 1.703125, "reward_std": 0.24464011192321777, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 42 }, { "advantages": -1.862645149230957e-09, "completion_length": 86.53125, "epoch": 0.2670807453416149, "grad_norm": 6.251725673675537, "kl": 0.009033203125, "learning_rate": 7.329192546583851e-07, "loss": 0.0009, "reward": 1.609375, "reward_mean": 1.609375, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 43 }, { "advantages": -4.6566128730773926e-09, "completion_length": 86.4375, "epoch": 0.2732919254658385, "grad_norm": 3.8048486709594727, "kl": 0.01385498046875, "learning_rate": 7.267080745341615e-07, "loss": 0.0014, "reward": 1.765625, "reward_mean": 1.765625, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 44 }, { "advantages": -1.862645149230957e-09, "completion_length": 84.21875, "epoch": 0.2795031055900621, "grad_norm": 2.5062499046325684, "kl": 0.00811767578125, "learning_rate": 7.204968944099379e-07, "loss": 0.0008, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.8125, "rewards/format_reward": 0.984375, "step": 45 }, { "advantages": 3.725290298461914e-09, "completion_length": 77.9375, "epoch": 0.2857142857142857, "grad_norm": 3.8415560722351074, "kl": 0.01165771484375, "learning_rate": 7.142857142857143e-07, "loss": 0.0012, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 46 }, { "advantages": 1.862645149230957e-09, "completion_length": 82.6875, "epoch": 0.2919254658385093, "grad_norm": 2.903069496154785, "kl": 0.012451171875, "learning_rate": 7.080745341614906e-07, "loss": 0.0012, "reward": 1.578125, "reward_mean": 1.578125, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.984375, "step": 47 }, { "advantages": -2.7939677238464355e-09, "completion_length": 75.578125, "epoch": 0.2981366459627329, "grad_norm": 11.884781837463379, "kl": 0.0125732421875, "learning_rate": 7.018633540372671e-07, "loss": 0.0013, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.17570312321186066, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 48 }, { "advantages": -1.862645149230957e-09, "completion_length": 73.34375, "epoch": 0.30434782608695654, "grad_norm": 2.234876871109009, "kl": 0.0084228515625, "learning_rate": 6.956521739130434e-07, "loss": 0.0008, "reward": 1.484375, "reward_mean": 1.484375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 49 }, { "advantages": 1.862645149230957e-09, "completion_length": 81.8125, "epoch": 0.3105590062111801, "grad_norm": 4.401739597320557, "kl": 0.007232666015625, "learning_rate": 6.894409937888198e-07, "loss": 0.0007, "reward": 1.765625, "reward_mean": 1.765625, "reward_std": 0.17782479524612427, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 50 }, { "advantages": 0.0, "completion_length": 84.015625, "epoch": 0.3167701863354037, "grad_norm": 0.28293830156326294, "kl": 0.0062255859375, "learning_rate": 6.832298136645962e-07, "loss": 0.0006, "reward": 2.0, "reward_mean": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 51 }, { "advantages": 0.0, "completion_length": 79.125, "epoch": 0.32298136645962733, "grad_norm": 2.2039246559143066, "kl": 0.0106201171875, "learning_rate": 6.770186335403726e-07, "loss": 0.0011, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 52 }, { "advantages": 9.313225746154785e-10, "completion_length": 76.0625, "epoch": 0.32919254658385094, "grad_norm": 4.176709175109863, "kl": 0.01123046875, "learning_rate": 6.708074534161491e-07, "loss": 0.0011, "reward": 1.640625, "reward_mean": 1.640625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 53 }, { "advantages": -3.725290298461914e-09, "completion_length": 81.125, "epoch": 0.33540372670807456, "grad_norm": 30.12848663330078, "kl": 0.099609375, "learning_rate": 6.645962732919254e-07, "loss": 0.01, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 54 }, { "advantages": -1.862645149230957e-09, "completion_length": 80.609375, "epoch": 0.3416149068322981, "grad_norm": 12.808406829833984, "kl": 0.01416015625, "learning_rate": 6.583850931677019e-07, "loss": 0.0014, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.2238783985376358, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 55 }, { "advantages": -5.587935447692871e-09, "completion_length": 76.15625, "epoch": 0.34782608695652173, "grad_norm": 5.750046253204346, "kl": 0.01019287109375, "learning_rate": 6.521739130434782e-07, "loss": 0.001, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.984375, "step": 56 }, { "advantages": -3.725290298461914e-09, "completion_length": 76.765625, "epoch": 0.35403726708074534, "grad_norm": 4.7853102684021, "kl": 0.010986328125, "learning_rate": 6.459627329192546e-07, "loss": 0.0011, "reward": 1.328125, "reward_mean": 1.328125, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 57 }, { "advantages": 4.6566128730773926e-09, "completion_length": 88.796875, "epoch": 0.36024844720496896, "grad_norm": 1.7344197034835815, "kl": 0.00982666015625, "learning_rate": 6.39751552795031e-07, "loss": 0.001, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 58 }, { "advantages": -9.313225746154785e-09, "completion_length": 84.28125, "epoch": 0.36645962732919257, "grad_norm": 3.1260499954223633, "kl": 0.01416015625, "learning_rate": 6.335403726708074e-07, "loss": 0.0014, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.1828794628381729, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 0.984375, "step": 59 }, { "advantages": -3.725290298461914e-09, "completion_length": 77.21875, "epoch": 0.37267080745341613, "grad_norm": 1.7963190078735352, "kl": 0.0089111328125, "learning_rate": 6.273291925465838e-07, "loss": 0.0009, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 60 }, { "advantages": 0.0, "completion_length": 82.25, "epoch": 0.37888198757763975, "grad_norm": 2.5049538612365723, "kl": 0.00787353515625, "learning_rate": 6.211180124223601e-07, "loss": 0.0008, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 1.0, "step": 61 }, { "advantages": -1.210719347000122e-08, "completion_length": 80.375, "epoch": 0.38509316770186336, "grad_norm": 5.739541530609131, "kl": 0.01312255859375, "learning_rate": 6.149068322981367e-07, "loss": 0.0013, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 62 }, { "advantages": 1.862645149230957e-09, "completion_length": 84.296875, "epoch": 0.391304347826087, "grad_norm": 4.335031032562256, "kl": 0.01116943359375, "learning_rate": 6.08695652173913e-07, "loss": 0.0011, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 63 }, { "advantages": 1.862645149230957e-09, "completion_length": 86.828125, "epoch": 0.39751552795031053, "grad_norm": 4.443232536315918, "kl": 0.01336669921875, "learning_rate": 6.024844720496894e-07, "loss": 0.0013, "reward": 1.703125, "reward_mean": 1.703125, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.703125, "rewards/format_reward": 1.0, "step": 64 }, { "advantages": 5.587935447692871e-09, "completion_length": 75.71875, "epoch": 0.40372670807453415, "grad_norm": 7.092515468597412, "kl": 0.01251220703125, "learning_rate": 5.962732919254659e-07, "loss": 0.0013, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.23827511072158813, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 65 }, { "advantages": 4.6566128730773926e-09, "completion_length": 82.140625, "epoch": 0.40993788819875776, "grad_norm": 4.468729496002197, "kl": 0.0211181640625, "learning_rate": 5.900621118012422e-07, "loss": 0.0021, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 66 }, { "advantages": 4.6566128730773926e-09, "completion_length": 74.890625, "epoch": 0.4161490683229814, "grad_norm": 9.289567947387695, "kl": 0.01611328125, "learning_rate": 5.838509316770186e-07, "loss": 0.0016, "reward": 1.421875, "reward_mean": 1.421875, "reward_std": 0.1983242630958557, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 67 }, { "advantages": 0.0, "completion_length": 77.625, "epoch": 0.422360248447205, "grad_norm": 0.4326918125152588, "kl": 0.0140380859375, "learning_rate": 5.77639751552795e-07, "loss": 0.0014, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 68 }, { "advantages": -9.313225746154785e-10, "completion_length": 81.71875, "epoch": 0.42857142857142855, "grad_norm": 5.539842128753662, "kl": 0.04296875, "learning_rate": 5.714285714285714e-07, "loss": 0.0043, "reward": 1.4375, "reward_mean": 1.4375, "reward_std": 0.34352827072143555, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 0.984375, "step": 69 }, { "advantages": 0.0, "completion_length": 90.703125, "epoch": 0.43478260869565216, "grad_norm": 0.46686819195747375, "kl": 0.0074462890625, "learning_rate": 5.652173913043477e-07, "loss": 0.0007, "reward": 1.875, "reward_mean": 1.875, "reward_std": 0.0, "rewards/accuracy_reward": 0.875, "rewards/format_reward": 1.0, "step": 70 }, { "advantages": 1.862645149230957e-09, "completion_length": 83.578125, "epoch": 0.4409937888198758, "grad_norm": 8.54028606414795, "kl": 0.00897216796875, "learning_rate": 5.590062111801241e-07, "loss": 0.0009, "reward": 1.765625, "reward_mean": 1.765625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 71 }, { "advantages": 7.450580596923828e-09, "completion_length": 84.25, "epoch": 0.4472049689440994, "grad_norm": 12.895256996154785, "kl": 0.00579833984375, "learning_rate": 5.527950310559007e-07, "loss": 0.0006, "reward": 1.453125, "reward_mean": 1.453125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 72 }, { "advantages": -9.313225746154785e-09, "completion_length": 77.15625, "epoch": 0.453416149068323, "grad_norm": 5.548634052276611, "kl": 0.0123291015625, "learning_rate": 5.46583850931677e-07, "loss": 0.0012, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.31983357667922974, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 73 }, { "advantages": -1.0244548320770264e-08, "completion_length": 84.75, "epoch": 0.45962732919254656, "grad_norm": 3.4154112339019775, "kl": 0.018798828125, "learning_rate": 5.403726708074534e-07, "loss": 0.0019, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.19727616012096405, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 74 }, { "advantages": -4.6566128730773926e-09, "completion_length": 83.015625, "epoch": 0.4658385093167702, "grad_norm": 3.4328691959381104, "kl": 0.01275634765625, "learning_rate": 5.341614906832298e-07, "loss": 0.0013, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 75 }, { "advantages": -1.862645149230957e-09, "completion_length": 78.609375, "epoch": 0.4720496894409938, "grad_norm": 3.627190113067627, "kl": 0.0142822265625, "learning_rate": 5.279503105590062e-07, "loss": 0.0014, "reward": 1.9375, "reward_mean": 1.9375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.9375, "rewards/format_reward": 1.0, "step": 76 }, { "advantages": -3.725290298461914e-09, "completion_length": 80.46875, "epoch": 0.4782608695652174, "grad_norm": 10.168981552124023, "kl": 0.01251220703125, "learning_rate": 5.217391304347825e-07, "loss": 0.0013, "reward": 1.515625, "reward_mean": 1.515625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 77 }, { "advantages": 0.0, "completion_length": 83.421875, "epoch": 0.484472049689441, "grad_norm": 20.923242568969727, "kl": 0.0113525390625, "learning_rate": 5.15527950310559e-07, "loss": 0.0011, "reward": 1.828125, "reward_mean": 1.828125, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 78 }, { "advantages": 2.7939677238464355e-09, "completion_length": 75.109375, "epoch": 0.4906832298136646, "grad_norm": 3.643770933151245, "kl": 0.00811767578125, "learning_rate": 5.093167701863354e-07, "loss": 0.0008, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 79 }, { "advantages": 1.862645149230957e-09, "completion_length": 81.65625, "epoch": 0.4968944099378882, "grad_norm": 4.883938312530518, "kl": 0.015625, "learning_rate": 5.031055900621117e-07, "loss": 0.0016, "reward": 1.25, "reward_mean": 1.25, "reward_std": 0.2130674123764038, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 80 }, { "advantages": -3.725290298461914e-09, "completion_length": 82.515625, "epoch": 0.5031055900621118, "grad_norm": 1.3860398530960083, "kl": 0.00799560546875, "learning_rate": 4.968944099378881e-07, "loss": 0.0008, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 81 }, { "advantages": 9.313225746154785e-10, "completion_length": 77.6875, "epoch": 0.5093167701863354, "grad_norm": 3.7328872680664062, "kl": 0.0181884765625, "learning_rate": 4.906832298136646e-07, "loss": 0.0018, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 82 }, { "advantages": 3.725290298461914e-09, "completion_length": 76.4375, "epoch": 0.515527950310559, "grad_norm": 3.6228644847869873, "kl": 0.01446533203125, "learning_rate": 4.84472049689441e-07, "loss": 0.0014, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.1246790662407875, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 83 }, { "advantages": -1.862645149230957e-09, "completion_length": 79.140625, "epoch": 0.5217391304347826, "grad_norm": 5.579171180725098, "kl": 0.0159912109375, "learning_rate": 4.782608695652174e-07, "loss": 0.0016, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.23356688022613525, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 84 }, { "advantages": -5.587935447692871e-09, "completion_length": 80.0, "epoch": 0.5279503105590062, "grad_norm": 9.611387252807617, "kl": 0.01080322265625, "learning_rate": 4.7204968944099376e-07, "loss": 0.0011, "reward": 1.828125, "reward_mean": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 85 }, { "advantages": -1.862645149230957e-09, "completion_length": 85.5, "epoch": 0.5341614906832298, "grad_norm": 4.1448540687561035, "kl": 0.01007080078125, "learning_rate": 4.6583850931677014e-07, "loss": 0.001, "reward": 1.859375, "reward_mean": 1.859375, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 86 }, { "advantages": 1.862645149230957e-09, "completion_length": 75.5, "epoch": 0.5403726708074534, "grad_norm": 5.654483795166016, "kl": 0.01123046875, "learning_rate": 4.596273291925465e-07, "loss": 0.0011, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 87 }, { "advantages": -3.725290298461914e-09, "completion_length": 72.671875, "epoch": 0.546583850931677, "grad_norm": 2.2370052337646484, "kl": 0.0137939453125, "learning_rate": 4.53416149068323e-07, "loss": 0.0014, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/format_reward": 1.0, "step": 88 }, { "advantages": 0.0, "completion_length": 81.84375, "epoch": 0.5527950310559007, "grad_norm": 1.389394760131836, "kl": 0.00836181640625, "learning_rate": 4.472049689440994e-07, "loss": 0.0008, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 0.984375, "step": 89 }, { "advantages": 0.0, "completion_length": 74.59375, "epoch": 0.5590062111801242, "grad_norm": 2.353760242462158, "kl": 0.00811767578125, "learning_rate": 4.4099378881987576e-07, "loss": 0.0008, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 90 }, { "advantages": 0.0, "completion_length": 85.28125, "epoch": 0.5652173913043478, "grad_norm": 1.5767848491668701, "kl": 0.009765625, "learning_rate": 4.3478260869565214e-07, "loss": 0.001, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 91 }, { "advantages": -8.381903171539307e-09, "completion_length": 81.859375, "epoch": 0.5714285714285714, "grad_norm": 3.835320234298706, "kl": 0.0181884765625, "learning_rate": 4.285714285714285e-07, "loss": 0.0018, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 0.984375, "step": 92 }, { "advantages": 3.725290298461914e-09, "completion_length": 83.671875, "epoch": 0.577639751552795, "grad_norm": 9.30271053314209, "kl": 0.017822265625, "learning_rate": 4.2236024844720495e-07, "loss": 0.0018, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 93 }, { "advantages": -4.6566128730773926e-09, "completion_length": 77.53125, "epoch": 0.5838509316770186, "grad_norm": 6.170975685119629, "kl": 0.009521484375, "learning_rate": 4.161490683229814e-07, "loss": 0.001, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 94 }, { "advantages": -1.862645149230957e-09, "completion_length": 85.640625, "epoch": 0.5900621118012422, "grad_norm": 4.217593669891357, "kl": 0.01409912109375, "learning_rate": 4.0993788819875776e-07, "loss": 0.0014, "reward": 1.734375, "reward_mean": 1.734375, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.984375, "step": 95 }, { "advantages": 0.0, "completion_length": 77.296875, "epoch": 0.5962732919254659, "grad_norm": 6.138365268707275, "kl": 0.0106201171875, "learning_rate": 4.0372670807453413e-07, "loss": 0.0011, "reward": 1.375, "reward_mean": 1.375, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.984375, "step": 96 }, { "advantages": 1.862645149230957e-09, "completion_length": 76.484375, "epoch": 0.6024844720496895, "grad_norm": 1.2896429300308228, "kl": 0.00970458984375, "learning_rate": 3.975155279503105e-07, "loss": 0.001, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 97 }, { "advantages": -3.725290298461914e-09, "completion_length": 81.71875, "epoch": 0.6086956521739131, "grad_norm": 6.941093444824219, "kl": 0.01165771484375, "learning_rate": 3.9130434782608694e-07, "loss": 0.0012, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 98 }, { "advantages": -3.725290298461914e-09, "completion_length": 81.390625, "epoch": 0.6149068322981367, "grad_norm": 3.163457155227661, "kl": 0.00787353515625, "learning_rate": 3.850931677018633e-07, "loss": 0.0008, "reward": 1.96875, "reward_mean": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/format_reward": 1.0, "step": 99 }, { "advantages": 4.6566128730773926e-09, "completion_length": 79.1875, "epoch": 0.6211180124223602, "grad_norm": 4.2669830322265625, "kl": 0.0108642578125, "learning_rate": 3.7888198757763975e-07, "loss": 0.0011, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 100 }, { "advantages": -1.862645149230957e-09, "completion_length": 77.578125, "epoch": 0.6273291925465838, "grad_norm": 6.153615474700928, "kl": 0.0101318359375, "learning_rate": 3.7267080745341613e-07, "loss": 0.001, "reward": 1.359375, "reward_mean": 1.359375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 101 }, { "advantages": -1.862645149230957e-09, "completion_length": 81.53125, "epoch": 0.6335403726708074, "grad_norm": 4.077609539031982, "kl": 0.0181884765625, "learning_rate": 3.6645962732919256e-07, "loss": 0.0018, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 102 }, { "advantages": -3.725290298461914e-09, "completion_length": 80.375, "epoch": 0.639751552795031, "grad_norm": 3.084027051925659, "kl": 0.01007080078125, "learning_rate": 3.6024844720496894e-07, "loss": 0.001, "reward": 1.53125, "reward_mean": 1.53125, "reward_std": 0.1462521106004715, "rewards/accuracy_reward": 0.53125, "rewards/format_reward": 1.0, "step": 103 }, { "advantages": -3.725290298461914e-09, "completion_length": 83.953125, "epoch": 0.6459627329192547, "grad_norm": 2.0512335300445557, "kl": 0.007476806640625, "learning_rate": 3.540372670807453e-07, "loss": 0.0007, "reward": 1.453125, "reward_mean": 1.453125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.453125, "rewards/format_reward": 1.0, "step": 104 }, { "advantages": 0.0, "completion_length": 82.5625, "epoch": 0.6521739130434783, "grad_norm": 0.5302374362945557, "kl": 0.00982666015625, "learning_rate": 3.478260869565217e-07, "loss": 0.001, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 105 }, { "advantages": -4.6566128730773926e-09, "completion_length": 75.953125, "epoch": 0.6583850931677019, "grad_norm": 6.2678751945495605, "kl": 0.0111083984375, "learning_rate": 3.416149068322981e-07, "loss": 0.0011, "reward": 1.890625, "reward_mean": 1.890625, "reward_std": 0.1315089464187622, "rewards/accuracy_reward": 0.890625, "rewards/format_reward": 1.0, "step": 106 }, { "advantages": -3.725290298461914e-09, "completion_length": 78.328125, "epoch": 0.6645962732919255, "grad_norm": 1.7859537601470947, "kl": 0.00946044921875, "learning_rate": 3.3540372670807456e-07, "loss": 0.0009, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 107 }, { "advantages": -3.725290298461914e-09, "completion_length": 91.8125, "epoch": 0.6708074534161491, "grad_norm": 2.7167623043060303, "kl": 0.0081787109375, "learning_rate": 3.2919254658385094e-07, "loss": 0.0008, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 0.984375, "step": 108 }, { "advantages": 0.0, "completion_length": 76.984375, "epoch": 0.6770186335403726, "grad_norm": 5.3628058433532715, "kl": 0.009033203125, "learning_rate": 3.229813664596273e-07, "loss": 0.0009, "reward": 1.515625, "reward_mean": 1.515625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 1.0, "step": 109 }, { "advantages": -1.862645149230957e-09, "completion_length": 73.5, "epoch": 0.6832298136645962, "grad_norm": 3.2727582454681396, "kl": 0.0108642578125, "learning_rate": 3.167701863354037e-07, "loss": 0.0011, "reward": 1.609375, "reward_mean": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.609375, "rewards/format_reward": 1.0, "step": 110 }, { "advantages": 0.0, "completion_length": 75.75, "epoch": 0.6894409937888198, "grad_norm": 11.552366256713867, "kl": 0.0145263671875, "learning_rate": 3.105590062111801e-07, "loss": 0.0015, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 111 }, { "advantages": 3.725290298461914e-09, "completion_length": 85.734375, "epoch": 0.6956521739130435, "grad_norm": 6.025736331939697, "kl": 0.01556396484375, "learning_rate": 3.043478260869565e-07, "loss": 0.0016, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 112 }, { "advantages": -3.725290298461914e-09, "completion_length": 82.734375, "epoch": 0.7018633540372671, "grad_norm": 15.336418151855469, "kl": 0.057373046875, "learning_rate": 2.9813664596273294e-07, "loss": 0.0057, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 1.0, "step": 113 }, { "advantages": -3.725290298461914e-09, "completion_length": 78.953125, "epoch": 0.7080745341614907, "grad_norm": 65.76184844970703, "kl": 0.01385498046875, "learning_rate": 2.919254658385093e-07, "loss": 0.0014, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 114 }, { "advantages": -1.862645149230957e-09, "completion_length": 79.765625, "epoch": 0.7142857142857143, "grad_norm": 3.660456657409668, "kl": 0.0194091796875, "learning_rate": 2.857142857142857e-07, "loss": 0.0019, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 115 }, { "advantages": -5.587935447692871e-09, "completion_length": 76.34375, "epoch": 0.7204968944099379, "grad_norm": 4.989613056182861, "kl": 0.00787353515625, "learning_rate": 2.7950310559006207e-07, "loss": 0.0008, "reward": 1.828125, "reward_mean": 1.828125, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.84375, "rewards/format_reward": 0.984375, "step": 116 }, { "advantages": -9.313225746154785e-10, "completion_length": 77.859375, "epoch": 0.7267080745341615, "grad_norm": 2.4932050704956055, "kl": 0.0081787109375, "learning_rate": 2.732919254658385e-07, "loss": 0.0008, "reward": 1.859375, "reward_mean": 1.859375, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 117 }, { "advantages": 1.862645149230957e-09, "completion_length": 84.5, "epoch": 0.7329192546583851, "grad_norm": 5.0420732498168945, "kl": 0.01226806640625, "learning_rate": 2.670807453416149e-07, "loss": 0.0012, "reward": 1.640625, "reward_mean": 1.640625, "reward_std": 0.23144522309303284, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 118 }, { "advantages": -1.862645149230957e-09, "completion_length": 79.703125, "epoch": 0.7391304347826086, "grad_norm": 3.599855899810791, "kl": 0.0086669921875, "learning_rate": 2.6086956521739126e-07, "loss": 0.0009, "reward": 1.484375, "reward_mean": 1.484375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 1.0, "step": 119 }, { "advantages": -5.587935447692871e-09, "completion_length": 79.1875, "epoch": 0.7453416149068323, "grad_norm": 3.320706605911255, "kl": 0.0133056640625, "learning_rate": 2.546583850931677e-07, "loss": 0.0013, "reward": 1.828125, "reward_mean": 1.828125, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 1.0, "step": 120 }, { "advantages": -8.381903171539307e-09, "completion_length": 95.203125, "epoch": 0.7515527950310559, "grad_norm": 2.8366851806640625, "kl": 0.0064697265625, "learning_rate": 2.4844720496894407e-07, "loss": 0.0006, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.1530819833278656, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 121 }, { "advantages": 1.862645149230957e-09, "completion_length": 78.765625, "epoch": 0.7577639751552795, "grad_norm": 3.376732587814331, "kl": 0.00860595703125, "learning_rate": 2.422360248447205e-07, "loss": 0.0009, "reward": 1.640625, "reward_mean": 1.640625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 122 }, { "advantages": 1.862645149230957e-09, "completion_length": 79.453125, "epoch": 0.7639751552795031, "grad_norm": 3.5682129859924316, "kl": 0.018798828125, "learning_rate": 2.3602484472049688e-07, "loss": 0.0019, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.1804211586713791, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 123 }, { "advantages": 4.6566128730773926e-09, "completion_length": 74.96875, "epoch": 0.7701863354037267, "grad_norm": 2.6698434352874756, "kl": 0.006256103515625, "learning_rate": 2.2981366459627326e-07, "loss": 0.0006, "reward": 1.546875, "reward_mean": 1.546875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 124 }, { "advantages": 0.0, "completion_length": 78.421875, "epoch": 0.7763975155279503, "grad_norm": 3.1063811779022217, "kl": 0.01214599609375, "learning_rate": 2.236024844720497e-07, "loss": 0.0012, "reward": 1.765625, "reward_mean": 1.765625, "reward_std": 0.12255740165710449, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 125 }, { "advantages": 1.862645149230957e-09, "completion_length": 86.703125, "epoch": 0.782608695652174, "grad_norm": 2.7392446994781494, "kl": 0.00634765625, "learning_rate": 2.1739130434782607e-07, "loss": 0.0006, "reward": 1.640625, "reward_mean": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 126 }, { "advantages": 7.450580596923828e-09, "completion_length": 83.546875, "epoch": 0.7888198757763976, "grad_norm": 9.345684051513672, "kl": 0.008056640625, "learning_rate": 2.1118012422360247e-07, "loss": 0.0008, "reward": 1.296875, "reward_mean": 1.296875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 1.0, "step": 127 }, { "advantages": 0.0, "completion_length": 85.84375, "epoch": 0.7950310559006211, "grad_norm": 0.22835175693035126, "kl": 0.0084228515625, "learning_rate": 2.0496894409937888e-07, "loss": 0.0008, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 128 }, { "advantages": 0.0, "completion_length": 81.828125, "epoch": 0.8012422360248447, "grad_norm": 2.44989275932312, "kl": 0.007171630859375, "learning_rate": 1.9875776397515526e-07, "loss": 0.0007, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 129 }, { "advantages": 9.313225746154785e-10, "completion_length": 83.296875, "epoch": 0.8074534161490683, "grad_norm": 26.60379409790039, "kl": 0.01031494140625, "learning_rate": 1.9254658385093166e-07, "loss": 0.001, "reward": 1.640625, "reward_mean": 1.640625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 1.0, "step": 130 }, { "advantages": -3.725290298461914e-09, "completion_length": 75.296875, "epoch": 0.8136645962732919, "grad_norm": 2.649775981903076, "kl": 0.00787353515625, "learning_rate": 1.8633540372670807e-07, "loss": 0.0008, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/format_reward": 1.0, "step": 131 }, { "advantages": -7.450580596923828e-09, "completion_length": 72.921875, "epoch": 0.8198757763975155, "grad_norm": 6.021523952484131, "kl": 0.017333984375, "learning_rate": 1.8012422360248447e-07, "loss": 0.0017, "reward": 1.546875, "reward_mean": 1.546875, "reward_std": 0.17358146607875824, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 132 }, { "advantages": 1.862645149230957e-09, "completion_length": 80.203125, "epoch": 0.8260869565217391, "grad_norm": 5.850553035736084, "kl": 0.0118408203125, "learning_rate": 1.7391304347826085e-07, "loss": 0.0012, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.25726157426834106, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 133 }, { "advantages": -5.587935447692871e-09, "completion_length": 90.234375, "epoch": 0.8322981366459627, "grad_norm": 9.700899124145508, "kl": 0.01422119140625, "learning_rate": 1.6770186335403728e-07, "loss": 0.0014, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.828125, "rewards/format_reward": 0.953125, "step": 134 }, { "advantages": 3.725290298461914e-09, "completion_length": 81.90625, "epoch": 0.8385093167701864, "grad_norm": 2.9975473880767822, "kl": 0.00909423828125, "learning_rate": 1.6149068322981366e-07, "loss": 0.0009, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 135 }, { "advantages": -3.725290298461914e-09, "completion_length": 79.3125, "epoch": 0.84472049689441, "grad_norm": 4.324582099914551, "kl": 0.01165771484375, "learning_rate": 1.5527950310559004e-07, "loss": 0.0012, "reward": 1.84375, "reward_mean": 1.84375, "reward_std": 0.2177756428718567, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 0.984375, "step": 136 }, { "advantages": 7.450580596923828e-09, "completion_length": 80.265625, "epoch": 0.8509316770186336, "grad_norm": 3.8911736011505127, "kl": 0.009521484375, "learning_rate": 1.4906832298136647e-07, "loss": 0.001, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 137 }, { "advantages": 0.0, "completion_length": 78.40625, "epoch": 0.8571428571428571, "grad_norm": 2.864941120147705, "kl": 0.01129150390625, "learning_rate": 1.4285714285714285e-07, "loss": 0.0011, "reward": 1.765625, "reward_mean": 1.765625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.765625, "rewards/format_reward": 1.0, "step": 138 }, { "advantages": 3.725290298461914e-09, "completion_length": 80.21875, "epoch": 0.8633540372670807, "grad_norm": 5.788990497589111, "kl": 0.00799560546875, "learning_rate": 1.3664596273291925e-07, "loss": 0.0008, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 139 }, { "advantages": -1.862645149230957e-09, "completion_length": 92.90625, "epoch": 0.8695652173913043, "grad_norm": 4.130926609039307, "kl": 0.0111083984375, "learning_rate": 1.3043478260869563e-07, "loss": 0.0011, "reward": 1.859375, "reward_mean": 1.859375, "reward_std": 0.2198973000049591, "rewards/accuracy_reward": 0.859375, "rewards/format_reward": 1.0, "step": 140 }, { "advantages": -3.725290298461914e-09, "completion_length": 79.109375, "epoch": 0.8757763975155279, "grad_norm": 3.025212287902832, "kl": 0.01324462890625, "learning_rate": 1.2422360248447204e-07, "loss": 0.0013, "reward": 1.46875, "reward_mean": 1.46875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.484375, "rewards/format_reward": 0.984375, "step": 141 }, { "advantages": 3.725290298461914e-09, "completion_length": 78.25, "epoch": 0.8819875776397516, "grad_norm": 6.828762531280518, "kl": 0.01177978515625, "learning_rate": 1.1801242236024844e-07, "loss": 0.0012, "reward": 1.40625, "reward_mean": 1.40625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 1.0, "step": 142 }, { "advantages": 0.0, "completion_length": 73.625, "epoch": 0.8881987577639752, "grad_norm": 3.4486515522003174, "kl": 0.00762939453125, "learning_rate": 1.1180124223602484e-07, "loss": 0.0008, "reward": 1.65625, "reward_mean": 1.65625, "reward_std": 0.1552036553621292, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 1.0, "step": 143 }, { "advantages": -3.725290298461914e-09, "completion_length": 80.359375, "epoch": 0.8944099378881988, "grad_norm": 8.272978782653809, "kl": 0.00628662109375, "learning_rate": 1.0559006211180124e-07, "loss": 0.0006, "reward": 1.71875, "reward_mean": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 0.984375, "step": 144 }, { "advantages": -7.450580596923828e-09, "completion_length": 81.1875, "epoch": 0.9006211180124224, "grad_norm": 4.848587512969971, "kl": 0.017578125, "learning_rate": 9.937888198757763e-08, "loss": 0.0018, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.2041158676147461, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 145 }, { "advantages": 0.0, "completion_length": 79.984375, "epoch": 0.906832298136646, "grad_norm": 0.3604845702648163, "kl": 0.00958251953125, "learning_rate": 9.316770186335403e-08, "loss": 0.001, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 146 }, { "advantages": 5.587935447692871e-09, "completion_length": 76.5625, "epoch": 0.9130434782608695, "grad_norm": 10.680438995361328, "kl": 0.01116943359375, "learning_rate": 8.695652173913042e-08, "loss": 0.0011, "reward": 1.671875, "reward_mean": 1.671875, "reward_std": 0.19939783215522766, "rewards/accuracy_reward": 0.671875, "rewards/format_reward": 1.0, "step": 147 }, { "advantages": -3.725290298461914e-09, "completion_length": 76.984375, "epoch": 0.9192546583850931, "grad_norm": 2.091907024383545, "kl": 0.010498046875, "learning_rate": 8.074534161490683e-08, "loss": 0.0011, "reward": 1.640625, "reward_mean": 1.640625, "reward_std": 0.08010874688625336, "rewards/accuracy_reward": 0.65625, "rewards/format_reward": 0.984375, "step": 148 }, { "advantages": 0.0, "completion_length": 76.203125, "epoch": 0.9254658385093167, "grad_norm": 0.20045147836208344, "kl": 0.0078125, "learning_rate": 7.453416149068323e-08, "loss": 0.0008, "reward": 1.75, "reward_mean": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 1.0, "step": 149 }, { "advantages": 0.0, "completion_length": 84.140625, "epoch": 0.9316770186335404, "grad_norm": 3.21720814704895, "kl": 0.008544921875, "learning_rate": 6.832298136645963e-08, "loss": 0.0009, "reward": 1.6875, "reward_mean": 1.6875, "reward_std": 0.06681530922651291, "rewards/accuracy_reward": 0.6875, "rewards/format_reward": 1.0, "step": 150 }, { "advantages": -1.862645149230957e-09, "completion_length": 82.734375, "epoch": 0.937888198757764, "grad_norm": 7.955801963806152, "kl": 0.01263427734375, "learning_rate": 6.211180124223602e-08, "loss": 0.0013, "reward": 1.734375, "reward_mean": 1.734375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward": 0.734375, "rewards/format_reward": 1.0, "step": 151 }, { "advantages": 3.725290298461914e-09, "completion_length": 72.96875, "epoch": 0.9440993788819876, "grad_norm": 3.563530921936035, "kl": 0.0093994140625, "learning_rate": 5.590062111801242e-08, "loss": 0.0009, "reward": 1.90625, "reward_mean": 1.90625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward": 0.90625, "rewards/format_reward": 1.0, "step": 152 }, { "advantages": -9.313225746154785e-09, "completion_length": 83.125, "epoch": 0.9503105590062112, "grad_norm": 9.811988830566406, "kl": 0.0184326171875, "learning_rate": 4.9689440993788814e-08, "loss": 0.0019, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 153 }, { "advantages": -1.0244548320770264e-08, "completion_length": 83.515625, "epoch": 0.9565217391304348, "grad_norm": 3.8269639015197754, "kl": 0.0133056640625, "learning_rate": 4.347826086956521e-08, "loss": 0.0013, "reward": 1.78125, "reward_mean": 1.78125, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.78125, "rewards/format_reward": 1.0, "step": 154 }, { "advantages": -7.450580596923828e-09, "completion_length": 85.671875, "epoch": 0.9627329192546584, "grad_norm": 3.470165252685547, "kl": 0.01300048828125, "learning_rate": 3.726708074534162e-08, "loss": 0.0013, "reward": 1.5625, "reward_mean": 1.5625, "reward_std": 0.1462520956993103, "rewards/accuracy_reward": 0.5625, "rewards/format_reward": 1.0, "step": 155 }, { "advantages": -9.313225746154785e-10, "completion_length": 84.203125, "epoch": 0.968944099378882, "grad_norm": 2.550407648086548, "kl": 0.00946044921875, "learning_rate": 3.105590062111801e-08, "loss": 0.0009, "reward": 1.625, "reward_mean": 1.625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward": 0.640625, "rewards/format_reward": 0.984375, "step": 156 }, { "advantages": 3.725290298461914e-09, "completion_length": 76.296875, "epoch": 0.9751552795031055, "grad_norm": 3.396425247192383, "kl": 0.00714111328125, "learning_rate": 2.4844720496894407e-08, "loss": 0.0007, "reward": 1.546875, "reward_mean": 1.546875, "reward_std": 0.15992169082164764, "rewards/accuracy_reward": 0.546875, "rewards/format_reward": 1.0, "step": 157 }, { "advantages": -2.7939677238464355e-09, "completion_length": 73.859375, "epoch": 0.9813664596273292, "grad_norm": 3.776041030883789, "kl": 0.00921630859375, "learning_rate": 1.863354037267081e-08, "loss": 0.0009, "reward": 1.59375, "reward_mean": 1.59375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 1.0, "step": 158 }, { "advantages": 1.862645149230957e-09, "completion_length": 77.953125, "epoch": 0.9875776397515528, "grad_norm": 3.304471254348755, "kl": 0.01263427734375, "learning_rate": 1.2422360248447204e-08, "loss": 0.0013, "reward": 1.796875, "reward_mean": 1.796875, "reward_std": 0.11100947856903076, "rewards/accuracy_reward": 0.796875, "rewards/format_reward": 1.0, "step": 159 }, { "advantages": 0.0, "completion_length": 79.25, "epoch": 0.9937888198757764, "grad_norm": 5.823967456817627, "kl": 0.00897216796875, "learning_rate": 6.211180124223602e-09, "loss": 0.0009, "reward": 1.5, "reward_mean": 1.5, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 1.0, "step": 160 }, { "advantages": -0.5890890955924988, "completion_length": 89.33333587646484, "epoch": 1.0, "grad_norm": 2.0931286811828613, "kl": 0.00677490234375, "learning_rate": 0.0, "loss": 0.001, "reward": 1.6666667461395264, "reward_mean": 1.875, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.6666666865348816, "rewards/format_reward": 1.0, "step": 161 } ], "logging_steps": 1.0, "max_steps": 161, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }