{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.672268907563026, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 621.6719055175781, "epoch": 0.03361344537815126, "grad_norm": 0.0801367936133843, "kl": 0.0, "learning_rate": 3.3333333333333334e-08, "loss": -0.0, "reward": 0.0989583358168602, "reward_std": 0.004166666883975267, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.0, "step": 2 }, { "completion_length": 490.7083435058594, "epoch": 0.06722689075630252, "grad_norm": 0.27032084439912185, "kl": 0.00024890899658203125, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.0989583395421505, "reward_std": 0.004166666883975267, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.0, "step": 4 }, { "completion_length": 425.6979293823242, "epoch": 0.10084033613445378, "grad_norm": 0.175008762881728, "kl": 0.0002465248107910156, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.13593751192092896, "reward_std": 0.06458333344198763, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0364583358168602, "step": 6 }, { "completion_length": 464.0364761352539, "epoch": 0.13445378151260504, "grad_norm": 0.0925824677352829, "kl": 0.00026226043701171875, "learning_rate": 1.3333333333333334e-07, "loss": 0.0, "reward": 0.0989583358168602, "reward_std": 0.004166666883975267, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.0, "step": 8 }, { "completion_length": 639.0052261352539, "epoch": 0.16806722689075632, "grad_norm": 0.1485724351337979, "kl": 0.0002181529998779297, "learning_rate": 1.6666666666666665e-07, "loss": 0.0, "reward": 0.10885417088866234, "reward_std": 0.03471375140361488, "rewards/format_reward_func": 0.09843751601874828, "rewards/solution_reward_func": 0.010416666977107525, "step": 10 }, { "completion_length": 463.8958435058594, "epoch": 0.20168067226890757, "grad_norm": 0.00011354212288655966, "kl": 0.00024199485778808594, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 12 }, { "completion_length": 578.8594055175781, "epoch": 0.23529411764705882, "grad_norm": 0.07730880319439308, "kl": 0.00025010108947753906, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 14 }, { "completion_length": 552.5937728881836, "epoch": 0.2689075630252101, "grad_norm": 0.09263201685945976, "kl": 0.0002052783966064453, "learning_rate": 2.6666666666666667e-07, "loss": 0.0, "reward": 0.10260417126119137, "reward_std": 0.02860941761173308, "rewards/format_reward_func": 0.09739584848284721, "rewards/solution_reward_func": 0.0052083334885537624, "step": 16 }, { "completion_length": 442.5521011352539, "epoch": 0.3025210084033613, "grad_norm": 0.23576421207830434, "kl": 0.00022339820861816406, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 18 }, { "completion_length": 467.2760543823242, "epoch": 0.33613445378151263, "grad_norm": 0.22791132298206718, "kl": 0.0002951622009277344, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.0989583358168602, "reward_std": 0.004166666883975267, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.0, "step": 20 }, { "completion_length": 484.4479217529297, "epoch": 0.3697478991596639, "grad_norm": 0.00012183057825469897, "kl": 0.0002396106719970703, "learning_rate": 3.666666666666666e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 22 }, { "completion_length": 413.3333435058594, "epoch": 0.40336134453781514, "grad_norm": 0.34473540698627514, "kl": 0.0002987384796142578, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.166666679084301, "reward_std": 0.08413009555079043, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.06770833395421505, "step": 24 }, { "completion_length": 488.56251525878906, "epoch": 0.4369747899159664, "grad_norm": 0.00015667411397720364, "kl": 0.0002536773681640625, "learning_rate": 4.3333333333333335e-07, "loss": 0.0, "reward": 0.1041666716337204, "reward_std": 0.025000000838190317, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.0052083334885537624, "step": 26 }, { "completion_length": 397.0833435058594, "epoch": 0.47058823529411764, "grad_norm": 0.00013639205681583485, "kl": 0.0003132820129394531, "learning_rate": 4.6666666666666666e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 28 }, { "completion_length": 468.6979217529297, "epoch": 0.5042016806722689, "grad_norm": 0.1847110341404601, "kl": 0.00031375885009765625, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.11406250670552254, "reward_std": 0.052656359039247036, "rewards/format_reward_func": 0.09843751788139343, "rewards/solution_reward_func": 0.015625000465661287, "step": 30 }, { "completion_length": 488.58856201171875, "epoch": 0.5378151260504201, "grad_norm": 0.303657526759844, "kl": 0.0002887248992919922, "learning_rate": 4.999947552503497e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 32 }, { "completion_length": 390.3645935058594, "epoch": 0.5714285714285714, "grad_norm": 0.00025776879845989663, "kl": 0.0004658699035644531, "learning_rate": 4.999790212214579e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 34 }, { "completion_length": 508.9687728881836, "epoch": 0.6050420168067226, "grad_norm": 0.1251668733841362, "kl": 0.00040602684020996094, "learning_rate": 4.999527985734931e-07, "loss": 0.0, "reward": 0.09843750298023224, "reward_std": 0.0049297085497528315, "rewards/format_reward_func": 0.09843751601874828, "rewards/solution_reward_func": 0.0, "step": 36 }, { "completion_length": 505.42189025878906, "epoch": 0.6386554621848739, "grad_norm": 0.15320179547134924, "kl": 0.0005106925964355469, "learning_rate": 4.99916088406705e-07, "loss": 0.0, "reward": 0.09739583730697632, "reward_std": 0.00752594112418592, "rewards/format_reward_func": 0.09739585034549236, "rewards/solution_reward_func": 0.0, "step": 38 }, { "completion_length": 577.3645935058594, "epoch": 0.6722689075630253, "grad_norm": 0.3609111340706473, "kl": 0.0005517005920410156, "learning_rate": 4.998688922613787e-07, "loss": 0.0, "reward": 0.10364583507180214, "reward_std": 0.027083334047347307, "rewards/format_reward_func": 0.09843751788139343, "rewards/solution_reward_func": 0.0052083334885537624, "step": 40 }, { "completion_length": 429.1823043823242, "epoch": 0.7058823529411765, "grad_norm": 0.00029926095891074706, "kl": 0.00060272216796875, "learning_rate": 4.998112121177698e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 42 }, { "completion_length": 425.10418701171875, "epoch": 0.7394957983193278, "grad_norm": 0.00032084496825872, "kl": 0.0008230209350585938, "learning_rate": 4.997430503960219e-07, "loss": 0.0, "reward": 0.13125001266598701, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 44 }, { "completion_length": 323.8229293823242, "epoch": 0.773109243697479, "grad_norm": 0.0006460538202990007, "kl": 0.001178741455078125, "learning_rate": 4.996644099560641e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 46 }, { "completion_length": 456.7239685058594, "epoch": 0.8067226890756303, "grad_norm": 0.0005616889391318414, "kl": 0.0009098052978515625, "learning_rate": 4.995752940974918e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 48 }, { "completion_length": 471.1979217529297, "epoch": 0.8403361344537815, "grad_norm": 0.0005527827401151652, "kl": 0.0010004043579101562, "learning_rate": 4.994757065594279e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 50 }, { "completion_length": 390.73958587646484, "epoch": 0.8739495798319328, "grad_norm": 0.28218859975368193, "kl": 0.0015735626220703125, "learning_rate": 4.993656515203662e-07, "loss": 0.0, "reward": 0.13125001266598701, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 52 }, { "completion_length": 419.03126525878906, "epoch": 0.907563025210084, "grad_norm": 0.0006289108444375812, "kl": 0.0016717910766601562, "learning_rate": 4.992451335979955e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 54 }, { "completion_length": 404.08333587646484, "epoch": 0.9411764705882353, "grad_norm": 0.000474366079936863, "kl": 0.001483917236328125, "learning_rate": 4.991141578490066e-07, "loss": 0.0, "reward": 0.11041667684912682, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 56 }, { "completion_length": 506.69793701171875, "epoch": 0.9747899159663865, "grad_norm": 0.06271699990344082, "kl": 0.0015659332275390625, "learning_rate": 4.989727297688796e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 58 }, { "completion_length": 457.99306233723956, "epoch": 1.0, "grad_norm": 0.6492803666487954, "kl": 0.0022710164388020835, "learning_rate": 4.988208552916535e-07, "loss": 0.0, "reward": 0.10625000794728597, "reward_std": 0.030555556217829388, "rewards/format_reward_func": 0.0993055726091067, "rewards/solution_reward_func": 0.006944444651405017, "step": 60 }, { "completion_length": 415.7291793823242, "epoch": 1.0336134453781514, "grad_norm": 0.001010416552496197, "kl": 0.00310516357421875, "learning_rate": 4.986585407896771e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 62 }, { "completion_length": 410.3489685058594, "epoch": 1.0672268907563025, "grad_norm": 0.4377155765065235, "kl": 0.00357818603515625, "learning_rate": 4.984857930733419e-07, "loss": 0.0, "reward": 0.1520833484828472, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0520833358168602, "step": 64 }, { "completion_length": 400.1927185058594, "epoch": 1.1008403361344539, "grad_norm": 0.0008410280438695629, "kl": 0.0035247802734375, "learning_rate": 4.98302619390796e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 66 }, { "completion_length": 517.3489685058594, "epoch": 1.134453781512605, "grad_norm": 0.0012400229988806556, "kl": 0.00423431396484375, "learning_rate": 4.981090274276405e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 68 }, { "completion_length": 422.1197967529297, "epoch": 1.1680672268907564, "grad_norm": 0.0016035653025600468, "kl": 0.00673675537109375, "learning_rate": 4.979050253066063e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 70 }, { "completion_length": 360.5885543823242, "epoch": 1.2016806722689075, "grad_norm": 0.0013723406744808482, "kl": 0.00720977783203125, "learning_rate": 4.976906215872137e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 72 }, { "completion_length": 535.5625152587891, "epoch": 1.2352941176470589, "grad_norm": 0.14884839987208168, "kl": 0.006267547607421875, "learning_rate": 4.974658252654134e-07, "loss": 0.0, "reward": 0.09427084028720856, "reward_std": 0.003989280201494694, "rewards/format_reward_func": 0.09427084773778915, "rewards/solution_reward_func": 0.0, "step": 74 }, { "completion_length": 387.6666717529297, "epoch": 1.26890756302521, "grad_norm": 0.0014803546647266586, "kl": 0.009185791015625, "learning_rate": 4.97230645773209e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 76 }, { "completion_length": 425.26563262939453, "epoch": 1.3025210084033614, "grad_norm": 0.46211414374103155, "kl": 0.0101318359375, "learning_rate": 4.96985092978261e-07, "loss": 0.0, "reward": 0.13125000894069672, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 78 }, { "completion_length": 380.0260543823242, "epoch": 1.3361344537815127, "grad_norm": 0.0010418820155503024, "kl": 0.00743865966796875, "learning_rate": 4.967291771834726e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 80 }, { "completion_length": 385.01563262939453, "epoch": 1.3697478991596639, "grad_norm": 0.0012203032212272564, "kl": 0.00830078125, "learning_rate": 4.964629091265583e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 82 }, { "completion_length": 396.2291793823242, "epoch": 1.403361344537815, "grad_norm": 0.1133029293441453, "kl": 0.01006317138671875, "learning_rate": 4.961862999795923e-07, "loss": 0.0, "reward": 0.18072918057441711, "reward_std": 0.06091678235679865, "rewards/format_reward_func": 0.09739585034549236, "rewards/solution_reward_func": 0.0833333358168602, "step": 84 }, { "completion_length": 401.1979293823242, "epoch": 1.4369747899159664, "grad_norm": 0.001999131456116592, "kl": 0.0111083984375, "learning_rate": 4.958993613485405e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 86 }, { "completion_length": 352.9843864440918, "epoch": 1.4705882352941178, "grad_norm": 0.15210228397159312, "kl": 0.0132598876953125, "learning_rate": 4.956021052727731e-07, "loss": 0.0, "reward": 0.0989583395421505, "reward_std": 0.0028463751077651978, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.0, "step": 88 }, { "completion_length": 302.56250762939453, "epoch": 1.504201680672269, "grad_norm": 0.3691590573105211, "kl": 0.0166015625, "learning_rate": 4.952945442245597e-07, "loss": 0.0, "reward": 0.16250000894069672, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.06250000139698386, "step": 90 }, { "completion_length": 340.72396087646484, "epoch": 1.53781512605042, "grad_norm": 0.43877267569380435, "kl": 0.018524169921875, "learning_rate": 4.949766911085461e-07, "loss": 0.0, "reward": 0.14687501266598701, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.046875, "step": 92 }, { "completion_length": 379.5260543823242, "epoch": 1.5714285714285714, "grad_norm": 0.4701626303264626, "kl": 0.0168609619140625, "learning_rate": 4.946485592612122e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 94 }, { "completion_length": 218.46875762939453, "epoch": 1.6050420168067228, "grad_norm": 0.0039619447014435484, "kl": 0.02301025390625, "learning_rate": 4.943101624503132e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 96 }, { "completion_length": 250.26563262939453, "epoch": 1.638655462184874, "grad_norm": 0.5016542124950076, "kl": 0.0252685546875, "learning_rate": 4.939615148743017e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.049297086894512177, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 98 }, { "completion_length": 351.2395935058594, "epoch": 1.6722689075630253, "grad_norm": 0.0017673068432889033, "kl": 0.01922607421875, "learning_rate": 4.936026311617316e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 100 }, { "completion_length": 229.41666793823242, "epoch": 1.7058823529411766, "grad_norm": 0.0036472118938720346, "kl": 0.027374267578125, "learning_rate": 4.932335263706445e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 102 }, { "completion_length": 347.8072967529297, "epoch": 1.7394957983193278, "grad_norm": 0.15696120658135326, "kl": 0.0265045166015625, "learning_rate": 4.928542159879385e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 104 }, { "completion_length": 412.4583435058594, "epoch": 1.773109243697479, "grad_norm": 0.00235152353922058, "kl": 0.0222625732421875, "learning_rate": 4.924647159287175e-07, "loss": 0.0, "reward": 0.12604168057441711, "reward_std": 0.03989280015230179, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0260416679084301, "step": 106 }, { "completion_length": 494.6823043823242, "epoch": 1.8067226890756303, "grad_norm": 0.0028179582025939544, "kl": 0.0166168212890625, "learning_rate": 4.920650425356239e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 108 }, { "completion_length": 350.18751525878906, "epoch": 1.8403361344537816, "grad_norm": 0.4410132711811432, "kl": 0.020263671875, "learning_rate": 4.916552125781528e-07, "loss": 0.0, "reward": 0.12083334103226662, "reward_std": 0.054426075890660286, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.020833333488553762, "step": 110 }, { "completion_length": 292.17188262939453, "epoch": 1.8739495798319328, "grad_norm": 0.11691145085915705, "kl": 0.0254974365234375, "learning_rate": 4.912352432519484e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 112 }, { "completion_length": 284.3177185058594, "epoch": 1.907563025210084, "grad_norm": 0.7003567235287095, "kl": 0.03863525390625, "learning_rate": 4.908051521780824e-07, "loss": 0.0, "reward": 0.2093750201165676, "reward_std": 0.12322613596916199, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1093750037252903, "step": 114 }, { "completion_length": 336.6146011352539, "epoch": 1.9411764705882353, "grad_norm": 0.003275626393694024, "kl": 0.0210418701171875, "learning_rate": 4.90364957402315e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 116 }, { "completion_length": 387.1823043823242, "epoch": 1.9747899159663866, "grad_norm": 0.0014315333787240038, "kl": 0.01727294921875, "learning_rate": 4.899146773943373e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 118 }, { "completion_length": 398.09722900390625, "epoch": 2.0, "grad_norm": 0.002235676467817067, "kl": 0.021931966145833332, "learning_rate": 4.894543310469967e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 120 }, { "completion_length": 363.96876525878906, "epoch": 2.033613445378151, "grad_norm": 0.0028512365272910394, "kl": 0.019744873046875, "learning_rate": 4.88983937675504e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 122 }, { "completion_length": 324.75000762939453, "epoch": 2.0672268907563027, "grad_norm": 0.0032159901825156378, "kl": 0.024017333984375, "learning_rate": 4.885035170166228e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 124 }, { "completion_length": 259.03125381469727, "epoch": 2.100840336134454, "grad_norm": 0.001761468705277778, "kl": 0.0211334228515625, "learning_rate": 4.880130892278419e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 126 }, { "completion_length": 402.97396087646484, "epoch": 2.134453781512605, "grad_norm": 0.0029818895099654563, "kl": 0.022796630859375, "learning_rate": 4.875126748865289e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 128 }, { "completion_length": 443.5364761352539, "epoch": 2.168067226890756, "grad_norm": 0.003568087255322955, "kl": 0.0235595703125, "learning_rate": 4.870022949890676e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 130 }, { "completion_length": 273.77083587646484, "epoch": 2.2016806722689077, "grad_norm": 0.9125842294177587, "kl": 0.0274658203125, "learning_rate": 4.864819709499761e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 132 }, { "completion_length": 370.6823043823242, "epoch": 2.235294117647059, "grad_norm": 0.13384343254939152, "kl": 0.0224761962890625, "learning_rate": 4.85951724601009e-07, "loss": 0.0, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 134 }, { "completion_length": 370.6458435058594, "epoch": 2.26890756302521, "grad_norm": 0.003290935380335244, "kl": 0.0223388671875, "learning_rate": 4.854115781902414e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 136 }, { "completion_length": 371.68750762939453, "epoch": 2.302521008403361, "grad_norm": 0.001501973217418717, "kl": 0.01715087890625, "learning_rate": 4.848615543811344e-07, "loss": 0.0, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 138 }, { "completion_length": 356.9166793823242, "epoch": 2.3361344537815127, "grad_norm": 0.013205663197367531, "kl": 0.03387451171875, "learning_rate": 4.843016762515859e-07, "loss": 0.0, "reward": 0.1572916842997074, "reward_std": 0.03989280015230179, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0572916679084301, "step": 140 }, { "completion_length": 349.26563262939453, "epoch": 2.369747899159664, "grad_norm": 0.0025208087801339408, "kl": 0.021820068359375, "learning_rate": 4.837319672929606e-07, "loss": 0.0, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 142 }, { "completion_length": 299.31771087646484, "epoch": 2.403361344537815, "grad_norm": 0.0020220601267428705, "kl": 0.0236358642578125, "learning_rate": 4.831524514091056e-07, "loss": 0.0, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 144 }, { "completion_length": 324.6979217529297, "epoch": 2.4369747899159666, "grad_norm": 0.395149886152443, "kl": 0.0274505615234375, "learning_rate": 4.825631529153466e-07, "loss": 0.0, "reward": 0.16250001266598701, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625, "step": 146 }, { "completion_length": 422.1458511352539, "epoch": 2.4705882352941178, "grad_norm": 0.0015328340372073913, "kl": 0.01739501953125, "learning_rate": 4.81964096537468e-07, "loss": 0.0, "reward": 0.09843750670552254, "reward_std": 0.004929708316922188, "rewards/format_reward_func": 0.09843751601874828, "rewards/solution_reward_func": 0.0, "step": 148 }, { "completion_length": 354.7239685058594, "epoch": 2.504201680672269, "grad_norm": 0.0013994782066214318, "kl": 0.0224151611328125, "learning_rate": 4.81355307410676e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 150 }, { "completion_length": 301.70313262939453, "epoch": 2.53781512605042, "grad_norm": 0.24763478990837257, "kl": 0.033599853515625, "learning_rate": 4.80736811078543e-07, "loss": 0.0, "reward": 0.15677084401249886, "reward_std": 0.06458333530463278, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.05729166930541396, "step": 152 }, { "completion_length": 255.9270896911621, "epoch": 2.571428571428571, "grad_norm": 0.12329395496821485, "kl": 0.035980224609375, "learning_rate": 4.80108633491936e-07, "loss": 0.0, "reward": 0.12031250447034836, "reward_std": 0.039351133862510324, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.02083333395421505, "step": 154 }, { "completion_length": 281.31250381469727, "epoch": 2.6050420168067228, "grad_norm": 0.002735026301519947, "kl": 0.02569580078125, "learning_rate": 4.794708010079288e-07, "loss": 0.0, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 156 }, { "completion_length": 353.95313262939453, "epoch": 2.638655462184874, "grad_norm": 0.00677577023994385, "kl": 0.026336669921875, "learning_rate": 4.788233403886949e-07, "loss": 0.0, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 158 }, { "completion_length": 331.2604217529297, "epoch": 2.6722689075630255, "grad_norm": 0.0018809466014367724, "kl": 0.0211334228515625, "learning_rate": 4.78166278800385e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 160 }, { "completion_length": 309.6770935058594, "epoch": 2.7058823529411766, "grad_norm": 0.5544656996263257, "kl": 0.030364990234375, "learning_rate": 4.774996438119876e-07, "loss": 0.0, "reward": 0.13125001266598701, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125000139698386, "step": 162 }, { "completion_length": 349.2916717529297, "epoch": 2.7394957983193278, "grad_norm": 0.49145004972437356, "kl": 0.0228729248046875, "learning_rate": 4.7682346339417157e-07, "loss": 0.0, "reward": 0.12604167684912682, "reward_std": 0.062056493014097214, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.026041666977107525, "step": 164 }, { "completion_length": 250.7239646911621, "epoch": 2.773109243697479, "grad_norm": 0.47447691376343265, "kl": 0.05035400390625, "learning_rate": 4.7613776591811295e-07, "loss": 0.0001, "reward": 0.2031250186264515, "reward_std": 0.09893911704421043, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.1041666716337204, "step": 166 }, { "completion_length": 345.64063262939453, "epoch": 2.80672268907563, "grad_norm": 0.0026459922218496257, "kl": 0.025238037109375, "learning_rate": 4.754425801543046e-07, "loss": 0.0, "reward": 0.1572916842997074, "reward_std": 0.08155946433544159, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0572916679084301, "step": 168 }, { "completion_length": 333.87500762939453, "epoch": 2.8403361344537816, "grad_norm": 0.0015567010322838591, "kl": 0.0333404541015625, "learning_rate": 4.747379352713488e-07, "loss": 0.0, "reward": 0.1468750163912773, "reward_std": 0.06386648304760456, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.04687500139698386, "step": 170 }, { "completion_length": 262.1354217529297, "epoch": 2.8739495798319328, "grad_norm": 0.5511050261269926, "kl": 0.0656890869140625, "learning_rate": 4.7402386083473364e-07, "loss": 0.0001, "reward": 0.11510417237877846, "reward_std": 0.05138041847385466, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.015625000465661287, "step": 172 }, { "completion_length": 293.2864646911621, "epoch": 2.907563025210084, "grad_norm": 0.01800527962711472, "kl": 0.043487548828125, "learning_rate": 4.7330038680559224e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 174 }, { "completion_length": 370.3073043823242, "epoch": 2.9411764705882355, "grad_norm": 0.36111501285099934, "kl": 0.030181884765625, "learning_rate": 4.72567543539446e-07, "loss": 0.0, "reward": 0.16718751192092896, "reward_std": 0.03567607537843287, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0677083358168602, "step": 176 }, { "completion_length": 243.7135467529297, "epoch": 2.9747899159663866, "grad_norm": 0.5903132372700801, "kl": 0.03546142578125, "learning_rate": 4.718253617849305e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625000465661287, "step": 178 }, { "completion_length": 318.87501017252606, "epoch": 3.0, "grad_norm": 0.5048600121415732, "kl": 0.0343017578125, "learning_rate": 4.7107387268250586e-07, "loss": 0.0, "reward": 0.10694445172945659, "reward_std": 0.027777778605620067, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.006944444651405017, "step": 180 }, { "completion_length": 299.0937614440918, "epoch": 3.033613445378151, "grad_norm": 0.005780050456035791, "kl": 0.037506103515625, "learning_rate": 4.703131077631497e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 182 }, { "completion_length": 269.8854179382324, "epoch": 3.0672268907563027, "grad_norm": 0.6166299601553784, "kl": 0.049407958984375, "learning_rate": 4.6954309894703426e-07, "loss": 0.0, "reward": 0.13125000894069672, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125000139698386, "step": 184 }, { "completion_length": 329.3281364440918, "epoch": 3.100840336134454, "grad_norm": 0.003973686438788461, "kl": 0.04754638671875, "learning_rate": 4.6876387854218744e-07, "loss": 0.0, "reward": 0.13593751192092896, "reward_std": 0.04477895796298981, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0364583358168602, "step": 186 }, { "completion_length": 344.95314025878906, "epoch": 3.134453781512605, "grad_norm": 0.00185996346424368, "kl": 0.0301513671875, "learning_rate": 4.6797547924313673e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 188 }, { "completion_length": 366.5416717529297, "epoch": 3.168067226890756, "grad_norm": 0.0852771175410214, "kl": 0.032684326171875, "learning_rate": 4.6717793412953776e-07, "loss": 0.0, "reward": 0.17968751303851604, "reward_std": 0.004269563127309084, "rewards/format_reward_func": 0.09635418094694614, "rewards/solution_reward_func": 0.0833333358168602, "step": 190 }, { "completion_length": 545.8073043823242, "epoch": 3.2016806722689077, "grad_norm": 0.002354030426399684, "kl": 0.0242919921875, "learning_rate": 4.6637127666478617e-07, "loss": 0.0, "reward": 0.0989583395421505, "reward_std": 0.0028463751077651978, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.0, "step": 192 }, { "completion_length": 305.4687614440918, "epoch": 3.235294117647059, "grad_norm": 0.7198493590761849, "kl": 0.0433349609375, "learning_rate": 4.6555554069461346e-07, "loss": 0.0, "reward": 0.2614583671092987, "reward_std": 0.07776083797216415, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.16145833861082792, "step": 194 }, { "completion_length": 329.9791717529297, "epoch": 3.26890756302521, "grad_norm": 0.6213682420460217, "kl": 0.05126953125, "learning_rate": 4.647307604456674e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 196 }, { "completion_length": 278.6614646911621, "epoch": 3.302521008403361, "grad_norm": 0.09446674309058874, "kl": 0.046905517578125, "learning_rate": 4.6389697052407526e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 198 }, { "completion_length": 392.9166793823242, "epoch": 3.3361344537815127, "grad_norm": 0.3130859963847107, "kl": 0.03375244140625, "learning_rate": 4.630542059139923e-07, "loss": 0.0, "reward": 0.12083334848284721, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.02083333395421505, "step": 200 }, { "completion_length": 364.2916717529297, "epoch": 3.369747899159664, "grad_norm": 0.05531185206394403, "kl": 0.030609130859375, "learning_rate": 4.622025019761336e-07, "loss": 0.0, "reward": 0.0989583358168602, "reward_std": 0.004166666883975267, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.0, "step": 202 }, { "completion_length": 231.47917938232422, "epoch": 3.403361344537815, "grad_norm": 0.11378729038977049, "kl": 0.052581787109375, "learning_rate": 4.613418944462906e-07, "loss": 0.0001, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 204 }, { "completion_length": 327.32292556762695, "epoch": 3.4369747899159666, "grad_norm": 0.001937226818489755, "kl": 0.0352935791015625, "learning_rate": 4.6047241943383173e-07, "loss": 0.0, "reward": 0.13125000894069672, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 206 }, { "completion_length": 218.39063262939453, "epoch": 3.4705882352941178, "grad_norm": 0.41454143509770136, "kl": 0.10504150390625, "learning_rate": 4.5959411342018704e-07, "loss": 0.0001, "reward": 0.19895834475755692, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.09895833348855376, "step": 208 }, { "completion_length": 375.73959732055664, "epoch": 3.504201680672269, "grad_norm": 0.41487639917807506, "kl": 0.04254150390625, "learning_rate": 4.5870701325731773e-07, "loss": 0.0, "reward": 0.10989583656191826, "reward_std": 0.04375000135041773, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.010416666977107525, "step": 210 }, { "completion_length": 222.1614646911621, "epoch": 3.53781512605042, "grad_norm": 0.004711423015289795, "kl": 0.0560302734375, "learning_rate": 4.578111561661702e-07, "loss": 0.0001, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 212 }, { "completion_length": 293.6510543823242, "epoch": 3.571428571428571, "grad_norm": 0.0047083814929660085, "kl": 0.0430908203125, "learning_rate": 4.569065797351135e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 214 }, { "completion_length": 347.82813262939453, "epoch": 3.6050420168067228, "grad_norm": 0.001882710598052609, "kl": 0.03912353515625, "learning_rate": 4.559933219183631e-07, "loss": 0.0, "reward": 0.14166667312383652, "reward_std": 0.07453560084104538, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0416666679084301, "step": 216 }, { "completion_length": 392.2239685058594, "epoch": 3.638655462184874, "grad_norm": 0.0015965613721868007, "kl": 0.02435302734375, "learning_rate": 4.550714210343879e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 218 }, { "completion_length": 247.1510467529297, "epoch": 3.6722689075630255, "grad_norm": 0.9358996531947065, "kl": 0.08123779296875, "learning_rate": 4.541409157643027e-07, "loss": 0.0001, "reward": 0.15208334103226662, "reward_std": 0.1358619760721922, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.05208333348855376, "step": 220 }, { "completion_length": 341.96356201171875, "epoch": 3.7058823529411766, "grad_norm": 0.5471411305666429, "kl": 0.05560302734375, "learning_rate": 4.5320184515024493e-07, "loss": 0.0001, "reward": 0.16041667759418488, "reward_std": 0.040994580602273345, "rewards/format_reward_func": 0.09791668318212032, "rewards/solution_reward_func": 0.0625, "step": 222 }, { "completion_length": 300.1354293823242, "epoch": 3.7394957983193278, "grad_norm": 0.0034163686519730406, "kl": 0.0570068359375, "learning_rate": 4.5225424859373684e-07, "loss": 0.0001, "reward": 0.2510416880249977, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1510416716337204, "step": 224 }, { "completion_length": 310.88542556762695, "epoch": 3.773109243697479, "grad_norm": 0.3166757367914985, "kl": 0.050567626953125, "learning_rate": 4.51298165854032e-07, "loss": 0.0001, "reward": 0.2614583484828472, "reward_std": 0.06250000186264515, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.16145833348855376, "step": 226 }, { "completion_length": 286.53125762939453, "epoch": 3.80672268907563, "grad_norm": 0.003343243746434635, "kl": 0.04010009765625, "learning_rate": 4.503336370464475e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 228 }, { "completion_length": 347.9323043823242, "epoch": 3.8403361344537816, "grad_norm": 0.3148793125351533, "kl": 0.041290283203125, "learning_rate": 4.4936070264068016e-07, "loss": 0.0, "reward": 0.11562500894069672, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 230 }, { "completion_length": 377.4322967529297, "epoch": 3.8739495798319328, "grad_norm": 0.20834477983630245, "kl": 0.04046630859375, "learning_rate": 4.4837940345910917e-07, "loss": 0.0, "reward": 0.13125001266598701, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 232 }, { "completion_length": 239.21355056762695, "epoch": 3.907563025210084, "grad_norm": 0.0024386771687048563, "kl": 0.05242919921875, "learning_rate": 4.473897806750828e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 234 }, { "completion_length": 275.9479217529297, "epoch": 3.9411764705882355, "grad_norm": 0.6360428477419187, "kl": 0.056121826171875, "learning_rate": 4.4639187581119116e-07, "loss": 0.0001, "reward": 0.11562500894069672, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625000465661287, "step": 236 }, { "completion_length": 306.2395935058594, "epoch": 3.9747899159663866, "grad_norm": 0.004821758450074375, "kl": 0.058837890625, "learning_rate": 4.453857307375236e-07, "loss": 0.0001, "reward": 0.1416666842997074, "reward_std": 0.04303314909338951, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0416666679084301, "step": 238 }, { "completion_length": 294.2916768391927, "epoch": 4.0, "grad_norm": 0.004664537962938436, "kl": 0.048014322916666664, "learning_rate": 4.443713876699123e-07, "loss": 0.0, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 240 }, { "completion_length": 340.10939025878906, "epoch": 4.033613445378151, "grad_norm": 0.27486536874625467, "kl": 0.0467529296875, "learning_rate": 4.433488891681609e-07, "loss": 0.0, "reward": 0.13593750447034836, "reward_std": 0.0447789600584656, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0364583358168602, "step": 242 }, { "completion_length": 354.17188262939453, "epoch": 4.067226890756302, "grad_norm": 0.806685605805211, "kl": 0.0802001953125, "learning_rate": 4.423182781342588e-07, "loss": 0.0001, "reward": 0.229166679084301, "reward_std": 0.13723588176071644, "rewards/format_reward_func": 0.09895835071802139, "rewards/solution_reward_func": 0.13020833348855376, "step": 244 }, { "completion_length": 377.2708435058594, "epoch": 4.100840336134453, "grad_norm": 0.001932432667307889, "kl": 0.04119873046875, "learning_rate": 4.412795978105807e-07, "loss": 0.0, "reward": 0.09791667386889458, "reward_std": 0.003726780414581299, "rewards/format_reward_func": 0.09791668318212032, "rewards/solution_reward_func": 0.0, "step": 246 }, { "completion_length": 201.68750762939453, "epoch": 4.1344537815126055, "grad_norm": 0.6661792610194608, "kl": 0.08123779296875, "learning_rate": 4.402328917780728e-07, "loss": 0.0001, "reward": 0.15729167684912682, "reward_std": 0.03989280015230179, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0572916679084301, "step": 248 }, { "completion_length": 186.31250762939453, "epoch": 4.168067226890757, "grad_norm": 0.7529731375310115, "kl": 0.0986328125, "learning_rate": 4.391782039544238e-07, "loss": 0.0001, "reward": 0.11562500521540642, "reward_std": 0.06250000186264515, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625000465661287, "step": 250 }, { "completion_length": 318.3229293823242, "epoch": 4.201680672268908, "grad_norm": 0.6265542764192513, "kl": 0.0982666015625, "learning_rate": 4.381155785922225e-07, "loss": 0.0001, "reward": 0.18854167684912682, "reward_std": 0.12602896243333817, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08854166697710752, "step": 252 }, { "completion_length": 232.78125381469727, "epoch": 4.235294117647059, "grad_norm": 0.004504345225298719, "kl": 0.0882568359375, "learning_rate": 4.37045060277101e-07, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 254 }, { "completion_length": 218.16667556762695, "epoch": 4.26890756302521, "grad_norm": 0.00494520880194292, "kl": 0.0882568359375, "learning_rate": 4.3596669392586363e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 256 }, { "completion_length": 286.3020896911621, "epoch": 4.302521008403361, "grad_norm": 0.003908254855839671, "kl": 0.070556640625, "learning_rate": 4.348805247846027e-07, "loss": 0.0001, "reward": 0.11562500894069672, "reward_std": 0.049297086894512177, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 258 }, { "completion_length": 244.34375762939453, "epoch": 4.336134453781512, "grad_norm": 0.45940985940622603, "kl": 0.0684814453125, "learning_rate": 4.337865984268001e-07, "loss": 0.0001, "reward": 0.14687501266598701, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.046875, "step": 260 }, { "completion_length": 287.06250381469727, "epoch": 4.369747899159664, "grad_norm": 0.00874214776920602, "kl": 0.06689453125, "learning_rate": 4.326849607514148e-07, "loss": 0.0001, "reward": 0.13072917610406876, "reward_std": 0.04375000135041773, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.03125, "step": 262 }, { "completion_length": 295.40625762939453, "epoch": 4.4033613445378155, "grad_norm": 0.4428610572315149, "kl": 0.06109619140625, "learning_rate": 4.3157565798095746e-07, "loss": 0.0001, "reward": 0.1625000163912773, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.06250000139698386, "step": 264 }, { "completion_length": 232.04167556762695, "epoch": 4.436974789915967, "grad_norm": 0.005178787079259263, "kl": 0.08477783203125, "learning_rate": 4.304587366595505e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 266 }, { "completion_length": 309.6614685058594, "epoch": 4.470588235294118, "grad_norm": 0.37164177033937396, "kl": 0.0654296875, "learning_rate": 4.293342436509756e-07, "loss": 0.0001, "reward": 0.17760417610406876, "reward_std": 0.022916667396202683, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.078125, "step": 268 }, { "completion_length": 300.76562881469727, "epoch": 4.504201680672269, "grad_norm": 0.4431821324059999, "kl": 0.0758056640625, "learning_rate": 4.282022261367073e-07, "loss": 0.0001, "reward": 0.13125000894069672, "reward_std": 0.06718548387289047, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 270 }, { "completion_length": 294.60417556762695, "epoch": 4.53781512605042, "grad_norm": 0.007990926426986908, "kl": 0.09228515625, "learning_rate": 4.2706273161393326e-07, "loss": 0.0001, "reward": 0.25052086263895035, "reward_std": 0.05138042033649981, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.1510416716337204, "step": 272 }, { "completion_length": 300.7135467529297, "epoch": 4.571428571428571, "grad_norm": 0.005388618453356045, "kl": 0.07293701171875, "learning_rate": 4.259158078935615e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 274 }, { "completion_length": 328.0885543823242, "epoch": 4.605042016806722, "grad_norm": 0.4535545707288815, "kl": 0.06951904296875, "learning_rate": 4.2476150309821437e-07, "loss": 0.0001, "reward": 0.1364583484828472, "reward_std": 0.06835655122995377, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.036458334885537624, "step": 276 }, { "completion_length": 346.4791831970215, "epoch": 4.6386554621848735, "grad_norm": 0.003400716750454624, "kl": 0.054901123046875, "learning_rate": 4.235998656602091e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 278 }, { "completion_length": 420.37500762939453, "epoch": 4.6722689075630255, "grad_norm": 0.005817124828446267, "kl": 0.053955078125, "learning_rate": 4.2243094431952607e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 280 }, { "completion_length": 316.5520935058594, "epoch": 4.705882352941177, "grad_norm": 0.6998918184411341, "kl": 0.07061767578125, "learning_rate": 4.2125478812176363e-07, "loss": 0.0001, "reward": 0.12604167312383652, "reward_std": 0.07525940984487534, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.026041666977107525, "step": 282 }, { "completion_length": 358.15626525878906, "epoch": 4.739495798319328, "grad_norm": 0.4308722420669692, "kl": 0.084716796875, "learning_rate": 4.2007144641608035e-07, "loss": 0.0001, "reward": 0.1520833522081375, "reward_std": 0.07893446832895279, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.05208333395421505, "step": 284 }, { "completion_length": 344.07813262939453, "epoch": 4.773109243697479, "grad_norm": 0.004113712525176569, "kl": 0.06121826171875, "learning_rate": 4.188809688531241e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 286 }, { "completion_length": 234.23438262939453, "epoch": 4.80672268907563, "grad_norm": 0.5112298113873388, "kl": 0.091796875, "learning_rate": 4.1768340538294914e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 288 }, { "completion_length": 317.43751525878906, "epoch": 4.840336134453781, "grad_norm": 0.004962546541418084, "kl": 0.0760498046875, "learning_rate": 4.1647880625292027e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 290 }, { "completion_length": 315.85417556762695, "epoch": 4.873949579831933, "grad_norm": 0.4514363360707906, "kl": 0.06011962890625, "learning_rate": 4.1526722200560436e-07, "loss": 0.0001, "reward": 0.2291666865348816, "reward_std": 0.09517661295831203, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.1302083358168602, "step": 292 }, { "completion_length": 364.1354293823242, "epoch": 4.907563025210084, "grad_norm": 0.00292400682887999, "kl": 0.05596923828125, "learning_rate": 4.140487034766499e-07, "loss": 0.0001, "reward": 0.13645834103226662, "reward_std": 0.07086054235696793, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03645833395421505, "step": 294 }, { "completion_length": 410.1666717529297, "epoch": 4.9411764705882355, "grad_norm": 0.004090386239142373, "kl": 0.05633544921875, "learning_rate": 4.1282330179265377e-07, "loss": 0.0001, "reward": 0.12083334103226662, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.02083333395421505, "step": 296 }, { "completion_length": 356.8541793823242, "epoch": 4.974789915966387, "grad_norm": 0.4926778113410085, "kl": 0.08074951171875, "learning_rate": 4.115910683690167e-07, "loss": 0.0001, "reward": 0.11510417237877846, "reward_std": 0.05138041847385466, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.015625000465661287, "step": 298 }, { "completion_length": 313.0555725097656, "epoch": 5.0, "grad_norm": 0.692508798855638, "kl": 0.07649739583333333, "learning_rate": 4.1035205490778496e-07, "loss": 0.0001, "reward": 0.17638889948527017, "reward_std": 0.05319040020306905, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.07638889054457347, "step": 300 }, { "completion_length": 331.9479217529297, "epoch": 5.033613445378151, "grad_norm": 0.003976789504191413, "kl": 0.05645751953125, "learning_rate": 4.09106313395482e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 302 }, { "completion_length": 370.62500762939453, "epoch": 5.067226890756302, "grad_norm": 0.22704491806065916, "kl": 0.06939697265625, "learning_rate": 4.078538961009268e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 304 }, { "completion_length": 228.07291793823242, "epoch": 5.100840336134453, "grad_norm": 0.4237533414058654, "kl": 0.10888671875, "learning_rate": 4.0659485557304047e-07, "loss": 0.0001, "reward": 0.18854168057441711, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08854166930541396, "step": 306 }, { "completion_length": 233.2604217529297, "epoch": 5.1344537815126055, "grad_norm": 1.0261194882547862, "kl": 0.08538818359375, "learning_rate": 4.0532924463864214e-07, "loss": 0.0001, "reward": 0.339583370834589, "reward_std": 0.18413009867072105, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.23958334140479565, "step": 308 }, { "completion_length": 328.6041717529297, "epoch": 5.168067226890757, "grad_norm": 0.5315207527005077, "kl": 0.05841064453125, "learning_rate": 4.040571164002318e-07, "loss": 0.0001, "reward": 0.17291668057441711, "reward_std": 0.07893446832895279, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.07291666977107525, "step": 310 }, { "completion_length": 289.46876335144043, "epoch": 5.201680672268908, "grad_norm": 0.005494587051281489, "kl": 0.0968017578125, "learning_rate": 4.027785242337625e-07, "loss": 0.0001, "reward": 0.2562500275671482, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.15625, "step": 312 }, { "completion_length": 407.2552261352539, "epoch": 5.235294117647059, "grad_norm": 0.23555061577036132, "kl": 0.0577392578125, "learning_rate": 4.0149352178640084e-07, "loss": 0.0001, "reward": 0.1625000163912773, "reward_std": 0.0803009495139122, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.06250000186264515, "step": 314 }, { "completion_length": 301.6197967529297, "epoch": 5.26890756302521, "grad_norm": 0.44699271790119377, "kl": 0.0704345703125, "learning_rate": 4.002021629742759e-07, "loss": 0.0001, "reward": 0.2458333596587181, "reward_std": 0.11105217784643173, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333395421505, "step": 316 }, { "completion_length": 274.4791793823242, "epoch": 5.302521008403361, "grad_norm": 0.003803966426898332, "kl": 0.0848388671875, "learning_rate": 3.9890450198021705e-07, "loss": 0.0001, "reward": 0.2302083559334278, "reward_std": 0.0625, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1302083432674408, "step": 318 }, { "completion_length": 385.4010581970215, "epoch": 5.336134453781512, "grad_norm": 0.005520512930507831, "kl": 0.062774658203125, "learning_rate": 3.9760059325148063e-07, "loss": 0.0001, "reward": 0.18281251564621925, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0833333358168602, "step": 320 }, { "completion_length": 299.29688262939453, "epoch": 5.369747899159664, "grad_norm": 0.005787108659773324, "kl": 0.082275390625, "learning_rate": 3.9629049149746556e-07, "loss": 0.0001, "reward": 0.12604167684912682, "reward_std": 0.03989280015230179, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0260416679084301, "step": 322 }, { "completion_length": 219.5208396911621, "epoch": 5.4033613445378155, "grad_norm": 0.003977844952312375, "kl": 0.09033203125, "learning_rate": 3.949742516874175e-07, "loss": 0.0001, "reward": 0.12604168057441711, "reward_std": 0.03989280015230179, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0260416679084301, "step": 324 }, { "completion_length": 259.7239685058594, "epoch": 5.436974789915967, "grad_norm": 0.7332299676680101, "kl": 0.0986328125, "learning_rate": 3.9365192904812263e-07, "loss": 0.0001, "reward": 0.12604167312383652, "reward_std": 0.09096375480294228, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.026041666977107525, "step": 326 }, { "completion_length": 325.2291717529297, "epoch": 5.470588235294118, "grad_norm": 0.21639002927451828, "kl": 0.072021484375, "learning_rate": 3.9232357906159065e-07, "loss": 0.0001, "reward": 0.1677083522081375, "reward_std": 0.08436229452490807, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0677083358168602, "step": 328 }, { "completion_length": 275.8281364440918, "epoch": 5.504201680672269, "grad_norm": 0.4156972740588642, "kl": 0.07879638671875, "learning_rate": 3.909892574627266e-07, "loss": 0.0001, "reward": 0.16250000894069672, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.06250000139698386, "step": 330 }, { "completion_length": 312.4322967529297, "epoch": 5.53781512605042, "grad_norm": 0.004431438650385246, "kl": 0.07269287109375, "learning_rate": 3.8964902023699234e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 332 }, { "completion_length": 298.65625762939453, "epoch": 5.571428571428571, "grad_norm": 0.026171564181720158, "kl": 0.0762939453125, "learning_rate": 3.8830292361805767e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 334 }, { "completion_length": 301.5937614440918, "epoch": 5.605042016806722, "grad_norm": 0.5635842425763482, "kl": 0.091796875, "learning_rate": 3.869510240854407e-07, "loss": 0.0001, "reward": 0.12083334103226662, "reward_std": 0.054426075890660286, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.020833333488553762, "step": 336 }, { "completion_length": 241.86458587646484, "epoch": 5.6386554621848735, "grad_norm": 0.0073095019134156135, "kl": 0.124267578125, "learning_rate": 3.855933783621383e-07, "loss": 0.0001, "reward": 0.14687501266598701, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.046875, "step": 338 }, { "completion_length": 371.3073043823242, "epoch": 5.6722689075630255, "grad_norm": 0.0027847795037786055, "kl": 0.066162109375, "learning_rate": 3.8423004341224595e-07, "loss": 0.0001, "reward": 0.11562500894069672, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.015625, "step": 340 }, { "completion_length": 326.3593864440918, "epoch": 5.705882352941177, "grad_norm": 0.004290759127866285, "kl": 0.08599853515625, "learning_rate": 3.828610764385676e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 342 }, { "completion_length": 364.30731201171875, "epoch": 5.739495798319328, "grad_norm": 0.6183588751246543, "kl": 0.073486328125, "learning_rate": 3.8148653488021566e-07, "loss": 0.0001, "reward": 0.16718750819563866, "reward_std": 0.06018446781672537, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.06770833348855376, "step": 344 }, { "completion_length": 303.52605056762695, "epoch": 5.773109243697479, "grad_norm": 0.01688718911460057, "kl": 0.097412109375, "learning_rate": 3.801064764102011e-07, "loss": 0.0001, "reward": 0.14114584401249886, "reward_std": 0.04511648043990135, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0416666679084301, "step": 346 }, { "completion_length": 255.6666717529297, "epoch": 5.80672268907563, "grad_norm": 0.004030938738462692, "kl": 0.0947265625, "learning_rate": 3.787209589330134e-07, "loss": 0.0001, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 348 }, { "completion_length": 361.8020935058594, "epoch": 5.840336134453781, "grad_norm": 0.0035375041184368787, "kl": 0.09124755859375, "learning_rate": 3.773300405821908e-07, "loss": 0.0001, "reward": 0.2354166842997074, "reward_std": 0.06072613224387169, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1354166716337204, "step": 350 }, { "completion_length": 336.4270935058594, "epoch": 5.873949579831933, "grad_norm": 0.007175603878940048, "kl": 0.0863037109375, "learning_rate": 3.759337797178816e-07, "loss": 0.0001, "reward": 0.2145833484828472, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1145833358168602, "step": 352 }, { "completion_length": 253.83855056762695, "epoch": 5.907563025210084, "grad_norm": 0.7332468682916561, "kl": 0.10400390625, "learning_rate": 3.745322349243954e-07, "loss": 0.0001, "reward": 0.17239584773778915, "reward_std": 0.10584261477924883, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0729166716337204, "step": 354 }, { "completion_length": 270.6041831970215, "epoch": 5.9411764705882355, "grad_norm": 0.5812090123487667, "kl": 0.09564208984375, "learning_rate": 3.7312546500774455e-07, "loss": 0.0001, "reward": 0.21979167684912682, "reward_std": 0.09976780228316784, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.11979166930541396, "step": 356 }, { "completion_length": 347.25000762939453, "epoch": 5.974789915966387, "grad_norm": 0.002938832703303519, "kl": 0.068603515625, "learning_rate": 3.717135289931774e-07, "loss": 0.0001, "reward": 0.20416668057441711, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.10416666977107525, "step": 358 }, { "completion_length": 310.7708333333333, "epoch": 6.0, "grad_norm": 0.3977959168312144, "kl": 0.078857421875, "learning_rate": 3.7029648612270123e-07, "loss": 0.0001, "reward": 0.12083334227403005, "reward_std": 0.04479032258192698, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.020833333333333332, "step": 360 }, { "completion_length": 353.5000114440918, "epoch": 6.033613445378151, "grad_norm": 0.14776976897161848, "kl": 0.0682373046875, "learning_rate": 3.688743958525969e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 362 }, { "completion_length": 227.23959350585938, "epoch": 6.067226890756302, "grad_norm": 0.19290693481758328, "kl": 0.1162109375, "learning_rate": 3.6744731785092393e-07, "loss": 0.0001, "reward": 0.2927083522081375, "reward_std": 0.05810113437473774, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1927083390764892, "step": 364 }, { "completion_length": 244.61459350585938, "epoch": 6.100840336134453, "grad_norm": 0.26807458396276374, "kl": 0.093994140625, "learning_rate": 3.660153119950171e-07, "loss": 0.0001, "reward": 0.1364583484828472, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0364583358168602, "step": 366 }, { "completion_length": 289.4583435058594, "epoch": 6.1344537815126055, "grad_norm": 0.3074715915927646, "kl": 0.07244873046875, "learning_rate": 3.6457843836897417e-07, "loss": 0.0001, "reward": 0.1781250163912773, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.07812500512227416, "step": 368 }, { "completion_length": 417.2812728881836, "epoch": 6.168067226890757, "grad_norm": 0.0025652428149402157, "kl": 0.052825927734375, "learning_rate": 3.6313675726113475e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 370 }, { "completion_length": 360.14062881469727, "epoch": 6.201680672268908, "grad_norm": 0.02809673866920517, "kl": 0.0933837890625, "learning_rate": 3.6169032916155055e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 372 }, { "completion_length": 334.81251525878906, "epoch": 6.235294117647059, "grad_norm": 0.008321765133696193, "kl": 0.06768798828125, "learning_rate": 3.602392147594479e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 374 }, { "completion_length": 321.2291679382324, "epoch": 6.26890756302521, "grad_norm": 0.005269844482915654, "kl": 0.0859375, "learning_rate": 3.587834749406808e-07, "loss": 0.0001, "reward": 0.1364583484828472, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0364583358168602, "step": 376 }, { "completion_length": 300.1302185058594, "epoch": 6.302521008403361, "grad_norm": 0.08155198298691421, "kl": 0.10186767578125, "learning_rate": 3.573231707851765e-07, "loss": 0.0001, "reward": 0.18697917088866234, "reward_std": 0.08772156853228807, "rewards/format_reward_func": 0.09843751788139343, "rewards/solution_reward_func": 0.0885416716337204, "step": 378 }, { "completion_length": 315.4114685058594, "epoch": 6.336134453781512, "grad_norm": 0.4573624849299947, "kl": 0.07080078125, "learning_rate": 3.558583635643726e-07, "loss": 0.0001, "reward": 0.16250001266598701, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625, "step": 380 }, { "completion_length": 354.03125762939453, "epoch": 6.369747899159664, "grad_norm": 0.7816242785406106, "kl": 0.07342529296875, "learning_rate": 3.543891147386463e-07, "loss": 0.0001, "reward": 0.22968751937150955, "reward_std": 0.06458333507180214, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.1302083358168602, "step": 382 }, { "completion_length": 306.90625, "epoch": 6.4033613445378155, "grad_norm": 0.0031974749249534733, "kl": 0.072998046875, "learning_rate": 3.52915485954736e-07, "loss": 0.0001, "reward": 0.1885416842997074, "reward_std": 0.062056493014097214, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0885416716337204, "step": 384 }, { "completion_length": 340.9687614440918, "epoch": 6.436974789915967, "grad_norm": 0.4423536638302119, "kl": 0.0994873046875, "learning_rate": 3.514375390431539e-07, "loss": 0.0001, "reward": 0.1520833484828472, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0520833358168602, "step": 386 }, { "completion_length": 269.3177185058594, "epoch": 6.470588235294118, "grad_norm": 0.4817681621127007, "kl": 0.084228515625, "learning_rate": 3.4995533601559225e-07, "loss": 0.0001, "reward": 0.20416668057441711, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.10416666977107525, "step": 388 }, { "completion_length": 253.63542556762695, "epoch": 6.504201680672269, "grad_norm": 0.3361832205973872, "kl": 0.087158203125, "learning_rate": 3.484689390623218e-07, "loss": 0.0001, "reward": 0.3239583633840084, "reward_std": 0.1280868798494339, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2239583432674408, "step": 390 }, { "completion_length": 258.3333396911621, "epoch": 6.53781512605042, "grad_norm": 0.5412333179913434, "kl": 0.093505859375, "learning_rate": 3.469784105495816e-07, "loss": 0.0001, "reward": 0.3343750275671482, "reward_std": 0.08656488917768002, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2343750074505806, "step": 392 }, { "completion_length": 286.1302185058594, "epoch": 6.571428571428571, "grad_norm": 0.4459051543272153, "kl": 0.08331298828125, "learning_rate": 3.4548381301696295e-07, "loss": 0.0001, "reward": 0.2822916954755783, "reward_std": 0.06835655122995377, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1822916753590107, "step": 394 }, { "completion_length": 301.5052146911621, "epoch": 6.605042016806722, "grad_norm": 0.39392024757824967, "kl": 0.0714111328125, "learning_rate": 3.4398520917478476e-07, "loss": 0.0001, "reward": 0.12083334848284721, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.02083333395421505, "step": 396 }, { "completion_length": 258.60937881469727, "epoch": 6.6386554621848735, "grad_norm": 0.005544019530604207, "kl": 0.1090087890625, "learning_rate": 3.42482661901463e-07, "loss": 0.0001, "reward": 0.1989583484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0989583358168602, "step": 398 }, { "completion_length": 251.80209350585938, "epoch": 6.6722689075630255, "grad_norm": 0.4716520013640562, "kl": 0.1103515625, "learning_rate": 3.409762342408719e-07, "loss": 0.0001, "reward": 0.15729168057441711, "reward_std": 0.07115937769412994, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.057291666977107525, "step": 400 }, { "completion_length": 310.75000762939453, "epoch": 6.705882352941177, "grad_norm": 0.003255524037399874, "kl": 0.0640869140625, "learning_rate": 3.3946598939969893e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 402 }, { "completion_length": 237.4323024749756, "epoch": 6.739495798319328, "grad_norm": 0.008504424460508943, "kl": 0.1221923828125, "learning_rate": 3.379519907447931e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 404 }, { "completion_length": 367.09375762939453, "epoch": 6.773109243697479, "grad_norm": 0.004187864124011303, "kl": 0.0701904296875, "learning_rate": 3.364343018005057e-07, "loss": 0.0001, "reward": 0.2041666880249977, "reward_std": 0.0762883685529232, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1041666716337204, "step": 406 }, { "completion_length": 304.4218864440918, "epoch": 6.80672268907563, "grad_norm": 0.5787228811432347, "kl": 0.10296630859375, "learning_rate": 3.349129862460251e-07, "loss": 0.0001, "reward": 0.15208334475755692, "reward_std": 0.06352896057069302, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.05208333348855376, "step": 408 }, { "completion_length": 345.68750762939453, "epoch": 6.840336134453781, "grad_norm": 0.538023000721197, "kl": 0.0804443359375, "learning_rate": 3.3338810791270517e-07, "loss": 0.0001, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 410 }, { "completion_length": 300.2135543823242, "epoch": 6.873949579831933, "grad_norm": 0.46559225096079665, "kl": 0.06329345703125, "learning_rate": 3.318597307813866e-07, "loss": 0.0001, "reward": 0.2458333522081375, "reward_std": 0.054426075890660286, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333358168602, "step": 412 }, { "completion_length": 346.2552185058594, "epoch": 6.907563025210084, "grad_norm": 0.7156195718305587, "kl": 0.0623779296875, "learning_rate": 3.3032791897971307e-07, "loss": 0.0001, "reward": 0.1937500163912773, "reward_std": 0.08288982696831226, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.09375000512227416, "step": 414 }, { "completion_length": 356.2552146911621, "epoch": 6.9411764705882355, "grad_norm": 0.5889271668975388, "kl": 0.06939697265625, "learning_rate": 3.287927367794397e-07, "loss": 0.0001, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 416 }, { "completion_length": 274.0052146911621, "epoch": 6.974789915966387, "grad_norm": 0.005902260993884556, "kl": 0.0972900390625, "learning_rate": 3.272542485937368e-07, "loss": 0.0001, "reward": 0.1677083484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0677083358168602, "step": 418 }, { "completion_length": 419.5208333333333, "epoch": 7.0, "grad_norm": 0.5277366247449402, "kl": 0.07820638020833333, "learning_rate": 3.2571251897448763e-07, "loss": 0.0001, "reward": 0.19722224275271097, "reward_std": 0.03795166810353597, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.09722222884496053, "step": 420 }, { "completion_length": 259.93750762939453, "epoch": 7.033613445378151, "grad_norm": 0.5009669125967887, "kl": 0.0908203125, "learning_rate": 3.241676126095792e-07, "loss": 0.0001, "reward": 0.16250001266598701, "reward_std": 0.08258842676877975, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625, "step": 422 }, { "completion_length": 282.2083435058594, "epoch": 7.067226890756302, "grad_norm": 0.6550871349737686, "kl": 0.0823974609375, "learning_rate": 3.226195943201883e-07, "loss": 0.0001, "reward": 0.3187500089406967, "reward_std": 0.08288982883095741, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2187500074505806, "step": 424 }, { "completion_length": 354.31250762939453, "epoch": 7.100840336134453, "grad_norm": 0.00397697663282839, "kl": 0.06005859375, "learning_rate": 3.2106852905806216e-07, "loss": 0.0001, "reward": 0.09791667386889458, "reward_std": 0.003726780414581299, "rewards/format_reward_func": 0.09791668318212032, "rewards/solution_reward_func": 0.0, "step": 426 }, { "completion_length": 241.1197967529297, "epoch": 7.1344537815126055, "grad_norm": 0.006003511711751473, "kl": 0.11328125, "learning_rate": 3.1951448190279253e-07, "loss": 0.0001, "reward": 0.1989583484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0989583358168602, "step": 428 }, { "completion_length": 330.20833587646484, "epoch": 7.168067226890757, "grad_norm": 0.0032583513701371293, "kl": 0.072998046875, "learning_rate": 3.179575180590857e-07, "loss": 0.0001, "reward": 0.2197916880249977, "reward_std": 0.07115937769412994, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1197916716337204, "step": 430 }, { "completion_length": 245.54687881469727, "epoch": 7.201680672268908, "grad_norm": 0.004803635863370439, "kl": 0.08544921875, "learning_rate": 3.163977028540263e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 432 }, { "completion_length": 278.78125762939453, "epoch": 7.235294117647059, "grad_norm": 0.007489149847475061, "kl": 0.074951171875, "learning_rate": 3.1483510173433627e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 434 }, { "completion_length": 420.5364685058594, "epoch": 7.26890756302521, "grad_norm": 0.0035200468804183526, "kl": 0.0665283203125, "learning_rate": 3.1326978026362905e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 436 }, { "completion_length": 242.71354866027832, "epoch": 7.302521008403361, "grad_norm": 0.7127436161995736, "kl": 0.1019287109375, "learning_rate": 3.1170180411965854e-07, "loss": 0.0001, "reward": 0.3656250350177288, "reward_std": 0.17060723900794983, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2656250149011612, "step": 438 }, { "completion_length": 281.33855056762695, "epoch": 7.336134453781512, "grad_norm": 0.00892945104511247, "kl": 0.08880615234375, "learning_rate": 3.101312390915634e-07, "loss": 0.0001, "reward": 0.16250001266598701, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625, "step": 440 }, { "completion_length": 320.5520935058594, "epoch": 7.369747899159664, "grad_norm": 0.4692012635202913, "kl": 0.07550048828125, "learning_rate": 3.0855815107710665e-07, "loss": 0.0001, "reward": 0.1989583484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0989583358168602, "step": 442 }, { "completion_length": 352.53125762939453, "epoch": 7.4033613445378155, "grad_norm": 0.0032313000404080574, "kl": 0.0660400390625, "learning_rate": 3.069826060799109e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 444 }, { "completion_length": 390.0989685058594, "epoch": 7.436974789915967, "grad_norm": 0.5979586278915581, "kl": 0.05596923828125, "learning_rate": 3.054046702066886e-07, "loss": 0.0001, "reward": 0.1677083484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0677083358168602, "step": 446 }, { "completion_length": 259.80208587646484, "epoch": 7.470588235294118, "grad_norm": 0.42251764106374134, "kl": 0.086669921875, "learning_rate": 3.038244096644687e-07, "loss": 0.0001, "reward": 0.19322917982935905, "reward_std": 0.03054708451963961, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.09375000279396772, "step": 448 }, { "completion_length": 373.07814025878906, "epoch": 7.504201680672269, "grad_norm": 0.6367532522463891, "kl": 0.058319091796875, "learning_rate": 3.022418907578188e-07, "loss": 0.0001, "reward": 0.2250000238418579, "reward_std": 0.07013041898608208, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1250000074505806, "step": 450 }, { "completion_length": 346.81771087646484, "epoch": 7.53781512605042, "grad_norm": 0.0032189297282872124, "kl": 0.06268310546875, "learning_rate": 3.0065717988606256e-07, "loss": 0.0001, "reward": 0.23020834475755692, "reward_std": 0.06250000186264515, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1302083358168602, "step": 452 }, { "completion_length": 224.6302146911621, "epoch": 7.571428571428571, "grad_norm": 0.0070145705742728215, "kl": 0.1085205078125, "learning_rate": 2.990703435404944e-07, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 454 }, { "completion_length": 311.1198043823242, "epoch": 7.605042016806722, "grad_norm": 0.002328189721551877, "kl": 0.0562744140625, "learning_rate": 2.974814483015892e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 456 }, { "completion_length": 230.9791717529297, "epoch": 7.6386554621848735, "grad_norm": 0.6164935504116354, "kl": 0.083984375, "learning_rate": 2.95890560836209e-07, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.06573155522346497, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 458 }, { "completion_length": 176.54167366027832, "epoch": 7.6722689075630255, "grad_norm": 0.5822173835565376, "kl": 0.1627197265625, "learning_rate": 2.942977478948057e-07, "loss": 0.0002, "reward": 0.3500000163912773, "reward_std": 0.09609274379909039, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.25, "step": 460 }, { "completion_length": 200.66146087646484, "epoch": 7.705882352941177, "grad_norm": 0.47279850730171785, "kl": 0.0819091796875, "learning_rate": 2.9270307630862006e-07, "loss": 0.0001, "reward": 0.3395833633840084, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833432674408, "step": 462 }, { "completion_length": 387.1927185058594, "epoch": 7.739495798319328, "grad_norm": 0.34987458156755596, "kl": 0.065704345703125, "learning_rate": 2.911066129868782e-07, "loss": 0.0001, "reward": 0.3343750312924385, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2343750149011612, "step": 464 }, { "completion_length": 213.9791717529297, "epoch": 7.773109243697479, "grad_norm": 0.010540591350387818, "kl": 0.1251220703125, "learning_rate": 2.8950842491398355e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 466 }, { "completion_length": 298.5156364440918, "epoch": 7.80672268907563, "grad_norm": 0.375936919745997, "kl": 0.0787353515625, "learning_rate": 2.87908579146707e-07, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 468 }, { "completion_length": 342.28125762939453, "epoch": 7.840336134453781, "grad_norm": 0.010771183271059725, "kl": 0.0621337890625, "learning_rate": 2.863071428113726e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 470 }, { "completion_length": 378.4948081970215, "epoch": 7.873949579831933, "grad_norm": 0.04834529793389601, "kl": 0.05743408203125, "learning_rate": 2.847041831010417e-07, "loss": 0.0001, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 472 }, { "completion_length": 279.18750762939453, "epoch": 7.907563025210084, "grad_norm": 0.8882661114784222, "kl": 0.067138671875, "learning_rate": 2.830997672726933e-07, "loss": 0.0001, "reward": 0.18333334475755692, "reward_std": 0.07013042084872723, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08333333348855376, "step": 474 }, { "completion_length": 257.3177185058594, "epoch": 7.9411764705882355, "grad_norm": 0.6232743528149389, "kl": 0.0765380859375, "learning_rate": 2.8149396264440227e-07, "loss": 0.0001, "reward": 0.18854168057441711, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08854166930541396, "step": 476 }, { "completion_length": 233.8072967529297, "epoch": 7.974789915966387, "grad_norm": 0.25884898154635494, "kl": 0.0908203125, "learning_rate": 2.798868365925147e-07, "loss": 0.0001, "reward": 0.18854168057441711, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08854166930541396, "step": 478 }, { "completion_length": 231.5277862548828, "epoch": 8.0, "grad_norm": 0.775511609460672, "kl": 0.09716796875, "learning_rate": 2.782784565488211e-07, "loss": 0.0001, "reward": 0.1965278039375941, "reward_std": 0.11571086446444194, "rewards/format_reward_func": 0.0993055726091067, "rewards/solution_reward_func": 0.0972222238779068, "step": 480 }, { "completion_length": 215.39062881469727, "epoch": 8.033613445378151, "grad_norm": 0.6185701356178138, "kl": 0.0933837890625, "learning_rate": 2.7666888999772656e-07, "loss": 0.0001, "reward": 0.18333334475755692, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08333333348855376, "step": 482 }, { "completion_length": 371.9739685058594, "epoch": 8.067226890756302, "grad_norm": 0.2212655480609394, "kl": 0.05657958984375, "learning_rate": 2.7505820447342024e-07, "loss": 0.0001, "reward": 0.2875000275671482, "reward_std": 0.07013041898608208, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1875000074505806, "step": 484 }, { "completion_length": 278.1718864440918, "epoch": 8.100840336134453, "grad_norm": 0.7701258102106249, "kl": 0.127197265625, "learning_rate": 2.7344646755704073e-07, "loss": 0.0001, "reward": 0.2562500201165676, "reward_std": 0.07013041898608208, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.15625000512227416, "step": 486 }, { "completion_length": 308.5833435058594, "epoch": 8.134453781512605, "grad_norm": 0.003463822900695765, "kl": 0.07781982421875, "learning_rate": 2.7183374687384096e-07, "loss": 0.0001, "reward": 0.1989583484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0989583358168602, "step": 488 }, { "completion_length": 400.25001525878906, "epoch": 8.168067226890756, "grad_norm": 0.7010254972886426, "kl": 0.04412841796875, "learning_rate": 2.7022011009035107e-07, "loss": 0.0, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 490 }, { "completion_length": 272.82812881469727, "epoch": 8.201680672268907, "grad_norm": 0.3815532737760802, "kl": 0.08648681640625, "learning_rate": 2.686056249115385e-07, "loss": 0.0001, "reward": 0.3604166954755783, "reward_std": 0.07348554208874702, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2604166753590107, "step": 492 }, { "completion_length": 477.3020935058594, "epoch": 8.235294117647058, "grad_norm": 0.008021431156731727, "kl": 0.046417236328125, "learning_rate": 2.669903590779679e-07, "loss": 0.0, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 494 }, { "completion_length": 277.81250381469727, "epoch": 8.268907563025211, "grad_norm": 0.04266518197489792, "kl": 0.09765625, "learning_rate": 2.653743803629587e-07, "loss": 0.0001, "reward": 0.25052086263895035, "reward_std": 0.03567607537843287, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.1510416716337204, "step": 496 }, { "completion_length": 204.90105438232422, "epoch": 8.302521008403362, "grad_norm": 0.5870014548055794, "kl": 0.111083984375, "learning_rate": 2.637577565697412e-07, "loss": 0.0001, "reward": 0.1625000201165676, "reward_std": 0.08258842676877975, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625000037252903, "step": 498 }, { "completion_length": 293.453125, "epoch": 8.336134453781513, "grad_norm": 0.5780343895053853, "kl": 0.1341552734375, "learning_rate": 2.621405555286121e-07, "loss": 0.0001, "reward": 0.235416691750288, "reward_std": 0.101949293166399, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.13541667070239782, "step": 500 }, { "completion_length": 272.40625381469727, "epoch": 8.369747899159664, "grad_norm": 0.0035389874725178733, "kl": 0.08929443359375, "learning_rate": 2.60522845094088e-07, "loss": 0.0001, "reward": 0.2822916880249977, "reward_std": 0.06835655122995377, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1822916716337204, "step": 502 }, { "completion_length": 305.78646087646484, "epoch": 8.403361344537815, "grad_norm": 0.008320591410409525, "kl": 0.0771484375, "learning_rate": 2.589046931420589e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 504 }, { "completion_length": 336.52605056762695, "epoch": 8.436974789915967, "grad_norm": 0.0037538139247778163, "kl": 0.054229736328125, "learning_rate": 2.572861675669399e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 506 }, { "completion_length": 258.96875762939453, "epoch": 8.470588235294118, "grad_norm": 0.0043383543240277725, "kl": 0.11865234375, "learning_rate": 2.556673362788225e-07, "loss": 0.0001, "reward": 0.2145833522081375, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1145833358168602, "step": 508 }, { "completion_length": 294.8437614440918, "epoch": 8.504201680672269, "grad_norm": 0.0033526391772074966, "kl": 0.06878662109375, "learning_rate": 2.540482672006254e-07, "loss": 0.0001, "reward": 0.4125000461935997, "reward_std": 0.07013042084872723, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3125000149011612, "step": 510 }, { "completion_length": 287.7552146911621, "epoch": 8.53781512605042, "grad_norm": 0.006122461290136909, "kl": 0.1024169921875, "learning_rate": 2.524290282652443e-07, "loss": 0.0001, "reward": 0.2666666693985462, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 512 }, { "completion_length": 304.98438262939453, "epoch": 8.571428571428571, "grad_norm": 0.5579592198788391, "kl": 0.08380126953125, "learning_rate": 2.508096874127022e-07, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 514 }, { "completion_length": 277.1041717529297, "epoch": 8.605042016806722, "grad_norm": 0.10526791891414893, "kl": 0.104248046875, "learning_rate": 2.4919031258729785e-07, "loss": 0.0001, "reward": 0.18802084401249886, "reward_std": 0.022916667396202683, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.08854166930541396, "step": 516 }, { "completion_length": 193.20833587646484, "epoch": 8.638655462184873, "grad_norm": 0.9568123429854551, "kl": 0.1287841796875, "learning_rate": 2.475709717347557e-07, "loss": 0.0001, "reward": 0.4281250163912773, "reward_std": 0.0625, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3281250149011612, "step": 518 }, { "completion_length": 418.1770935058594, "epoch": 8.672268907563025, "grad_norm": 0.07005965494562287, "kl": 0.05169677734375, "learning_rate": 2.459517327993746e-07, "loss": 0.0001, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 520 }, { "completion_length": 255.0520896911621, "epoch": 8.705882352941176, "grad_norm": 0.7577414046012017, "kl": 0.09326171875, "learning_rate": 2.443326637211775e-07, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 522 }, { "completion_length": 256.17188262939453, "epoch": 8.739495798319329, "grad_norm": 0.007952853794502448, "kl": 0.07794189453125, "learning_rate": 2.427138324330601e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 524 }, { "completion_length": 348.1458396911621, "epoch": 8.77310924369748, "grad_norm": 0.0034606017743952578, "kl": 0.06988525390625, "learning_rate": 2.4109530685794106e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 526 }, { "completion_length": 295.59375762939453, "epoch": 8.806722689075631, "grad_norm": 0.004754275690798883, "kl": 0.0775146484375, "learning_rate": 2.3947715490591203e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 528 }, { "completion_length": 321.43751525878906, "epoch": 8.840336134453782, "grad_norm": 0.7246512047403145, "kl": 0.091064453125, "learning_rate": 2.37859444471388e-07, "loss": 0.0001, "reward": 0.3135416992008686, "reward_std": 0.09052024409174919, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.213541679084301, "step": 530 }, { "completion_length": 323.1198081970215, "epoch": 8.873949579831933, "grad_norm": 0.4892724553026726, "kl": 0.099609375, "learning_rate": 2.3624224343025876e-07, "loss": 0.0001, "reward": 0.3395833559334278, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833358168602, "step": 532 }, { "completion_length": 289.08334732055664, "epoch": 8.907563025210084, "grad_norm": 0.5480894515702789, "kl": 0.08551025390625, "learning_rate": 2.346256196370413e-07, "loss": 0.0001, "reward": 0.15208334103226662, "reward_std": 0.0762883685529232, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0520833358168602, "step": 534 }, { "completion_length": 265.41667556762695, "epoch": 8.941176470588236, "grad_norm": 0.009228270434359776, "kl": 0.0875244140625, "learning_rate": 2.3300964092203203e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 536 }, { "completion_length": 325.2604179382324, "epoch": 8.974789915966387, "grad_norm": 0.4748058813958341, "kl": 0.07147216796875, "learning_rate": 2.3139437508846152e-07, "loss": 0.0001, "reward": 0.2302083522081375, "reward_std": 0.07086054235696793, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1302083358168602, "step": 538 }, { "completion_length": 305.97223409016925, "epoch": 9.0, "grad_norm": 0.5192242496691418, "kl": 0.08723958333333333, "learning_rate": 2.2977988990964896e-07, "loss": 0.0001, "reward": 0.3013888895511627, "reward_std": 0.04479032258192698, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2013888955116272, "step": 540 }, { "completion_length": 257.2083396911621, "epoch": 9.033613445378151, "grad_norm": 0.7835809708951533, "kl": 0.10400390625, "learning_rate": 2.28166253126159e-07, "loss": 0.0001, "reward": 0.2510416693985462, "reward_std": 0.049297086894512177, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1510416716337204, "step": 542 }, { "completion_length": 290.59375762939453, "epoch": 9.067226890756302, "grad_norm": 0.2951830812347451, "kl": 0.0877685546875, "learning_rate": 2.2655353244295927e-07, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 544 }, { "completion_length": 158.9427146911621, "epoch": 9.100840336134453, "grad_norm": 0.8522344807660408, "kl": 0.1275634765625, "learning_rate": 2.2494179552657974e-07, "loss": 0.0001, "reward": 0.3447916805744171, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.244791679084301, "step": 546 }, { "completion_length": 265.7239646911621, "epoch": 9.134453781512605, "grad_norm": 0.005651887468855693, "kl": 0.114013671875, "learning_rate": 2.233311100022734e-07, "loss": 0.0001, "reward": 0.3395833671092987, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833358168602, "step": 548 }, { "completion_length": 281.7239685058594, "epoch": 9.168067226890756, "grad_norm": 0.6098897971418463, "kl": 0.1029052734375, "learning_rate": 2.2172154345117894e-07, "loss": 0.0001, "reward": 0.22447917610406876, "reward_std": 0.0656122958753258, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.125, "step": 550 }, { "completion_length": 242.2187557220459, "epoch": 9.201680672268907, "grad_norm": 0.9822221419762381, "kl": 0.11492919921875, "learning_rate": 2.2011316340748528e-07, "loss": 0.0001, "reward": 0.344791691750288, "reward_std": 0.09096375480294228, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2447916716337204, "step": 552 }, { "completion_length": 308.53126525878906, "epoch": 9.235294117647058, "grad_norm": 0.1885437904997469, "kl": 0.083831787109375, "learning_rate": 2.1850603735559776e-07, "loss": 0.0001, "reward": 0.2562500163912773, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.15625, "step": 554 }, { "completion_length": 293.29688262939453, "epoch": 9.268907563025211, "grad_norm": 0.005611452795609897, "kl": 0.08160400390625, "learning_rate": 2.1690023272730678e-07, "loss": 0.0001, "reward": 0.14166668057441711, "reward_std": 0.06352896057069302, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.04166666930541396, "step": 556 }, { "completion_length": 446.8385467529297, "epoch": 9.302521008403362, "grad_norm": 0.002865293733617532, "kl": 0.05181884765625, "learning_rate": 2.1529581689895836e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 558 }, { "completion_length": 301.7083435058594, "epoch": 9.336134453781513, "grad_norm": 0.004643867255213824, "kl": 0.077392578125, "learning_rate": 2.1369285718862748e-07, "loss": 0.0001, "reward": 0.2510416880249977, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1510416716337204, "step": 560 }, { "completion_length": 425.6979293823242, "epoch": 9.369747899159664, "grad_norm": 0.0060364768094503885, "kl": 0.062744140625, "learning_rate": 2.1209142085329298e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 562 }, { "completion_length": 367.24480056762695, "epoch": 9.403361344537815, "grad_norm": 0.19744639090946212, "kl": 0.0653076171875, "learning_rate": 2.104915750860164e-07, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 564 }, { "completion_length": 254.57813262939453, "epoch": 9.436974789915967, "grad_norm": 0.006675575539801922, "kl": 0.100341796875, "learning_rate": 2.088933870131218e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 566 }, { "completion_length": 301.3281364440918, "epoch": 9.470588235294118, "grad_norm": 0.7003600247485652, "kl": 0.09130859375, "learning_rate": 2.072969236913799e-07, "loss": 0.0001, "reward": 0.2145833522081375, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1145833358168602, "step": 568 }, { "completion_length": 255.64584350585938, "epoch": 9.504201680672269, "grad_norm": 0.005376980604359961, "kl": 0.0914306640625, "learning_rate": 2.0570225210519433e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 570 }, { "completion_length": 297.3385543823242, "epoch": 9.53781512605042, "grad_norm": 0.04687088025945674, "kl": 0.10687255859375, "learning_rate": 2.0410943916379097e-07, "loss": 0.0001, "reward": 0.2656250335276127, "reward_std": 0.0028463751077651978, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.1666666716337204, "step": 572 }, { "completion_length": 247.78125762939453, "epoch": 9.571428571428571, "grad_norm": 0.006219759823107143, "kl": 0.0921630859375, "learning_rate": 2.0251855169841075e-07, "loss": 0.0001, "reward": 0.3395833447575569, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833432674408, "step": 574 }, { "completion_length": 234.68750381469727, "epoch": 9.605042016806722, "grad_norm": 0.8534496465930611, "kl": 0.10888671875, "learning_rate": 2.0092965645950564e-07, "loss": 0.0001, "reward": 0.360416691750288, "reward_std": 0.10375928319990635, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.260416679084301, "step": 576 }, { "completion_length": 359.671875, "epoch": 9.638655462184873, "grad_norm": 0.43358431816878246, "kl": 0.103271484375, "learning_rate": 1.993428201139375e-07, "loss": 0.0001, "reward": 0.1885416842997074, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0885416716337204, "step": 578 }, { "completion_length": 366.21875762939453, "epoch": 9.672268907563025, "grad_norm": 0.7734153510431392, "kl": 0.09124755859375, "learning_rate": 1.977581092421812e-07, "loss": 0.0001, "reward": 0.2093750163912773, "reward_std": 0.08656488358974457, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.10937500279396772, "step": 580 }, { "completion_length": 362.6458435058594, "epoch": 9.705882352941176, "grad_norm": 0.259139772069165, "kl": 0.0731201171875, "learning_rate": 1.9617559033553126e-07, "loss": 0.0001, "reward": 0.2302083522081375, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1302083358168602, "step": 582 }, { "completion_length": 264.66667556762695, "epoch": 9.739495798319329, "grad_norm": 0.5785865980178528, "kl": 0.0960693359375, "learning_rate": 1.9459532979331148e-07, "loss": 0.0001, "reward": 0.2562500163912773, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.15625, "step": 584 }, { "completion_length": 340.3177185058594, "epoch": 9.77310924369748, "grad_norm": 0.002878573369429767, "kl": 0.09002685546875, "learning_rate": 1.930173939200892e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 586 }, { "completion_length": 313.12501525878906, "epoch": 9.806722689075631, "grad_norm": 0.00482642407002359, "kl": 0.07318115234375, "learning_rate": 1.9144184892289336e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 588 }, { "completion_length": 321.1979217529297, "epoch": 9.840336134453782, "grad_norm": 0.0036796111689659074, "kl": 0.0645751953125, "learning_rate": 1.8986876090843664e-07, "loss": 0.0001, "reward": 0.2718750201165676, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.17187500279396772, "step": 590 }, { "completion_length": 362.9583435058594, "epoch": 9.873949579831933, "grad_norm": 0.8570968090880219, "kl": 0.0810546875, "learning_rate": 1.882981958803414e-07, "loss": 0.0001, "reward": 0.3968750238418579, "reward_std": 0.09976780228316784, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.296875, "step": 592 }, { "completion_length": 321.5677185058594, "epoch": 9.907563025210084, "grad_norm": 0.44306141612544053, "kl": 0.06744384765625, "learning_rate": 1.8673021973637093e-07, "loss": 0.0001, "reward": 0.1677083484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0677083358168602, "step": 594 }, { "completion_length": 307.9739646911621, "epoch": 9.941176470588236, "grad_norm": 0.07862960541736436, "kl": 0.1390380859375, "learning_rate": 1.8516489826566374e-07, "loss": 0.0001, "reward": 0.29739586636424065, "reward_std": 0.07358023361302912, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.1979166753590107, "step": 596 }, { "completion_length": 361.4583435058594, "epoch": 9.974789915966387, "grad_norm": 0.004307893394110164, "kl": 0.0771484375, "learning_rate": 1.8360229714597368e-07, "loss": 0.0001, "reward": 0.2354166880249977, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1354166716337204, "step": 598 }, { "completion_length": 372.5486195882161, "epoch": 10.0, "grad_norm": 0.3191218976519368, "kl": 0.0772705078125, "learning_rate": 1.8204248194091425e-07, "loss": 0.0001, "reward": 0.16944446166356406, "reward_std": 0.055555557211240135, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0694444477558136, "step": 600 }, { "completion_length": 241.42187881469727, "epoch": 10.033613445378151, "grad_norm": 0.3908517811128159, "kl": 0.0975341796875, "learning_rate": 1.804855180972075e-07, "loss": 0.0001, "reward": 0.3083333559334278, "reward_std": 0.07453560084104538, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2083333358168602, "step": 602 }, { "completion_length": 245.83333587646484, "epoch": 10.067226890756302, "grad_norm": 0.005583650837234494, "kl": 0.09808349609375, "learning_rate": 1.7893147094193784e-07, "loss": 0.0001, "reward": 0.391666691750288, "reward_std": 0.11515220627188683, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2916666716337204, "step": 604 }, { "completion_length": 264.625, "epoch": 10.100840336134453, "grad_norm": 0.00629135149917365, "kl": 0.095458984375, "learning_rate": 1.7738040567981165e-07, "loss": 0.0001, "reward": 0.16250001266598701, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625, "step": 606 }, { "completion_length": 276.79687881469727, "epoch": 10.134453781512605, "grad_norm": 0.0074588315645399885, "kl": 0.0931396484375, "learning_rate": 1.7583238739042084e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 608 }, { "completion_length": 335.3541793823242, "epoch": 10.168067226890756, "grad_norm": 0.881684726639755, "kl": 0.08636474609375, "learning_rate": 1.7428748102551234e-07, "loss": 0.0001, "reward": 0.21770834550261497, "reward_std": 0.06759326322935522, "rewards/format_reward_func": 0.09791668318212032, "rewards/solution_reward_func": 0.1197916679084301, "step": 610 }, { "completion_length": 380.1458435058594, "epoch": 10.201680672268907, "grad_norm": 0.035539592870514024, "kl": 0.10882568359375, "learning_rate": 1.7274575140626315e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 612 }, { "completion_length": 177.77083778381348, "epoch": 10.235294117647058, "grad_norm": 0.008151893338596393, "kl": 0.133056640625, "learning_rate": 1.712072632205604e-07, "loss": 0.0001, "reward": 0.3500000275671482, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2500000074505806, "step": 614 }, { "completion_length": 348.59897232055664, "epoch": 10.268907563025211, "grad_norm": 0.6522621398010783, "kl": 0.076019287109375, "learning_rate": 1.6967208102028696e-07, "loss": 0.0001, "reward": 0.2458333522081375, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333358168602, "step": 616 }, { "completion_length": 265.7135543823242, "epoch": 10.302521008403362, "grad_norm": 0.009800657169689538, "kl": 0.1063232421875, "learning_rate": 1.6814026921861335e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 618 }, { "completion_length": 344.2448043823242, "epoch": 10.336134453781513, "grad_norm": 0.5653048540417516, "kl": 0.06671142578125, "learning_rate": 1.6661189208729489e-07, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 620 }, { "completion_length": 288.6979217529297, "epoch": 10.369747899159664, "grad_norm": 0.7376311172377286, "kl": 0.1240234375, "learning_rate": 1.6508701375397486e-07, "loss": 0.0001, "reward": 0.2041666880249977, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.10416666977107525, "step": 622 }, { "completion_length": 285.9791717529297, "epoch": 10.403361344537815, "grad_norm": 0.005882996843449139, "kl": 0.0888671875, "learning_rate": 1.6356569819949427e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 624 }, { "completion_length": 438.2864685058594, "epoch": 10.436974789915967, "grad_norm": 0.0034224313159851405, "kl": 0.05291748046875, "learning_rate": 1.6204800925520685e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 626 }, { "completion_length": 293.8541717529297, "epoch": 10.470588235294118, "grad_norm": 0.021787024469213906, "kl": 0.09100341796875, "learning_rate": 1.6053401060030097e-07, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 628 }, { "completion_length": 357.2447967529297, "epoch": 10.504201680672269, "grad_norm": 0.005647735020516258, "kl": 0.067138671875, "learning_rate": 1.5902376575912814e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 630 }, { "completion_length": 231.5885467529297, "epoch": 10.53781512605042, "grad_norm": 0.8336139638323953, "kl": 0.1038818359375, "learning_rate": 1.57517338098537e-07, "loss": 0.0001, "reward": 0.2562500238418579, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1562500074505806, "step": 632 }, { "completion_length": 250.3072967529297, "epoch": 10.571428571428571, "grad_norm": 0.8356424677382668, "kl": 0.1336669921875, "learning_rate": 1.5601479082521525e-07, "loss": 0.0001, "reward": 0.14166667684912682, "reward_std": 0.04303314909338951, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0416666679084301, "step": 634 }, { "completion_length": 344.9271049499512, "epoch": 10.605042016806722, "grad_norm": 0.0035056662420766566, "kl": 0.11224365234375, "learning_rate": 1.545161869830371e-07, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 636 }, { "completion_length": 299.5208396911621, "epoch": 10.638655462184873, "grad_norm": 0.007154394360479766, "kl": 0.07421875, "learning_rate": 1.5302158945041837e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 638 }, { "completion_length": 339.4270935058594, "epoch": 10.672268907563025, "grad_norm": 0.8255871150168281, "kl": 0.0926513671875, "learning_rate": 1.5153106093767825e-07, "loss": 0.0001, "reward": 0.287500012665987, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1875000111758709, "step": 640 }, { "completion_length": 364.01563262939453, "epoch": 10.705882352941176, "grad_norm": 0.004029284764535584, "kl": 0.06903076171875, "learning_rate": 1.5004466398440773e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 642 }, { "completion_length": 241.4739646911621, "epoch": 10.739495798319329, "grad_norm": 0.24873228031508388, "kl": 0.1231689453125, "learning_rate": 1.4856246095684622e-07, "loss": 0.0001, "reward": 0.4125000275671482, "reward_std": 0.056927502155303955, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3125000074505806, "step": 644 }, { "completion_length": 273.3020896911621, "epoch": 10.77310924369748, "grad_norm": 0.517040798143609, "kl": 0.101318359375, "learning_rate": 1.4708451404526407e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 646 }, { "completion_length": 324.92188262939453, "epoch": 10.806722689075631, "grad_norm": 0.005372034676039942, "kl": 0.076904296875, "learning_rate": 1.4561088526135374e-07, "loss": 0.0001, "reward": 0.2354166880249977, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1354166716337204, "step": 648 }, { "completion_length": 263.44792556762695, "epoch": 10.840336134453782, "grad_norm": 1.080748525385016, "kl": 0.1195068359375, "learning_rate": 1.4414163643562753e-07, "loss": 0.0001, "reward": 0.3343750163912773, "reward_std": 0.10562435537576675, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.234375, "step": 650 }, { "completion_length": 407.1354293823242, "epoch": 10.873949579831933, "grad_norm": 0.004519233002901432, "kl": 0.06085205078125, "learning_rate": 1.4267682921482356e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 652 }, { "completion_length": 367.9323081970215, "epoch": 10.907563025210084, "grad_norm": 0.23028488880398254, "kl": 0.08795166015625, "learning_rate": 1.4121652505931918e-07, "loss": 0.0001, "reward": 0.313541691750288, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2135416716337204, "step": 654 }, { "completion_length": 292.2135467529297, "epoch": 10.941176470588236, "grad_norm": 0.6894185016028704, "kl": 0.11322021484375, "learning_rate": 1.3976078524055203e-07, "loss": 0.0001, "reward": 0.3500000312924385, "reward_std": 0.08288982696831226, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.25, "step": 656 }, { "completion_length": 357.2760543823242, "epoch": 10.974789915966387, "grad_norm": 0.00543910499739779, "kl": 0.07342529296875, "learning_rate": 1.383096708384494e-07, "loss": 0.0001, "reward": 0.18854168057441711, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08854166930541396, "step": 658 }, { "completion_length": 236.81944783528647, "epoch": 11.0, "grad_norm": 0.008429435225204571, "kl": 0.13151041666666666, "learning_rate": 1.3686324273886528e-07, "loss": 0.0001, "reward": 0.2111111283302307, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.11111111442248027, "step": 660 }, { "completion_length": 290.3489685058594, "epoch": 11.033613445378151, "grad_norm": 1.1654982828774354, "kl": 0.1307373046875, "learning_rate": 1.354215616310258e-07, "loss": 0.0001, "reward": 0.2510416880249977, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1510416716337204, "step": 662 }, { "completion_length": 307.65105056762695, "epoch": 11.067226890756302, "grad_norm": 0.7909150095816768, "kl": 0.10833740234375, "learning_rate": 1.339846880049829e-07, "loss": 0.0001, "reward": 0.4281250201165676, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3281250149011612, "step": 664 }, { "completion_length": 254.92188262939453, "epoch": 11.100840336134453, "grad_norm": 0.35217027000712403, "kl": 0.1090087890625, "learning_rate": 1.325526821490761e-07, "loss": 0.0001, "reward": 0.3864583633840084, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2864583432674408, "step": 666 }, { "completion_length": 330.2864646911621, "epoch": 11.134453781512605, "grad_norm": 0.0073745461407176225, "kl": 0.09051513671875, "learning_rate": 1.3112560414740313e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 668 }, { "completion_length": 315.76042556762695, "epoch": 11.168067226890756, "grad_norm": 0.5403360796429658, "kl": 0.093017578125, "learning_rate": 1.2970351387729872e-07, "loss": 0.0001, "reward": 0.20416668057441711, "reward_std": 0.0803009495139122, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1041666679084301, "step": 670 }, { "completion_length": 273.2447967529297, "epoch": 11.201680672268907, "grad_norm": 0.06935201957593491, "kl": 0.11627197265625, "learning_rate": 1.2828647100682261e-07, "loss": 0.0001, "reward": 0.26614585146307945, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.1666666716337204, "step": 672 }, { "completion_length": 273.6718864440918, "epoch": 11.235294117647058, "grad_norm": 0.004798147961259559, "kl": 0.1021728515625, "learning_rate": 1.2687453499225546e-07, "loss": 0.0001, "reward": 0.2458333522081375, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333358168602, "step": 674 }, { "completion_length": 417.1146011352539, "epoch": 11.268907563025211, "grad_norm": 0.6188661588776337, "kl": 0.0777587890625, "learning_rate": 1.2546776507560467e-07, "loss": 0.0001, "reward": 0.2562500238418579, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1562500074505806, "step": 676 }, { "completion_length": 281.4270935058594, "epoch": 11.302521008403362, "grad_norm": 1.2487408673494609, "kl": 0.1129150390625, "learning_rate": 1.2406622028211843e-07, "loss": 0.0001, "reward": 0.2666666954755783, "reward_std": 0.0833333358168602, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 678 }, { "completion_length": 368.48959732055664, "epoch": 11.336134453781513, "grad_norm": 0.02006347331748632, "kl": 0.086822509765625, "learning_rate": 1.2266995941780933e-07, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 680 }, { "completion_length": 286.7604217529297, "epoch": 11.369747899159664, "grad_norm": 0.0037060063339731907, "kl": 0.08837890625, "learning_rate": 1.2127904106698665e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 682 }, { "completion_length": 312.97396087646484, "epoch": 11.403361344537815, "grad_norm": 0.004529713519338045, "kl": 0.07354736328125, "learning_rate": 1.1989352358979888e-07, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 684 }, { "completion_length": 243.83334350585938, "epoch": 11.436974789915967, "grad_norm": 0.008582596407864693, "kl": 0.112548828125, "learning_rate": 1.1851346511978424e-07, "loss": 0.0001, "reward": 0.19375001266598701, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.09375, "step": 686 }, { "completion_length": 271.65625762939453, "epoch": 11.470588235294118, "grad_norm": 0.005887216695610677, "kl": 0.10125732421875, "learning_rate": 1.1713892356143238e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 688 }, { "completion_length": 315.09376525878906, "epoch": 11.504201680672269, "grad_norm": 0.614171785294004, "kl": 0.0902099609375, "learning_rate": 1.1576995658775404e-07, "loss": 0.0001, "reward": 0.3291666992008686, "reward_std": 0.056927502155303955, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.229166679084301, "step": 690 }, { "completion_length": 384.4687614440918, "epoch": 11.53781512605042, "grad_norm": 0.8034815955429827, "kl": 0.07061767578125, "learning_rate": 1.1440662163786166e-07, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 692 }, { "completion_length": 366.9114685058594, "epoch": 11.571428571428571, "grad_norm": 0.011868136767574996, "kl": 0.07867431640625, "learning_rate": 1.1304897591455928e-07, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 694 }, { "completion_length": 218.17188262939453, "epoch": 11.605042016806722, "grad_norm": 0.32492573340382475, "kl": 0.114501953125, "learning_rate": 1.1169707638194237e-07, "loss": 0.0001, "reward": 0.2666666842997074, "reward_std": 0.09799393452703953, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666679084301, "step": 696 }, { "completion_length": 280.0, "epoch": 11.638655462184873, "grad_norm": 0.631639919614747, "kl": 0.118896484375, "learning_rate": 1.103509797630077e-07, "loss": 0.0001, "reward": 0.2510416992008686, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1510416716337204, "step": 698 }, { "completion_length": 306.18750762939453, "epoch": 11.672268907563025, "grad_norm": 1.146015482310564, "kl": 0.1170654296875, "learning_rate": 1.0901074253727336e-07, "loss": 0.0001, "reward": 0.2875000238418579, "reward_std": 0.07013041898608208, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1875000074505806, "step": 700 }, { "completion_length": 296.04688262939453, "epoch": 11.705882352941176, "grad_norm": 0.29793048722036486, "kl": 0.08282470703125, "learning_rate": 1.0767642093840932e-07, "loss": 0.0001, "reward": 0.13125001266598701, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.03125, "step": 702 }, { "completion_length": 384.16147232055664, "epoch": 11.739495798319329, "grad_norm": 0.005625102562048634, "kl": 0.0933837890625, "learning_rate": 1.0634807095187737e-07, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 704 }, { "completion_length": 411.3802261352539, "epoch": 11.77310924369748, "grad_norm": 0.01217459597943632, "kl": 0.06707763671875, "learning_rate": 1.0502574831258257e-07, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 706 }, { "completion_length": 247.71875762939453, "epoch": 11.806722689075631, "grad_norm": 0.19957260841732413, "kl": 0.0972900390625, "learning_rate": 1.0370950850253449e-07, "loss": 0.0001, "reward": 0.16250000894069672, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.06250000139698386, "step": 708 }, { "completion_length": 289.3229293823242, "epoch": 11.840336134453782, "grad_norm": 0.006041845142227962, "kl": 0.1116943359375, "learning_rate": 1.0239940674851941e-07, "loss": 0.0001, "reward": 0.3489583544433117, "reward_std": 0.0028463751077651978, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.2500000074505806, "step": 710 }, { "completion_length": 326.96875381469727, "epoch": 11.873949579831933, "grad_norm": 0.006151330078350502, "kl": 0.1517333984375, "learning_rate": 1.0109549801978304e-07, "loss": 0.0002, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 712 }, { "completion_length": 230.8697967529297, "epoch": 11.907563025210084, "grad_norm": 0.40487088785790015, "kl": 0.1290283203125, "learning_rate": 9.979783702572411e-08, "loss": 0.0001, "reward": 0.3343750275671482, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2343750074505806, "step": 714 }, { "completion_length": 264.3645935058594, "epoch": 11.941176470588236, "grad_norm": 0.44556064226256137, "kl": 0.095458984375, "learning_rate": 9.850647821359917e-08, "loss": 0.0001, "reward": 0.3760416954755783, "reward_std": 0.07996342703700066, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2760416716337204, "step": 716 }, { "completion_length": 328.7239646911621, "epoch": 11.974789915966387, "grad_norm": 0.004107674650873193, "kl": 0.0845947265625, "learning_rate": 9.722147576623744e-08, "loss": 0.0001, "reward": 0.3395833559334278, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833432674408, "step": 718 }, { "completion_length": 241.56251017252603, "epoch": 12.0, "grad_norm": 0.00952493843638888, "kl": 0.11832682291666667, "learning_rate": 9.594288359976815e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 720 }, { "completion_length": 252.4479217529297, "epoch": 12.033613445378151, "grad_norm": 0.00986294919129773, "kl": 0.127685546875, "learning_rate": 9.467075536135785e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 722 }, { "completion_length": 307.11980056762695, "epoch": 12.067226890756302, "grad_norm": 0.011493373295556296, "kl": 0.098388671875, "learning_rate": 9.340514442695952e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 724 }, { "completion_length": 231.78125953674316, "epoch": 12.100840336134453, "grad_norm": 0.747475304272941, "kl": 0.1522216796875, "learning_rate": 9.214610389907326e-08, "loss": 0.0002, "reward": 0.3291666992008686, "reward_std": 0.05442607402801514, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2291666716337204, "step": 726 }, { "completion_length": 385.3854217529297, "epoch": 12.134453781512605, "grad_norm": 0.0075720542377330615, "kl": 0.0850830078125, "learning_rate": 9.089368660451798e-08, "loss": 0.0001, "reward": 0.18802084773778915, "reward_std": 0.022916667396202683, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.08854166930541396, "step": 728 }, { "completion_length": 271.5989685058594, "epoch": 12.168067226890756, "grad_norm": 0.7325108075032994, "kl": 0.107666015625, "learning_rate": 8.964794509221507e-08, "loss": 0.0001, "reward": 0.2354166842997074, "reward_std": 0.06573155522346497, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1354166716337204, "step": 730 }, { "completion_length": 260.8541717529297, "epoch": 12.201680672268907, "grad_norm": 0.0052402489889907585, "kl": 0.1241455078125, "learning_rate": 8.840893163098332e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 732 }, { "completion_length": 393.32814025878906, "epoch": 12.235294117647058, "grad_norm": 0.005839856133157586, "kl": 0.080657958984375, "learning_rate": 8.717669820734619e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 734 }, { "completion_length": 272.2916793823242, "epoch": 12.268907563025211, "grad_norm": 0.6155182375164174, "kl": 0.100830078125, "learning_rate": 8.595129652335017e-08, "loss": 0.0001, "reward": 0.4385417029261589, "reward_std": 0.13848697021603584, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.338541679084301, "step": 736 }, { "completion_length": 392.85938262939453, "epoch": 12.302521008403362, "grad_norm": 0.36403761668152484, "kl": 0.05914306640625, "learning_rate": 8.473277799439568e-08, "loss": 0.0001, "reward": 0.1833333522081375, "reward_std": 0.056927502155303955, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.08333333861082792, "step": 738 }, { "completion_length": 319.17708587646484, "epoch": 12.336134453781513, "grad_norm": 0.551177969993759, "kl": 0.1339111328125, "learning_rate": 8.352119374707977e-08, "loss": 0.0001, "reward": 0.24583334103226662, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333432674408, "step": 740 }, { "completion_length": 279.69272232055664, "epoch": 12.369747899159664, "grad_norm": 0.3586755174946672, "kl": 0.0850830078125, "learning_rate": 8.23165946170509e-08, "loss": 0.0001, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 742 }, { "completion_length": 191.2395896911621, "epoch": 12.403361344537815, "grad_norm": 0.5919946490310048, "kl": 0.1617431640625, "learning_rate": 8.11190311468759e-08, "loss": 0.0002, "reward": 0.2510416842997074, "reward_std": 0.05810113437473774, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.15104166930541396, "step": 744 }, { "completion_length": 309.1145896911621, "epoch": 12.436974789915967, "grad_norm": 0.006602732475021676, "kl": 0.091064453125, "learning_rate": 7.992855358391967e-08, "loss": 0.0001, "reward": 0.3395833633840084, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833432674408, "step": 746 }, { "completion_length": 232.97396087646484, "epoch": 12.470588235294118, "grad_norm": 0.8696964758836045, "kl": 0.1104736328125, "learning_rate": 7.87452118782363e-08, "loss": 0.0001, "reward": 0.4489583633840084, "reward_std": 0.1321869120001793, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3489583507180214, "step": 748 }, { "completion_length": 269.78125762939453, "epoch": 12.504201680672269, "grad_norm": 0.0060405505157196965, "kl": 0.1015625, "learning_rate": 7.756905568047392e-08, "loss": 0.0001, "reward": 0.1572916842997074, "reward_std": 0.03989280015230179, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0572916679084301, "step": 750 }, { "completion_length": 435.40626525878906, "epoch": 12.53781512605042, "grad_norm": 0.002602008616345893, "kl": 0.06072998046875, "learning_rate": 7.640013433979093e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 752 }, { "completion_length": 323.25521087646484, "epoch": 12.571428571428571, "grad_norm": 0.006797283726563493, "kl": 0.08563232421875, "learning_rate": 7.523849690178566e-08, "loss": 0.0001, "reward": 0.12083334848284721, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.02083333395421505, "step": 754 }, { "completion_length": 285.63542556762695, "epoch": 12.605042016806722, "grad_norm": 0.012502473295343316, "kl": 0.1177978515625, "learning_rate": 7.408419210643846e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 756 }, { "completion_length": 397.17187881469727, "epoch": 12.638655462184873, "grad_norm": 0.007470449319281652, "kl": 0.0933837890625, "learning_rate": 7.293726838606673e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 758 }, { "completion_length": 320.3125114440918, "epoch": 12.672268907563025, "grad_norm": 0.0048389039261022065, "kl": 0.09417724609375, "learning_rate": 7.179777386329275e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 760 }, { "completion_length": 342.85938262939453, "epoch": 12.705882352941176, "grad_norm": 0.6087394360106246, "kl": 0.10589599609375, "learning_rate": 7.066575634902435e-08, "loss": 0.0001, "reward": 0.3135416954755783, "reward_std": 0.042695626616477966, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.213541679084301, "step": 762 }, { "completion_length": 313.3645935058594, "epoch": 12.739495798319329, "grad_norm": 0.005906665579025605, "kl": 0.0860595703125, "learning_rate": 6.954126334044949e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 764 }, { "completion_length": 355.98438262939453, "epoch": 12.77310924369748, "grad_norm": 0.0056999879735242474, "kl": 0.07330322265625, "learning_rate": 6.842434201904255e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 766 }, { "completion_length": 241.91146850585938, "epoch": 12.806722689075631, "grad_norm": 0.028147754380362787, "kl": 0.1492919921875, "learning_rate": 6.731503924858516e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 768 }, { "completion_length": 338.0364685058594, "epoch": 12.840336134453782, "grad_norm": 0.07450167852895073, "kl": 0.09326171875, "learning_rate": 6.621340157319996e-08, "loss": 0.0001, "reward": 0.18281251192092896, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0833333358168602, "step": 770 }, { "completion_length": 219.78125381469727, "epoch": 12.873949579831933, "grad_norm": 0.34157306279214383, "kl": 0.1063232421875, "learning_rate": 6.511947521539737e-08, "loss": 0.0001, "reward": 0.1885416842997074, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0885416716337204, "step": 772 }, { "completion_length": 271.07813262939453, "epoch": 12.907563025210084, "grad_norm": 0.514663185806984, "kl": 0.08892822265625, "learning_rate": 6.403330607413643e-08, "loss": 0.0001, "reward": 0.2562500275671482, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.15625, "step": 774 }, { "completion_length": 246.22396850585938, "epoch": 12.941176470588236, "grad_norm": 0.39608592745774945, "kl": 0.11279296875, "learning_rate": 6.295493972289903e-08, "loss": 0.0001, "reward": 0.2614583559334278, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583432674408, "step": 776 }, { "completion_length": 290.3333435058594, "epoch": 12.974789915966387, "grad_norm": 0.017535727888143934, "kl": 0.10650634765625, "learning_rate": 6.188442140777742e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 778 }, { "completion_length": 316.8611195882161, "epoch": 13.0, "grad_norm": 0.29523577694143666, "kl": 0.12190755208333333, "learning_rate": 6.082179604557616e-08, "loss": 0.0001, "reward": 0.3916666855414708, "reward_std": 0.16607532650232315, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2916666666666667, "step": 780 }, { "completion_length": 395.31250762939453, "epoch": 13.033613445378151, "grad_norm": 0.003660582613325097, "kl": 0.0721435546875, "learning_rate": 5.976710822192721e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 782 }, { "completion_length": 324.5677146911621, "epoch": 13.067226890756302, "grad_norm": 0.008821339257569614, "kl": 0.08148193359375, "learning_rate": 5.8720402189419286e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 784 }, { "completion_length": 263.7708396911621, "epoch": 13.100840336134453, "grad_norm": 0.00414360837717974, "kl": 0.1064453125, "learning_rate": 5.768172186574122e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 786 }, { "completion_length": 246.01042556762695, "epoch": 13.134453781512605, "grad_norm": 0.006612956119534821, "kl": 0.1407470703125, "learning_rate": 5.6651110831839046e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 788 }, { "completion_length": 310.0885467529297, "epoch": 13.168067226890756, "grad_norm": 0.27851474590106207, "kl": 0.10125732421875, "learning_rate": 5.5628612330087724e-08, "loss": 0.0001, "reward": 0.3083333522081375, "reward_std": 0.07013041898608208, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2083333358168602, "step": 790 }, { "completion_length": 246.76563262939453, "epoch": 13.201680672268907, "grad_norm": 0.011283830511360449, "kl": 0.1168212890625, "learning_rate": 5.461426926247639e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 792 }, { "completion_length": 275.8698043823242, "epoch": 13.235294117647058, "grad_norm": 0.8130282523724809, "kl": 0.12408447265625, "learning_rate": 5.360812418880883e-08, "loss": 0.0001, "reward": 0.329166691750288, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2291666716337204, "step": 794 }, { "completion_length": 267.6354217529297, "epoch": 13.268907563025211, "grad_norm": 0.010511330325395564, "kl": 0.1043701171875, "learning_rate": 5.261021932491713e-08, "loss": 0.0001, "reward": 0.2562500238418579, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1562500074505806, "step": 796 }, { "completion_length": 344.0520935058594, "epoch": 13.302521008403362, "grad_norm": 0.2739193322106394, "kl": 0.07659912109375, "learning_rate": 5.162059654089082e-08, "loss": 0.0001, "reward": 0.2666666842997074, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.16666666930541396, "step": 798 }, { "completion_length": 427.25001525878906, "epoch": 13.336134453781513, "grad_norm": 0.62938115807007, "kl": 0.06671142578125, "learning_rate": 5.0639297359319846e-08, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 800 }, { "completion_length": 235.60937881469727, "epoch": 13.369747899159664, "grad_norm": 0.010438495508474768, "kl": 0.10369873046875, "learning_rate": 4.9666362953552534e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 802 }, { "completion_length": 345.89584732055664, "epoch": 13.403361344537815, "grad_norm": 0.0068738775359263775, "kl": 0.0814208984375, "learning_rate": 4.870183414596793e-08, "loss": 0.0001, "reward": 0.18802084773778915, "reward_std": 0.05138041824102402, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.08854166697710752, "step": 804 }, { "completion_length": 305.16146087646484, "epoch": 13.436974789915967, "grad_norm": 0.006244598431241451, "kl": 0.100341796875, "learning_rate": 4.774575140626316e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 806 }, { "completion_length": 254.76562881469727, "epoch": 13.470588235294118, "grad_norm": 0.00880109127635552, "kl": 0.1104736328125, "learning_rate": 4.679815484975505e-08, "loss": 0.0001, "reward": 0.3239583447575569, "reward_std": 0.05810113437473774, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2239583432674408, "step": 808 }, { "completion_length": 255.0572967529297, "epoch": 13.504201680672269, "grad_norm": 0.003013564564233277, "kl": 0.0870361328125, "learning_rate": 4.5859084235697235e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 810 }, { "completion_length": 342.0833435058594, "epoch": 13.53781512605042, "grad_norm": 0.007070990246017409, "kl": 0.09552001953125, "learning_rate": 4.492857896561203e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 812 }, { "completion_length": 347.32813262939453, "epoch": 13.571428571428571, "grad_norm": 0.014370992884883369, "kl": 0.08935546875, "learning_rate": 4.4006678081636885e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 814 }, { "completion_length": 279.49480056762695, "epoch": 13.605042016806722, "grad_norm": 0.00850487735117035, "kl": 0.0908203125, "learning_rate": 4.309342026488652e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 816 }, { "completion_length": 240.33333778381348, "epoch": 13.638655462184873, "grad_norm": 0.9374858195956104, "kl": 0.1434326171875, "learning_rate": 4.218884383382987e-08, "loss": 0.0001, "reward": 0.3864583484828472, "reward_std": 0.09096375294029713, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2864583507180214, "step": 818 }, { "completion_length": 300.3958435058594, "epoch": 13.672268907563025, "grad_norm": 0.7034297108093118, "kl": 0.0877685546875, "learning_rate": 4.1292986742682254e-08, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 820 }, { "completion_length": 251.66146850585938, "epoch": 13.705882352941176, "grad_norm": 0.011467423720761897, "kl": 0.11962890625, "learning_rate": 4.0405886579813006e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 822 }, { "completion_length": 294.9635543823242, "epoch": 13.739495798319329, "grad_norm": 0.0071764991111909916, "kl": 0.103515625, "learning_rate": 3.952758056616826e-08, "loss": 0.0001, "reward": 0.3500000089406967, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2500000074505806, "step": 824 }, { "completion_length": 276.4531364440918, "epoch": 13.77310924369748, "grad_norm": 0.008556811421937708, "kl": 0.102783203125, "learning_rate": 3.8658105553709353e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 826 }, { "completion_length": 296.17187881469727, "epoch": 13.806722689075631, "grad_norm": 0.4702624407411105, "kl": 0.09197998046875, "learning_rate": 3.7797498023866395e-08, "loss": 0.0001, "reward": 0.3447916731238365, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2447916716337204, "step": 828 }, { "completion_length": 275.7083435058594, "epoch": 13.840336134453782, "grad_norm": 0.33861168165432415, "kl": 0.1192626953125, "learning_rate": 3.6945794086007705e-08, "loss": 0.0001, "reward": 0.4177083484828472, "reward_std": 0.09096375480294228, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3177083432674408, "step": 830 }, { "completion_length": 347.51564025878906, "epoch": 13.873949579831933, "grad_norm": 0.004701312956686734, "kl": 0.0821533203125, "learning_rate": 3.6103029475924727e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 832 }, { "completion_length": 299.0937614440918, "epoch": 13.907563025210084, "grad_norm": 0.9933424183576781, "kl": 0.0992431640625, "learning_rate": 3.5269239554332556e-08, "loss": 0.0001, "reward": 0.19322917610406876, "reward_std": 0.07221375242806971, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.09375000046566129, "step": 834 }, { "completion_length": 252.39063262939453, "epoch": 13.941176470588236, "grad_norm": 0.08168179073438255, "kl": 0.1673583984375, "learning_rate": 3.4444459305386504e-08, "loss": 0.0002, "reward": 0.3489583544433117, "reward_std": 0.0028463751077651978, "rewards/format_reward_func": 0.09895834885537624, "rewards/solution_reward_func": 0.2500000074505806, "step": 836 }, { "completion_length": 283.2447967529297, "epoch": 13.974789915966387, "grad_norm": 0.00658960669367079, "kl": 0.0897216796875, "learning_rate": 3.362872333521388e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 838 }, { "completion_length": 339.4305674235026, "epoch": 14.0, "grad_norm": 0.014336116628560408, "kl": 0.06925455729166667, "learning_rate": 3.2822065870462215e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 840 }, { "completion_length": 247.82292556762695, "epoch": 14.033613445378151, "grad_norm": 0.09328828337303391, "kl": 0.128662109375, "learning_rate": 3.2024520756863236e-08, "loss": 0.0001, "reward": 0.33281252160668373, "reward_std": 0.052656359039247036, "rewards/format_reward_func": 0.09843751788139343, "rewards/solution_reward_func": 0.2343750074505806, "step": 842 }, { "completion_length": 343.55208587646484, "epoch": 14.067226890756302, "grad_norm": 0.4848405022736212, "kl": 0.0882568359375, "learning_rate": 3.1236121457812545e-08, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 844 }, { "completion_length": 265.72917556762695, "epoch": 14.100840336134453, "grad_norm": 0.5664930247833133, "kl": 0.1453857421875, "learning_rate": 3.045690105296572e-08, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 846 }, { "completion_length": 416.04689025878906, "epoch": 14.134453781512605, "grad_norm": 0.002942009683252044, "kl": 0.05035400390625, "learning_rate": 2.9686892236850336e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 848 }, { "completion_length": 260.52605056762695, "epoch": 14.168067226890756, "grad_norm": 0.8292313716577437, "kl": 0.1314697265625, "learning_rate": 2.892612731749414e-08, "loss": 0.0001, "reward": 0.4177083596587181, "reward_std": 0.06250000186264515, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3177083358168602, "step": 850 }, { "completion_length": 288.8593864440918, "epoch": 14.201680672268907, "grad_norm": 0.01027928579030803, "kl": 0.1077880859375, "learning_rate": 2.817463821506949e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 852 }, { "completion_length": 255.95313262939453, "epoch": 14.235294117647058, "grad_norm": 0.3027502470359931, "kl": 0.11279296875, "learning_rate": 2.7432456460553975e-08, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 854 }, { "completion_length": 331.3489685058594, "epoch": 14.268907563025211, "grad_norm": 0.0692626108004871, "kl": 0.085205078125, "learning_rate": 2.6699613194407723e-08, "loss": 0.0001, "reward": 0.09947917237877846, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0, "step": 856 }, { "completion_length": 234.7760467529297, "epoch": 14.302521008403362, "grad_norm": 0.01995604364702659, "kl": 0.11376953125, "learning_rate": 2.5976139165266364e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 858 }, { "completion_length": 314.17188262939453, "epoch": 14.336134453781513, "grad_norm": 0.19946917145393075, "kl": 0.08721923828125, "learning_rate": 2.5262064728651194e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 860 }, { "completion_length": 217.1197967529297, "epoch": 14.369747899159664, "grad_norm": 0.8209078488360535, "kl": 0.11962890625, "learning_rate": 2.4557419845695427e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 862 }, { "completion_length": 344.73439025878906, "epoch": 14.403361344537815, "grad_norm": 0.5499154873275146, "kl": 0.12255859375, "learning_rate": 2.3862234081887033e-08, "loss": 0.0001, "reward": 0.24010418355464935, "reward_std": 0.06018446758389473, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.140625, "step": 864 }, { "completion_length": 333.04689025878906, "epoch": 14.436974789915967, "grad_norm": 0.0066587336233593294, "kl": 0.0692138671875, "learning_rate": 2.3176536605828438e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 866 }, { "completion_length": 311.61459159851074, "epoch": 14.470588235294118, "grad_norm": 0.008587341457266496, "kl": 0.1072998046875, "learning_rate": 2.250035618801241e-08, "loss": 0.0001, "reward": 0.2666666992008686, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 868 }, { "completion_length": 358.3489685058594, "epoch": 14.504201680672269, "grad_norm": 0.4455009178979699, "kl": 0.10162353515625, "learning_rate": 2.183372119961499e-08, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 870 }, { "completion_length": 309.4271011352539, "epoch": 14.53781512605042, "grad_norm": 0.005214729019482053, "kl": 0.07476806640625, "learning_rate": 2.117665961130513e-08, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 872 }, { "completion_length": 200.9583396911621, "epoch": 14.571428571428571, "grad_norm": 0.6768082275345103, "kl": 0.1715087890625, "learning_rate": 2.05291989920712e-08, "loss": 0.0002, "reward": 0.2354166880249977, "reward_std": 0.06573155149817467, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1354166716337204, "step": 874 }, { "completion_length": 303.65626525878906, "epoch": 14.605042016806722, "grad_norm": 0.004267275125558004, "kl": 0.080810546875, "learning_rate": 1.9891366508064e-08, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 876 }, { "completion_length": 322.3021011352539, "epoch": 14.638655462184873, "grad_norm": 0.01667547007707851, "kl": 0.1243896484375, "learning_rate": 1.926318892145712e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 878 }, { "completion_length": 357.37500762939453, "epoch": 14.672268907563025, "grad_norm": 0.00886625832334433, "kl": 0.08367919921875, "learning_rate": 1.8644692589323967e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 880 }, { "completion_length": 322.29689025878906, "epoch": 14.705882352941176, "grad_norm": 0.004223851124021023, "kl": 0.0787353515625, "learning_rate": 1.803590346253195e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 882 }, { "completion_length": 347.9010543823242, "epoch": 14.739495798319329, "grad_norm": 0.012438169704799707, "kl": 0.08349609375, "learning_rate": 1.7436847084653456e-08, "loss": 0.0001, "reward": 0.12083334103226662, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.02083333395421505, "step": 884 }, { "completion_length": 365.4895935058594, "epoch": 14.77310924369748, "grad_norm": 0.015310140705621391, "kl": 0.0728759765625, "learning_rate": 1.6847548590894434e-08, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 886 }, { "completion_length": 353.7239685058594, "epoch": 14.806722689075631, "grad_norm": 0.0036206604944826965, "kl": 0.08245849609375, "learning_rate": 1.626803270703936e-08, "loss": 0.0001, "reward": 0.2458333522081375, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333358168602, "step": 888 }, { "completion_length": 253.05730056762695, "epoch": 14.840336134453782, "grad_norm": 0.8866521336808585, "kl": 0.1080322265625, "learning_rate": 1.5698323748414122e-08, "loss": 0.0001, "reward": 0.329166691750288, "reward_std": 0.09609274379909039, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2291666716337204, "step": 890 }, { "completion_length": 305.40105056762695, "epoch": 14.873949579831933, "grad_norm": 0.013258599882684454, "kl": 0.0771484375, "learning_rate": 1.513844561886554e-08, "loss": 0.0001, "reward": 0.16250001266598701, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0625, "step": 892 }, { "completion_length": 289.23438262939453, "epoch": 14.907563025210084, "grad_norm": 0.6602331403859858, "kl": 0.123046875, "learning_rate": 1.4588421809758639e-08, "loss": 0.0001, "reward": 0.1677083484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0677083358168602, "step": 894 }, { "completion_length": 244.92188262939453, "epoch": 14.941176470588236, "grad_norm": 0.0043349531115773705, "kl": 0.093505859375, "learning_rate": 1.4048275398990894e-08, "loss": 0.0001, "reward": 0.344791691750288, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2447916716337204, "step": 896 }, { "completion_length": 312.59375762939453, "epoch": 14.974789915966387, "grad_norm": 0.003016527454182693, "kl": 0.09222412109375, "learning_rate": 1.351802905002386e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 898 }, { "completion_length": 341.6180725097656, "epoch": 15.0, "grad_norm": 0.011517873196403568, "kl": 0.09358723958333333, "learning_rate": 1.2997705010932391e-08, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 900 }, { "completion_length": 337.5260543823242, "epoch": 15.033613445378151, "grad_norm": 0.6729615244671641, "kl": 0.11712646484375, "learning_rate": 1.248732511347103e-08, "loss": 0.0001, "reward": 0.4020833447575569, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3020833432674408, "step": 902 }, { "completion_length": 238.9479217529297, "epoch": 15.067226890756302, "grad_norm": 0.012826138098582018, "kl": 0.1363525390625, "learning_rate": 1.1986910772158105e-08, "loss": 0.0001, "reward": 0.344791691750288, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2447916716337204, "step": 904 }, { "completion_length": 320.8229217529297, "epoch": 15.100840336134453, "grad_norm": 0.004206686054131056, "kl": 0.0753173828125, "learning_rate": 1.1496482983377188e-08, "loss": 0.0001, "reward": 0.3395833633840084, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833432674408, "step": 906 }, { "completion_length": 329.78126525878906, "epoch": 15.134453781512605, "grad_norm": 0.42616998079489693, "kl": 0.09759521484375, "learning_rate": 1.1016062324496007e-08, "loss": 0.0001, "reward": 0.2510416880249977, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1510416716337204, "step": 908 }, { "completion_length": 340.2656364440918, "epoch": 15.168067226890756, "grad_norm": 0.33767444236796246, "kl": 0.10260009765625, "learning_rate": 1.054566895300324e-08, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 910 }, { "completion_length": 337.3229293823242, "epoch": 15.201680672268907, "grad_norm": 0.0052193594255629605, "kl": 0.068115234375, "learning_rate": 1.0085322605662666e-08, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 912 }, { "completion_length": 311.5885467529297, "epoch": 15.235294117647058, "grad_norm": 0.009822458136948056, "kl": 0.0919189453125, "learning_rate": 9.635042597685023e-09, "loss": 0.0001, "reward": 0.2666666880249977, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1666666716337204, "step": 914 }, { "completion_length": 338.4166831970215, "epoch": 15.268907563025211, "grad_norm": 0.6381106558231654, "kl": 0.0838623046875, "learning_rate": 9.194847821917623e-09, "loss": 0.0001, "reward": 0.1729166842997074, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0729166716337204, "step": 916 }, { "completion_length": 247.79167556762695, "epoch": 15.302521008403362, "grad_norm": 0.483496147349892, "kl": 0.1385498046875, "learning_rate": 8.764756748051661e-09, "loss": 0.0001, "reward": 0.5843750536441803, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.4843750149011612, "step": 918 }, { "completion_length": 376.38022232055664, "epoch": 15.336134453781513, "grad_norm": 0.009013346187797264, "kl": 0.07366943359375, "learning_rate": 8.344787421847216e-09, "loss": 0.0001, "reward": 0.1677083484828472, "reward_std": 0.033592741936445236, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0677083358168602, "step": 920 }, { "completion_length": 256.8593864440918, "epoch": 15.369747899159664, "grad_norm": 0.007979834986816158, "kl": 0.09619140625, "learning_rate": 7.934957464376058e-09, "loss": 0.0001, "reward": 0.11041667684912682, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.010416666977107525, "step": 922 }, { "completion_length": 278.84375381469727, "epoch": 15.403361344537815, "grad_norm": 0.010620959423456592, "kl": 0.10223388671875, "learning_rate": 7.535284071282455e-09, "loss": 0.0001, "reward": 0.4281250424683094, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3281250074505806, "step": 924 }, { "completion_length": 330.82813262939453, "epoch": 15.436974789915967, "grad_norm": 1.0124394946558697, "kl": 0.153564453125, "learning_rate": 7.145784012061423e-09, "loss": 0.0002, "reward": 0.33750002831220627, "reward_std": 0.06945833191275597, "rewards/format_reward_func": 0.09791668318212032, "rewards/solution_reward_func": 0.23958333861082792, "step": 926 }, { "completion_length": 280.7448043823242, "epoch": 15.470588235294118, "grad_norm": 0.01100254003630868, "kl": 0.1099853515625, "learning_rate": 6.766473629355452e-09, "loss": 0.0001, "reward": 0.3500000275671482, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2500000074505806, "step": 928 }, { "completion_length": 337.10938262939453, "epoch": 15.504201680672269, "grad_norm": 0.2294437286353692, "kl": 0.075439453125, "learning_rate": 6.397368838268496e-09, "loss": 0.0001, "reward": 0.2718750201165676, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.17187500512227416, "step": 930 }, { "completion_length": 287.1145896911621, "epoch": 15.53781512605042, "grad_norm": 0.006834647105213461, "kl": 0.0982666015625, "learning_rate": 6.038485125698295e-09, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 932 }, { "completion_length": 193.25000381469727, "epoch": 15.571428571428571, "grad_norm": 0.3002270960655947, "kl": 0.1285400390625, "learning_rate": 5.689837549686744e-09, "loss": 0.0001, "reward": 0.2458333559334278, "reward_std": 0.06072613410651684, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.14583333721384406, "step": 934 }, { "completion_length": 306.20834732055664, "epoch": 15.605042016806722, "grad_norm": 0.007243208215792406, "kl": 0.09375, "learning_rate": 5.3514407387877936e-09, "loss": 0.0001, "reward": 0.4125000163912773, "reward_std": 0.05442607402801514, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3125000149011612, "step": 936 }, { "completion_length": 260.57292556762695, "epoch": 15.638655462184873, "grad_norm": 0.004761304807713431, "kl": 0.115478515625, "learning_rate": 5.023308891453915e-09, "loss": 0.0001, "reward": 0.3395833671092987, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833358168602, "step": 938 }, { "completion_length": 266.81250762939453, "epoch": 15.672268907563025, "grad_norm": 0.0047137674453268386, "kl": 0.07830810546875, "learning_rate": 4.705455775440237e-09, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 940 }, { "completion_length": 273.5520935058594, "epoch": 15.705882352941176, "grad_norm": 0.2878444008029626, "kl": 0.106201171875, "learning_rate": 4.3978947272269305e-09, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 942 }, { "completion_length": 412.87501525878906, "epoch": 15.739495798319329, "grad_norm": 0.003461087288127568, "kl": 0.06109619140625, "learning_rate": 4.100638651459542e-09, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 944 }, { "completion_length": 354.3020896911621, "epoch": 15.77310924369748, "grad_norm": 0.3108817654882425, "kl": 0.07928466796875, "learning_rate": 3.813700020407706e-09, "loss": 0.0001, "reward": 0.1937500163912773, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.09375000279396772, "step": 946 }, { "completion_length": 202.9739646911621, "epoch": 15.806722689075631, "grad_norm": 0.006531613998872075, "kl": 0.149658203125, "learning_rate": 3.5370908734417006e-09, "loss": 0.0001, "reward": 0.3500000387430191, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2500000074505806, "step": 948 }, { "completion_length": 247.10937881469727, "epoch": 15.840336134453782, "grad_norm": 0.006013059884196403, "kl": 0.1055908203125, "learning_rate": 3.2708228165273244e-09, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 950 }, { "completion_length": 230.55209350585938, "epoch": 15.873949579831933, "grad_norm": 0.005585688903542125, "kl": 0.1065673828125, "learning_rate": 3.0149070217390106e-09, "loss": 0.0001, "reward": 0.23541668057441711, "reward_std": 0.0416666679084301, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1354166716337204, "step": 952 }, { "completion_length": 338.1823043823242, "epoch": 15.907563025210084, "grad_norm": 0.004945192084647359, "kl": 0.13848876953125, "learning_rate": 2.769354226790893e-09, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 954 }, { "completion_length": 298.95312881469727, "epoch": 15.941176470588236, "grad_norm": 0.005862658852345729, "kl": 0.07177734375, "learning_rate": 2.5341747345865026e-09, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 956 }, { "completion_length": 270.6822967529297, "epoch": 15.974789915966387, "grad_norm": 0.8733860181444384, "kl": 0.135009765625, "learning_rate": 2.3093784127863057e-09, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 958 }, { "completion_length": 342.18751017252606, "epoch": 16.0, "grad_norm": 0.21587686787790358, "kl": 0.08382161458333333, "learning_rate": 2.094974693393731e-09, "loss": 0.0001, "reward": 0.20416668057441711, "reward_std": 0.027777778605620067, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.10416666666666667, "step": 960 }, { "completion_length": 265.34896087646484, "epoch": 16.03361344537815, "grad_norm": 0.30195526039267395, "kl": 0.1190185546875, "learning_rate": 1.890972572359456e-09, "loss": 0.0001, "reward": 0.10520834103226662, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0052083334885537624, "step": 962 }, { "completion_length": 255.22917366027832, "epoch": 16.067226890756302, "grad_norm": 0.007238478919550304, "kl": 0.0931396484375, "learning_rate": 1.6973806092038523e-09, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 964 }, { "completion_length": 315.1458396911621, "epoch": 16.100840336134453, "grad_norm": 0.0071029011504459135, "kl": 0.107421875, "learning_rate": 1.514206926658046e-09, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 966 }, { "completion_length": 190.1458396911621, "epoch": 16.134453781512605, "grad_norm": 0.008558663696003357, "kl": 0.156982421875, "learning_rate": 1.3414592103228594e-09, "loss": 0.0002, "reward": 0.3500000424683094, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.25, "step": 968 }, { "completion_length": 339.3385543823242, "epoch": 16.168067226890756, "grad_norm": 0.004039709998008519, "kl": 0.09521484375, "learning_rate": 1.1791447083465133e-09, "loss": 0.0001, "reward": 0.18281251192092896, "reward_std": 0.0020833334419876337, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.0833333358168602, "step": 970 }, { "completion_length": 212.35938262939453, "epoch": 16.201680672268907, "grad_norm": 0.4578272594721116, "kl": 0.1202392578125, "learning_rate": 1.0272702311203695e-09, "loss": 0.0001, "reward": 0.2458333522081375, "reward_std": 0.03726780042052269, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1458333358168602, "step": 972 }, { "completion_length": 328.359375, "epoch": 16.235294117647058, "grad_norm": 0.8576024393145434, "kl": 0.1065673828125, "learning_rate": 8.858421509933823e-10, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 974 }, { "completion_length": 368.2135467529297, "epoch": 16.26890756302521, "grad_norm": 0.7461130696069282, "kl": 0.0911865234375, "learning_rate": 7.548664020045059e-10, "loss": 0.0001, "reward": 0.2614583522081375, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1614583358168602, "step": 976 }, { "completion_length": 327.42187881469727, "epoch": 16.30252100840336, "grad_norm": 0.0036313740351404077, "kl": 0.078125, "learning_rate": 6.343484796338394e-10, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 978 }, { "completion_length": 297.5052185058594, "epoch": 16.33613445378151, "grad_norm": 0.47099217001019605, "kl": 0.0927734375, "learning_rate": 5.242934405720878e-10, "loss": 0.0001, "reward": 0.3395833596587181, "reward_std": 0.054426075890660286, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.23958334093913436, "step": 980 }, { "completion_length": 327.76562881469727, "epoch": 16.369747899159663, "grad_norm": 0.009638487385220006, "kl": 0.122802734375, "learning_rate": 4.2470590250823223e-10, "loss": 0.0001, "reward": 0.2770833522081375, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.1770833358168602, "step": 982 }, { "completion_length": 314.75000762939453, "epoch": 16.403361344537814, "grad_norm": 0.0068183045876018655, "kl": 0.110595703125, "learning_rate": 3.355900439359072e-10, "loss": 0.0001, "reward": 0.344791691750288, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2447916716337204, "step": 984 }, { "completion_length": 319.4427146911621, "epoch": 16.436974789915965, "grad_norm": 0.21120482423734918, "kl": 0.071044921875, "learning_rate": 2.569496039780683e-10, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 986 }, { "completion_length": 330.7864646911621, "epoch": 16.470588235294116, "grad_norm": 0.012263931902891346, "kl": 0.0955810546875, "learning_rate": 1.8878788223009035e-10, "loss": 0.0001, "reward": 0.1833333484828472, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0833333358168602, "step": 988 }, { "completion_length": 230.11458587646484, "epoch": 16.504201680672267, "grad_norm": 0.6477456649296228, "kl": 0.1221923828125, "learning_rate": 1.3110773862126667e-10, "loss": 0.0001, "reward": 0.3291667066514492, "reward_std": 0.08288982696831226, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.22916667675599456, "step": 990 }, { "completion_length": 204.63542366027832, "epoch": 16.537815126050422, "grad_norm": 0.014236759711816102, "kl": 0.15869140625, "learning_rate": 8.391159329496079e-11, "loss": 0.0002, "reward": 0.4177083745598793, "reward_std": 0.04929708503186703, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.3177083432674408, "step": 992 }, { "completion_length": 351.54167556762695, "epoch": 16.571428571428573, "grad_norm": 0.0913225606659672, "kl": 0.08087158203125, "learning_rate": 4.7201426506854324e-11, "loss": 0.0001, "reward": 0.25052086263895035, "reward_std": 0.03567607537843287, "rewards/format_reward_func": 0.09947918355464935, "rewards/solution_reward_func": 0.1510416716337204, "step": 994 }, { "completion_length": 312.8958435058594, "epoch": 16.605042016806724, "grad_norm": 0.005916716333972388, "kl": 0.0887451171875, "learning_rate": 2.097877854204122e-11, "loss": 0.0001, "reward": 0.3395833633840084, "reward_std": 0.028463751077651978, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.2395833432674408, "step": 996 }, { "completion_length": 350.3385429382324, "epoch": 16.638655462184875, "grad_norm": 0.00927812311865424, "kl": 0.0775146484375, "learning_rate": 5.244749650301639e-12, "loss": 0.0001, "reward": 0.17812501266598701, "reward_std": 0.02083333395421505, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.078125, "step": 998 }, { "completion_length": 268.69792556762695, "epoch": 16.672268907563026, "grad_norm": 0.06515464559304185, "kl": 0.114013671875, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.10000000894069672, "reward_std": 0.0, "rewards/format_reward_func": 0.10000001639127731, "rewards/solution_reward_func": 0.0, "step": 1000 }, { "epoch": 16.672268907563026, "step": 1000, "total_flos": 0.0, "train_loss": 7.514761615893417e-05, "train_runtime": 37925.9573, "train_samples_per_second": 0.158, "train_steps_per_second": 0.026 } ], "logging_steps": 2, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 17, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }