{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2894906511927788, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 117.859375, "epoch": 0.0006447453255963894, "grad_norm": 11.706681251525879, "kl": 0.0, "learning_rate": 9.996776273372017e-07, "loss": 0.0, "reward": 0.9551776349544525, "reward_std": 0.313363179564476, "rewards/format_reward": 0.859375, "rewards/iou_timestamp_reward": 0.0958026759326458, "step": 1 }, { "completion_length": 119.921875, "epoch": 0.0012894906511927789, "grad_norm": 9.978516578674316, "kl": 0.0011749267578125, "learning_rate": 9.993552546744034e-07, "loss": 0.0, "reward": 0.9701357483863831, "reward_std": 0.3779118061065674, "rewards/format_reward": 0.875, "rewards/iou_timestamp_reward": 0.09513575211167336, "step": 2 }, { "completion_length": 121.859375, "epoch": 0.0019342359767891683, "grad_norm": 49.87274932861328, "kl": 0.001697540283203125, "learning_rate": 9.990328820116053e-07, "loss": 0.0001, "reward": 0.9885792136192322, "reward_std": 0.27833086252212524, "rewards/format_reward": 0.90625, "rewards/iou_timestamp_reward": 0.08232920989394188, "step": 3 }, { "completion_length": 119.546875, "epoch": 0.0025789813023855577, "grad_norm": 21.538257598876953, "kl": 0.003204345703125, "learning_rate": 9.987105093488073e-07, "loss": 0.0001, "reward": 1.1702584028244019, "reward_std": 0.3175881803035736, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.21713333204388618, "step": 4 }, { "completion_length": 116.640625, "epoch": 0.003223726627981947, "grad_norm": 6.138484477996826, "kl": 0.0047760009765625, "learning_rate": 9.98388136686009e-07, "loss": 0.0002, "reward": 1.1206515431404114, "reward_std": 0.24463944137096405, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.16752655059099197, "step": 5 }, { "completion_length": 117.625, "epoch": 0.0038684719535783366, "grad_norm": 16.45502471923828, "kl": 0.0115966796875, "learning_rate": 9.980657640232107e-07, "loss": 0.0005, "reward": 1.2238141894340515, "reward_std": 0.3482338488101959, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.25506413727998734, "step": 6 }, { "completion_length": 108.46875, "epoch": 0.004513217279174726, "grad_norm": 20.508726119995117, "kl": 0.014617919921875, "learning_rate": 9.977433913604127e-07, "loss": 0.0006, "reward": 1.1528931856155396, "reward_std": 0.295160673558712, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.19976822286844254, "step": 7 }, { "completion_length": 115.1875, "epoch": 0.0051579626047711154, "grad_norm": 7.711580276489258, "kl": 0.01885986328125, "learning_rate": 9.974210186976144e-07, "loss": 0.0008, "reward": 1.1179298758506775, "reward_std": 0.23363575339317322, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.1491798721253872, "step": 8 }, { "completion_length": 117.609375, "epoch": 0.005802707930367505, "grad_norm": 13.166901588439941, "kl": 0.02362060546875, "learning_rate": 9.970986460348161e-07, "loss": 0.0009, "reward": 1.3111140131950378, "reward_std": 0.28913620114326477, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.32673898339271545, "step": 9 }, { "completion_length": 116.890625, "epoch": 0.006447453255963894, "grad_norm": 36.41984176635742, "kl": 0.0179443359375, "learning_rate": 9.96776273372018e-07, "loss": 0.0007, "reward": 1.1912025213241577, "reward_std": 0.22346250712871552, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.19120249524712563, "step": 10 }, { "completion_length": 100.515625, "epoch": 0.0070921985815602835, "grad_norm": 17.331663131713867, "kl": 0.02130126953125, "learning_rate": 9.964539007092198e-07, "loss": 0.0009, "reward": 1.1976637244224548, "reward_std": 0.20566103607416153, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.19766373187303543, "step": 11 }, { "completion_length": 108.25, "epoch": 0.007736943907156673, "grad_norm": 14.938997268676758, "kl": 0.0384521484375, "learning_rate": 9.961315280464217e-07, "loss": 0.0015, "reward": 1.0446301102638245, "reward_std": 0.2249654456973076, "rewards/format_reward": 0.9375, "rewards/iou_timestamp_reward": 0.10713006556034088, "step": 12 }, { "completion_length": 109.109375, "epoch": 0.008381689232753063, "grad_norm": 13.962553977966309, "kl": 0.02801513671875, "learning_rate": 9.958091553836235e-07, "loss": 0.0011, "reward": 1.2218610644340515, "reward_std": 0.23190322518348694, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.22186104953289032, "step": 13 }, { "completion_length": 112.421875, "epoch": 0.009026434558349452, "grad_norm": 12.863759994506836, "kl": 0.0321044921875, "learning_rate": 9.954867827208252e-07, "loss": 0.0013, "reward": 1.2887030243873596, "reward_std": 0.28984855115413666, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.30432797968387604, "step": 14 }, { "completion_length": 94.703125, "epoch": 0.009671179883945842, "grad_norm": 10.097845077514648, "kl": 0.0379638671875, "learning_rate": 9.95164410058027e-07, "loss": 0.0015, "reward": 1.289795994758606, "reward_std": 0.23819649964571, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.28979597985744476, "step": 15 }, { "completion_length": 109.71875, "epoch": 0.010315925209542231, "grad_norm": 14.919957160949707, "kl": 0.03289794921875, "learning_rate": 9.948420373952289e-07, "loss": 0.0013, "reward": 1.2771413326263428, "reward_std": 0.2302519753575325, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.27714137732982635, "step": 16 }, { "completion_length": 119.546875, "epoch": 0.01096067053513862, "grad_norm": 7.356471538543701, "kl": 0.034912109375, "learning_rate": 9.945196647324306e-07, "loss": 0.0014, "reward": 1.108040452003479, "reward_std": 0.19030947610735893, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.1236654780805111, "step": 17 }, { "completion_length": 112.640625, "epoch": 0.01160541586073501, "grad_norm": 8.942741394042969, "kl": 0.03021240234375, "learning_rate": 9.941972920696325e-07, "loss": 0.0012, "reward": 1.2421876192092896, "reward_std": 0.25605954229831696, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.27343761920928955, "step": 18 }, { "completion_length": 112.015625, "epoch": 0.012250161186331399, "grad_norm": 5.671343803405762, "kl": 0.037109375, "learning_rate": 9.938749194068343e-07, "loss": 0.0015, "reward": 1.2924596667289734, "reward_std": 0.17979642003774643, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2924596965312958, "step": 19 }, { "completion_length": 117.625, "epoch": 0.012894906511927788, "grad_norm": 7.245625019073486, "kl": 0.0355224609375, "learning_rate": 9.935525467440362e-07, "loss": 0.0014, "reward": 1.2731870412826538, "reward_std": 0.2262459546327591, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2731870636343956, "step": 20 }, { "completion_length": 110.90625, "epoch": 0.013539651837524178, "grad_norm": 7.730829238891602, "kl": 0.036376953125, "learning_rate": 9.93230174081238e-07, "loss": 0.0015, "reward": 1.2657849192619324, "reward_std": 0.25212879478931427, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2657849192619324, "step": 21 }, { "completion_length": 113.953125, "epoch": 0.014184397163120567, "grad_norm": 8.334545135498047, "kl": 0.038330078125, "learning_rate": 9.929078014184397e-07, "loss": 0.0015, "reward": 1.2092431783676147, "reward_std": 0.21151791512966156, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.20924309641122818, "step": 22 }, { "completion_length": 100.0625, "epoch": 0.014829142488716958, "grad_norm": 17.85267448425293, "kl": 0.0452880859375, "learning_rate": 9.925854287556414e-07, "loss": 0.0018, "reward": 1.247024118900299, "reward_std": 0.2600787580013275, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.24702415615320206, "step": 23 }, { "completion_length": 100.140625, "epoch": 0.015473887814313346, "grad_norm": 10.987561225891113, "kl": 0.048583984375, "learning_rate": 9.922630560928433e-07, "loss": 0.0019, "reward": 1.3182414770126343, "reward_std": 0.23921259492635727, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.31824155151844025, "step": 24 }, { "completion_length": 97.75, "epoch": 0.016118633139909737, "grad_norm": 14.050704956054688, "kl": 0.0531005859375, "learning_rate": 9.91940683430045e-07, "loss": 0.0021, "reward": 1.2669792175292969, "reward_std": 0.22845448553562164, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2669791951775551, "step": 25 }, { "completion_length": 101.078125, "epoch": 0.016763378465506126, "grad_norm": 12.309895515441895, "kl": 0.0396728515625, "learning_rate": 9.91618310767247e-07, "loss": 0.0016, "reward": 1.2797068357467651, "reward_std": 0.1980849951505661, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.27970677614212036, "step": 26 }, { "completion_length": 84.90625, "epoch": 0.017408123791102514, "grad_norm": 8.08408260345459, "kl": 0.0487060546875, "learning_rate": 9.912959381044487e-07, "loss": 0.0019, "reward": 1.2797399759292603, "reward_std": 0.1894526183605194, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.29536499083042145, "step": 27 }, { "completion_length": 100.75, "epoch": 0.018052869116698903, "grad_norm": 13.173178672790527, "kl": 0.048095703125, "learning_rate": 9.909735654416504e-07, "loss": 0.0019, "reward": 1.254739224910736, "reward_std": 0.19636224955320358, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.2703641951084137, "step": 28 }, { "completion_length": 90.0625, "epoch": 0.018697614442295292, "grad_norm": 14.946029663085938, "kl": 0.0457763671875, "learning_rate": 9.906511927788524e-07, "loss": 0.0018, "reward": 1.2612407207489014, "reward_std": 0.19902688264846802, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2612406611442566, "step": 29 }, { "completion_length": 92.875, "epoch": 0.019342359767891684, "grad_norm": 24.124683380126953, "kl": 0.04345703125, "learning_rate": 9.903288201160541e-07, "loss": 0.0017, "reward": 1.3272080421447754, "reward_std": 0.24419597536325455, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.32720808684825897, "step": 30 }, { "completion_length": 92.421875, "epoch": 0.019987105093488073, "grad_norm": 19.198226928710938, "kl": 0.1097412109375, "learning_rate": 9.900064474532558e-07, "loss": 0.0044, "reward": 1.3916027545928955, "reward_std": 0.2533900588750839, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4072277396917343, "step": 31 }, { "completion_length": 101.421875, "epoch": 0.020631850419084462, "grad_norm": 9.14734172821045, "kl": 0.0443115234375, "learning_rate": 9.896840747904578e-07, "loss": 0.0018, "reward": 1.247195303440094, "reward_std": 0.2349850833415985, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2471952885389328, "step": 32 }, { "completion_length": 95.921875, "epoch": 0.02127659574468085, "grad_norm": 8.344867706298828, "kl": 0.037841796875, "learning_rate": 9.893617021276595e-07, "loss": 0.0015, "reward": 1.343221127986908, "reward_std": 0.20855669677257538, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.34322114288806915, "step": 33 }, { "completion_length": 93.359375, "epoch": 0.02192134107027724, "grad_norm": 19.839126586914062, "kl": 0.0396728515625, "learning_rate": 9.890393294648614e-07, "loss": 0.0016, "reward": 1.2119425535202026, "reward_std": 0.1833260953426361, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.21194250136613846, "step": 34 }, { "completion_length": 112.921875, "epoch": 0.02256608639587363, "grad_norm": 10.852544784545898, "kl": 0.0394287109375, "learning_rate": 9.887169568020632e-07, "loss": 0.0016, "reward": 1.2822151184082031, "reward_std": 0.19346018135547638, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.28221510350704193, "step": 35 }, { "completion_length": 110.859375, "epoch": 0.02321083172147002, "grad_norm": 66.1825942993164, "kl": 0.03253173828125, "learning_rate": 9.88394584139265e-07, "loss": 0.0013, "reward": 1.2645833492279053, "reward_std": 0.22069542109966278, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.26458336412906647, "step": 36 }, { "completion_length": 104.421875, "epoch": 0.02385557704706641, "grad_norm": 12.111766815185547, "kl": 0.0426025390625, "learning_rate": 9.880722114764668e-07, "loss": 0.0017, "reward": 1.2878879308700562, "reward_std": 0.21032921224832535, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.28788794577121735, "step": 37 }, { "completion_length": 110.65625, "epoch": 0.024500322372662798, "grad_norm": 11.206411361694336, "kl": 0.0400390625, "learning_rate": 9.877498388136686e-07, "loss": 0.0016, "reward": 1.3537410497665405, "reward_std": 0.2186509370803833, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3537410721182823, "step": 38 }, { "completion_length": 111.984375, "epoch": 0.025145067698259187, "grad_norm": 17.833251953125, "kl": 0.0408935546875, "learning_rate": 9.874274661508703e-07, "loss": 0.0016, "reward": 1.3208078145980835, "reward_std": 0.21878959983587265, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3208078145980835, "step": 39 }, { "completion_length": 119.75, "epoch": 0.025789813023855575, "grad_norm": 40.49732208251953, "kl": 0.334716796875, "learning_rate": 9.871050934880722e-07, "loss": 0.0134, "reward": 1.23319411277771, "reward_std": 0.2120913863182068, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.23319409787654877, "step": 40 }, { "completion_length": 104.25, "epoch": 0.026434558349451968, "grad_norm": 19.549007415771484, "kl": 0.050537109375, "learning_rate": 9.86782720825274e-07, "loss": 0.002, "reward": 1.307875633239746, "reward_std": 0.16077548265457153, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3078755736351013, "step": 41 }, { "completion_length": 105.921875, "epoch": 0.027079303675048357, "grad_norm": 29.055553436279297, "kl": 0.056640625, "learning_rate": 9.86460348162476e-07, "loss": 0.0023, "reward": 1.3375996351242065, "reward_std": 0.23000206798315048, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.33759957551956177, "step": 42 }, { "completion_length": 103.140625, "epoch": 0.027724049000644745, "grad_norm": 7.140056610107422, "kl": 0.101318359375, "learning_rate": 9.861379754996776e-07, "loss": 0.0041, "reward": 1.2600332498550415, "reward_std": 0.2108185514807701, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2600333169102669, "step": 43 }, { "completion_length": 100.625, "epoch": 0.028368794326241134, "grad_norm": 36.15247344970703, "kl": 0.06201171875, "learning_rate": 9.858156028368794e-07, "loss": 0.0025, "reward": 1.2336326837539673, "reward_std": 0.17267921566963196, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2336326688528061, "step": 44 }, { "completion_length": 98.484375, "epoch": 0.029013539651837523, "grad_norm": 12.655134201049805, "kl": 0.052490234375, "learning_rate": 9.85493230174081e-07, "loss": 0.0021, "reward": 1.4318326711654663, "reward_std": 0.20957604050636292, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4318326860666275, "step": 45 }, { "completion_length": 102.71875, "epoch": 0.029658284977433915, "grad_norm": 25.280502319335938, "kl": 0.059326171875, "learning_rate": 9.85170857511283e-07, "loss": 0.0024, "reward": 1.3532536029815674, "reward_std": 0.21924903988838196, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3532535433769226, "step": 46 }, { "completion_length": 104.140625, "epoch": 0.030303030303030304, "grad_norm": 10.246079444885254, "kl": 0.0511474609375, "learning_rate": 9.848484848484847e-07, "loss": 0.002, "reward": 1.4125980138778687, "reward_std": 0.1969481185078621, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41259796917438507, "step": 47 }, { "completion_length": 96.484375, "epoch": 0.030947775628626693, "grad_norm": 32.44934844970703, "kl": 0.0533447265625, "learning_rate": 9.845261121856867e-07, "loss": 0.0021, "reward": 1.3488176465034485, "reward_std": 0.2110263928771019, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.3644426167011261, "step": 48 }, { "completion_length": 94.328125, "epoch": 0.03159252095422308, "grad_norm": 6.3295698165893555, "kl": 0.0740966796875, "learning_rate": 9.842037395228884e-07, "loss": 0.003, "reward": 1.3096264600753784, "reward_std": 0.1813429892063141, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3096264898777008, "step": 49 }, { "completion_length": 96.25, "epoch": 0.032237266279819474, "grad_norm": 12.89992618560791, "kl": 0.0531005859375, "learning_rate": 9.838813668600904e-07, "loss": 0.0021, "reward": 1.3683323860168457, "reward_std": 0.2907109409570694, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.39958229660987854, "step": 50 }, { "completion_length": 90.84375, "epoch": 0.03288201160541586, "grad_norm": 106.75090789794922, "kl": 0.0523681640625, "learning_rate": 9.83558994197292e-07, "loss": 0.0021, "reward": 1.4628892540931702, "reward_std": 0.22101883590221405, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46288928389549255, "step": 51 }, { "completion_length": 84.015625, "epoch": 0.03352675693101225, "grad_norm": 8.15224552154541, "kl": 0.060302734375, "learning_rate": 9.832366215344938e-07, "loss": 0.0024, "reward": 1.36749267578125, "reward_std": 0.22733061015605927, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3674926608800888, "step": 52 }, { "completion_length": 88.3125, "epoch": 0.03417150225660864, "grad_norm": 10.359516143798828, "kl": 0.055908203125, "learning_rate": 9.829142488716955e-07, "loss": 0.0022, "reward": 1.3939309120178223, "reward_std": 0.18551449477672577, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39393097162246704, "step": 53 }, { "completion_length": 82.8125, "epoch": 0.03481624758220503, "grad_norm": 23.30002212524414, "kl": 0.0635986328125, "learning_rate": 9.825918762088975e-07, "loss": 0.0025, "reward": 1.3875991702079773, "reward_std": 0.2218652218580246, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3875991553068161, "step": 54 }, { "completion_length": 84.015625, "epoch": 0.03546099290780142, "grad_norm": 9.210583686828613, "kl": 0.0657958984375, "learning_rate": 9.822695035460992e-07, "loss": 0.0026, "reward": 1.3397376537322998, "reward_std": 0.2256728559732437, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.3709876760840416, "step": 55 }, { "completion_length": 78.359375, "epoch": 0.036105738233397806, "grad_norm": 18.673093795776367, "kl": 0.074951171875, "learning_rate": 9.819471308833011e-07, "loss": 0.003, "reward": 1.3963087797164917, "reward_std": 0.2275141477584839, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3963087424635887, "step": 56 }, { "completion_length": 86.234375, "epoch": 0.0367504835589942, "grad_norm": 21.288368225097656, "kl": 0.091552734375, "learning_rate": 9.816247582205029e-07, "loss": 0.0037, "reward": 1.4477627873420715, "reward_std": 0.194952093064785, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44776275753974915, "step": 57 }, { "completion_length": 82.671875, "epoch": 0.037395228884590584, "grad_norm": 12.072967529296875, "kl": 0.079345703125, "learning_rate": 9.813023855577046e-07, "loss": 0.0032, "reward": 1.5084742307662964, "reward_std": 0.15730255097150803, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5084741413593292, "step": 58 }, { "completion_length": 76.5625, "epoch": 0.038039974210186976, "grad_norm": 12.807083129882812, "kl": 0.067626953125, "learning_rate": 9.809800128949065e-07, "loss": 0.0027, "reward": 1.396537721157074, "reward_std": 0.21208080649375916, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3965377062559128, "step": 59 }, { "completion_length": 78.0, "epoch": 0.03868471953578337, "grad_norm": 32.63819885253906, "kl": 0.074951171875, "learning_rate": 9.806576402321083e-07, "loss": 0.003, "reward": 1.4429082870483398, "reward_std": 0.19335371255874634, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44290828704833984, "step": 60 }, { "completion_length": 81.71875, "epoch": 0.039329464861379754, "grad_norm": 7.923285961151123, "kl": 0.0810546875, "learning_rate": 9.8033526756931e-07, "loss": 0.0032, "reward": 1.2859596014022827, "reward_std": 0.19862326979637146, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2859596461057663, "step": 61 }, { "completion_length": 73.09375, "epoch": 0.039974210186976146, "grad_norm": 45.514076232910156, "kl": 0.0791015625, "learning_rate": 9.80012894906512e-07, "loss": 0.0032, "reward": 1.3525552153587341, "reward_std": 0.1816207319498062, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3525552302598953, "step": 62 }, { "completion_length": 73.234375, "epoch": 0.04061895551257253, "grad_norm": 25.02358627319336, "kl": 0.0810546875, "learning_rate": 9.796905222437137e-07, "loss": 0.0032, "reward": 1.3032844066619873, "reward_std": 0.2011517509818077, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3032844811677933, "step": 63 }, { "completion_length": 77.375, "epoch": 0.041263700838168924, "grad_norm": 18.431140899658203, "kl": 0.092041015625, "learning_rate": 9.793681495809156e-07, "loss": 0.0037, "reward": 1.4467111825942993, "reward_std": 0.23784177750349045, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44671112298965454, "step": 64 }, { "completion_length": 76.78125, "epoch": 0.041908446163765316, "grad_norm": 10.069541931152344, "kl": 0.0888671875, "learning_rate": 9.790457769181173e-07, "loss": 0.0036, "reward": 1.4564274549484253, "reward_std": 0.24415691941976547, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4564273804426193, "step": 65 }, { "completion_length": 79.359375, "epoch": 0.0425531914893617, "grad_norm": 8.669953346252441, "kl": 0.09521484375, "learning_rate": 9.78723404255319e-07, "loss": 0.0038, "reward": 1.3141262531280518, "reward_std": 0.21810634434223175, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.32975125312805176, "step": 66 }, { "completion_length": 79.953125, "epoch": 0.04319793681495809, "grad_norm": 22.783241271972656, "kl": 0.080078125, "learning_rate": 9.78401031592521e-07, "loss": 0.0032, "reward": 1.4575133919715881, "reward_std": 0.18917211890220642, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4575134366750717, "step": 67 }, { "completion_length": 78.296875, "epoch": 0.04384268214055448, "grad_norm": 22.941936492919922, "kl": 0.09326171875, "learning_rate": 9.780786589297227e-07, "loss": 0.0037, "reward": 1.4472717642784119, "reward_std": 0.15321099758148193, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4472717046737671, "step": 68 }, { "completion_length": 78.640625, "epoch": 0.04448742746615087, "grad_norm": 17.53478240966797, "kl": 0.091552734375, "learning_rate": 9.777562862669244e-07, "loss": 0.0037, "reward": 1.265991985797882, "reward_std": 0.18441501259803772, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.28161704540252686, "step": 69 }, { "completion_length": 71.9375, "epoch": 0.04513217279174726, "grad_norm": 12.067954063415527, "kl": 0.1015625, "learning_rate": 9.774339136041264e-07, "loss": 0.0041, "reward": 1.4680742025375366, "reward_std": 0.17444680631160736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4680742621421814, "step": 70 }, { "completion_length": 73.640625, "epoch": 0.04577691811734365, "grad_norm": 12.765181541442871, "kl": 0.115478515625, "learning_rate": 9.771115409413281e-07, "loss": 0.0046, "reward": 1.384080708026886, "reward_std": 0.2056313455104828, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.39970576763153076, "step": 71 }, { "completion_length": 74.9375, "epoch": 0.04642166344294004, "grad_norm": 15.447208404541016, "kl": 0.088623046875, "learning_rate": 9.7678916827853e-07, "loss": 0.0035, "reward": 1.3486806750297546, "reward_std": 0.2008630558848381, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.34868063032627106, "step": 72 }, { "completion_length": 72.875, "epoch": 0.047066408768536426, "grad_norm": 12.092300415039062, "kl": 0.1162109375, "learning_rate": 9.764667956157318e-07, "loss": 0.0047, "reward": 1.413942575454712, "reward_std": 0.2041231021285057, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4139425456523895, "step": 73 }, { "completion_length": 89.46875, "epoch": 0.04771115409413282, "grad_norm": 16.925933837890625, "kl": 0.070068359375, "learning_rate": 9.761444229529335e-07, "loss": 0.0028, "reward": 1.3617019653320312, "reward_std": 0.16857799887657166, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3617018908262253, "step": 74 }, { "completion_length": 80.109375, "epoch": 0.048355899419729204, "grad_norm": 18.16619110107422, "kl": 0.0762939453125, "learning_rate": 9.758220502901352e-07, "loss": 0.0031, "reward": 1.3572936058044434, "reward_std": 0.22740349918603897, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.37291859090328217, "step": 75 }, { "completion_length": 77.640625, "epoch": 0.049000644745325596, "grad_norm": 40.9599609375, "kl": 0.0771484375, "learning_rate": 9.754996776273372e-07, "loss": 0.0031, "reward": 1.3504416346549988, "reward_std": 0.18997812271118164, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.35044166445732117, "step": 76 }, { "completion_length": 77.640625, "epoch": 0.04964539007092199, "grad_norm": 9.37459945678711, "kl": 0.09033203125, "learning_rate": 9.75177304964539e-07, "loss": 0.0036, "reward": 1.375741422176361, "reward_std": 0.20171231776475906, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3757414370775223, "step": 77 }, { "completion_length": 83.421875, "epoch": 0.05029013539651837, "grad_norm": 8.924232482910156, "kl": 0.07861328125, "learning_rate": 9.748549323017408e-07, "loss": 0.0031, "reward": 1.3597639203071594, "reward_std": 0.21023260056972504, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.3753889948129654, "step": 78 }, { "completion_length": 83.703125, "epoch": 0.050934880722114766, "grad_norm": 10.10476016998291, "kl": 0.077880859375, "learning_rate": 9.745325596389426e-07, "loss": 0.0031, "reward": 1.273858666419983, "reward_std": 0.18880721181631088, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2738586515188217, "step": 79 }, { "completion_length": 79.109375, "epoch": 0.05157962604771115, "grad_norm": 11.688029289245605, "kl": 0.083251953125, "learning_rate": 9.742101869761445e-07, "loss": 0.0033, "reward": 1.4125596284866333, "reward_std": 0.16709274798631668, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4125596284866333, "step": 80 }, { "completion_length": 75.21875, "epoch": 0.05222437137330754, "grad_norm": 14.252153396606445, "kl": 0.085205078125, "learning_rate": 9.738878143133462e-07, "loss": 0.0034, "reward": 1.355860710144043, "reward_std": 0.25486576557159424, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3558606505393982, "step": 81 }, { "completion_length": 82.0625, "epoch": 0.052869116698903935, "grad_norm": 23.381254196166992, "kl": 0.089111328125, "learning_rate": 9.73565441650548e-07, "loss": 0.0036, "reward": 1.4577891826629639, "reward_std": 0.222563698887825, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45778918266296387, "step": 82 }, { "completion_length": 85.671875, "epoch": 0.05351386202450032, "grad_norm": 29.149097442626953, "kl": 0.076904296875, "learning_rate": 9.732430689877497e-07, "loss": 0.0031, "reward": 1.3724395632743835, "reward_std": 0.16367345303297043, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.37243956327438354, "step": 83 }, { "completion_length": 86.78125, "epoch": 0.05415860735009671, "grad_norm": 11.676203727722168, "kl": 0.083984375, "learning_rate": 9.729206963249516e-07, "loss": 0.0034, "reward": 1.4838296175003052, "reward_std": 0.2798515185713768, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4994546175003052, "step": 84 }, { "completion_length": 77.171875, "epoch": 0.0548033526756931, "grad_norm": 9.494368553161621, "kl": 0.0966796875, "learning_rate": 9.725983236621534e-07, "loss": 0.0039, "reward": 1.3889164328575134, "reward_std": 0.18041953444480896, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.38891640305519104, "step": 85 }, { "completion_length": 79.171875, "epoch": 0.05544809800128949, "grad_norm": 15.85273265838623, "kl": 0.082763671875, "learning_rate": 9.722759509993553e-07, "loss": 0.0033, "reward": 1.2888696193695068, "reward_std": 0.18666045367717743, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2888696789741516, "step": 86 }, { "completion_length": 80.375, "epoch": 0.05609284332688588, "grad_norm": 33.20154571533203, "kl": 0.09130859375, "learning_rate": 9.71953578336557e-07, "loss": 0.0037, "reward": 1.4762005805969238, "reward_std": 0.20586346834897995, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.476200670003891, "step": 87 }, { "completion_length": 80.25, "epoch": 0.05673758865248227, "grad_norm": 23.280982971191406, "kl": 0.0849609375, "learning_rate": 9.716312056737588e-07, "loss": 0.0034, "reward": 1.3444839119911194, "reward_std": 0.18448061496019363, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3444839417934418, "step": 88 }, { "completion_length": 86.203125, "epoch": 0.05738233397807866, "grad_norm": 7.881406307220459, "kl": 0.07373046875, "learning_rate": 9.713088330109607e-07, "loss": 0.003, "reward": 1.4067343473434448, "reward_std": 0.18169286847114563, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.406734362244606, "step": 89 }, { "completion_length": 81.0625, "epoch": 0.058027079303675046, "grad_norm": 13.038424491882324, "kl": 0.084228515625, "learning_rate": 9.709864603481624e-07, "loss": 0.0034, "reward": 1.4451138973236084, "reward_std": 0.21713434159755707, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4451138973236084, "step": 90 }, { "completion_length": 79.609375, "epoch": 0.05867182462927144, "grad_norm": 7.999779224395752, "kl": 0.081298828125, "learning_rate": 9.706640876853641e-07, "loss": 0.0033, "reward": 1.40691077709198, "reward_std": 0.2112068235874176, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4069107621908188, "step": 91 }, { "completion_length": 73.28125, "epoch": 0.05931656995486783, "grad_norm": 19.086753845214844, "kl": 0.11572265625, "learning_rate": 9.70341715022566e-07, "loss": 0.0046, "reward": 1.4588000178337097, "reward_std": 0.1944720521569252, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4588000625371933, "step": 92 }, { "completion_length": 83.25, "epoch": 0.059961315280464215, "grad_norm": 24.621259689331055, "kl": 0.101806640625, "learning_rate": 9.700193423597678e-07, "loss": 0.0041, "reward": 1.4452659487724304, "reward_std": 0.21456287056207657, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4452659338712692, "step": 93 }, { "completion_length": 76.28125, "epoch": 0.06060606060606061, "grad_norm": 18.733131408691406, "kl": 0.088623046875, "learning_rate": 9.696969696969698e-07, "loss": 0.0035, "reward": 1.4556202292442322, "reward_std": 0.11773604899644852, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.455620214343071, "step": 94 }, { "completion_length": 73.015625, "epoch": 0.06125080593165699, "grad_norm": 11.115130424499512, "kl": 0.096435546875, "learning_rate": 9.693745970341715e-07, "loss": 0.0039, "reward": 1.3447935581207275, "reward_std": 0.18323758244514465, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.34479353576898575, "step": 95 }, { "completion_length": 73.3125, "epoch": 0.061895551257253385, "grad_norm": 15.140289306640625, "kl": 0.093994140625, "learning_rate": 9.690522243713732e-07, "loss": 0.0038, "reward": 1.475332260131836, "reward_std": 0.23074129223823547, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4753323048353195, "step": 96 }, { "completion_length": 74.53125, "epoch": 0.06254029658284978, "grad_norm": 23.077402114868164, "kl": 0.10498046875, "learning_rate": 9.68729851708575e-07, "loss": 0.0042, "reward": 1.417439877986908, "reward_std": 0.17361952364444733, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41743984818458557, "step": 97 }, { "completion_length": 73.5, "epoch": 0.06318504190844616, "grad_norm": 13.471073150634766, "kl": 0.099853515625, "learning_rate": 9.684074790457769e-07, "loss": 0.004, "reward": 1.3859241008758545, "reward_std": 0.20852358639240265, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3859241306781769, "step": 98 }, { "completion_length": 72.25, "epoch": 0.06382978723404255, "grad_norm": 12.648615837097168, "kl": 0.093017578125, "learning_rate": 9.680851063829786e-07, "loss": 0.0037, "reward": 1.3480225801467896, "reward_std": 0.22543464601039886, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.34802258014678955, "step": 99 }, { "completion_length": 71.875, "epoch": 0.06447453255963895, "grad_norm": 12.842320442199707, "kl": 0.080810546875, "learning_rate": 9.677627337201805e-07, "loss": 0.0032, "reward": 1.4175334572792053, "reward_std": 0.18312367051839828, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4175334572792053, "step": 100 }, { "completion_length": 70.515625, "epoch": 0.06511927788523533, "grad_norm": 13.96192455291748, "kl": 0.106201171875, "learning_rate": 9.674403610573823e-07, "loss": 0.0042, "reward": 1.3348948955535889, "reward_std": 0.166929941624403, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3348948061466217, "step": 101 }, { "completion_length": 71.375, "epoch": 0.06576402321083172, "grad_norm": 12.400444984436035, "kl": 0.12841796875, "learning_rate": 9.671179883945842e-07, "loss": 0.0051, "reward": 1.4501676559448242, "reward_std": 0.180241659283638, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45016759634017944, "step": 102 }, { "completion_length": 71.3125, "epoch": 0.06640876853642812, "grad_norm": 23.09809112548828, "kl": 0.1064453125, "learning_rate": 9.66795615731786e-07, "loss": 0.0043, "reward": 1.4367554187774658, "reward_std": 0.12167369574308395, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43675535917282104, "step": 103 }, { "completion_length": 72.09375, "epoch": 0.0670535138620245, "grad_norm": 7.730201244354248, "kl": 0.104736328125, "learning_rate": 9.664732430689877e-07, "loss": 0.0042, "reward": 1.4592158794403076, "reward_std": 0.193368099629879, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4592158794403076, "step": 104 }, { "completion_length": 68.953125, "epoch": 0.06769825918762089, "grad_norm": 17.011564254760742, "kl": 0.1201171875, "learning_rate": 9.661508704061894e-07, "loss": 0.0048, "reward": 1.3875025510787964, "reward_std": 0.1890907734632492, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.38750262558460236, "step": 105 }, { "completion_length": 76.875, "epoch": 0.06834300451321727, "grad_norm": 20.02046012878418, "kl": 0.0947265625, "learning_rate": 9.658284977433913e-07, "loss": 0.0038, "reward": 1.5030284523963928, "reward_std": 0.16404563933610916, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5030284374952316, "step": 106 }, { "completion_length": 75.0, "epoch": 0.06898774983881367, "grad_norm": 13.054033279418945, "kl": 0.1015625, "learning_rate": 9.65506125080593e-07, "loss": 0.0041, "reward": 1.571265459060669, "reward_std": 0.16624636203050613, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5712654292583466, "step": 107 }, { "completion_length": 70.609375, "epoch": 0.06963249516441006, "grad_norm": 30.342504501342773, "kl": 0.11669921875, "learning_rate": 9.65183752417795e-07, "loss": 0.0047, "reward": 1.4218077063560486, "reward_std": 0.20463600754737854, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4218077063560486, "step": 108 }, { "completion_length": 75.953125, "epoch": 0.07027724049000644, "grad_norm": 14.21203899383545, "kl": 0.09912109375, "learning_rate": 9.648613797549967e-07, "loss": 0.004, "reward": 1.4889991879463196, "reward_std": 0.18232028931379318, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4889991283416748, "step": 109 }, { "completion_length": 77.125, "epoch": 0.07092198581560284, "grad_norm": 16.088598251342773, "kl": 0.1142578125, "learning_rate": 9.645390070921985e-07, "loss": 0.0046, "reward": 1.4165233969688416, "reward_std": 0.18773652613162994, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41652336716651917, "step": 110 }, { "completion_length": 73.53125, "epoch": 0.07156673114119923, "grad_norm": 13.502654075622559, "kl": 0.10009765625, "learning_rate": 9.642166344294004e-07, "loss": 0.004, "reward": 1.4663327932357788, "reward_std": 0.17598994821310043, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4663327932357788, "step": 111 }, { "completion_length": 70.34375, "epoch": 0.07221147646679561, "grad_norm": 14.628576278686523, "kl": 0.102294921875, "learning_rate": 9.638942617666021e-07, "loss": 0.0041, "reward": 1.4488741755485535, "reward_std": 0.17282917350530624, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4488741457462311, "step": 112 }, { "completion_length": 75.375, "epoch": 0.07285622179239201, "grad_norm": 10.183534622192383, "kl": 0.103271484375, "learning_rate": 9.635718891038038e-07, "loss": 0.0041, "reward": 1.390573263168335, "reward_std": 0.1438509002327919, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39057329297065735, "step": 113 }, { "completion_length": 72.375, "epoch": 0.0735009671179884, "grad_norm": 15.171917915344238, "kl": 0.130615234375, "learning_rate": 9.632495164410058e-07, "loss": 0.0052, "reward": 1.4449996948242188, "reward_std": 0.15270018205046654, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4449998140335083, "step": 114 }, { "completion_length": 67.640625, "epoch": 0.07414571244358478, "grad_norm": 15.202000617980957, "kl": 0.14111328125, "learning_rate": 9.629271437782075e-07, "loss": 0.0056, "reward": 1.4991153478622437, "reward_std": 0.1308480128645897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49911533296108246, "step": 115 }, { "completion_length": 72.390625, "epoch": 0.07479045776918117, "grad_norm": 11.967129707336426, "kl": 0.1357421875, "learning_rate": 9.626047711154095e-07, "loss": 0.0054, "reward": 1.4869657158851624, "reward_std": 0.1620316430926323, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4869658052921295, "step": 116 }, { "completion_length": 76.890625, "epoch": 0.07543520309477757, "grad_norm": 15.96191692352295, "kl": 0.1484375, "learning_rate": 9.622823984526112e-07, "loss": 0.0059, "reward": 1.5816780924797058, "reward_std": 0.18133046478033066, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.581678032875061, "step": 117 }, { "completion_length": 75.53125, "epoch": 0.07607994842037395, "grad_norm": 12.706347465515137, "kl": 0.11328125, "learning_rate": 9.61960025789813e-07, "loss": 0.0045, "reward": 1.5155820846557617, "reward_std": 0.13779311254620552, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5155820995569229, "step": 118 }, { "completion_length": 75.921875, "epoch": 0.07672469374597034, "grad_norm": 10.711260795593262, "kl": 0.12451171875, "learning_rate": 9.616376531270149e-07, "loss": 0.005, "reward": 1.4439030289649963, "reward_std": 0.1384158730506897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44390299916267395, "step": 119 }, { "completion_length": 77.921875, "epoch": 0.07736943907156674, "grad_norm": 8.561759948730469, "kl": 0.118896484375, "learning_rate": 9.613152804642166e-07, "loss": 0.0048, "reward": 1.6015077829360962, "reward_std": 0.187755785882473, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.601507842540741, "step": 120 }, { "completion_length": 76.328125, "epoch": 0.07801418439716312, "grad_norm": 21.100255966186523, "kl": 0.4189453125, "learning_rate": 9.609929078014183e-07, "loss": 0.0168, "reward": 1.5190616846084595, "reward_std": 0.1306452825665474, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5190616846084595, "step": 121 }, { "completion_length": 75.78125, "epoch": 0.07865892972275951, "grad_norm": 32.30814743041992, "kl": 0.14404296875, "learning_rate": 9.606705351386202e-07, "loss": 0.0058, "reward": 1.394041121006012, "reward_std": 0.16053549945354462, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39404113590717316, "step": 122 }, { "completion_length": 71.28125, "epoch": 0.07930367504835589, "grad_norm": 18.817506790161133, "kl": 0.15673828125, "learning_rate": 9.60348162475822e-07, "loss": 0.0063, "reward": 1.4487716555595398, "reward_std": 0.24345475435256958, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.464396670460701, "step": 123 }, { "completion_length": 69.875, "epoch": 0.07994842037395229, "grad_norm": 16.40485382080078, "kl": 0.14306640625, "learning_rate": 9.60025789813024e-07, "loss": 0.0057, "reward": 1.5178059339523315, "reward_std": 0.14177018404006958, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5178059637546539, "step": 124 }, { "completion_length": 72.15625, "epoch": 0.08059316569954868, "grad_norm": 9.994036674499512, "kl": 0.1435546875, "learning_rate": 9.597034171502256e-07, "loss": 0.0057, "reward": 1.3876937627792358, "reward_std": 0.16864847391843796, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3876937925815582, "step": 125 }, { "completion_length": 76.640625, "epoch": 0.08123791102514506, "grad_norm": 9.577360153198242, "kl": 0.19482421875, "learning_rate": 9.593810444874274e-07, "loss": 0.0078, "reward": 1.4948091506958008, "reward_std": 0.1558496206998825, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4948091357946396, "step": 126 }, { "completion_length": 78.28125, "epoch": 0.08188265635074146, "grad_norm": 136.7667236328125, "kl": 0.12109375, "learning_rate": 9.59058671824629e-07, "loss": 0.0048, "reward": 1.507809579372406, "reward_std": 0.21292894333600998, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.507809579372406, "step": 127 }, { "completion_length": 79.828125, "epoch": 0.08252740167633785, "grad_norm": 27.071666717529297, "kl": 0.114013671875, "learning_rate": 9.58736299161831e-07, "loss": 0.0046, "reward": 1.3414384722709656, "reward_std": 0.12231213599443436, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3414384722709656, "step": 128 }, { "completion_length": 75.515625, "epoch": 0.08317214700193423, "grad_norm": 18.19074058532715, "kl": 0.109619140625, "learning_rate": 9.584139264990328e-07, "loss": 0.0044, "reward": 1.5530925393104553, "reward_std": 0.1707591935992241, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5530924797058105, "step": 129 }, { "completion_length": 84.265625, "epoch": 0.08381689232753063, "grad_norm": 21.930219650268555, "kl": 0.104248046875, "learning_rate": 9.580915538362347e-07, "loss": 0.0042, "reward": 1.42109215259552, "reward_std": 0.2092544138431549, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.43671710789203644, "step": 130 }, { "completion_length": 85.125, "epoch": 0.08446163765312702, "grad_norm": 8.885394096374512, "kl": 0.120361328125, "learning_rate": 9.577691811734364e-07, "loss": 0.0048, "reward": 1.486854612827301, "reward_std": 0.1961679607629776, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5024795830249786, "step": 131 }, { "completion_length": 83.078125, "epoch": 0.0851063829787234, "grad_norm": 13.252328872680664, "kl": 0.200439453125, "learning_rate": 9.574468085106384e-07, "loss": 0.008, "reward": 1.4762264490127563, "reward_std": 0.16455548256635666, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4762263596057892, "step": 132 }, { "completion_length": 78.484375, "epoch": 0.08575112830431979, "grad_norm": 23.153955459594727, "kl": 0.1083984375, "learning_rate": 9.5712443584784e-07, "loss": 0.0043, "reward": 1.5278353691101074, "reward_std": 0.16748548299074173, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.527835339307785, "step": 133 }, { "completion_length": 86.25, "epoch": 0.08639587362991619, "grad_norm": 57.02205276489258, "kl": 0.11181640625, "learning_rate": 9.568020631850418e-07, "loss": 0.0045, "reward": 1.5299711227416992, "reward_std": 0.17784756422042847, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5299711525440216, "step": 134 }, { "completion_length": 81.5625, "epoch": 0.08704061895551257, "grad_norm": 12.712023735046387, "kl": 0.151611328125, "learning_rate": 9.564796905222435e-07, "loss": 0.0061, "reward": 1.3502278327941895, "reward_std": 0.13139504194259644, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3502279072999954, "step": 135 }, { "completion_length": 81.46875, "epoch": 0.08768536428110896, "grad_norm": 22.494131088256836, "kl": 0.112548828125, "learning_rate": 9.561573178594455e-07, "loss": 0.0045, "reward": 1.4433656334877014, "reward_std": 0.1883310079574585, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44336557388305664, "step": 136 }, { "completion_length": 82.84375, "epoch": 0.08833010960670536, "grad_norm": 16.634904861450195, "kl": 0.105712890625, "learning_rate": 9.558349451966472e-07, "loss": 0.0042, "reward": 1.470713198184967, "reward_std": 0.25595565140247345, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4863382577896118, "step": 137 }, { "completion_length": 79.125, "epoch": 0.08897485493230174, "grad_norm": 16.343263626098633, "kl": 0.10888671875, "learning_rate": 9.555125725338492e-07, "loss": 0.0044, "reward": 1.336100161075592, "reward_std": 0.1666717305779457, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.33610019087791443, "step": 138 }, { "completion_length": 81.609375, "epoch": 0.08961960025789813, "grad_norm": 10.733060836791992, "kl": 0.10400390625, "learning_rate": 9.551901998710509e-07, "loss": 0.0042, "reward": 1.4843059182167053, "reward_std": 0.19644438475370407, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.49993085861206055, "step": 139 }, { "completion_length": 80.96875, "epoch": 0.09026434558349453, "grad_norm": 12.256318092346191, "kl": 0.114013671875, "learning_rate": 9.548678272082526e-07, "loss": 0.0046, "reward": 1.539031982421875, "reward_std": 0.21819938719272614, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5546570122241974, "step": 140 }, { "completion_length": 104.21875, "epoch": 0.09090909090909091, "grad_norm": 8.861552238464355, "kl": 0.0869140625, "learning_rate": 9.545454545454546e-07, "loss": 0.0035, "reward": 1.3945060968399048, "reward_std": 0.18161959201097488, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3945060670375824, "step": 141 }, { "completion_length": 90.609375, "epoch": 0.0915538362346873, "grad_norm": 16.709983825683594, "kl": 0.096923828125, "learning_rate": 9.542230818826563e-07, "loss": 0.0039, "reward": 1.5597324967384338, "reward_std": 0.2325654923915863, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5597324520349503, "step": 142 }, { "completion_length": 83.5625, "epoch": 0.09219858156028368, "grad_norm": 20.069183349609375, "kl": 0.107666015625, "learning_rate": 9.53900709219858e-07, "loss": 0.0043, "reward": 1.458389699459076, "reward_std": 0.13381709158420563, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4583897143602371, "step": 143 }, { "completion_length": 83.6875, "epoch": 0.09284332688588008, "grad_norm": 82.17460632324219, "kl": 0.10888671875, "learning_rate": 9.535783365570598e-07, "loss": 0.0044, "reward": 1.3395763635635376, "reward_std": 0.19901933521032333, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.3552014008164406, "step": 144 }, { "completion_length": 88.375, "epoch": 0.09348807221147647, "grad_norm": 16.213743209838867, "kl": 0.09912109375, "learning_rate": 9.532559638942618e-07, "loss": 0.004, "reward": 1.4737032651901245, "reward_std": 0.20099236071109772, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4893282651901245, "step": 145 }, { "completion_length": 92.28125, "epoch": 0.09413281753707285, "grad_norm": 11.83709716796875, "kl": 0.121826171875, "learning_rate": 9.529335912314635e-07, "loss": 0.0049, "reward": 1.413508951663971, "reward_std": 0.181750126183033, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41350890696048737, "step": 146 }, { "completion_length": 84.640625, "epoch": 0.09477756286266925, "grad_norm": 17.66875648498535, "kl": 0.099609375, "learning_rate": 9.526112185686653e-07, "loss": 0.004, "reward": 1.4129161834716797, "reward_std": 0.13768525421619415, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4129162132740021, "step": 147 }, { "completion_length": 81.40625, "epoch": 0.09542230818826564, "grad_norm": 56.989559173583984, "kl": 0.112548828125, "learning_rate": 9.522888459058671e-07, "loss": 0.0045, "reward": 1.3882247805595398, "reward_std": 0.19382939487695694, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3882247507572174, "step": 148 }, { "completion_length": 80.625, "epoch": 0.09606705351386202, "grad_norm": 25.619226455688477, "kl": 0.12744140625, "learning_rate": 9.519664732430689e-07, "loss": 0.0051, "reward": 1.4231994152069092, "reward_std": 0.28173212707042694, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.45444950461387634, "step": 149 }, { "completion_length": 77.6875, "epoch": 0.09671179883945841, "grad_norm": 9.32150650024414, "kl": 0.115478515625, "learning_rate": 9.516441005802707e-07, "loss": 0.0046, "reward": 1.477019727230072, "reward_std": 0.23725295066833496, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4926447123289108, "step": 150 }, { "completion_length": 77.5, "epoch": 0.0973565441650548, "grad_norm": 31.08614730834961, "kl": 0.1240234375, "learning_rate": 9.513217279174726e-07, "loss": 0.005, "reward": 1.5814010500907898, "reward_std": 0.14891761541366577, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5814010798931122, "step": 151 }, { "completion_length": 79.125, "epoch": 0.09800128949065119, "grad_norm": 11.537618637084961, "kl": 0.119140625, "learning_rate": 9.509993552546743e-07, "loss": 0.0048, "reward": 1.5184857249259949, "reward_std": 0.1145397424697876, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5184857249259949, "step": 152 }, { "completion_length": 77.28125, "epoch": 0.09864603481624758, "grad_norm": 7.693099498748779, "kl": 0.122802734375, "learning_rate": 9.506769825918761e-07, "loss": 0.0049, "reward": 1.3786104917526245, "reward_std": 0.17887841165065765, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3786105662584305, "step": 153 }, { "completion_length": 75.015625, "epoch": 0.09929078014184398, "grad_norm": 16.618059158325195, "kl": 0.11962890625, "learning_rate": 9.50354609929078e-07, "loss": 0.0048, "reward": 1.597913384437561, "reward_std": 0.15385859459638596, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5979134142398834, "step": 154 }, { "completion_length": 76.71875, "epoch": 0.09993552546744036, "grad_norm": 182.9221649169922, "kl": 0.113037109375, "learning_rate": 9.500322372662798e-07, "loss": 0.0045, "reward": 1.3890660405158997, "reward_std": 0.13871842622756958, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3890660032629967, "step": 155 }, { "completion_length": 75.84375, "epoch": 0.10058027079303675, "grad_norm": 16.91444969177246, "kl": 0.107177734375, "learning_rate": 9.497098646034815e-07, "loss": 0.0043, "reward": 1.5419670939445496, "reward_std": 0.13973253965377808, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5419670641422272, "step": 156 }, { "completion_length": 82.703125, "epoch": 0.10122501611863315, "grad_norm": 24.434553146362305, "kl": 0.1171875, "learning_rate": 9.493874919406834e-07, "loss": 0.0047, "reward": 1.370639443397522, "reward_std": 0.1973675936460495, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.37063948065042496, "step": 157 }, { "completion_length": 82.765625, "epoch": 0.10186976144422953, "grad_norm": 25.902759552001953, "kl": 0.121337890625, "learning_rate": 9.490651192778852e-07, "loss": 0.0048, "reward": 1.4471543431282043, "reward_std": 0.1710544154047966, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44715429842472076, "step": 158 }, { "completion_length": 79.828125, "epoch": 0.10251450676982592, "grad_norm": 14.119280815124512, "kl": 0.10302734375, "learning_rate": 9.48742746615087e-07, "loss": 0.0041, "reward": 1.3451138734817505, "reward_std": 0.11641614139080048, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3451138883829117, "step": 159 }, { "completion_length": 76.1875, "epoch": 0.1031592520954223, "grad_norm": 13.816582679748535, "kl": 0.098876953125, "learning_rate": 9.484203739522888e-07, "loss": 0.004, "reward": 1.5856391787528992, "reward_std": 0.1286786124110222, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5856391489505768, "step": 160 }, { "completion_length": 77.171875, "epoch": 0.1038039974210187, "grad_norm": 8.023931503295898, "kl": 0.10791015625, "learning_rate": 9.480980012894906e-07, "loss": 0.0043, "reward": 1.607506811618805, "reward_std": 0.14821096509695053, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6075067818164825, "step": 161 }, { "completion_length": 76.65625, "epoch": 0.10444874274661509, "grad_norm": 15.326875686645508, "kl": 0.089111328125, "learning_rate": 9.477756286266923e-07, "loss": 0.0036, "reward": 1.5751556158065796, "reward_std": 0.16239239275455475, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5751556158065796, "step": 162 }, { "completion_length": 77.328125, "epoch": 0.10509348807221147, "grad_norm": 13.427696228027344, "kl": 0.118408203125, "learning_rate": 9.474532559638943e-07, "loss": 0.0047, "reward": 1.563441514968872, "reward_std": 0.13956208154559135, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5634415149688721, "step": 163 }, { "completion_length": 81.8125, "epoch": 0.10573823339780787, "grad_norm": 12.042512893676758, "kl": 0.109130859375, "learning_rate": 9.47130883301096e-07, "loss": 0.0044, "reward": 1.5153995752334595, "reward_std": 0.19614696502685547, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5153996497392654, "step": 164 }, { "completion_length": 87.84375, "epoch": 0.10638297872340426, "grad_norm": 60.0034065246582, "kl": 0.094970703125, "learning_rate": 9.468085106382978e-07, "loss": 0.0038, "reward": 1.5395575761795044, "reward_std": 0.1202857531607151, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.539557546377182, "step": 165 }, { "completion_length": 72.921875, "epoch": 0.10702772404900064, "grad_norm": 18.208284378051758, "kl": 0.128173828125, "learning_rate": 9.464861379754995e-07, "loss": 0.0051, "reward": 1.4151722192764282, "reward_std": 0.16611632704734802, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4307972192764282, "step": 166 }, { "completion_length": 74.015625, "epoch": 0.10767246937459704, "grad_norm": 19.364013671875, "kl": 0.100830078125, "learning_rate": 9.461637653127015e-07, "loss": 0.004, "reward": 1.5370397567749023, "reward_std": 0.1590474396944046, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5370397567749023, "step": 167 }, { "completion_length": 77.375, "epoch": 0.10831721470019343, "grad_norm": 10.157119750976562, "kl": 0.10986328125, "learning_rate": 9.458413926499032e-07, "loss": 0.0044, "reward": 1.3529521226882935, "reward_std": 0.14438863843679428, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.35295212268829346, "step": 168 }, { "completion_length": 82.921875, "epoch": 0.10896196002578981, "grad_norm": 15.475646018981934, "kl": 0.126708984375, "learning_rate": 9.45519019987105e-07, "loss": 0.0051, "reward": 1.6346983909606934, "reward_std": 0.15077311173081398, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.634698361158371, "step": 169 }, { "completion_length": 76.34375, "epoch": 0.1096067053513862, "grad_norm": 106.46350860595703, "kl": 0.10498046875, "learning_rate": 9.451966473243068e-07, "loss": 0.0042, "reward": 1.4514729976654053, "reward_std": 0.12446143478155136, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45147302746772766, "step": 170 }, { "completion_length": 81.546875, "epoch": 0.1102514506769826, "grad_norm": 11.109757423400879, "kl": 0.091796875, "learning_rate": 9.448742746615087e-07, "loss": 0.0037, "reward": 1.4839178323745728, "reward_std": 0.12744403257966042, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48391780257225037, "step": 171 }, { "completion_length": 77.125, "epoch": 0.11089619600257898, "grad_norm": 10.060192108154297, "kl": 0.093994140625, "learning_rate": 9.445519019987105e-07, "loss": 0.0038, "reward": 1.4212124943733215, "reward_std": 0.17020346224308014, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4212125241756439, "step": 172 }, { "completion_length": 75.203125, "epoch": 0.11154094132817537, "grad_norm": 34.940547943115234, "kl": 0.09912109375, "learning_rate": 9.442295293359123e-07, "loss": 0.004, "reward": 1.5194954872131348, "reward_std": 0.14065240323543549, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5194954425096512, "step": 173 }, { "completion_length": 75.0625, "epoch": 0.11218568665377177, "grad_norm": 19.02269744873047, "kl": 0.099853515625, "learning_rate": 9.43907156673114e-07, "loss": 0.004, "reward": 1.39652419090271, "reward_std": 0.1206883117556572, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39652419090270996, "step": 174 }, { "completion_length": 74.203125, "epoch": 0.11283043197936815, "grad_norm": 10.494725227355957, "kl": 0.10986328125, "learning_rate": 9.435847840103159e-07, "loss": 0.0044, "reward": 1.5253658890724182, "reward_std": 0.1728542149066925, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5253658890724182, "step": 175 }, { "completion_length": 72.3125, "epoch": 0.11347517730496454, "grad_norm": 9.659781455993652, "kl": 0.10546875, "learning_rate": 9.432624113475178e-07, "loss": 0.0042, "reward": 1.389731228351593, "reward_std": 0.07970772497355938, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3897312879562378, "step": 176 }, { "completion_length": 70.90625, "epoch": 0.11411992263056092, "grad_norm": 14.460417747497559, "kl": 0.105224609375, "learning_rate": 9.429400386847195e-07, "loss": 0.0042, "reward": 1.516136884689331, "reward_std": 0.1931019425392151, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.516136884689331, "step": 177 }, { "completion_length": 70.484375, "epoch": 0.11476466795615732, "grad_norm": 23.858684539794922, "kl": 0.11328125, "learning_rate": 9.426176660219213e-07, "loss": 0.0045, "reward": 1.4906936287879944, "reward_std": 0.10323386639356613, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4906936585903168, "step": 178 }, { "completion_length": 75.1875, "epoch": 0.1154094132817537, "grad_norm": 11.778549194335938, "kl": 0.112060546875, "learning_rate": 9.422952933591231e-07, "loss": 0.0045, "reward": 1.5091403126716614, "reward_std": 0.15490031242370605, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5091403424739838, "step": 179 }, { "completion_length": 81.328125, "epoch": 0.11605415860735009, "grad_norm": 8.447686195373535, "kl": 0.1123046875, "learning_rate": 9.41972920696325e-07, "loss": 0.0045, "reward": 1.4372309446334839, "reward_std": 0.1335056535899639, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4372308552265167, "step": 180 }, { "completion_length": 78.96875, "epoch": 0.11669890393294649, "grad_norm": 91.26253509521484, "kl": 0.32568359375, "learning_rate": 9.416505480335267e-07, "loss": 0.0131, "reward": 1.4686675667762756, "reward_std": 0.16033166646957397, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46866756677627563, "step": 181 }, { "completion_length": 77.984375, "epoch": 0.11734364925854288, "grad_norm": 16.17279815673828, "kl": 0.109130859375, "learning_rate": 9.413281753707286e-07, "loss": 0.0044, "reward": 1.2890088558197021, "reward_std": 0.16506294906139374, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.28900881856679916, "step": 182 }, { "completion_length": 74.421875, "epoch": 0.11798839458413926, "grad_norm": 16.651771545410156, "kl": 0.11083984375, "learning_rate": 9.410058027079303e-07, "loss": 0.0044, "reward": 1.6036138534545898, "reward_std": 0.07109030708670616, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6036138236522675, "step": 183 }, { "completion_length": 82.359375, "epoch": 0.11863313990973566, "grad_norm": 14.208246231079102, "kl": 0.0908203125, "learning_rate": 9.406834300451322e-07, "loss": 0.0036, "reward": 1.5008456707000732, "reward_std": 0.13602961599826813, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5008457005023956, "step": 184 }, { "completion_length": 73.84375, "epoch": 0.11927788523533205, "grad_norm": 17.21881675720215, "kl": 0.098388671875, "learning_rate": 9.40361057382334e-07, "loss": 0.0039, "reward": 1.5844212174415588, "reward_std": 0.12729774415493011, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5844212472438812, "step": 185 }, { "completion_length": 86.96875, "epoch": 0.11992263056092843, "grad_norm": 14.471665382385254, "kl": 0.094970703125, "learning_rate": 9.400386847195358e-07, "loss": 0.0038, "reward": 1.3867006301879883, "reward_std": 0.1378549337387085, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3867005705833435, "step": 186 }, { "completion_length": 78.21875, "epoch": 0.12056737588652482, "grad_norm": 21.124332427978516, "kl": 0.113037109375, "learning_rate": 9.397163120567375e-07, "loss": 0.0045, "reward": 1.4724563956260681, "reward_std": 0.15684282779693604, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4724564105272293, "step": 187 }, { "completion_length": 74.8125, "epoch": 0.12121212121212122, "grad_norm": 16.912813186645508, "kl": 0.104736328125, "learning_rate": 9.393939393939395e-07, "loss": 0.0042, "reward": 1.4298964738845825, "reward_std": 0.136982262134552, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4298964589834213, "step": 188 }, { "completion_length": 80.3125, "epoch": 0.1218568665377176, "grad_norm": 21.21564292907715, "kl": 0.104736328125, "learning_rate": 9.390715667311412e-07, "loss": 0.0042, "reward": 1.451830267906189, "reward_std": 0.14300556480884552, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45183031260967255, "step": 189 }, { "completion_length": 76.140625, "epoch": 0.12250161186331399, "grad_norm": 13.693678855895996, "kl": 0.14208984375, "learning_rate": 9.38749194068343e-07, "loss": 0.0057, "reward": 1.476448655128479, "reward_std": 0.16905169934034348, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4764486402273178, "step": 190 }, { "completion_length": 98.3125, "epoch": 0.12314635718891039, "grad_norm": 12.33560562133789, "kl": 0.087890625, "learning_rate": 9.384268214055447e-07, "loss": 0.0035, "reward": 1.502742886543274, "reward_std": 0.17102345824241638, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.518367812037468, "step": 191 }, { "completion_length": 85.625, "epoch": 0.12379110251450677, "grad_norm": 17.925392150878906, "kl": 0.1318359375, "learning_rate": 9.381044487427466e-07, "loss": 0.0053, "reward": 1.5797561407089233, "reward_std": 0.16112536564469337, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5797560513019562, "step": 192 }, { "completion_length": 79.734375, "epoch": 0.12443584784010316, "grad_norm": 11.936005592346191, "kl": 0.109130859375, "learning_rate": 9.377820760799484e-07, "loss": 0.0044, "reward": 1.4955528378486633, "reward_std": 0.11603929847478867, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4955528676509857, "step": 193 }, { "completion_length": 78.375, "epoch": 0.12508059316569956, "grad_norm": 17.255386352539062, "kl": 0.113525390625, "learning_rate": 9.374597034171502e-07, "loss": 0.0045, "reward": 1.4783340692520142, "reward_std": 0.08274748548865318, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47833409905433655, "step": 194 }, { "completion_length": 84.6875, "epoch": 0.12572533849129594, "grad_norm": 25.271358489990234, "kl": 0.084228515625, "learning_rate": 9.37137330754352e-07, "loss": 0.0034, "reward": 1.3292394280433655, "reward_std": 0.19878728687763214, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3292394280433655, "step": 195 }, { "completion_length": 76.46875, "epoch": 0.12637008381689233, "grad_norm": 10.557123184204102, "kl": 0.10595703125, "learning_rate": 9.368149580915538e-07, "loss": 0.0042, "reward": 1.440163254737854, "reward_std": 0.17127015441656113, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4401632696390152, "step": 196 }, { "completion_length": 81.515625, "epoch": 0.1270148291424887, "grad_norm": 27.85223960876465, "kl": 0.11083984375, "learning_rate": 9.364925854287556e-07, "loss": 0.0044, "reward": 1.3243326544761658, "reward_std": 0.10718319937586784, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3243326246738434, "step": 197 }, { "completion_length": 86.25, "epoch": 0.1276595744680851, "grad_norm": 30.643444061279297, "kl": 0.090576171875, "learning_rate": 9.361702127659575e-07, "loss": 0.0036, "reward": 1.4653456211090088, "reward_std": 0.17089767009019852, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4653455317020416, "step": 198 }, { "completion_length": 89.03125, "epoch": 0.1283043197936815, "grad_norm": 10.595077514648438, "kl": 0.10888671875, "learning_rate": 9.358478401031592e-07, "loss": 0.0044, "reward": 1.3735358119010925, "reward_std": 0.17685279250144958, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.37353579699993134, "step": 199 }, { "completion_length": 87.375, "epoch": 0.1289490651192779, "grad_norm": 17.84724235534668, "kl": 0.1533203125, "learning_rate": 9.35525467440361e-07, "loss": 0.0061, "reward": 1.4756492972373962, "reward_std": 0.09536949545145035, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47564925253391266, "step": 200 }, { "completion_length": 79.09375, "epoch": 0.12959381044487428, "grad_norm": 13.18930435180664, "kl": 0.0927734375, "learning_rate": 9.352030947775629e-07, "loss": 0.0037, "reward": 1.512624442577362, "reward_std": 0.10898591205477715, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5126243680715561, "step": 201 }, { "completion_length": 82.21875, "epoch": 0.13023855577047067, "grad_norm": 20.932710647583008, "kl": 0.123046875, "learning_rate": 9.348807221147647e-07, "loss": 0.0049, "reward": 1.4443880915641785, "reward_std": 0.08235402405261993, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44438812136650085, "step": 202 }, { "completion_length": 82.234375, "epoch": 0.13088330109606705, "grad_norm": 10.726685523986816, "kl": 0.108642578125, "learning_rate": 9.345583494519664e-07, "loss": 0.0043, "reward": 1.6250720024108887, "reward_std": 0.12261708825826645, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6250719428062439, "step": 203 }, { "completion_length": 82.6875, "epoch": 0.13152804642166344, "grad_norm": 10.96521282196045, "kl": 0.1015625, "learning_rate": 9.342359767891683e-07, "loss": 0.0041, "reward": 1.6589622497558594, "reward_std": 0.13084128499031067, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6589623093605042, "step": 204 }, { "completion_length": 80.921875, "epoch": 0.13217279174725982, "grad_norm": 18.59563446044922, "kl": 0.1103515625, "learning_rate": 9.3391360412637e-07, "loss": 0.0044, "reward": 1.4989012479782104, "reward_std": 0.12866604328155518, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49890124797821045, "step": 205 }, { "completion_length": 79.140625, "epoch": 0.13281753707285623, "grad_norm": 26.62146759033203, "kl": 0.11962890625, "learning_rate": 9.335912314635719e-07, "loss": 0.0048, "reward": 1.557816207408905, "reward_std": 0.19459083676338196, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5578161776065826, "step": 206 }, { "completion_length": 87.34375, "epoch": 0.13346228239845262, "grad_norm": 15.918787956237793, "kl": 0.1044921875, "learning_rate": 9.332688588007737e-07, "loss": 0.0042, "reward": 1.459834098815918, "reward_std": 0.1547633334994316, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4598340541124344, "step": 207 }, { "completion_length": 92.734375, "epoch": 0.134107027724049, "grad_norm": 9.844905853271484, "kl": 0.110107421875, "learning_rate": 9.329464861379755e-07, "loss": 0.0044, "reward": 1.5493760108947754, "reward_std": 0.13573891669511795, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5493760406970978, "step": 208 }, { "completion_length": 82.203125, "epoch": 0.1347517730496454, "grad_norm": 9.640893936157227, "kl": 0.1474609375, "learning_rate": 9.326241134751772e-07, "loss": 0.0059, "reward": 1.4705365896224976, "reward_std": 0.117528036236763, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47053661942481995, "step": 209 }, { "completion_length": 84.71875, "epoch": 0.13539651837524178, "grad_norm": 20.125877380371094, "kl": 0.11279296875, "learning_rate": 9.323017408123792e-07, "loss": 0.0045, "reward": 1.5758726596832275, "reward_std": 0.12206336110830307, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5758726894855499, "step": 210 }, { "completion_length": 82.65625, "epoch": 0.13604126370083816, "grad_norm": 12.198143005371094, "kl": 0.13134765625, "learning_rate": 9.319793681495809e-07, "loss": 0.0052, "reward": 1.5487463474273682, "reward_std": 0.16695217788219452, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5487463772296906, "step": 211 }, { "completion_length": 86.25, "epoch": 0.13668600902643455, "grad_norm": 12.829062461853027, "kl": 0.111083984375, "learning_rate": 9.316569954867827e-07, "loss": 0.0044, "reward": 1.6886752843856812, "reward_std": 0.10757141932845116, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6886752545833588, "step": 212 }, { "completion_length": 79.265625, "epoch": 0.13733075435203096, "grad_norm": 25.454059600830078, "kl": 0.12109375, "learning_rate": 9.313346228239844e-07, "loss": 0.0048, "reward": 1.6044875383377075, "reward_std": 0.11038105189800262, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6044874787330627, "step": 213 }, { "completion_length": 72.921875, "epoch": 0.13797549967762734, "grad_norm": 19.579898834228516, "kl": 0.150390625, "learning_rate": 9.310122501611864e-07, "loss": 0.006, "reward": 1.3529685735702515, "reward_std": 0.06810446828603745, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.35296855866909027, "step": 214 }, { "completion_length": 75.328125, "epoch": 0.13862024500322373, "grad_norm": 20.17354393005371, "kl": 0.110107421875, "learning_rate": 9.306898774983881e-07, "loss": 0.0044, "reward": 1.5281617641448975, "reward_std": 0.10443935543298721, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5281617343425751, "step": 215 }, { "completion_length": 79.3125, "epoch": 0.13926499032882012, "grad_norm": 18.318899154663086, "kl": 0.125, "learning_rate": 9.303675048355899e-07, "loss": 0.005, "reward": 1.4097517132759094, "reward_std": 0.12294080853462219, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.409751757979393, "step": 216 }, { "completion_length": 75.828125, "epoch": 0.1399097356544165, "grad_norm": 16.73103141784668, "kl": 0.14013671875, "learning_rate": 9.300451321727917e-07, "loss": 0.0056, "reward": 1.5454647541046143, "reward_std": 0.15932783484458923, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.545464813709259, "step": 217 }, { "completion_length": 77.5625, "epoch": 0.14055448098001289, "grad_norm": 16.411338806152344, "kl": 0.1240234375, "learning_rate": 9.297227595099935e-07, "loss": 0.005, "reward": 1.4458375573158264, "reward_std": 0.10837087035179138, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4458375424146652, "step": 218 }, { "completion_length": 80.828125, "epoch": 0.14119922630560927, "grad_norm": 18.790870666503906, "kl": 0.12255859375, "learning_rate": 9.294003868471953e-07, "loss": 0.0049, "reward": 1.6049268245697021, "reward_std": 0.15314558148384094, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6049268841743469, "step": 219 }, { "completion_length": 83.234375, "epoch": 0.14184397163120568, "grad_norm": 24.741748809814453, "kl": 0.114501953125, "learning_rate": 9.290780141843972e-07, "loss": 0.0046, "reward": 1.4110137820243835, "reward_std": 0.11946087330579758, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4110138416290283, "step": 220 }, { "completion_length": 77.90625, "epoch": 0.14248871695680207, "grad_norm": 17.31515884399414, "kl": 0.13330078125, "learning_rate": 9.287556415215989e-07, "loss": 0.0053, "reward": 1.4192990064620972, "reward_std": 0.13341504707932472, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4192989766597748, "step": 221 }, { "completion_length": 75.484375, "epoch": 0.14313346228239845, "grad_norm": 17.7659969329834, "kl": 0.118896484375, "learning_rate": 9.284332688588007e-07, "loss": 0.0048, "reward": 1.6054986715316772, "reward_std": 0.11470058560371399, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6054986715316772, "step": 222 }, { "completion_length": 77.8125, "epoch": 0.14377820760799484, "grad_norm": 17.46185302734375, "kl": 0.13720703125, "learning_rate": 9.281108961960026e-07, "loss": 0.0055, "reward": 1.4767987728118896, "reward_std": 0.20763688534498215, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47679875791072845, "step": 223 }, { "completion_length": 73.109375, "epoch": 0.14442295293359123, "grad_norm": 11.976152420043945, "kl": 0.137939453125, "learning_rate": 9.277885235332044e-07, "loss": 0.0055, "reward": 1.5722563862800598, "reward_std": 0.13132376223802567, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.572256326675415, "step": 224 }, { "completion_length": 78.484375, "epoch": 0.1450676982591876, "grad_norm": 9.927757263183594, "kl": 0.130859375, "learning_rate": 9.274661508704061e-07, "loss": 0.0052, "reward": 1.5556007027626038, "reward_std": 0.154083713889122, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.555600643157959, "step": 225 }, { "completion_length": 79.609375, "epoch": 0.14571244358478402, "grad_norm": 13.89825439453125, "kl": 0.146484375, "learning_rate": 9.27143778207608e-07, "loss": 0.0059, "reward": 1.6433384418487549, "reward_std": 0.14611933380365372, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6433383822441101, "step": 226 }, { "completion_length": 89.265625, "epoch": 0.1463571889103804, "grad_norm": 11.447097778320312, "kl": 0.1240234375, "learning_rate": 9.268214055448098e-07, "loss": 0.005, "reward": 1.4545153975486755, "reward_std": 0.14470210671424866, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4545154720544815, "step": 227 }, { "completion_length": 97.6875, "epoch": 0.1470019342359768, "grad_norm": 17.05489730834961, "kl": 0.122314453125, "learning_rate": 9.264990328820116e-07, "loss": 0.0049, "reward": 1.570904791355133, "reward_std": 0.11560512334108353, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5709047913551331, "step": 228 }, { "completion_length": 86.703125, "epoch": 0.14764667956157318, "grad_norm": 13.748475074768066, "kl": 0.103271484375, "learning_rate": 9.261766602192134e-07, "loss": 0.0041, "reward": 1.6213688850402832, "reward_std": 0.09734684601426125, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6213689148426056, "step": 229 }, { "completion_length": 81.921875, "epoch": 0.14829142488716957, "grad_norm": 16.89826774597168, "kl": 0.13720703125, "learning_rate": 9.258542875564152e-07, "loss": 0.0055, "reward": 1.439545214176178, "reward_std": 0.1173011064529419, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4395451992750168, "step": 230 }, { "completion_length": 90.84375, "epoch": 0.14893617021276595, "grad_norm": 13.908186912536621, "kl": 0.09033203125, "learning_rate": 9.255319148936169e-07, "loss": 0.0036, "reward": 1.4615440964698792, "reward_std": 0.17866834998130798, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46154411137104034, "step": 231 }, { "completion_length": 93.265625, "epoch": 0.14958091553836234, "grad_norm": 12.506692886352539, "kl": 0.09814453125, "learning_rate": 9.252095422308189e-07, "loss": 0.0039, "reward": 1.508992850780487, "reward_std": 0.10917293652892113, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5089928805828094, "step": 232 }, { "completion_length": 99.0625, "epoch": 0.15022566086395875, "grad_norm": 28.037565231323242, "kl": 0.12109375, "learning_rate": 9.248871695680206e-07, "loss": 0.0048, "reward": 1.5570703744888306, "reward_std": 0.13100450485944748, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5570703744888306, "step": 233 }, { "completion_length": 98.75, "epoch": 0.15087040618955513, "grad_norm": 18.369422912597656, "kl": 0.13671875, "learning_rate": 9.245647969052224e-07, "loss": 0.0055, "reward": 1.5361040830612183, "reward_std": 0.14577998965978622, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5361041128635406, "step": 234 }, { "completion_length": 102.828125, "epoch": 0.15151515151515152, "grad_norm": 22.57931137084961, "kl": 0.110595703125, "learning_rate": 9.242424242424241e-07, "loss": 0.0044, "reward": 1.6102051734924316, "reward_std": 0.13294027745723724, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6102051734924316, "step": 235 }, { "completion_length": 99.640625, "epoch": 0.1521598968407479, "grad_norm": 6.480432510375977, "kl": 0.106689453125, "learning_rate": 9.239200515796261e-07, "loss": 0.0043, "reward": 1.5354961156845093, "reward_std": 0.12877590954303741, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5354961007833481, "step": 236 }, { "completion_length": 104.828125, "epoch": 0.1528046421663443, "grad_norm": 9.36783504486084, "kl": 0.10791015625, "learning_rate": 9.235976789168278e-07, "loss": 0.0043, "reward": 1.506102979183197, "reward_std": 0.14209586381912231, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5061030089855194, "step": 237 }, { "completion_length": 101.828125, "epoch": 0.15344938749194068, "grad_norm": 12.477219581604004, "kl": 0.099853515625, "learning_rate": 9.232753062540296e-07, "loss": 0.004, "reward": 1.4833937287330627, "reward_std": 0.1376461237668991, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4833938032388687, "step": 238 }, { "completion_length": 98.625, "epoch": 0.15409413281753706, "grad_norm": 24.562990188598633, "kl": 0.1083984375, "learning_rate": 9.229529335912314e-07, "loss": 0.0043, "reward": 1.4200094938278198, "reward_std": 0.1750444434583187, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42000944912433624, "step": 239 }, { "completion_length": 106.125, "epoch": 0.15473887814313347, "grad_norm": 10.694780349731445, "kl": 0.1337890625, "learning_rate": 9.226305609284333e-07, "loss": 0.0054, "reward": 1.5045281648635864, "reward_std": 0.12367697432637215, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5045281946659088, "step": 240 }, { "completion_length": 100.09375, "epoch": 0.15538362346872986, "grad_norm": 14.116943359375, "kl": 0.108642578125, "learning_rate": 9.22308188265635e-07, "loss": 0.0043, "reward": 1.593424916267395, "reward_std": 0.1556602418422699, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5934248268604279, "step": 241 }, { "completion_length": 105.921875, "epoch": 0.15602836879432624, "grad_norm": 17.899860382080078, "kl": 0.12109375, "learning_rate": 9.219858156028369e-07, "loss": 0.0048, "reward": 1.5431109070777893, "reward_std": 0.1369476467370987, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5431108772754669, "step": 242 }, { "completion_length": 113.890625, "epoch": 0.15667311411992263, "grad_norm": 13.18708610534668, "kl": 0.09130859375, "learning_rate": 9.216634429400386e-07, "loss": 0.0037, "reward": 1.470144808292389, "reward_std": 0.13966727256774902, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47014474868774414, "step": 243 }, { "completion_length": 99.796875, "epoch": 0.15731785944551901, "grad_norm": 7.642358303070068, "kl": 0.158203125, "learning_rate": 9.213410702772404e-07, "loss": 0.0063, "reward": 1.3841314911842346, "reward_std": 0.11086063086986542, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.38413146138191223, "step": 244 }, { "completion_length": 108.578125, "epoch": 0.1579626047711154, "grad_norm": 16.818422317504883, "kl": 0.09423828125, "learning_rate": 9.210186976144423e-07, "loss": 0.0038, "reward": 1.4686157703399658, "reward_std": 0.09336278215050697, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4686158001422882, "step": 245 }, { "completion_length": 103.609375, "epoch": 0.15860735009671179, "grad_norm": 21.795143127441406, "kl": 0.1044921875, "learning_rate": 9.206963249516441e-07, "loss": 0.0042, "reward": 1.3981711864471436, "reward_std": 0.20313138514757156, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39817118644714355, "step": 246 }, { "completion_length": 109.046875, "epoch": 0.1592520954223082, "grad_norm": 17.059049606323242, "kl": 0.09765625, "learning_rate": 9.203739522888458e-07, "loss": 0.0039, "reward": 1.5794776678085327, "reward_std": 0.16649839282035828, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5794776380062103, "step": 247 }, { "completion_length": 99.296875, "epoch": 0.15989684074790458, "grad_norm": 12.510836601257324, "kl": 0.1064453125, "learning_rate": 9.200515796260477e-07, "loss": 0.0043, "reward": 1.6266114115715027, "reward_std": 0.16728466749191284, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6266114115715027, "step": 248 }, { "completion_length": 95.390625, "epoch": 0.16054158607350097, "grad_norm": 21.999820709228516, "kl": 0.10498046875, "learning_rate": 9.197292069632495e-07, "loss": 0.0042, "reward": 1.7057658433914185, "reward_std": 0.15626183152198792, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7057657837867737, "step": 249 }, { "completion_length": 89.6875, "epoch": 0.16118633139909735, "grad_norm": 12.472075462341309, "kl": 0.094482421875, "learning_rate": 9.194068343004513e-07, "loss": 0.0038, "reward": 1.5567827224731445, "reward_std": 0.1044660434126854, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5567827373743057, "step": 250 }, { "completion_length": 99.546875, "epoch": 0.16183107672469374, "grad_norm": 10.50654125213623, "kl": 0.09619140625, "learning_rate": 9.19084461637653e-07, "loss": 0.0038, "reward": 1.5309272408485413, "reward_std": 0.12022281438112259, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5309272557497025, "step": 251 }, { "completion_length": 113.09375, "epoch": 0.16247582205029013, "grad_norm": 9.103067398071289, "kl": 0.0927734375, "learning_rate": 9.187620889748549e-07, "loss": 0.0037, "reward": 1.5212771892547607, "reward_std": 0.13080750405788422, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5212771892547607, "step": 252 }, { "completion_length": 101.203125, "epoch": 0.16312056737588654, "grad_norm": 11.426836013793945, "kl": 0.105224609375, "learning_rate": 9.184397163120567e-07, "loss": 0.0042, "reward": 1.569593608379364, "reward_std": 0.14871494472026825, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5695934891700745, "step": 253 }, { "completion_length": 98.203125, "epoch": 0.16376531270148292, "grad_norm": 9.3218355178833, "kl": 0.090576171875, "learning_rate": 9.181173436492586e-07, "loss": 0.0036, "reward": 1.530907690525055, "reward_std": 0.0898456946015358, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5309076905250549, "step": 254 }, { "completion_length": 100.234375, "epoch": 0.1644100580270793, "grad_norm": 12.731607437133789, "kl": 0.076171875, "learning_rate": 9.177949709864603e-07, "loss": 0.0031, "reward": 1.4146405458450317, "reward_std": 0.11831414513289928, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41464051604270935, "step": 255 }, { "completion_length": 93.109375, "epoch": 0.1650548033526757, "grad_norm": 17.456554412841797, "kl": 0.098876953125, "learning_rate": 9.174725983236621e-07, "loss": 0.004, "reward": 1.5557602643966675, "reward_std": 0.11968041211366653, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5557602941989899, "step": 256 }, { "completion_length": 100.140625, "epoch": 0.16569954867827208, "grad_norm": 12.351179122924805, "kl": 0.109375, "learning_rate": 9.171502256608638e-07, "loss": 0.0044, "reward": 1.5976706147193909, "reward_std": 0.1560915857553482, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5976706445217133, "step": 257 }, { "completion_length": 101.890625, "epoch": 0.16634429400386846, "grad_norm": 12.943246841430664, "kl": 0.089599609375, "learning_rate": 9.168278529980658e-07, "loss": 0.0036, "reward": 1.3898320198059082, "reward_std": 0.12752876058220863, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.389832004904747, "step": 258 }, { "completion_length": 92.40625, "epoch": 0.16698903932946485, "grad_norm": 18.802291870117188, "kl": 0.094970703125, "learning_rate": 9.165054803352675e-07, "loss": 0.0038, "reward": 1.5527189373970032, "reward_std": 0.1755797415971756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.552718997001648, "step": 259 }, { "completion_length": 92.9375, "epoch": 0.16763378465506126, "grad_norm": 22.909547805786133, "kl": 0.1005859375, "learning_rate": 9.161831076724693e-07, "loss": 0.004, "reward": 1.3774397373199463, "reward_std": 0.17273323237895966, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3774396479129791, "step": 260 }, { "completion_length": 94.609375, "epoch": 0.16827852998065765, "grad_norm": 9.854948997497559, "kl": 0.088623046875, "learning_rate": 9.158607350096711e-07, "loss": 0.0035, "reward": 1.5603925585746765, "reward_std": 0.14747831225395203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5603925287723541, "step": 261 }, { "completion_length": 104.125, "epoch": 0.16892327530625403, "grad_norm": 12.83651351928711, "kl": 0.088134765625, "learning_rate": 9.15538362346873e-07, "loss": 0.0035, "reward": 1.5412468910217285, "reward_std": 0.14012236148118973, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5412469208240509, "step": 262 }, { "completion_length": 97.625, "epoch": 0.16956802063185042, "grad_norm": 11.614677429199219, "kl": 0.095703125, "learning_rate": 9.152159896840747e-07, "loss": 0.0038, "reward": 1.3001367449760437, "reward_std": 0.1370644010603428, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3001367151737213, "step": 263 }, { "completion_length": 97.625, "epoch": 0.1702127659574468, "grad_norm": 10.85827350616455, "kl": 0.107666015625, "learning_rate": 9.148936170212766e-07, "loss": 0.0043, "reward": 1.5808395147323608, "reward_std": 0.1318642795085907, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5808394998311996, "step": 264 }, { "completion_length": 94.0, "epoch": 0.1708575112830432, "grad_norm": 19.177623748779297, "kl": 0.0869140625, "learning_rate": 9.145712443584783e-07, "loss": 0.0035, "reward": 1.428877592086792, "reward_std": 0.1186145581305027, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.428877592086792, "step": 265 }, { "completion_length": 101.171875, "epoch": 0.17150225660863957, "grad_norm": 14.279739379882812, "kl": 0.1953125, "learning_rate": 9.142488716956802e-07, "loss": 0.0078, "reward": 1.5931092500686646, "reward_std": 0.1707104742527008, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5931092202663422, "step": 266 }, { "completion_length": 101.0, "epoch": 0.172147001934236, "grad_norm": 11.718372344970703, "kl": 0.085693359375, "learning_rate": 9.13926499032882e-07, "loss": 0.0034, "reward": 1.5582287907600403, "reward_std": 0.10395102947950363, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5582287758588791, "step": 267 }, { "completion_length": 99.078125, "epoch": 0.17279174725983237, "grad_norm": 11.058600425720215, "kl": 0.092529296875, "learning_rate": 9.136041263700838e-07, "loss": 0.0037, "reward": 1.4419950246810913, "reward_std": 0.1245020180940628, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4419950246810913, "step": 268 }, { "completion_length": 98.3125, "epoch": 0.17343649258542876, "grad_norm": 18.12723731994629, "kl": 0.15478515625, "learning_rate": 9.132817537072855e-07, "loss": 0.0062, "reward": 1.470383882522583, "reward_std": 0.21054161339998245, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4703838974237442, "step": 269 }, { "completion_length": 104.53125, "epoch": 0.17408123791102514, "grad_norm": 10.150888442993164, "kl": 0.101806640625, "learning_rate": 9.129593810444874e-07, "loss": 0.0041, "reward": 1.4431976079940796, "reward_std": 0.20065002888441086, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4431975781917572, "step": 270 }, { "completion_length": 109.90625, "epoch": 0.17472598323662153, "grad_norm": 7.890026569366455, "kl": 0.082275390625, "learning_rate": 9.126370083816892e-07, "loss": 0.0033, "reward": 1.4148862957954407, "reward_std": 0.08163053542375565, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4148862734436989, "step": 271 }, { "completion_length": 95.640625, "epoch": 0.17537072856221791, "grad_norm": 10.980069160461426, "kl": 0.099853515625, "learning_rate": 9.12314635718891e-07, "loss": 0.004, "reward": 1.4476662278175354, "reward_std": 0.17421916499733925, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.447666272521019, "step": 272 }, { "completion_length": 107.625, "epoch": 0.1760154738878143, "grad_norm": 20.894821166992188, "kl": 0.0830078125, "learning_rate": 9.119922630560928e-07, "loss": 0.0033, "reward": 1.5147703289985657, "reward_std": 0.11893308907747269, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5147703588008881, "step": 273 }, { "completion_length": 97.5625, "epoch": 0.1766602192134107, "grad_norm": 17.869699478149414, "kl": 0.1171875, "learning_rate": 9.116698903932946e-07, "loss": 0.0047, "reward": 1.5422923564910889, "reward_std": 0.15741287916898727, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5422923564910889, "step": 274 }, { "completion_length": 95.734375, "epoch": 0.1773049645390071, "grad_norm": 15.127033233642578, "kl": 0.11328125, "learning_rate": 9.113475177304964e-07, "loss": 0.0045, "reward": 1.4402891397476196, "reward_std": 0.12576143443584442, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44028913974761963, "step": 275 }, { "completion_length": 116.8125, "epoch": 0.17794970986460348, "grad_norm": 31.529743194580078, "kl": 0.087890625, "learning_rate": 9.110251450676983e-07, "loss": 0.0035, "reward": 1.5826689004898071, "reward_std": 0.17237188667058945, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5826689004898071, "step": 276 }, { "completion_length": 99.359375, "epoch": 0.17859445519019987, "grad_norm": 24.593116760253906, "kl": 0.09521484375, "learning_rate": 9.107027724049e-07, "loss": 0.0038, "reward": 1.4288339614868164, "reward_std": 0.13205820694565773, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42883390188217163, "step": 277 }, { "completion_length": 108.5, "epoch": 0.17923920051579625, "grad_norm": 16.80621337890625, "kl": 0.090087890625, "learning_rate": 9.103803997421018e-07, "loss": 0.0036, "reward": 1.4728108644485474, "reward_std": 0.14833205938339233, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4728108048439026, "step": 278 }, { "completion_length": 99.421875, "epoch": 0.17988394584139264, "grad_norm": 11.30988597869873, "kl": 0.095947265625, "learning_rate": 9.100580270793036e-07, "loss": 0.0038, "reward": 1.5478020310401917, "reward_std": 0.10750558972358704, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5478019714355469, "step": 279 }, { "completion_length": 111.9375, "epoch": 0.18052869116698905, "grad_norm": 18.844890594482422, "kl": 0.091552734375, "learning_rate": 9.097356544165055e-07, "loss": 0.0037, "reward": 1.5706157684326172, "reward_std": 0.13911627978086472, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5706157982349396, "step": 280 }, { "completion_length": 115.703125, "epoch": 0.18117343649258544, "grad_norm": 58.691158294677734, "kl": 0.105712890625, "learning_rate": 9.094132817537072e-07, "loss": 0.0042, "reward": 1.4865358471870422, "reward_std": 0.11797124892473221, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48653586208820343, "step": 281 }, { "completion_length": 105.0625, "epoch": 0.18181818181818182, "grad_norm": 21.599842071533203, "kl": 0.096435546875, "learning_rate": 9.09090909090909e-07, "loss": 0.0039, "reward": 1.439337968826294, "reward_std": 0.13826286792755127, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43933798372745514, "step": 282 }, { "completion_length": 111.734375, "epoch": 0.1824629271437782, "grad_norm": 9.935829162597656, "kl": 0.08740234375, "learning_rate": 9.087685364281109e-07, "loss": 0.0035, "reward": 1.5908172130584717, "reward_std": 0.11852175369858742, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5908172726631165, "step": 283 }, { "completion_length": 110.484375, "epoch": 0.1831076724693746, "grad_norm": 10.622421264648438, "kl": 0.10888671875, "learning_rate": 9.084461637653127e-07, "loss": 0.0044, "reward": 1.4755669236183167, "reward_std": 0.12218554317951202, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47556689381599426, "step": 284 }, { "completion_length": 115.5625, "epoch": 0.18375241779497098, "grad_norm": 6.261775493621826, "kl": 0.090576171875, "learning_rate": 9.081237911025144e-07, "loss": 0.0036, "reward": 1.5844565033912659, "reward_std": 0.09432502463459969, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5844565629959106, "step": 285 }, { "completion_length": 109.703125, "epoch": 0.18439716312056736, "grad_norm": 15.250070571899414, "kl": 0.109375, "learning_rate": 9.078014184397163e-07, "loss": 0.0044, "reward": 1.5829523801803589, "reward_std": 0.15015285462141037, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5829523801803589, "step": 286 }, { "completion_length": 108.5, "epoch": 0.18504190844616378, "grad_norm": 8.247808456420898, "kl": 0.087890625, "learning_rate": 9.07479045776918e-07, "loss": 0.0035, "reward": 1.5334210991859436, "reward_std": 0.15494172275066376, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5334210991859436, "step": 287 }, { "completion_length": 111.765625, "epoch": 0.18568665377176016, "grad_norm": 45.70930099487305, "kl": 0.227783203125, "learning_rate": 9.071566731141199e-07, "loss": 0.0091, "reward": 1.478054404258728, "reward_std": 0.1481890268623829, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4780544489622116, "step": 288 }, { "completion_length": 105.546875, "epoch": 0.18633139909735655, "grad_norm": 11.30000114440918, "kl": 0.1005859375, "learning_rate": 9.068343004513217e-07, "loss": 0.004, "reward": 1.5877296924591064, "reward_std": 0.1937682181596756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5877296328544617, "step": 289 }, { "completion_length": 103.25, "epoch": 0.18697614442295293, "grad_norm": 10.429908752441406, "kl": 0.166259765625, "learning_rate": 9.065119277885235e-07, "loss": 0.0067, "reward": 1.6546611785888672, "reward_std": 0.15019819140434265, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6546612083911896, "step": 290 }, { "completion_length": 101.984375, "epoch": 0.18762088974854932, "grad_norm": 21.050172805786133, "kl": 0.1064453125, "learning_rate": 9.061895551257252e-07, "loss": 0.0043, "reward": 1.5703359842300415, "reward_std": 0.1420222818851471, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5703359246253967, "step": 291 }, { "completion_length": 95.890625, "epoch": 0.1882656350741457, "grad_norm": 9.279730796813965, "kl": 0.11181640625, "learning_rate": 9.058671824629272e-07, "loss": 0.0045, "reward": 1.480491280555725, "reward_std": 0.11640884727239609, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4804912358522415, "step": 292 }, { "completion_length": 104.796875, "epoch": 0.1889103803997421, "grad_norm": 13.045880317687988, "kl": 0.099365234375, "learning_rate": 9.055448098001289e-07, "loss": 0.004, "reward": 1.54798024892807, "reward_std": 0.15294693410396576, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5479801893234253, "step": 293 }, { "completion_length": 93.859375, "epoch": 0.1895551257253385, "grad_norm": 9.018101692199707, "kl": 0.104736328125, "learning_rate": 9.052224371373307e-07, "loss": 0.0042, "reward": 1.5038177371025085, "reward_std": 0.10080670937895775, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5038177520036697, "step": 294 }, { "completion_length": 107.171875, "epoch": 0.1901998710509349, "grad_norm": 8.838314056396484, "kl": 0.0869140625, "learning_rate": 9.049000644745325e-07, "loss": 0.0035, "reward": 1.5588614344596863, "reward_std": 0.11014912649989128, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5588614046573639, "step": 295 }, { "completion_length": 99.875, "epoch": 0.19084461637653127, "grad_norm": 12.840435981750488, "kl": 0.097900390625, "learning_rate": 9.045776918117344e-07, "loss": 0.0039, "reward": 1.5545161366462708, "reward_std": 0.12117831408977509, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5545161068439484, "step": 296 }, { "completion_length": 97.921875, "epoch": 0.19148936170212766, "grad_norm": 14.086722373962402, "kl": 0.095947265625, "learning_rate": 9.042553191489361e-07, "loss": 0.0038, "reward": 1.6154414415359497, "reward_std": 0.14917904138565063, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6154414117336273, "step": 297 }, { "completion_length": 100.515625, "epoch": 0.19213410702772404, "grad_norm": 13.001978874206543, "kl": 0.216064453125, "learning_rate": 9.03932946486138e-07, "loss": 0.0086, "reward": 1.6394542455673218, "reward_std": 0.17497223615646362, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6394542753696442, "step": 298 }, { "completion_length": 104.65625, "epoch": 0.19277885235332043, "grad_norm": 16.04650115966797, "kl": 0.09130859375, "learning_rate": 9.036105738233397e-07, "loss": 0.0037, "reward": 1.4991700649261475, "reward_std": 0.24890705943107605, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5147949606180191, "step": 299 }, { "completion_length": 104.203125, "epoch": 0.19342359767891681, "grad_norm": 19.988811492919922, "kl": 0.095458984375, "learning_rate": 9.032882011605415e-07, "loss": 0.0038, "reward": 1.547214686870575, "reward_std": 0.11229012906551361, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5472146570682526, "step": 300 }, { "completion_length": 97.921875, "epoch": 0.19406834300451323, "grad_norm": 15.380250930786133, "kl": 0.0888671875, "learning_rate": 9.029658284977433e-07, "loss": 0.0035, "reward": 1.6846535801887512, "reward_std": 0.11254275217652321, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6846535801887512, "step": 301 }, { "completion_length": 101.359375, "epoch": 0.1947130883301096, "grad_norm": 11.956006050109863, "kl": 0.091796875, "learning_rate": 9.026434558349452e-07, "loss": 0.0037, "reward": 1.584118127822876, "reward_std": 0.12083565443754196, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5841181576251984, "step": 302 }, { "completion_length": 107.328125, "epoch": 0.195357833655706, "grad_norm": 8.037882804870605, "kl": 0.0869140625, "learning_rate": 9.023210831721469e-07, "loss": 0.0035, "reward": 1.5585249662399292, "reward_std": 0.11393819749355316, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.558524951338768, "step": 303 }, { "completion_length": 93.875, "epoch": 0.19600257898130238, "grad_norm": 13.471504211425781, "kl": 0.105712890625, "learning_rate": 9.019987105093487e-07, "loss": 0.0042, "reward": 1.5376924872398376, "reward_std": 0.18834134191274643, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5376925766468048, "step": 304 }, { "completion_length": 92.96875, "epoch": 0.19664732430689877, "grad_norm": 10.125955581665039, "kl": 0.114501953125, "learning_rate": 9.016763378465506e-07, "loss": 0.0046, "reward": 1.5640528798103333, "reward_std": 0.14177117496728897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.564052939414978, "step": 305 }, { "completion_length": 98.96875, "epoch": 0.19729206963249515, "grad_norm": 9.222174644470215, "kl": 0.1259765625, "learning_rate": 9.013539651837524e-07, "loss": 0.005, "reward": 1.4195857048034668, "reward_std": 0.13147933036088943, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4195857346057892, "step": 306 }, { "completion_length": 87.921875, "epoch": 0.19793681495809157, "grad_norm": 11.84753704071045, "kl": 0.11865234375, "learning_rate": 9.010315925209541e-07, "loss": 0.0048, "reward": 1.5052205920219421, "reward_std": 0.15341130271553993, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5052205324172974, "step": 307 }, { "completion_length": 88.09375, "epoch": 0.19858156028368795, "grad_norm": 9.092905044555664, "kl": 0.113525390625, "learning_rate": 9.00709219858156e-07, "loss": 0.0045, "reward": 1.6300784349441528, "reward_std": 0.11801314726471901, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6300784051418304, "step": 308 }, { "completion_length": 89.203125, "epoch": 0.19922630560928434, "grad_norm": 13.911439895629883, "kl": 0.114990234375, "learning_rate": 9.003868471953578e-07, "loss": 0.0046, "reward": 1.4316323399543762, "reward_std": 0.19944678246974945, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4316323399543762, "step": 309 }, { "completion_length": 95.453125, "epoch": 0.19987105093488072, "grad_norm": 11.394854545593262, "kl": 0.097412109375, "learning_rate": 9.000644745325596e-07, "loss": 0.0039, "reward": 1.53744238615036, "reward_std": 0.15609167516231537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5374423861503601, "step": 310 }, { "completion_length": 90.5625, "epoch": 0.2005157962604771, "grad_norm": 43.32953643798828, "kl": 0.095458984375, "learning_rate": 8.997421018697614e-07, "loss": 0.0038, "reward": 1.5357195138931274, "reward_std": 0.14336901158094406, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5357194542884827, "step": 311 }, { "completion_length": 93.515625, "epoch": 0.2011605415860735, "grad_norm": 11.800814628601074, "kl": 0.0869140625, "learning_rate": 8.994197292069632e-07, "loss": 0.0035, "reward": 1.5805315971374512, "reward_std": 0.14307568967342377, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5805316269397736, "step": 312 }, { "completion_length": 90.515625, "epoch": 0.20180528691166988, "grad_norm": 7.8137922286987305, "kl": 0.092529296875, "learning_rate": 8.990973565441649e-07, "loss": 0.0037, "reward": 1.6621150970458984, "reward_std": 0.09617417305707932, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6621150374412537, "step": 313 }, { "completion_length": 94.890625, "epoch": 0.2024500322372663, "grad_norm": 19.35686683654785, "kl": 0.28369140625, "learning_rate": 8.987749838813669e-07, "loss": 0.0113, "reward": 1.4318525791168213, "reward_std": 0.13407088071107864, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4318525493144989, "step": 314 }, { "completion_length": 96.921875, "epoch": 0.20309477756286268, "grad_norm": 9.18880558013916, "kl": 0.118408203125, "learning_rate": 8.984526112185686e-07, "loss": 0.0047, "reward": 1.4458122253417969, "reward_std": 0.11026818305253983, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4458121508359909, "step": 315 }, { "completion_length": 98.46875, "epoch": 0.20373952288845906, "grad_norm": 10.588865280151367, "kl": 0.10791015625, "learning_rate": 8.981302385557704e-07, "loss": 0.0043, "reward": 1.527498185634613, "reward_std": 0.13546320796012878, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5274982452392578, "step": 316 }, { "completion_length": 92.125, "epoch": 0.20438426821405545, "grad_norm": 57.36320877075195, "kl": 0.08984375, "learning_rate": 8.978078658929722e-07, "loss": 0.0036, "reward": 1.3975678086280823, "reward_std": 0.12371647357940674, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3975677937269211, "step": 317 }, { "completion_length": 91.84375, "epoch": 0.20502901353965183, "grad_norm": 10.51960563659668, "kl": 0.097900390625, "learning_rate": 8.974854932301741e-07, "loss": 0.0039, "reward": 1.5480912327766418, "reward_std": 0.06759429350495338, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5480912178754807, "step": 318 }, { "completion_length": 85.546875, "epoch": 0.20567375886524822, "grad_norm": 11.455018997192383, "kl": 0.11865234375, "learning_rate": 8.971631205673758e-07, "loss": 0.0047, "reward": 1.5190264582633972, "reward_std": 0.12998723611235619, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5190264880657196, "step": 319 }, { "completion_length": 88.8125, "epoch": 0.2063185041908446, "grad_norm": 11.84350299835205, "kl": 0.1015625, "learning_rate": 8.968407479045777e-07, "loss": 0.0041, "reward": 1.4973270297050476, "reward_std": 0.12015613168478012, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.497326985001564, "step": 320 }, { "completion_length": 91.34375, "epoch": 0.20696324951644102, "grad_norm": 16.877960205078125, "kl": 0.19580078125, "learning_rate": 8.965183752417794e-07, "loss": 0.0079, "reward": 1.4202344417572021, "reward_std": 0.12599121034145355, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4202345311641693, "step": 321 }, { "completion_length": 84.59375, "epoch": 0.2076079948420374, "grad_norm": 23.965744018554688, "kl": 0.1162109375, "learning_rate": 8.961960025789813e-07, "loss": 0.0047, "reward": 1.578860580921173, "reward_std": 0.10854113101959229, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5788605213165283, "step": 322 }, { "completion_length": 85.796875, "epoch": 0.2082527401676338, "grad_norm": 17.646039962768555, "kl": 0.104736328125, "learning_rate": 8.95873629916183e-07, "loss": 0.0042, "reward": 1.4003408551216125, "reward_std": 0.17851673811674118, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.40034082531929016, "step": 323 }, { "completion_length": 86.5625, "epoch": 0.20889748549323017, "grad_norm": 32.0064697265625, "kl": 0.09375, "learning_rate": 8.955512572533849e-07, "loss": 0.0038, "reward": 1.579477608203888, "reward_std": 0.1306827887892723, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5794776082038879, "step": 324 }, { "completion_length": 83.5625, "epoch": 0.20954223081882656, "grad_norm": 11.898215293884277, "kl": 0.099853515625, "learning_rate": 8.952288845905866e-07, "loss": 0.004, "reward": 1.4137381315231323, "reward_std": 0.09641918167471886, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41373810172080994, "step": 325 }, { "completion_length": 83.96875, "epoch": 0.21018697614442294, "grad_norm": 11.983473777770996, "kl": 0.126220703125, "learning_rate": 8.949065119277884e-07, "loss": 0.0051, "reward": 1.5423098802566528, "reward_std": 0.16920984536409378, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5579348504543304, "step": 326 }, { "completion_length": 86.546875, "epoch": 0.21083172147001933, "grad_norm": 10.262345314025879, "kl": 0.111328125, "learning_rate": 8.945841392649903e-07, "loss": 0.0045, "reward": 1.535430669784546, "reward_std": 0.1486879587173462, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5354305803775787, "step": 327 }, { "completion_length": 82.140625, "epoch": 0.21147646679561574, "grad_norm": 21.612335205078125, "kl": 0.10498046875, "learning_rate": 8.942617666021921e-07, "loss": 0.0042, "reward": 1.548956573009491, "reward_std": 0.09943357855081558, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.548956573009491, "step": 328 }, { "completion_length": 81.609375, "epoch": 0.21212121212121213, "grad_norm": 16.632413864135742, "kl": 0.115234375, "learning_rate": 8.939393939393938e-07, "loss": 0.0046, "reward": 1.5397294163703918, "reward_std": 0.16385426372289658, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5397294163703918, "step": 329 }, { "completion_length": 81.546875, "epoch": 0.2127659574468085, "grad_norm": 20.90030860900879, "kl": 0.13818359375, "learning_rate": 8.936170212765957e-07, "loss": 0.0055, "reward": 1.507832646369934, "reward_std": 0.16199873387813568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5078326165676117, "step": 330 }, { "completion_length": 88.640625, "epoch": 0.2134107027724049, "grad_norm": 13.776839256286621, "kl": 0.107421875, "learning_rate": 8.932946486137975e-07, "loss": 0.0043, "reward": 1.618999719619751, "reward_std": 0.1200912743806839, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6189996898174286, "step": 331 }, { "completion_length": 83.625, "epoch": 0.21405544809800128, "grad_norm": 43.87213134765625, "kl": 0.11083984375, "learning_rate": 8.929722759509993e-07, "loss": 0.0044, "reward": 1.6609066128730774, "reward_std": 0.152876615524292, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.660906583070755, "step": 332 }, { "completion_length": 92.03125, "epoch": 0.21470019342359767, "grad_norm": 13.790566444396973, "kl": 0.12109375, "learning_rate": 8.926499032882011e-07, "loss": 0.0048, "reward": 1.4482386112213135, "reward_std": 0.14547474682331085, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4482386112213135, "step": 333 }, { "completion_length": 84.859375, "epoch": 0.21534493874919408, "grad_norm": 14.70999813079834, "kl": 0.12548828125, "learning_rate": 8.923275306254029e-07, "loss": 0.005, "reward": 1.565877914428711, "reward_std": 0.1912159025669098, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5971279144287109, "step": 334 }, { "completion_length": 87.53125, "epoch": 0.21598968407479047, "grad_norm": 26.293325424194336, "kl": 0.115478515625, "learning_rate": 8.920051579626047e-07, "loss": 0.0046, "reward": 1.5629397630691528, "reward_std": 0.19451142847537994, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5785647630691528, "step": 335 }, { "completion_length": 87.296875, "epoch": 0.21663442940038685, "grad_norm": 20.95355796813965, "kl": 0.122314453125, "learning_rate": 8.916827852998066e-07, "loss": 0.0049, "reward": 1.6097609996795654, "reward_std": 0.14318327605724335, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6097610294818878, "step": 336 }, { "completion_length": 83.875, "epoch": 0.21727917472598324, "grad_norm": 19.546236038208008, "kl": 0.117431640625, "learning_rate": 8.913604126370083e-07, "loss": 0.0047, "reward": 1.6258500814437866, "reward_std": 0.09713875502347946, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6258500814437866, "step": 337 }, { "completion_length": 86.09375, "epoch": 0.21792392005157962, "grad_norm": 31.88779067993164, "kl": 0.108642578125, "learning_rate": 8.910380399742101e-07, "loss": 0.0043, "reward": 1.4809694290161133, "reward_std": 0.12653079256415367, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48096954822540283, "step": 338 }, { "completion_length": 88.796875, "epoch": 0.218568665377176, "grad_norm": 13.355061531066895, "kl": 0.12353515625, "learning_rate": 8.907156673114119e-07, "loss": 0.0049, "reward": 1.510122537612915, "reward_std": 0.180454570800066, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.525747537612915, "step": 339 }, { "completion_length": 88.578125, "epoch": 0.2192134107027724, "grad_norm": 12.257147789001465, "kl": 0.135009765625, "learning_rate": 8.903932946486138e-07, "loss": 0.0054, "reward": 1.6074494123458862, "reward_std": 0.1379043161869049, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6074493527412415, "step": 340 }, { "completion_length": 96.953125, "epoch": 0.2198581560283688, "grad_norm": 7.543994426727295, "kl": 0.11181640625, "learning_rate": 8.900709219858155e-07, "loss": 0.0045, "reward": 1.6192185282707214, "reward_std": 0.10273057222366333, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6192185282707214, "step": 341 }, { "completion_length": 93.640625, "epoch": 0.2205029013539652, "grad_norm": 16.90849494934082, "kl": 0.09912109375, "learning_rate": 8.897485493230174e-07, "loss": 0.004, "reward": 1.464130938053131, "reward_std": 0.185988649725914, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4641309380531311, "step": 342 }, { "completion_length": 87.296875, "epoch": 0.22114764667956158, "grad_norm": 10.599559783935547, "kl": 0.118896484375, "learning_rate": 8.894261766602191e-07, "loss": 0.0047, "reward": 1.5260052680969238, "reward_std": 0.14786972850561142, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5260052978992462, "step": 343 }, { "completion_length": 94.296875, "epoch": 0.22179239200515796, "grad_norm": 12.245553970336914, "kl": 0.11279296875, "learning_rate": 8.89103803997421e-07, "loss": 0.0045, "reward": 1.62826669216156, "reward_std": 0.11454515159130096, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6282666921615601, "step": 344 }, { "completion_length": 91.0, "epoch": 0.22243713733075435, "grad_norm": 16.552492141723633, "kl": 0.11572265625, "learning_rate": 8.887814313346229e-07, "loss": 0.0046, "reward": 1.4791878461837769, "reward_std": 0.13600347191095352, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47918783128261566, "step": 345 }, { "completion_length": 99.8125, "epoch": 0.22308188265635073, "grad_norm": 8.797539710998535, "kl": 0.1162109375, "learning_rate": 8.884590586718246e-07, "loss": 0.0046, "reward": 1.4839099049568176, "reward_std": 0.12259767949581146, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4839099049568176, "step": 346 }, { "completion_length": 94.953125, "epoch": 0.22372662798194712, "grad_norm": 20.40011978149414, "kl": 0.121337890625, "learning_rate": 8.881366860090263e-07, "loss": 0.0048, "reward": 1.41604483127594, "reward_std": 0.19619542360305786, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4160449057817459, "step": 347 }, { "completion_length": 99.390625, "epoch": 0.22437137330754353, "grad_norm": 10.037973403930664, "kl": 0.16357421875, "learning_rate": 8.878143133462282e-07, "loss": 0.0065, "reward": 1.6190119981765747, "reward_std": 0.10121146589517593, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6190120130777359, "step": 348 }, { "completion_length": 100.984375, "epoch": 0.22501611863313992, "grad_norm": 35.19863510131836, "kl": 0.1494140625, "learning_rate": 8.874919406834301e-07, "loss": 0.006, "reward": 1.537758231163025, "reward_std": 0.13214527815580368, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5377582460641861, "step": 349 }, { "completion_length": 110.046875, "epoch": 0.2256608639587363, "grad_norm": 42.346046447753906, "kl": 0.12353515625, "learning_rate": 8.871695680206318e-07, "loss": 0.0049, "reward": 1.5095826387405396, "reward_std": 0.11873700842261314, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5095826536417007, "step": 350 }, { "completion_length": 99.984375, "epoch": 0.2263056092843327, "grad_norm": 14.766871452331543, "kl": 0.117431640625, "learning_rate": 8.868471953578336e-07, "loss": 0.0047, "reward": 1.5179364085197449, "reward_std": 0.13833826035261154, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5179364234209061, "step": 351 }, { "completion_length": 111.25, "epoch": 0.22695035460992907, "grad_norm": 10.721341133117676, "kl": 0.117431640625, "learning_rate": 8.865248226950354e-07, "loss": 0.0047, "reward": 1.4805157780647278, "reward_std": 0.10306541621685028, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4805157780647278, "step": 352 }, { "completion_length": 116.0625, "epoch": 0.22759509993552546, "grad_norm": 20.140018463134766, "kl": 0.11474609375, "learning_rate": 8.862024500322373e-07, "loss": 0.0046, "reward": 1.5178296566009521, "reward_std": 0.1821778416633606, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5334546715021133, "step": 353 }, { "completion_length": 93.765625, "epoch": 0.22823984526112184, "grad_norm": 19.487287521362305, "kl": 0.13232421875, "learning_rate": 8.85880077369439e-07, "loss": 0.0053, "reward": 1.6278626918792725, "reward_std": 0.20768529176712036, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6434876918792725, "step": 354 }, { "completion_length": 99.484375, "epoch": 0.22888459058671826, "grad_norm": 10.036696434020996, "kl": 0.16455078125, "learning_rate": 8.855577047066409e-07, "loss": 0.0066, "reward": 1.4639315605163574, "reward_std": 0.11769643425941467, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4639315754175186, "step": 355 }, { "completion_length": 97.75, "epoch": 0.22952933591231464, "grad_norm": 9.284832954406738, "kl": 0.13671875, "learning_rate": 8.852353320438426e-07, "loss": 0.0055, "reward": 1.588053584098816, "reward_std": 0.17258089780807495, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5880535244941711, "step": 356 }, { "completion_length": 102.859375, "epoch": 0.23017408123791103, "grad_norm": 17.785030364990234, "kl": 0.158203125, "learning_rate": 8.849129593810445e-07, "loss": 0.0063, "reward": 1.5744596719741821, "reward_std": 0.11283202096819878, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5744596570730209, "step": 357 }, { "completion_length": 112.25, "epoch": 0.2308188265635074, "grad_norm": 8.444705963134766, "kl": 0.109375, "learning_rate": 8.845905867182463e-07, "loss": 0.0044, "reward": 1.6061638593673706, "reward_std": 0.15277552604675293, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6061639040708542, "step": 358 }, { "completion_length": 116.953125, "epoch": 0.2314635718891038, "grad_norm": 91.60777282714844, "kl": 0.1181640625, "learning_rate": 8.842682140554481e-07, "loss": 0.0047, "reward": 1.6188980340957642, "reward_std": 0.18260718137025833, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6345230638980865, "step": 359 }, { "completion_length": 110.375, "epoch": 0.23210831721470018, "grad_norm": 8.044037818908691, "kl": 0.129150390625, "learning_rate": 8.839458413926498e-07, "loss": 0.0052, "reward": 1.4037490487098694, "reward_std": 0.1555328145623207, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.40374905616045, "step": 360 }, { "completion_length": 109.671875, "epoch": 0.2327530625402966, "grad_norm": 10.870550155639648, "kl": 0.134033203125, "learning_rate": 8.836234687298518e-07, "loss": 0.0054, "reward": 1.46833074092865, "reward_std": 0.13861528784036636, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4683307707309723, "step": 361 }, { "completion_length": 100.71875, "epoch": 0.23339780786589298, "grad_norm": 22.924345016479492, "kl": 0.1484375, "learning_rate": 8.833010960670535e-07, "loss": 0.0059, "reward": 1.3982949256896973, "reward_std": 0.19406422972679138, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3982948958873749, "step": 362 }, { "completion_length": 115.578125, "epoch": 0.23404255319148937, "grad_norm": 10.494964599609375, "kl": 0.14404296875, "learning_rate": 8.829787234042553e-07, "loss": 0.0058, "reward": 1.4072777032852173, "reward_std": 0.126494862139225, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4229026734828949, "step": 363 }, { "completion_length": 108.265625, "epoch": 0.23468729851708575, "grad_norm": 21.195520401000977, "kl": 0.1591796875, "learning_rate": 8.82656350741457e-07, "loss": 0.0063, "reward": 1.6603394746780396, "reward_std": 0.18765471875667572, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6603395640850067, "step": 364 }, { "completion_length": 101.9375, "epoch": 0.23533204384268214, "grad_norm": 13.110071182250977, "kl": 0.13330078125, "learning_rate": 8.823339780786589e-07, "loss": 0.0053, "reward": 1.6690788865089417, "reward_std": 0.13631463050842285, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6690788567066193, "step": 365 }, { "completion_length": 120.15625, "epoch": 0.23597678916827852, "grad_norm": 10.89026165008545, "kl": 0.112060546875, "learning_rate": 8.820116054158607e-07, "loss": 0.0045, "reward": 1.3980477452278137, "reward_std": 0.19040542095899582, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39804771542549133, "step": 366 }, { "completion_length": 111.578125, "epoch": 0.2366215344938749, "grad_norm": 37.69853210449219, "kl": 0.144775390625, "learning_rate": 8.816892327530626e-07, "loss": 0.0058, "reward": 1.5169401168823242, "reward_std": 0.1048840656876564, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5169401168823242, "step": 367 }, { "completion_length": 127.703125, "epoch": 0.23726627981947132, "grad_norm": 8.726611137390137, "kl": 0.110107421875, "learning_rate": 8.813668600902643e-07, "loss": 0.0044, "reward": 1.6172541975975037, "reward_std": 0.15612297505140305, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6172541975975037, "step": 368 }, { "completion_length": 127.125, "epoch": 0.2379110251450677, "grad_norm": 9.400821685791016, "kl": 0.14892578125, "learning_rate": 8.810444874274661e-07, "loss": 0.006, "reward": 1.5110276341438293, "reward_std": 0.20710352808237076, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5266527086496353, "step": 369 }, { "completion_length": 116.203125, "epoch": 0.2385557704706641, "grad_norm": 10.621955871582031, "kl": 0.11181640625, "learning_rate": 8.807221147646679e-07, "loss": 0.0045, "reward": 1.5545547008514404, "reward_std": 0.1288330778479576, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5545547008514404, "step": 370 }, { "completion_length": 111.578125, "epoch": 0.23920051579626048, "grad_norm": 8.831900596618652, "kl": 0.122314453125, "learning_rate": 8.803997421018698e-07, "loss": 0.0049, "reward": 1.5220544338226318, "reward_std": 0.12251096218824387, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5220544338226318, "step": 371 }, { "completion_length": 122.109375, "epoch": 0.23984526112185686, "grad_norm": 25.54118537902832, "kl": 0.1123046875, "learning_rate": 8.800773694390715e-07, "loss": 0.0045, "reward": 1.5144230723381042, "reward_std": 0.17132049053907394, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.514423057436943, "step": 372 }, { "completion_length": 117.546875, "epoch": 0.24049000644745325, "grad_norm": 13.058128356933594, "kl": 0.146484375, "learning_rate": 8.797549967762733e-07, "loss": 0.0059, "reward": 1.5927224159240723, "reward_std": 0.11529594287276268, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5927222967147827, "step": 373 }, { "completion_length": 120.109375, "epoch": 0.24113475177304963, "grad_norm": 9.465697288513184, "kl": 0.1240234375, "learning_rate": 8.794326241134752e-07, "loss": 0.005, "reward": 1.5129834413528442, "reward_std": 0.14658623188734055, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.512983426451683, "step": 374 }, { "completion_length": 115.234375, "epoch": 0.24177949709864605, "grad_norm": 14.327066421508789, "kl": 0.116943359375, "learning_rate": 8.79110251450677e-07, "loss": 0.0047, "reward": 1.5295900106430054, "reward_std": 0.125333983451128, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5295900255441666, "step": 375 }, { "completion_length": 109.453125, "epoch": 0.24242424242424243, "grad_norm": 13.450315475463867, "kl": 0.1328125, "learning_rate": 8.787878787878787e-07, "loss": 0.0053, "reward": 1.5274213552474976, "reward_std": 0.1867852807044983, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5274212658405304, "step": 376 }, { "completion_length": 113.0, "epoch": 0.24306898774983882, "grad_norm": 38.73995590209961, "kl": 0.1494140625, "learning_rate": 8.784655061250806e-07, "loss": 0.006, "reward": 1.5954452753067017, "reward_std": 0.16821444034576416, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5954452753067017, "step": 377 }, { "completion_length": 113.75, "epoch": 0.2437137330754352, "grad_norm": 8.51379108428955, "kl": 0.1337890625, "learning_rate": 8.781431334622823e-07, "loss": 0.0054, "reward": 1.4244216084480286, "reward_std": 0.08954451978206635, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42442163825035095, "step": 378 }, { "completion_length": 118.125, "epoch": 0.2443584784010316, "grad_norm": 12.981401443481445, "kl": 0.121337890625, "learning_rate": 8.778207607994842e-07, "loss": 0.0049, "reward": 1.536249577999115, "reward_std": 0.20471828430891037, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.536249577999115, "step": 379 }, { "completion_length": 118.390625, "epoch": 0.24500322372662797, "grad_norm": 17.424579620361328, "kl": 0.1240234375, "learning_rate": 8.77498388136686e-07, "loss": 0.005, "reward": 1.4686583876609802, "reward_std": 0.12933894246816635, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4686584174633026, "step": 380 }, { "completion_length": 103.21875, "epoch": 0.24564796905222436, "grad_norm": 13.15453815460205, "kl": 0.1240234375, "learning_rate": 8.771760154738878e-07, "loss": 0.0049, "reward": 1.6250154376029968, "reward_std": 0.09519154578447342, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6250154674053192, "step": 381 }, { "completion_length": 116.5625, "epoch": 0.24629271437782077, "grad_norm": 10.82996654510498, "kl": 0.1396484375, "learning_rate": 8.768536428110895e-07, "loss": 0.0056, "reward": 1.631207287311554, "reward_std": 0.1291571594774723, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.631207287311554, "step": 382 }, { "completion_length": 111.671875, "epoch": 0.24693745970341716, "grad_norm": 13.207661628723145, "kl": 0.1298828125, "learning_rate": 8.765312701482915e-07, "loss": 0.0052, "reward": 1.374193549156189, "reward_std": 0.13640347868204117, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.37419362366199493, "step": 383 }, { "completion_length": 109.421875, "epoch": 0.24758220502901354, "grad_norm": 11.31387996673584, "kl": 0.12451171875, "learning_rate": 8.762088974854932e-07, "loss": 0.005, "reward": 1.560090184211731, "reward_std": 0.21742689609527588, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5757152438163757, "step": 384 }, { "completion_length": 105.734375, "epoch": 0.24822695035460993, "grad_norm": 46.513221740722656, "kl": 0.124267578125, "learning_rate": 8.75886524822695e-07, "loss": 0.005, "reward": 1.525193452835083, "reward_std": 0.1469801962375641, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.525193452835083, "step": 385 }, { "completion_length": 110.765625, "epoch": 0.2488716956802063, "grad_norm": 15.489583015441895, "kl": 0.122314453125, "learning_rate": 8.755641521598968e-07, "loss": 0.0049, "reward": 1.5546384453773499, "reward_std": 0.10428916290402412, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5546384751796722, "step": 386 }, { "completion_length": 113.890625, "epoch": 0.2495164410058027, "grad_norm": 14.360328674316406, "kl": 0.272216796875, "learning_rate": 8.752417794970987e-07, "loss": 0.011, "reward": 1.5682534575462341, "reward_std": 0.15514999628067017, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5838784277439117, "step": 387 }, { "completion_length": 119.609375, "epoch": 0.2501611863313991, "grad_norm": 8.095773696899414, "kl": 0.109619140625, "learning_rate": 8.749194068343004e-07, "loss": 0.0044, "reward": 1.521271824836731, "reward_std": 0.10073830187320709, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5212718695402145, "step": 388 }, { "completion_length": 116.34375, "epoch": 0.2508059316569955, "grad_norm": 10.134056091308594, "kl": 0.11962890625, "learning_rate": 8.745970341715023e-07, "loss": 0.0048, "reward": 1.4365845918655396, "reward_std": 0.13728945702314377, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43658460676670074, "step": 389 }, { "completion_length": 104.21875, "epoch": 0.2514506769825919, "grad_norm": 24.610681533813477, "kl": 0.1904296875, "learning_rate": 8.74274661508704e-07, "loss": 0.0076, "reward": 1.4568537473678589, "reward_std": 0.12321500480175018, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4724787175655365, "step": 390 }, { "completion_length": 114.578125, "epoch": 0.25209542230818827, "grad_norm": 12.346039772033691, "kl": 0.117431640625, "learning_rate": 8.739522888459059e-07, "loss": 0.0047, "reward": 1.5427250862121582, "reward_std": 0.11714871600270271, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5427251160144806, "step": 391 }, { "completion_length": 117.171875, "epoch": 0.25274016763378465, "grad_norm": 10.353236198425293, "kl": 0.109619140625, "learning_rate": 8.736299161831076e-07, "loss": 0.0044, "reward": 1.7059763073921204, "reward_std": 0.16882120072841644, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7216013371944427, "step": 392 }, { "completion_length": 117.4375, "epoch": 0.25338491295938104, "grad_norm": 28.252819061279297, "kl": 0.10791015625, "learning_rate": 8.733075435203095e-07, "loss": 0.0043, "reward": 1.5156517624855042, "reward_std": 0.1431019902229309, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5156517624855042, "step": 393 }, { "completion_length": 114.4375, "epoch": 0.2540296582849774, "grad_norm": 45.27177429199219, "kl": 0.117431640625, "learning_rate": 8.729851708575112e-07, "loss": 0.0047, "reward": 1.4806265234947205, "reward_std": 0.22300370782613754, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.5275015234947205, "step": 394 }, { "completion_length": 113.390625, "epoch": 0.2546744036105738, "grad_norm": 14.209746360778809, "kl": 0.12060546875, "learning_rate": 8.72662798194713e-07, "loss": 0.0048, "reward": 1.585417628288269, "reward_std": 0.14653317630290985, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6010426878929138, "step": 395 }, { "completion_length": 123.015625, "epoch": 0.2553191489361702, "grad_norm": 6.456853866577148, "kl": 0.114501953125, "learning_rate": 8.723404255319149e-07, "loss": 0.0046, "reward": 1.694661259651184, "reward_std": 0.17132965475320816, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7102862596511841, "step": 396 }, { "completion_length": 133.40625, "epoch": 0.2559638942617666, "grad_norm": 11.757306098937988, "kl": 0.12060546875, "learning_rate": 8.720180528691167e-07, "loss": 0.0048, "reward": 1.5562049746513367, "reward_std": 0.2403947338461876, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5874549746513367, "step": 397 }, { "completion_length": 124.859375, "epoch": 0.256608639587363, "grad_norm": 15.601590156555176, "kl": 0.1201171875, "learning_rate": 8.716956802063184e-07, "loss": 0.0048, "reward": 1.3463094234466553, "reward_std": 0.255843885242939, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.3931843936443329, "step": 398 }, { "completion_length": 124.671875, "epoch": 0.2572533849129594, "grad_norm": 8.446572303771973, "kl": 0.1142578125, "learning_rate": 8.713733075435203e-07, "loss": 0.0046, "reward": 1.5371050834655762, "reward_std": 0.1252804920077324, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5527301132678986, "step": 399 }, { "completion_length": 123.171875, "epoch": 0.2578981302385558, "grad_norm": 9.509588241577148, "kl": 0.142578125, "learning_rate": 8.710509348807221e-07, "loss": 0.0057, "reward": 1.6943880319595337, "reward_std": 0.09968248754739761, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6943880319595337, "step": 400 }, { "completion_length": 115.953125, "epoch": 0.2585428755641522, "grad_norm": 26.3836612701416, "kl": 0.1240234375, "learning_rate": 8.707285622179239e-07, "loss": 0.005, "reward": 1.5159879922866821, "reward_std": 0.16381509974598885, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5316129773855209, "step": 401 }, { "completion_length": 134.421875, "epoch": 0.25918762088974856, "grad_norm": 12.983834266662598, "kl": 0.105712890625, "learning_rate": 8.704061895551257e-07, "loss": 0.0042, "reward": 1.4239293932914734, "reward_std": 0.19441810250282288, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.4551793485879898, "step": 402 }, { "completion_length": 119.921875, "epoch": 0.25983236621534495, "grad_norm": 19.93628692626953, "kl": 0.114013671875, "learning_rate": 8.700838168923275e-07, "loss": 0.0046, "reward": 1.525950849056244, "reward_std": 0.1031199786812067, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5415758490562439, "step": 403 }, { "completion_length": 119.359375, "epoch": 0.26047711154094133, "grad_norm": 9.794746398925781, "kl": 0.115966796875, "learning_rate": 8.697614442295293e-07, "loss": 0.0046, "reward": 1.4721354842185974, "reward_std": 0.19477830827236176, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4877604842185974, "step": 404 }, { "completion_length": 128.75, "epoch": 0.2611218568665377, "grad_norm": 11.187701225280762, "kl": 0.105224609375, "learning_rate": 8.694390715667312e-07, "loss": 0.0042, "reward": 1.5111178755760193, "reward_std": 0.13802172988653183, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5111178755760193, "step": 405 }, { "completion_length": 130.578125, "epoch": 0.2617666021921341, "grad_norm": 11.691713333129883, "kl": 0.0966796875, "learning_rate": 8.691166989039329e-07, "loss": 0.0039, "reward": 1.3761667013168335, "reward_std": 0.1583014875650406, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.3917917311191559, "step": 406 }, { "completion_length": 128.6875, "epoch": 0.2624113475177305, "grad_norm": 10.686534881591797, "kl": 0.103759765625, "learning_rate": 8.687943262411347e-07, "loss": 0.0041, "reward": 1.5369993448257446, "reward_std": 0.09643547236919403, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5369993597269058, "step": 407 }, { "completion_length": 117.6875, "epoch": 0.26305609284332687, "grad_norm": 39.99586868286133, "kl": 0.10400390625, "learning_rate": 8.684719535783365e-07, "loss": 0.0042, "reward": 1.606239914894104, "reward_std": 0.12021593004465103, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.606239914894104, "step": 408 }, { "completion_length": 117.03125, "epoch": 0.26370083816892326, "grad_norm": 10.232660293579102, "kl": 0.111572265625, "learning_rate": 8.681495809155384e-07, "loss": 0.0045, "reward": 1.7151300311088562, "reward_std": 0.09283937886357307, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7151300609111786, "step": 409 }, { "completion_length": 128.359375, "epoch": 0.26434558349451964, "grad_norm": 5.87454080581665, "kl": 0.10498046875, "learning_rate": 8.678272082527401e-07, "loss": 0.0042, "reward": 1.7446652054786682, "reward_std": 0.1887577325105667, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7602902054786682, "step": 410 }, { "completion_length": 126.859375, "epoch": 0.26499032882011603, "grad_norm": 16.121383666992188, "kl": 0.1181640625, "learning_rate": 8.67504835589942e-07, "loss": 0.0047, "reward": 1.6039236783981323, "reward_std": 0.10203766822814941, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6039237380027771, "step": 411 }, { "completion_length": 119.734375, "epoch": 0.26563507414571247, "grad_norm": 17.275545120239258, "kl": 0.14306640625, "learning_rate": 8.671824629271437e-07, "loss": 0.0057, "reward": 1.4978615641593933, "reward_std": 0.2517547532916069, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5134866237640381, "step": 412 }, { "completion_length": 135.46875, "epoch": 0.26627981947130885, "grad_norm": 15.758459091186523, "kl": 0.111328125, "learning_rate": 8.668600902643456e-07, "loss": 0.0045, "reward": 1.525922417640686, "reward_std": 0.18179912865161896, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5415474027395248, "step": 413 }, { "completion_length": 120.140625, "epoch": 0.26692456479690524, "grad_norm": 23.685686111450195, "kl": 0.1123046875, "learning_rate": 8.665377176015473e-07, "loss": 0.0045, "reward": 1.5499253273010254, "reward_std": 0.20756025612354279, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.565550372004509, "step": 414 }, { "completion_length": 107.4375, "epoch": 0.2675693101225016, "grad_norm": 17.447803497314453, "kl": 0.1376953125, "learning_rate": 8.662153449387492e-07, "loss": 0.0055, "reward": 1.4381148219108582, "reward_std": 0.21692107617855072, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.46936482191085815, "step": 415 }, { "completion_length": 110.953125, "epoch": 0.268214055448098, "grad_norm": 22.098020553588867, "kl": 0.1083984375, "learning_rate": 8.658929722759509e-07, "loss": 0.0043, "reward": 1.5825045108795166, "reward_std": 0.15497270971536636, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5981295108795166, "step": 416 }, { "completion_length": 120.234375, "epoch": 0.2688588007736944, "grad_norm": 18.107759475708008, "kl": 0.109375, "learning_rate": 8.655705996131528e-07, "loss": 0.0044, "reward": 1.4336537718772888, "reward_std": 0.12037287652492523, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4336537718772888, "step": 417 }, { "completion_length": 119.78125, "epoch": 0.2695035460992908, "grad_norm": 16.09417724609375, "kl": 0.119140625, "learning_rate": 8.652482269503546e-07, "loss": 0.0048, "reward": 1.4209988117218018, "reward_std": 0.22898180782794952, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.45224879682064056, "step": 418 }, { "completion_length": 118.421875, "epoch": 0.27014829142488717, "grad_norm": 12.481720924377441, "kl": 0.110107421875, "learning_rate": 8.649258542875564e-07, "loss": 0.0044, "reward": 1.3904073238372803, "reward_std": 0.14730828255414963, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4060323089361191, "step": 419 }, { "completion_length": 116.46875, "epoch": 0.27079303675048355, "grad_norm": 15.177681922912598, "kl": 0.1171875, "learning_rate": 8.646034816247581e-07, "loss": 0.0047, "reward": 1.521640121936798, "reward_std": 0.14473527297377586, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5372650921344757, "step": 420 }, { "completion_length": 106.96875, "epoch": 0.27143778207607994, "grad_norm": 15.35623550415039, "kl": 0.107421875, "learning_rate": 8.6428110896196e-07, "loss": 0.0043, "reward": 1.4848104119300842, "reward_std": 0.17813602089881897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48481038212776184, "step": 421 }, { "completion_length": 115.203125, "epoch": 0.2720825274016763, "grad_norm": 8.478485107421875, "kl": 0.17919921875, "learning_rate": 8.639587362991618e-07, "loss": 0.0071, "reward": 1.492174744606018, "reward_std": 0.10588141530752182, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49217478930950165, "step": 422 }, { "completion_length": 116.78125, "epoch": 0.2727272727272727, "grad_norm": 20.47816276550293, "kl": 0.106201171875, "learning_rate": 8.636363636363636e-07, "loss": 0.0042, "reward": 1.6990843415260315, "reward_std": 0.08219916746020317, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6990842819213867, "step": 423 }, { "completion_length": 108.609375, "epoch": 0.2733720180528691, "grad_norm": 15.236698150634766, "kl": 0.102294921875, "learning_rate": 8.633139909735654e-07, "loss": 0.0041, "reward": 1.5470861196517944, "reward_std": 0.18373820185661316, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5470860600471497, "step": 424 }, { "completion_length": 116.09375, "epoch": 0.27401676337846553, "grad_norm": 41.5452880859375, "kl": 0.10888671875, "learning_rate": 8.629916183107672e-07, "loss": 0.0044, "reward": 1.6228560209274292, "reward_std": 0.15928027778863907, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6228560507297516, "step": 425 }, { "completion_length": 106.75, "epoch": 0.2746615087040619, "grad_norm": 11.794381141662598, "kl": 0.110107421875, "learning_rate": 8.62669245647969e-07, "loss": 0.0044, "reward": 1.5028586387634277, "reward_std": 0.12982606142759323, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5028586536645889, "step": 426 }, { "completion_length": 104.59375, "epoch": 0.2753062540296583, "grad_norm": 14.104724884033203, "kl": 0.107177734375, "learning_rate": 8.623468729851709e-07, "loss": 0.0043, "reward": 1.6256231665611267, "reward_std": 0.07264291495084763, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6256231665611267, "step": 427 }, { "completion_length": 95.453125, "epoch": 0.2759509993552547, "grad_norm": 12.817341804504395, "kl": 0.102783203125, "learning_rate": 8.620245003223726e-07, "loss": 0.0041, "reward": 1.4218062162399292, "reward_std": 0.21968971192836761, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4218061864376068, "step": 428 }, { "completion_length": 107.609375, "epoch": 0.2765957446808511, "grad_norm": 65.80889129638672, "kl": 0.10986328125, "learning_rate": 8.617021276595744e-07, "loss": 0.0044, "reward": 1.4983304142951965, "reward_std": 0.1709209829568863, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49833041429519653, "step": 429 }, { "completion_length": 108.625, "epoch": 0.27724049000644746, "grad_norm": 9.083003044128418, "kl": 0.11669921875, "learning_rate": 8.613797549967763e-07, "loss": 0.0047, "reward": 1.3649240136146545, "reward_std": 0.09722113236784935, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.36492396891117096, "step": 430 }, { "completion_length": 108.3125, "epoch": 0.27788523533204385, "grad_norm": 13.82675838470459, "kl": 0.10009765625, "learning_rate": 8.610573823339781e-07, "loss": 0.004, "reward": 1.5768791437149048, "reward_std": 0.17914673686027527, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5925041735172272, "step": 431 }, { "completion_length": 112.28125, "epoch": 0.27852998065764023, "grad_norm": 8.703876495361328, "kl": 0.099365234375, "learning_rate": 8.607350096711798e-07, "loss": 0.004, "reward": 1.6096728444099426, "reward_std": 0.11242663115262985, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6096728146076202, "step": 432 }, { "completion_length": 92.0, "epoch": 0.2791747259832366, "grad_norm": 21.1232852935791, "kl": 0.110107421875, "learning_rate": 8.604126370083817e-07, "loss": 0.0044, "reward": 1.66538667678833, "reward_std": 0.11511220782995224, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6653867065906525, "step": 433 }, { "completion_length": 114.4375, "epoch": 0.279819471308833, "grad_norm": 10.83615493774414, "kl": 0.13525390625, "learning_rate": 8.600902643455834e-07, "loss": 0.0054, "reward": 1.6429526805877686, "reward_std": 0.12804623320698738, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6429526805877686, "step": 434 }, { "completion_length": 93.515625, "epoch": 0.2804642166344294, "grad_norm": 286.1723327636719, "kl": 0.116943359375, "learning_rate": 8.597678916827853e-07, "loss": 0.0047, "reward": 1.4603025913238525, "reward_std": 0.12294166535139084, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4603026509284973, "step": 435 }, { "completion_length": 103.96875, "epoch": 0.28110896196002577, "grad_norm": 18.53604507446289, "kl": 0.1162109375, "learning_rate": 8.59445519019987e-07, "loss": 0.0047, "reward": 1.463842749595642, "reward_std": 0.13263465836644173, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4638426899909973, "step": 436 }, { "completion_length": 109.703125, "epoch": 0.28175370728562216, "grad_norm": 7.604578495025635, "kl": 0.0927734375, "learning_rate": 8.591231463571889e-07, "loss": 0.0037, "reward": 1.6767799854278564, "reward_std": 0.1197793260216713, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6767799556255341, "step": 437 }, { "completion_length": 110.15625, "epoch": 0.28239845261121854, "grad_norm": 12.251392364501953, "kl": 0.114013671875, "learning_rate": 8.588007736943906e-07, "loss": 0.0046, "reward": 1.6475961804389954, "reward_std": 0.16608282178640366, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.647596150636673, "step": 438 }, { "completion_length": 101.640625, "epoch": 0.283043197936815, "grad_norm": 45.992103576660156, "kl": 0.099609375, "learning_rate": 8.584784010315925e-07, "loss": 0.004, "reward": 1.530308485031128, "reward_std": 0.10350951924920082, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5303084552288055, "step": 439 }, { "completion_length": 112.5, "epoch": 0.28368794326241137, "grad_norm": 21.31104850769043, "kl": 0.102294921875, "learning_rate": 8.581560283687943e-07, "loss": 0.0041, "reward": 1.5717370510101318, "reward_std": 0.1433250457048416, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5717370808124542, "step": 440 }, { "completion_length": 116.625, "epoch": 0.28433268858800775, "grad_norm": 12.857828140258789, "kl": 0.1220703125, "learning_rate": 8.578336557059961e-07, "loss": 0.0049, "reward": 1.6379773020744324, "reward_std": 0.10524650290608406, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6379773616790771, "step": 441 }, { "completion_length": 120.796875, "epoch": 0.28497743391360414, "grad_norm": 14.907901763916016, "kl": 0.12646484375, "learning_rate": 8.575112830431978e-07, "loss": 0.005, "reward": 1.6316999793052673, "reward_std": 0.10639721155166626, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.631699949502945, "step": 442 }, { "completion_length": 110.0, "epoch": 0.2856221792392005, "grad_norm": 13.865013122558594, "kl": 0.1142578125, "learning_rate": 8.571889103803998e-07, "loss": 0.0046, "reward": 1.489943504333496, "reward_std": 0.08053775876760483, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48994359374046326, "step": 443 }, { "completion_length": 121.109375, "epoch": 0.2862669245647969, "grad_norm": 10.000602722167969, "kl": 0.103271484375, "learning_rate": 8.568665377176015e-07, "loss": 0.0041, "reward": 1.5654188394546509, "reward_std": 0.11091581359505653, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5654188394546509, "step": 444 }, { "completion_length": 121.9375, "epoch": 0.2869116698903933, "grad_norm": 15.2792329788208, "kl": 0.1064453125, "learning_rate": 8.565441650548033e-07, "loss": 0.0043, "reward": 1.6336507201194763, "reward_std": 0.10441675782203674, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6336507201194763, "step": 445 }, { "completion_length": 124.203125, "epoch": 0.2875564152159897, "grad_norm": 9.192334175109863, "kl": 0.1162109375, "learning_rate": 8.562217923920051e-07, "loss": 0.0046, "reward": 1.5857059359550476, "reward_std": 0.083042461425066, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5857059359550476, "step": 446 }, { "completion_length": 126.328125, "epoch": 0.28820116054158607, "grad_norm": 12.716821670532227, "kl": 0.0986328125, "learning_rate": 8.558994197292069e-07, "loss": 0.0039, "reward": 1.3910475969314575, "reward_std": 0.20336882025003433, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3910476565361023, "step": 447 }, { "completion_length": 98.984375, "epoch": 0.28884590586718245, "grad_norm": 19.093204498291016, "kl": 0.111328125, "learning_rate": 8.555770470664087e-07, "loss": 0.0044, "reward": 1.4914937615394592, "reward_std": 0.07705500349402428, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49149373173713684, "step": 448 }, { "completion_length": 117.171875, "epoch": 0.28949065119277884, "grad_norm": 10.103181838989258, "kl": 0.111328125, "learning_rate": 8.552546744036106e-07, "loss": 0.0045, "reward": 1.586841642856598, "reward_std": 0.15257032215595245, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5868416428565979, "step": 449 }, { "completion_length": 107.703125, "epoch": 0.2901353965183752, "grad_norm": 13.13296127319336, "kl": 0.111328125, "learning_rate": 8.549323017408123e-07, "loss": 0.0045, "reward": 1.52940034866333, "reward_std": 0.18080459535121918, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5294004082679749, "step": 450 }, { "completion_length": 110.34375, "epoch": 0.2907801418439716, "grad_norm": 12.052055358886719, "kl": 0.124755859375, "learning_rate": 8.546099290780141e-07, "loss": 0.005, "reward": 1.4507166743278503, "reward_std": 0.19025427103042603, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45071670413017273, "step": 451 }, { "completion_length": 104.078125, "epoch": 0.29142488716956805, "grad_norm": 12.829391479492188, "kl": 0.1220703125, "learning_rate": 8.54287556415216e-07, "loss": 0.0049, "reward": 1.6305707693099976, "reward_std": 0.18986354023218155, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.6618208289146423, "step": 452 }, { "completion_length": 103.703125, "epoch": 0.29206963249516443, "grad_norm": 8.33245849609375, "kl": 0.144775390625, "learning_rate": 8.539651837524178e-07, "loss": 0.0058, "reward": 1.596256971359253, "reward_std": 0.12559916824102402, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5962570309638977, "step": 453 }, { "completion_length": 95.34375, "epoch": 0.2927143778207608, "grad_norm": 19.02564239501953, "kl": 0.15234375, "learning_rate": 8.536428110896195e-07, "loss": 0.0061, "reward": 1.6366922855377197, "reward_std": 0.18177789449691772, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6366923153400421, "step": 454 }, { "completion_length": 107.3125, "epoch": 0.2933591231463572, "grad_norm": 109.59540557861328, "kl": 0.129150390625, "learning_rate": 8.533204384268214e-07, "loss": 0.0052, "reward": 1.5032501220703125, "reward_std": 0.14116708934307098, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5032500624656677, "step": 455 }, { "completion_length": 109.53125, "epoch": 0.2940038684719536, "grad_norm": 53.494632720947266, "kl": 0.11865234375, "learning_rate": 8.529980657640232e-07, "loss": 0.0047, "reward": 1.5563023090362549, "reward_std": 0.11787058785557747, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5563022345304489, "step": 456 }, { "completion_length": 111.515625, "epoch": 0.29464861379755, "grad_norm": 23.822132110595703, "kl": 0.127197265625, "learning_rate": 8.52675693101225e-07, "loss": 0.0051, "reward": 1.4594323635101318, "reward_std": 0.05473834462463856, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45943234860897064, "step": 457 }, { "completion_length": 98.34375, "epoch": 0.29529335912314636, "grad_norm": 24.85257339477539, "kl": 0.13916015625, "learning_rate": 8.523533204384267e-07, "loss": 0.0056, "reward": 1.4711275696754456, "reward_std": 0.10964871942996979, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.471127524971962, "step": 458 }, { "completion_length": 105.453125, "epoch": 0.29593810444874274, "grad_norm": 14.3064546585083, "kl": 0.158203125, "learning_rate": 8.520309477756286e-07, "loss": 0.0063, "reward": 1.6756532788276672, "reward_std": 0.09594709798693657, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.675653338432312, "step": 459 }, { "completion_length": 105.96875, "epoch": 0.29658284977433913, "grad_norm": 32.8977165222168, "kl": 0.12890625, "learning_rate": 8.517085751128303e-07, "loss": 0.0052, "reward": 1.3314522504806519, "reward_std": 0.08916751854121685, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.33145226538181305, "step": 460 }, { "completion_length": 98.984375, "epoch": 0.2972275950999355, "grad_norm": 8.08194351196289, "kl": 0.125, "learning_rate": 8.513862024500322e-07, "loss": 0.005, "reward": 1.6798730492591858, "reward_std": 0.13914865255355835, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6798729598522186, "step": 461 }, { "completion_length": 104.1875, "epoch": 0.2978723404255319, "grad_norm": 17.793569564819336, "kl": 0.11328125, "learning_rate": 8.51063829787234e-07, "loss": 0.0045, "reward": 1.5144559741020203, "reward_std": 0.14659196138381958, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5144559442996979, "step": 462 }, { "completion_length": 106.453125, "epoch": 0.2985170857511283, "grad_norm": 59.80522155761719, "kl": 0.171875, "learning_rate": 8.507414571244358e-07, "loss": 0.0069, "reward": 1.659844696521759, "reward_std": 0.1038866862654686, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6598446369171143, "step": 463 }, { "completion_length": 108.96875, "epoch": 0.29916183107672467, "grad_norm": 9.849380493164062, "kl": 0.11572265625, "learning_rate": 8.504190844616375e-07, "loss": 0.0046, "reward": 1.5677456259727478, "reward_std": 0.18492824956774712, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5833705961704254, "step": 464 }, { "completion_length": 102.890625, "epoch": 0.29980657640232106, "grad_norm": 9.406426429748535, "kl": 0.1396484375, "learning_rate": 8.500967117988395e-07, "loss": 0.0056, "reward": 1.5041118264198303, "reward_std": 0.13810846023261547, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5197368264198303, "step": 465 }, { "completion_length": 116.59375, "epoch": 0.3004513217279175, "grad_norm": 17.882394790649414, "kl": 0.126708984375, "learning_rate": 8.497743391360412e-07, "loss": 0.0051, "reward": 1.5210480093955994, "reward_std": 0.15733475238084793, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.521048054099083, "step": 466 }, { "completion_length": 102.96875, "epoch": 0.3010960670535139, "grad_norm": 17.0341854095459, "kl": 0.126220703125, "learning_rate": 8.49451966473243e-07, "loss": 0.0051, "reward": 1.420632779598236, "reward_std": 0.1144905835390091, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4362577497959137, "step": 467 }, { "completion_length": 94.65625, "epoch": 0.30174081237911027, "grad_norm": 14.085397720336914, "kl": 0.18115234375, "learning_rate": 8.491295938104448e-07, "loss": 0.0073, "reward": 1.5675341486930847, "reward_std": 0.19463778287172318, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5987841784954071, "step": 468 }, { "completion_length": 106.734375, "epoch": 0.30238555770470665, "grad_norm": 10.348201751708984, "kl": 0.136962890625, "learning_rate": 8.488072211476467e-07, "loss": 0.0055, "reward": 1.656339168548584, "reward_std": 0.2050868570804596, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6719641387462616, "step": 469 }, { "completion_length": 106.46875, "epoch": 0.30303030303030304, "grad_norm": 46.44258499145508, "kl": 0.123779296875, "learning_rate": 8.484848484848484e-07, "loss": 0.0049, "reward": 1.5097472667694092, "reward_std": 0.0910700261592865, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5097472071647644, "step": 470 }, { "completion_length": 99.203125, "epoch": 0.3036750483558994, "grad_norm": 25.939085006713867, "kl": 0.121337890625, "learning_rate": 8.481624758220503e-07, "loss": 0.0049, "reward": 1.5171257257461548, "reward_std": 0.14139147475361824, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.517125815153122, "step": 471 }, { "completion_length": 101.703125, "epoch": 0.3043197936814958, "grad_norm": 18.568218231201172, "kl": 0.116943359375, "learning_rate": 8.47840103159252e-07, "loss": 0.0047, "reward": 1.6284334659576416, "reward_std": 0.1252906247973442, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6284334659576416, "step": 472 }, { "completion_length": 112.78125, "epoch": 0.3049645390070922, "grad_norm": 21.57918357849121, "kl": 0.14306640625, "learning_rate": 8.475177304964538e-07, "loss": 0.0057, "reward": 1.4556211829185486, "reward_std": 0.08510977774858475, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4556211531162262, "step": 473 }, { "completion_length": 101.3125, "epoch": 0.3056092843326886, "grad_norm": 9.105512619018555, "kl": 0.125732421875, "learning_rate": 8.471953578336557e-07, "loss": 0.005, "reward": 1.4934914708137512, "reward_std": 0.08908221870660782, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4934914708137512, "step": 474 }, { "completion_length": 101.609375, "epoch": 0.30625402965828497, "grad_norm": 88.50373840332031, "kl": 0.1494140625, "learning_rate": 8.468729851708575e-07, "loss": 0.006, "reward": 1.3966038823127747, "reward_std": 0.14134737104177475, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39660392701625824, "step": 475 }, { "completion_length": 98.15625, "epoch": 0.30689877498388135, "grad_norm": 13.56151294708252, "kl": 0.13232421875, "learning_rate": 8.465506125080592e-07, "loss": 0.0053, "reward": 1.652710199356079, "reward_std": 0.12360907346010208, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6527101993560791, "step": 476 }, { "completion_length": 92.921875, "epoch": 0.30754352030947774, "grad_norm": 12.644444465637207, "kl": 0.14453125, "learning_rate": 8.46228239845261e-07, "loss": 0.0058, "reward": 1.5489562153816223, "reward_std": 0.10549729317426682, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5489561706781387, "step": 477 }, { "completion_length": 89.375, "epoch": 0.3081882656350741, "grad_norm": 35.62141036987305, "kl": 0.1298828125, "learning_rate": 8.459058671824629e-07, "loss": 0.0052, "reward": 1.4552032351493835, "reward_std": 0.1214294545352459, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45520325005054474, "step": 478 }, { "completion_length": 97.40625, "epoch": 0.30883301096067056, "grad_norm": 11.729783058166504, "kl": 0.13671875, "learning_rate": 8.455834945196647e-07, "loss": 0.0055, "reward": 1.6263262629508972, "reward_std": 0.16124407947063446, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6419512629508972, "step": 479 }, { "completion_length": 98.84375, "epoch": 0.30947775628626695, "grad_norm": 8.603642463684082, "kl": 0.14453125, "learning_rate": 8.452611218568664e-07, "loss": 0.0058, "reward": 1.4429956674575806, "reward_std": 0.15181055292487144, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44299568235874176, "step": 480 }, { "completion_length": 98.375, "epoch": 0.31012250161186333, "grad_norm": 19.082008361816406, "kl": 0.1376953125, "learning_rate": 8.449387491940683e-07, "loss": 0.0055, "reward": 1.5683542490005493, "reward_std": 0.1690295785665512, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5683543235063553, "step": 481 }, { "completion_length": 92.046875, "epoch": 0.3107672469374597, "grad_norm": 13.106690406799316, "kl": 0.1396484375, "learning_rate": 8.446163765312701e-07, "loss": 0.0056, "reward": 1.637206792831421, "reward_std": 0.11582134664058685, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6372068524360657, "step": 482 }, { "completion_length": 94.5625, "epoch": 0.3114119922630561, "grad_norm": 14.102886199951172, "kl": 0.16455078125, "learning_rate": 8.44294003868472e-07, "loss": 0.0066, "reward": 1.555085003376007, "reward_std": 0.14258688688278198, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5707099884748459, "step": 483 }, { "completion_length": 97.15625, "epoch": 0.3120567375886525, "grad_norm": 15.093389511108398, "kl": 0.1337890625, "learning_rate": 8.439716312056737e-07, "loss": 0.0054, "reward": 1.6329968571662903, "reward_std": 0.13173328340053558, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6329968869686127, "step": 484 }, { "completion_length": 90.921875, "epoch": 0.3127014829142489, "grad_norm": 8.526525497436523, "kl": 0.185546875, "learning_rate": 8.436492585428755e-07, "loss": 0.0074, "reward": 1.5760527849197388, "reward_std": 0.12339088693261147, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5760527849197388, "step": 485 }, { "completion_length": 103.671875, "epoch": 0.31334622823984526, "grad_norm": 11.255599975585938, "kl": 0.16845703125, "learning_rate": 8.433268858800772e-07, "loss": 0.0068, "reward": 1.6047207713127136, "reward_std": 0.1114160493016243, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6047207415103912, "step": 486 }, { "completion_length": 108.015625, "epoch": 0.31399097356544164, "grad_norm": 18.66996955871582, "kl": 0.12158203125, "learning_rate": 8.430045132172792e-07, "loss": 0.0049, "reward": 1.6641413569450378, "reward_std": 0.0755637101829052, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6641413271427155, "step": 487 }, { "completion_length": 106.1875, "epoch": 0.31463571889103803, "grad_norm": 17.2938232421875, "kl": 0.12646484375, "learning_rate": 8.426821405544809e-07, "loss": 0.005, "reward": 1.3134232759475708, "reward_std": 0.09148961305618286, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.313423290848732, "step": 488 }, { "completion_length": 103.21875, "epoch": 0.3152804642166344, "grad_norm": 14.192865371704102, "kl": 0.16552734375, "learning_rate": 8.423597678916827e-07, "loss": 0.0066, "reward": 1.5219502449035645, "reward_std": 0.13911869004368782, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5375751405954361, "step": 489 }, { "completion_length": 104.734375, "epoch": 0.3159252095422308, "grad_norm": 8.525148391723633, "kl": 0.13623046875, "learning_rate": 8.420373952288845e-07, "loss": 0.0054, "reward": 1.4910805821418762, "reward_std": 0.14263714104890823, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4910805821418762, "step": 490 }, { "completion_length": 104.96875, "epoch": 0.3165699548678272, "grad_norm": 15.304465293884277, "kl": 0.16943359375, "learning_rate": 8.417150225660864e-07, "loss": 0.0068, "reward": 1.374679982662201, "reward_std": 0.0994303934276104, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.37467992305755615, "step": 491 }, { "completion_length": 102.375, "epoch": 0.31721470019342357, "grad_norm": 28.216218948364258, "kl": 0.173828125, "learning_rate": 8.413926499032881e-07, "loss": 0.0069, "reward": 1.5741727948188782, "reward_std": 0.06946994923055172, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5741727650165558, "step": 492 }, { "completion_length": 101.3125, "epoch": 0.31785944551902, "grad_norm": 62.535240173339844, "kl": 0.15380859375, "learning_rate": 8.4107027724049e-07, "loss": 0.0061, "reward": 1.570895254611969, "reward_std": 0.16077566146850586, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5708952844142914, "step": 493 }, { "completion_length": 98.5625, "epoch": 0.3185041908446164, "grad_norm": 10.88364028930664, "kl": 0.162109375, "learning_rate": 8.407479045776917e-07, "loss": 0.0065, "reward": 1.5242938995361328, "reward_std": 0.12571700662374496, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.524293839931488, "step": 494 }, { "completion_length": 115.046875, "epoch": 0.3191489361702128, "grad_norm": 13.604912757873535, "kl": 0.140625, "learning_rate": 8.404255319148936e-07, "loss": 0.0056, "reward": 1.3118263483047485, "reward_std": 0.17000606656074524, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.32745136320590973, "step": 495 }, { "completion_length": 105.34375, "epoch": 0.31979368149580917, "grad_norm": 12.397253036499023, "kl": 0.1357421875, "learning_rate": 8.401031592520954e-07, "loss": 0.0054, "reward": 1.6170586347579956, "reward_std": 0.11738874390721321, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6170586943626404, "step": 496 }, { "completion_length": 100.828125, "epoch": 0.32043842682140555, "grad_norm": 34.156211853027344, "kl": 0.146484375, "learning_rate": 8.397807865892972e-07, "loss": 0.0058, "reward": 1.6565853357315063, "reward_std": 0.1069246418774128, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6565854251384735, "step": 497 }, { "completion_length": 103.625, "epoch": 0.32108317214700194, "grad_norm": 22.407413482666016, "kl": 0.1533203125, "learning_rate": 8.394584139264989e-07, "loss": 0.0061, "reward": 1.5076429843902588, "reward_std": 0.19160231947898865, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.52326799929142, "step": 498 }, { "completion_length": 100.53125, "epoch": 0.3217279174725983, "grad_norm": 12.296113967895508, "kl": 0.17041015625, "learning_rate": 8.391360412637009e-07, "loss": 0.0068, "reward": 1.560167908668518, "reward_std": 0.09707137942314148, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5601678788661957, "step": 499 }, { "completion_length": 109.15625, "epoch": 0.3223726627981947, "grad_norm": 8.46380615234375, "kl": 0.16357421875, "learning_rate": 8.388136686009026e-07, "loss": 0.0065, "reward": 1.460886836051941, "reward_std": 0.06691646948456764, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4608868658542633, "step": 500 }, { "completion_length": 103.703125, "epoch": 0.3230174081237911, "grad_norm": 57.8582763671875, "kl": 0.1689453125, "learning_rate": 8.384912959381044e-07, "loss": 0.0068, "reward": 1.6686757802963257, "reward_std": 0.12994926422834396, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6686758100986481, "step": 501 }, { "completion_length": 106.484375, "epoch": 0.3236621534493875, "grad_norm": 40.78071212768555, "kl": 0.14990234375, "learning_rate": 8.381689232753061e-07, "loss": 0.006, "reward": 1.4987604022026062, "reward_std": 0.11321312561631203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4987604022026062, "step": 502 }, { "completion_length": 112.4375, "epoch": 0.32430689877498386, "grad_norm": 14.115378379821777, "kl": 0.181640625, "learning_rate": 8.37846550612508e-07, "loss": 0.0073, "reward": 1.4107964038848877, "reward_std": 0.14325329661369324, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4107963293790817, "step": 503 }, { "completion_length": 95.15625, "epoch": 0.32495164410058025, "grad_norm": 12.940362930297852, "kl": 0.1630859375, "learning_rate": 8.375241779497098e-07, "loss": 0.0065, "reward": 1.4768180847167969, "reward_std": 0.17815791070461273, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4924430698156357, "step": 504 }, { "completion_length": 97.046875, "epoch": 0.32559638942617664, "grad_norm": 24.559680938720703, "kl": 0.173828125, "learning_rate": 8.372018052869116e-07, "loss": 0.007, "reward": 1.6828228831291199, "reward_std": 0.12030645087361336, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6828228235244751, "step": 505 }, { "completion_length": 103.046875, "epoch": 0.3262411347517731, "grad_norm": 9.820699691772461, "kl": 0.16650390625, "learning_rate": 8.368794326241134e-07, "loss": 0.0067, "reward": 1.6893452405929565, "reward_std": 0.057600412517786026, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6893452405929565, "step": 506 }, { "completion_length": 108.46875, "epoch": 0.32688588007736946, "grad_norm": 31.632915496826172, "kl": 0.1943359375, "learning_rate": 8.365570599613152e-07, "loss": 0.0078, "reward": 1.5965908765792847, "reward_std": 0.11272242292761803, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5965908467769623, "step": 507 }, { "completion_length": 92.796875, "epoch": 0.32753062540296585, "grad_norm": 13.336411476135254, "kl": 0.1728515625, "learning_rate": 8.36234687298517e-07, "loss": 0.0069, "reward": 1.5450761318206787, "reward_std": 0.08154022693634033, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5450761616230011, "step": 508 }, { "completion_length": 103.265625, "epoch": 0.32817537072856223, "grad_norm": 25.839597702026367, "kl": 0.1689453125, "learning_rate": 8.359123146357189e-07, "loss": 0.0067, "reward": 1.5176383256912231, "reward_std": 0.17332381755113602, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.517638310790062, "step": 509 }, { "completion_length": 95.1875, "epoch": 0.3288201160541586, "grad_norm": 12.781591415405273, "kl": 0.19091796875, "learning_rate": 8.355899419729206e-07, "loss": 0.0076, "reward": 1.3962189555168152, "reward_std": 0.07649979926645756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39621901512145996, "step": 510 }, { "completion_length": 100.734375, "epoch": 0.329464861379755, "grad_norm": 7.244078159332275, "kl": 0.17138671875, "learning_rate": 8.352675693101224e-07, "loss": 0.0069, "reward": 1.4728859066963196, "reward_std": 0.1124839149415493, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4885109066963196, "step": 511 }, { "completion_length": 100.515625, "epoch": 0.3301096067053514, "grad_norm": 22.949989318847656, "kl": 0.14453125, "learning_rate": 8.349451966473244e-07, "loss": 0.0058, "reward": 1.4569940567016602, "reward_std": 0.07238898053765297, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4569939970970154, "step": 512 }, { "completion_length": 96.140625, "epoch": 0.3307543520309478, "grad_norm": 11.172731399536133, "kl": 0.21484375, "learning_rate": 8.346228239845261e-07, "loss": 0.0086, "reward": 1.511184811592102, "reward_std": 0.12513361498713493, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5111847519874573, "step": 513 }, { "completion_length": 100.078125, "epoch": 0.33139909735654416, "grad_norm": 12.96349048614502, "kl": 0.1220703125, "learning_rate": 8.343004513217278e-07, "loss": 0.0049, "reward": 1.473563313484192, "reward_std": 0.1395437940955162, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4735632687807083, "step": 514 }, { "completion_length": 88.875, "epoch": 0.33204384268214054, "grad_norm": 12.410280227661133, "kl": 0.13525390625, "learning_rate": 8.339780786589297e-07, "loss": 0.0054, "reward": 1.6107569932937622, "reward_std": 0.13490690663456917, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6107569634914398, "step": 515 }, { "completion_length": 91.40625, "epoch": 0.33268858800773693, "grad_norm": 61.04192352294922, "kl": 0.1943359375, "learning_rate": 8.336557059961314e-07, "loss": 0.0078, "reward": 1.4572083353996277, "reward_std": 0.08159304037690163, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4572083503007889, "step": 516 }, { "completion_length": 108.03125, "epoch": 0.3333333333333333, "grad_norm": 18.190624237060547, "kl": 0.12451171875, "learning_rate": 8.333333333333333e-07, "loss": 0.005, "reward": 1.6369551420211792, "reward_std": 0.16067005693912506, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6369551718235016, "step": 517 }, { "completion_length": 105.765625, "epoch": 0.3339780786589297, "grad_norm": 19.701316833496094, "kl": 0.12353515625, "learning_rate": 8.330109606705352e-07, "loss": 0.005, "reward": 1.5677099823951721, "reward_std": 0.13132942840456963, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5833349823951721, "step": 518 }, { "completion_length": 107.5, "epoch": 0.3346228239845261, "grad_norm": 11.90446949005127, "kl": 0.117431640625, "learning_rate": 8.326885880077369e-07, "loss": 0.0047, "reward": 1.6299498081207275, "reward_std": 0.10089965909719467, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6299497783184052, "step": 519 }, { "completion_length": 101.84375, "epoch": 0.3352675693101225, "grad_norm": 18.77952766418457, "kl": 0.111328125, "learning_rate": 8.323662153449386e-07, "loss": 0.0045, "reward": 1.4982839822769165, "reward_std": 0.0839691199362278, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4982839822769165, "step": 520 }, { "completion_length": 105.78125, "epoch": 0.3359123146357189, "grad_norm": 12.040685653686523, "kl": 0.102294921875, "learning_rate": 8.320438426821406e-07, "loss": 0.0041, "reward": 1.4064577221870422, "reward_std": 0.10008003562688828, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4064577519893646, "step": 521 }, { "completion_length": 100.203125, "epoch": 0.3365570599613153, "grad_norm": 6.722482204437256, "kl": 0.13037109375, "learning_rate": 8.317214700193424e-07, "loss": 0.0052, "reward": 1.6532670259475708, "reward_std": 0.07082688249647617, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6532670557498932, "step": 522 }, { "completion_length": 102.390625, "epoch": 0.3372018052869117, "grad_norm": 20.486141204833984, "kl": 0.11865234375, "learning_rate": 8.313990973565441e-07, "loss": 0.0047, "reward": 1.6709571480751038, "reward_std": 0.1421183981001377, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6865821480751038, "step": 523 }, { "completion_length": 102.953125, "epoch": 0.33784655061250807, "grad_norm": 19.00589942932129, "kl": 0.13720703125, "learning_rate": 8.31076724693746e-07, "loss": 0.0055, "reward": 1.3803046345710754, "reward_std": 0.13267886638641357, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.395929679274559, "step": 524 }, { "completion_length": 121.15625, "epoch": 0.33849129593810445, "grad_norm": 28.536258697509766, "kl": 0.1142578125, "learning_rate": 8.307543520309478e-07, "loss": 0.0046, "reward": 1.4973403811454773, "reward_std": 0.13099077716469765, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4973403811454773, "step": 525 }, { "completion_length": 111.953125, "epoch": 0.33913604126370084, "grad_norm": 10.45938491821289, "kl": 0.09765625, "learning_rate": 8.304319793681496e-07, "loss": 0.0039, "reward": 1.5662608742713928, "reward_std": 0.13913103193044662, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5818859040737152, "step": 526 }, { "completion_length": 113.4375, "epoch": 0.3397807865892972, "grad_norm": 13.307291030883789, "kl": 0.24853515625, "learning_rate": 8.301096067053513e-07, "loss": 0.0099, "reward": 1.590022087097168, "reward_std": 0.18774433434009552, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.605647087097168, "step": 527 }, { "completion_length": 117.71875, "epoch": 0.3404255319148936, "grad_norm": 31.09686851501465, "kl": 0.101806640625, "learning_rate": 8.297872340425532e-07, "loss": 0.0041, "reward": 1.625194489955902, "reward_std": 0.14474449306726456, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6251944303512573, "step": 528 }, { "completion_length": 112.15625, "epoch": 0.34107027724049, "grad_norm": 13.974041938781738, "kl": 0.12744140625, "learning_rate": 8.294648613797549e-07, "loss": 0.0051, "reward": 1.5699716806411743, "reward_std": 0.1584780141711235, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5699716806411743, "step": 529 }, { "completion_length": 119.21875, "epoch": 0.3417150225660864, "grad_norm": 15.859521865844727, "kl": 0.102783203125, "learning_rate": 8.291424887169568e-07, "loss": 0.0041, "reward": 1.6906739473342896, "reward_std": 0.132080078125, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6906739473342896, "step": 530 }, { "completion_length": 124.0, "epoch": 0.34235976789168276, "grad_norm": 18.77686882019043, "kl": 0.1025390625, "learning_rate": 8.288201160541586e-07, "loss": 0.0041, "reward": 1.6462790966033936, "reward_std": 0.09448760747909546, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6462791562080383, "step": 531 }, { "completion_length": 124.125, "epoch": 0.34300451321727915, "grad_norm": 11.553679466247559, "kl": 0.11083984375, "learning_rate": 8.284977433913604e-07, "loss": 0.0044, "reward": 1.4024157524108887, "reward_std": 0.14840160310268402, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4024156928062439, "step": 532 }, { "completion_length": 118.765625, "epoch": 0.3436492585428756, "grad_norm": 39.30613708496094, "kl": 0.102783203125, "learning_rate": 8.281753707285621e-07, "loss": 0.0041, "reward": 1.5651786923408508, "reward_std": 0.14602860994637012, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.565178707242012, "step": 533 }, { "completion_length": 131.0, "epoch": 0.344294003868472, "grad_norm": 10.37143325805664, "kl": 0.119384765625, "learning_rate": 8.278529980657641e-07, "loss": 0.0048, "reward": 1.6771512627601624, "reward_std": 0.12305908650159836, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6771512627601624, "step": 534 }, { "completion_length": 106.59375, "epoch": 0.34493874919406836, "grad_norm": 7.9928202629089355, "kl": 0.106689453125, "learning_rate": 8.275306254029658e-07, "loss": 0.0043, "reward": 1.5217041969299316, "reward_std": 0.06747284345328808, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5217041969299316, "step": 535 }, { "completion_length": 115.75, "epoch": 0.34558349451966475, "grad_norm": 14.964887619018555, "kl": 0.109375, "learning_rate": 8.272082527401676e-07, "loss": 0.0044, "reward": 1.5932498574256897, "reward_std": 0.11327129602432251, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5932497978210449, "step": 536 }, { "completion_length": 108.9375, "epoch": 0.34622823984526113, "grad_norm": 13.483436584472656, "kl": 0.1064453125, "learning_rate": 8.268858800773694e-07, "loss": 0.0043, "reward": 1.4369852542877197, "reward_std": 0.10051735490560532, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4369853138923645, "step": 537 }, { "completion_length": 125.0625, "epoch": 0.3468729851708575, "grad_norm": 27.211544036865234, "kl": 0.11083984375, "learning_rate": 8.265635074145713e-07, "loss": 0.0044, "reward": 1.6145740747451782, "reward_std": 0.11309744045138359, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6145740449428558, "step": 538 }, { "completion_length": 121.9375, "epoch": 0.3475177304964539, "grad_norm": 11.06650447845459, "kl": 0.129150390625, "learning_rate": 8.26241134751773e-07, "loss": 0.0052, "reward": 1.518451452255249, "reward_std": 0.10257099196314812, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5184515118598938, "step": 539 }, { "completion_length": 120.546875, "epoch": 0.3481624758220503, "grad_norm": 27.978939056396484, "kl": 0.10546875, "learning_rate": 8.259187620889749e-07, "loss": 0.0042, "reward": 1.5805214643478394, "reward_std": 0.10257112607359886, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5805214643478394, "step": 540 }, { "completion_length": 122.21875, "epoch": 0.3488072211476467, "grad_norm": 10.80872631072998, "kl": 0.109619140625, "learning_rate": 8.255963894261766e-07, "loss": 0.0044, "reward": 1.4624775052070618, "reward_std": 0.09696153551340103, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4624774754047394, "step": 541 }, { "completion_length": 109.28125, "epoch": 0.34945196647324306, "grad_norm": 12.653077125549316, "kl": 0.133544921875, "learning_rate": 8.252740167633784e-07, "loss": 0.0053, "reward": 1.588198721408844, "reward_std": 0.11155318468809128, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.588198721408844, "step": 542 }, { "completion_length": 120.09375, "epoch": 0.35009671179883944, "grad_norm": 22.150753021240234, "kl": 0.118408203125, "learning_rate": 8.249516441005803e-07, "loss": 0.0047, "reward": 1.56528639793396, "reward_std": 0.10561125725507736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.565286323428154, "step": 543 }, { "completion_length": 125.9375, "epoch": 0.35074145712443583, "grad_norm": 16.026426315307617, "kl": 0.119140625, "learning_rate": 8.246292714377821e-07, "loss": 0.0048, "reward": 1.427088737487793, "reward_std": 0.17949910461902618, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.44271376729011536, "step": 544 }, { "completion_length": 125.15625, "epoch": 0.3513862024500322, "grad_norm": 14.44304370880127, "kl": 0.11181640625, "learning_rate": 8.243068987749838e-07, "loss": 0.0045, "reward": 1.582537829875946, "reward_std": 0.09015891328454018, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5825379192829132, "step": 545 }, { "completion_length": 126.25, "epoch": 0.3520309477756286, "grad_norm": 17.329204559326172, "kl": 0.10498046875, "learning_rate": 8.239845261121857e-07, "loss": 0.0042, "reward": 1.6583821773529053, "reward_std": 0.13926228135824203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6583821475505829, "step": 546 }, { "completion_length": 123.859375, "epoch": 0.35267569310122504, "grad_norm": 24.32595443725586, "kl": 0.112548828125, "learning_rate": 8.236621534493875e-07, "loss": 0.0045, "reward": 1.613775610923767, "reward_std": 0.11163278669118881, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6137756109237671, "step": 547 }, { "completion_length": 120.5625, "epoch": 0.3533204384268214, "grad_norm": 13.765203475952148, "kl": 0.112548828125, "learning_rate": 8.233397807865893e-07, "loss": 0.0045, "reward": 1.637283205986023, "reward_std": 0.12223464995622635, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.637283205986023, "step": 548 }, { "completion_length": 128.75, "epoch": 0.3539651837524178, "grad_norm": 7.839471340179443, "kl": 0.140625, "learning_rate": 8.23017408123791e-07, "loss": 0.0056, "reward": 1.7445607781410217, "reward_std": 0.1211717426776886, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7445608079433441, "step": 549 }, { "completion_length": 128.96875, "epoch": 0.3546099290780142, "grad_norm": 10.117485046386719, "kl": 0.11376953125, "learning_rate": 8.226950354609929e-07, "loss": 0.0045, "reward": 1.4675299525260925, "reward_std": 0.12889251857995987, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4675299674272537, "step": 550 }, { "completion_length": 114.296875, "epoch": 0.3552546744036106, "grad_norm": 26.474218368530273, "kl": 0.132568359375, "learning_rate": 8.223726627981947e-07, "loss": 0.0053, "reward": 1.5867825746536255, "reward_std": 0.10010011121630669, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5867825746536255, "step": 551 }, { "completion_length": 130.15625, "epoch": 0.35589941972920697, "grad_norm": 8.115386962890625, "kl": 0.109375, "learning_rate": 8.220502901353965e-07, "loss": 0.0044, "reward": 1.4267299175262451, "reward_std": 0.06233101151883602, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4267299324274063, "step": 552 }, { "completion_length": 130.5625, "epoch": 0.35654416505480335, "grad_norm": 31.162986755371094, "kl": 0.125244140625, "learning_rate": 8.217279174725983e-07, "loss": 0.005, "reward": 1.4682827591896057, "reward_std": 0.19038020819425583, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.48390769958496094, "step": 553 }, { "completion_length": 123.96875, "epoch": 0.35718891038039974, "grad_norm": 9.136853218078613, "kl": 0.113525390625, "learning_rate": 8.214055448098001e-07, "loss": 0.0045, "reward": 1.6539664268493652, "reward_std": 0.09818685799837112, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6539664268493652, "step": 554 }, { "completion_length": 122.3125, "epoch": 0.3578336557059961, "grad_norm": 13.743880271911621, "kl": 0.11962890625, "learning_rate": 8.210831721470018e-07, "loss": 0.0048, "reward": 1.4207700490951538, "reward_std": 0.11169535666704178, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4207700490951538, "step": 555 }, { "completion_length": 132.734375, "epoch": 0.3584784010315925, "grad_norm": 19.605443954467773, "kl": 0.12841796875, "learning_rate": 8.207607994842038e-07, "loss": 0.0051, "reward": 1.6942888498306274, "reward_std": 0.1028636246919632, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6942889094352722, "step": 556 }, { "completion_length": 130.140625, "epoch": 0.3591231463571889, "grad_norm": 10.357660293579102, "kl": 0.107666015625, "learning_rate": 8.204384268214055e-07, "loss": 0.0043, "reward": 1.49127995967865, "reward_std": 0.11323336511850357, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49128003418445587, "step": 557 }, { "completion_length": 149.734375, "epoch": 0.3597678916827853, "grad_norm": 8.94410514831543, "kl": 0.1181640625, "learning_rate": 8.201160541586073e-07, "loss": 0.0047, "reward": 1.739545226097107, "reward_std": 0.0977691225707531, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7395451962947845, "step": 558 }, { "completion_length": 142.34375, "epoch": 0.36041263700838166, "grad_norm": 9.473371505737305, "kl": 0.109619140625, "learning_rate": 8.197936814958091e-07, "loss": 0.0044, "reward": 1.619294822216034, "reward_std": 0.10096556693315506, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6192947924137115, "step": 559 }, { "completion_length": 124.953125, "epoch": 0.3610573823339781, "grad_norm": 14.380293846130371, "kl": 0.1201171875, "learning_rate": 8.19471308833011e-07, "loss": 0.0048, "reward": 1.418857455253601, "reward_std": 0.08556991256773472, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41885746270418167, "step": 560 }, { "completion_length": 136.6875, "epoch": 0.3617021276595745, "grad_norm": 9.324554443359375, "kl": 0.114013671875, "learning_rate": 8.191489361702127e-07, "loss": 0.0046, "reward": 1.5292004346847534, "reward_std": 0.07976378127932549, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5292005091905594, "step": 561 }, { "completion_length": 155.71875, "epoch": 0.3623468729851709, "grad_norm": 10.505910873413086, "kl": 0.131103515625, "learning_rate": 8.188265635074146e-07, "loss": 0.0053, "reward": 1.5651956796646118, "reward_std": 0.11247077211737633, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5651956498622894, "step": 562 }, { "completion_length": 146.3125, "epoch": 0.36299161831076726, "grad_norm": 9.546722412109375, "kl": 0.1083984375, "learning_rate": 8.185041908446163e-07, "loss": 0.0043, "reward": 1.4872339367866516, "reward_std": 0.09367235377430916, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4872339963912964, "step": 563 }, { "completion_length": 131.546875, "epoch": 0.36363636363636365, "grad_norm": 13.24105453491211, "kl": 0.130859375, "learning_rate": 8.181818181818182e-07, "loss": 0.0052, "reward": 1.458554744720459, "reward_std": 0.10819820687174797, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.458554744720459, "step": 564 }, { "completion_length": 149.265625, "epoch": 0.36428110896196003, "grad_norm": 7.181205749511719, "kl": 0.122802734375, "learning_rate": 8.1785944551902e-07, "loss": 0.0049, "reward": 1.4140058159828186, "reward_std": 0.10015027970075607, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4140057861804962, "step": 565 }, { "completion_length": 139.8125, "epoch": 0.3649258542875564, "grad_norm": 13.35418701171875, "kl": 0.197265625, "learning_rate": 8.175370728562218e-07, "loss": 0.0079, "reward": 1.561979353427887, "reward_std": 0.21178478002548218, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.577604353427887, "step": 566 }, { "completion_length": 128.84375, "epoch": 0.3655705996131528, "grad_norm": 51.9110107421875, "kl": 0.114013671875, "learning_rate": 8.172147001934235e-07, "loss": 0.0046, "reward": 1.7140253782272339, "reward_std": 0.16923753917217255, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7140253484249115, "step": 567 }, { "completion_length": 135.4375, "epoch": 0.3662153449387492, "grad_norm": 8.429132461547852, "kl": 0.1259765625, "learning_rate": 8.168923275306254e-07, "loss": 0.005, "reward": 1.4810205698013306, "reward_std": 0.10634732991456985, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4810205101966858, "step": 568 }, { "completion_length": 142.625, "epoch": 0.3668600902643456, "grad_norm": 11.647993087768555, "kl": 0.12451171875, "learning_rate": 8.165699548678272e-07, "loss": 0.005, "reward": 1.3584343194961548, "reward_std": 0.07176856696605682, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3584343045949936, "step": 569 }, { "completion_length": 129.375, "epoch": 0.36750483558994196, "grad_norm": 7.475222110748291, "kl": 0.13818359375, "learning_rate": 8.16247582205029e-07, "loss": 0.0055, "reward": 1.5198262929916382, "reward_std": 0.12593180686235428, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.519826278090477, "step": 570 }, { "completion_length": 121.375, "epoch": 0.36814958091553834, "grad_norm": 10.700190544128418, "kl": 0.123046875, "learning_rate": 8.159252095422307e-07, "loss": 0.0049, "reward": 1.5901803374290466, "reward_std": 0.1543055735528469, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.590180367231369, "step": 571 }, { "completion_length": 129.015625, "epoch": 0.36879432624113473, "grad_norm": 11.171639442443848, "kl": 0.111083984375, "learning_rate": 8.156028368794326e-07, "loss": 0.0044, "reward": 1.3946861624717712, "reward_std": 0.11192984879016876, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.39468614757061005, "step": 572 }, { "completion_length": 124.875, "epoch": 0.3694390715667311, "grad_norm": 8.263480186462402, "kl": 0.1396484375, "learning_rate": 8.152804642166344e-07, "loss": 0.0056, "reward": 1.6307023167610168, "reward_std": 0.18546786159276962, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.6775772869586945, "step": 573 }, { "completion_length": 123.46875, "epoch": 0.37008381689232756, "grad_norm": 26.08427619934082, "kl": 0.1220703125, "learning_rate": 8.149580915538362e-07, "loss": 0.0049, "reward": 1.4789862036705017, "reward_std": 0.15343213826417923, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4946112334728241, "step": 574 }, { "completion_length": 135.953125, "epoch": 0.37072856221792394, "grad_norm": 10.028849601745605, "kl": 0.114990234375, "learning_rate": 8.14635718891038e-07, "loss": 0.0046, "reward": 1.5295458436012268, "reward_std": 0.12414504960179329, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5295458734035492, "step": 575 }, { "completion_length": 128.015625, "epoch": 0.3713733075435203, "grad_norm": 13.812504768371582, "kl": 0.112060546875, "learning_rate": 8.143133462282398e-07, "loss": 0.0045, "reward": 1.5613334774971008, "reward_std": 0.10750623792409897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5613334625959396, "step": 576 }, { "completion_length": 149.625, "epoch": 0.3720180528691167, "grad_norm": 27.1822509765625, "kl": 0.102294921875, "learning_rate": 8.139909735654416e-07, "loss": 0.0041, "reward": 1.6229792833328247, "reward_std": 0.11508912220597267, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6229793429374695, "step": 577 }, { "completion_length": 124.609375, "epoch": 0.3726627981947131, "grad_norm": 9.831287384033203, "kl": 0.12109375, "learning_rate": 8.136686009026435e-07, "loss": 0.0048, "reward": 1.601158857345581, "reward_std": 0.11256638541817665, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.601158857345581, "step": 578 }, { "completion_length": 134.984375, "epoch": 0.3733075435203095, "grad_norm": 26.67717170715332, "kl": 0.1416015625, "learning_rate": 8.133462282398452e-07, "loss": 0.0057, "reward": 1.4907135963439941, "reward_std": 0.1917910873889923, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5063386261463165, "step": 579 }, { "completion_length": 131.421875, "epoch": 0.37395228884590587, "grad_norm": 6.938395023345947, "kl": 0.111328125, "learning_rate": 8.13023855577047e-07, "loss": 0.0045, "reward": 1.559043526649475, "reward_std": 0.10559235885739326, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5590435266494751, "step": 580 }, { "completion_length": 138.9375, "epoch": 0.37459703417150225, "grad_norm": 21.588024139404297, "kl": 0.113525390625, "learning_rate": 8.127014829142488e-07, "loss": 0.0045, "reward": 1.5905563831329346, "reward_std": 0.09706471487879753, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5905563831329346, "step": 581 }, { "completion_length": 134.515625, "epoch": 0.37524177949709864, "grad_norm": 17.881168365478516, "kl": 0.119140625, "learning_rate": 8.123791102514507e-07, "loss": 0.0048, "reward": 1.543268859386444, "reward_std": 0.18078606575727463, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5432688891887665, "step": 582 }, { "completion_length": 145.953125, "epoch": 0.375886524822695, "grad_norm": 29.037656784057617, "kl": 0.10107421875, "learning_rate": 8.120567375886524e-07, "loss": 0.004, "reward": 1.6184459328651428, "reward_std": 0.05552523583173752, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6184459626674652, "step": 583 }, { "completion_length": 145.234375, "epoch": 0.3765312701482914, "grad_norm": 15.09101676940918, "kl": 0.13525390625, "learning_rate": 8.117343649258543e-07, "loss": 0.0054, "reward": 1.487252116203308, "reward_std": 0.1185568068176508, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4872521460056305, "step": 584 }, { "completion_length": 125.8125, "epoch": 0.3771760154738878, "grad_norm": 11.656231880187988, "kl": 0.1201171875, "learning_rate": 8.11411992263056e-07, "loss": 0.0048, "reward": 1.4543855786323547, "reward_std": 0.09149051457643509, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45438555628061295, "step": 585 }, { "completion_length": 134.234375, "epoch": 0.3778207607994842, "grad_norm": 28.694446563720703, "kl": 0.100341796875, "learning_rate": 8.110896196002579e-07, "loss": 0.004, "reward": 1.495825171470642, "reward_std": 0.14905067533254623, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4958251863718033, "step": 586 }, { "completion_length": 125.84375, "epoch": 0.3784655061250806, "grad_norm": 12.141417503356934, "kl": 0.136962890625, "learning_rate": 8.107672469374597e-07, "loss": 0.0055, "reward": 1.6799025535583496, "reward_std": 0.15780113637447357, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6799026131629944, "step": 587 }, { "completion_length": 145.375, "epoch": 0.379110251450677, "grad_norm": 8.39338207244873, "kl": 0.1220703125, "learning_rate": 8.104448742746615e-07, "loss": 0.0049, "reward": 1.6171015501022339, "reward_std": 0.20077940821647644, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6327264904975891, "step": 588 }, { "completion_length": 125.796875, "epoch": 0.3797549967762734, "grad_norm": 10.23801040649414, "kl": 0.110595703125, "learning_rate": 8.101225016118632e-07, "loss": 0.0044, "reward": 1.3968492150306702, "reward_std": 0.11241466552019119, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3968491703271866, "step": 589 }, { "completion_length": 137.03125, "epoch": 0.3803997421018698, "grad_norm": 11.222281455993652, "kl": 0.142578125, "learning_rate": 8.098001289490652e-07, "loss": 0.0057, "reward": 1.656241774559021, "reward_std": 0.11266876012086868, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6562418043613434, "step": 590 }, { "completion_length": 133.859375, "epoch": 0.38104448742746616, "grad_norm": 14.622573852539062, "kl": 0.112060546875, "learning_rate": 8.094777562862669e-07, "loss": 0.0045, "reward": 1.6198719143867493, "reward_std": 0.14295167103409767, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6198718845844269, "step": 591 }, { "completion_length": 138.921875, "epoch": 0.38168923275306255, "grad_norm": 14.126190185546875, "kl": 0.124267578125, "learning_rate": 8.091553836234687e-07, "loss": 0.005, "reward": 1.544808268547058, "reward_std": 0.16154202073812485, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5448082685470581, "step": 592 }, { "completion_length": 140.140625, "epoch": 0.38233397807865893, "grad_norm": 10.186321258544922, "kl": 0.10791015625, "learning_rate": 8.088330109606704e-07, "loss": 0.0043, "reward": 1.633695662021637, "reward_std": 0.09759926423430443, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.633695662021637, "step": 593 }, { "completion_length": 129.8125, "epoch": 0.3829787234042553, "grad_norm": 13.927589416503906, "kl": 0.108642578125, "learning_rate": 8.085106382978723e-07, "loss": 0.0043, "reward": 1.5086371898651123, "reward_std": 0.12473899126052856, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5086371302604675, "step": 594 }, { "completion_length": 125.625, "epoch": 0.3836234687298517, "grad_norm": 13.513331413269043, "kl": 0.119140625, "learning_rate": 8.081882656350741e-07, "loss": 0.0048, "reward": 1.7253805994987488, "reward_std": 0.16587653756141663, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7410056293010712, "step": 595 }, { "completion_length": 135.125, "epoch": 0.3842682140554481, "grad_norm": 9.143721580505371, "kl": 0.10986328125, "learning_rate": 8.07865892972276e-07, "loss": 0.0044, "reward": 1.5537520051002502, "reward_std": 0.09487171098589897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5537519752979279, "step": 596 }, { "completion_length": 126.421875, "epoch": 0.3849129593810445, "grad_norm": 10.566431045532227, "kl": 0.10302734375, "learning_rate": 8.075435203094777e-07, "loss": 0.0041, "reward": 1.5097746849060059, "reward_std": 0.07031143084168434, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5097746551036835, "step": 597 }, { "completion_length": 130.84375, "epoch": 0.38555770470664086, "grad_norm": 8.138565063476562, "kl": 0.119140625, "learning_rate": 8.072211476466795e-07, "loss": 0.0048, "reward": 1.6204659342765808, "reward_std": 0.1338505893945694, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6360909044742584, "step": 598 }, { "completion_length": 136.28125, "epoch": 0.38620245003223724, "grad_norm": 10.86203670501709, "kl": 0.11865234375, "learning_rate": 8.068987749838813e-07, "loss": 0.0047, "reward": 1.634830355644226, "reward_std": 0.08793558552861214, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6348303556442261, "step": 599 }, { "completion_length": 130.4375, "epoch": 0.38684719535783363, "grad_norm": 7.916982650756836, "kl": 0.11376953125, "learning_rate": 8.065764023210832e-07, "loss": 0.0045, "reward": 1.5632728338241577, "reward_std": 0.10517224669456482, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5632728040218353, "step": 600 }, { "completion_length": 155.3125, "epoch": 0.38749194068343007, "grad_norm": 13.323336601257324, "kl": 0.11083984375, "learning_rate": 8.062540296582849e-07, "loss": 0.0044, "reward": 1.4888296127319336, "reward_std": 0.11279592663049698, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4888295754790306, "step": 601 }, { "completion_length": 134.046875, "epoch": 0.38813668600902645, "grad_norm": 11.745083808898926, "kl": 0.11767578125, "learning_rate": 8.059316569954867e-07, "loss": 0.0047, "reward": 1.597053587436676, "reward_std": 0.1300004981458187, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.597053587436676, "step": 602 }, { "completion_length": 145.640625, "epoch": 0.38878143133462284, "grad_norm": 8.197556495666504, "kl": 0.1171875, "learning_rate": 8.056092843326886e-07, "loss": 0.0047, "reward": 1.5338327884674072, "reward_std": 0.15634901821613312, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5338327586650848, "step": 603 }, { "completion_length": 142.859375, "epoch": 0.3894261766602192, "grad_norm": 19.091354370117188, "kl": 0.111572265625, "learning_rate": 8.052869116698904e-07, "loss": 0.0045, "reward": 1.6582316160202026, "reward_std": 0.11573278903961182, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6582316756248474, "step": 604 }, { "completion_length": 147.296875, "epoch": 0.3900709219858156, "grad_norm": 28.295654296875, "kl": 0.128662109375, "learning_rate": 8.049645390070921e-07, "loss": 0.0051, "reward": 1.4338959455490112, "reward_std": 0.10549728572368622, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43389593064785004, "step": 605 }, { "completion_length": 137.96875, "epoch": 0.390715667311412, "grad_norm": 137.10951232910156, "kl": 0.112060546875, "learning_rate": 8.04642166344294e-07, "loss": 0.0045, "reward": 1.6279088258743286, "reward_std": 0.1394149400293827, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6279088854789734, "step": 606 }, { "completion_length": 132.046875, "epoch": 0.3913604126370084, "grad_norm": 18.1970272064209, "kl": 0.13427734375, "learning_rate": 8.043197936814957e-07, "loss": 0.0054, "reward": 1.4707345366477966, "reward_std": 0.10406222194433212, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47073453664779663, "step": 607 }, { "completion_length": 147.9375, "epoch": 0.39200515796260477, "grad_norm": 9.023652076721191, "kl": 0.112060546875, "learning_rate": 8.039974210186976e-07, "loss": 0.0045, "reward": 1.6449916362762451, "reward_std": 0.22956346720457077, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6606166958808899, "step": 608 }, { "completion_length": 134.5, "epoch": 0.39264990328820115, "grad_norm": 12.73431396484375, "kl": 0.102783203125, "learning_rate": 8.036750483558994e-07, "loss": 0.0041, "reward": 1.4050042033195496, "reward_std": 0.09452563524246216, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4050041437149048, "step": 609 }, { "completion_length": 123.96875, "epoch": 0.39329464861379754, "grad_norm": 15.1465425491333, "kl": 0.11572265625, "learning_rate": 8.033526756931012e-07, "loss": 0.0046, "reward": 1.5025982856750488, "reward_std": 0.1606716513633728, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5182233303785324, "step": 610 }, { "completion_length": 128.84375, "epoch": 0.3939393939393939, "grad_norm": 10.938918113708496, "kl": 0.11865234375, "learning_rate": 8.030303030303029e-07, "loss": 0.0047, "reward": 1.56488835811615, "reward_std": 0.10107577964663506, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5648883879184723, "step": 611 }, { "completion_length": 130.546875, "epoch": 0.3945841392649903, "grad_norm": 10.713906288146973, "kl": 0.2900390625, "learning_rate": 8.027079303675049e-07, "loss": 0.0116, "reward": 1.6501939296722412, "reward_std": 0.1356111727654934, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6501939296722412, "step": 612 }, { "completion_length": 129.109375, "epoch": 0.3952288845905867, "grad_norm": 12.522553443908691, "kl": 0.101806640625, "learning_rate": 8.023855577047066e-07, "loss": 0.0041, "reward": 1.6621387004852295, "reward_std": 0.09056547284126282, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6621387004852295, "step": 613 }, { "completion_length": 114.875, "epoch": 0.39587362991618313, "grad_norm": 14.493385314941406, "kl": 0.108642578125, "learning_rate": 8.020631850419084e-07, "loss": 0.0043, "reward": 1.5702993869781494, "reward_std": 0.09657874330878258, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5702993869781494, "step": 614 }, { "completion_length": 130.0625, "epoch": 0.3965183752417795, "grad_norm": 14.518086433410645, "kl": 0.092041015625, "learning_rate": 8.017408123791101e-07, "loss": 0.0037, "reward": 1.5168561935424805, "reward_std": 0.12171942368149757, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5168561637401581, "step": 615 }, { "completion_length": 114.875, "epoch": 0.3971631205673759, "grad_norm": 13.475616455078125, "kl": 0.10693359375, "learning_rate": 8.014184397163121e-07, "loss": 0.0043, "reward": 1.5117144584655762, "reward_std": 0.16558092087507248, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5117143988609314, "step": 616 }, { "completion_length": 132.4375, "epoch": 0.3978078658929723, "grad_norm": 11.467823028564453, "kl": 0.097900390625, "learning_rate": 8.010960670535138e-07, "loss": 0.0039, "reward": 1.4913342595100403, "reward_std": 0.17832869291305542, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5069593191146851, "step": 617 }, { "completion_length": 115.921875, "epoch": 0.3984526112185687, "grad_norm": 14.274715423583984, "kl": 0.107666015625, "learning_rate": 8.007736943907156e-07, "loss": 0.0043, "reward": 1.3526421785354614, "reward_std": 0.07542551681399345, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3526422083377838, "step": 618 }, { "completion_length": 116.875, "epoch": 0.39909735654416506, "grad_norm": 9.649855613708496, "kl": 0.1298828125, "learning_rate": 8.004513217279174e-07, "loss": 0.0052, "reward": 1.565634310245514, "reward_std": 0.1185467280447483, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5656343698501587, "step": 619 }, { "completion_length": 118.9375, "epoch": 0.39974210186976145, "grad_norm": 22.857391357421875, "kl": 0.133544921875, "learning_rate": 8.001289490651193e-07, "loss": 0.0053, "reward": 1.5444243550300598, "reward_std": 0.09491568803787231, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5444244742393494, "step": 620 }, { "completion_length": 119.453125, "epoch": 0.40038684719535783, "grad_norm": 20.537433624267578, "kl": 0.114013671875, "learning_rate": 7.99806576402321e-07, "loss": 0.0046, "reward": 1.4165579080581665, "reward_std": 0.16316518932580948, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4321829378604889, "step": 621 }, { "completion_length": 115.0625, "epoch": 0.4010315925209542, "grad_norm": 22.098909378051758, "kl": 0.1640625, "learning_rate": 7.994842037395229e-07, "loss": 0.0066, "reward": 1.5407551527023315, "reward_std": 0.17422685772180557, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5407552272081375, "step": 622 }, { "completion_length": 118.734375, "epoch": 0.4016763378465506, "grad_norm": 48.7105827331543, "kl": 0.107421875, "learning_rate": 7.991618310767246e-07, "loss": 0.0043, "reward": 1.6025624871253967, "reward_std": 0.10630748048424721, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.602562427520752, "step": 623 }, { "completion_length": 119.59375, "epoch": 0.402321083172147, "grad_norm": 17.388961791992188, "kl": 0.095458984375, "learning_rate": 7.988394584139264e-07, "loss": 0.0038, "reward": 1.6318548321723938, "reward_std": 0.1776801347732544, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6318548321723938, "step": 624 }, { "completion_length": 121.046875, "epoch": 0.4029658284977434, "grad_norm": 7.685291290283203, "kl": 0.110107421875, "learning_rate": 7.985170857511283e-07, "loss": 0.0044, "reward": 1.5066440105438232, "reward_std": 0.10979078710079193, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.506644070148468, "step": 625 }, { "completion_length": 123.765625, "epoch": 0.40361057382333976, "grad_norm": 20.131128311157227, "kl": 0.11767578125, "learning_rate": 7.981947130883301e-07, "loss": 0.0047, "reward": 1.6256684064865112, "reward_std": 0.1347283013164997, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6256683468818665, "step": 626 }, { "completion_length": 112.890625, "epoch": 0.40425531914893614, "grad_norm": 7.655572891235352, "kl": 0.119873046875, "learning_rate": 7.978723404255318e-07, "loss": 0.0048, "reward": 1.525473415851593, "reward_std": 0.1499604694545269, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5254734456539154, "step": 627 }, { "completion_length": 130.328125, "epoch": 0.4049000644745326, "grad_norm": 95.7166519165039, "kl": 0.107666015625, "learning_rate": 7.975499677627337e-07, "loss": 0.0043, "reward": 1.5183069109916687, "reward_std": 0.12149505317211151, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5183069556951523, "step": 628 }, { "completion_length": 136.890625, "epoch": 0.40554480980012897, "grad_norm": 16.36639404296875, "kl": 0.10693359375, "learning_rate": 7.972275950999355e-07, "loss": 0.0043, "reward": 1.6043989658355713, "reward_std": 0.1741345077753067, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6043989807367325, "step": 629 }, { "completion_length": 132.0625, "epoch": 0.40618955512572535, "grad_norm": 9.136812210083008, "kl": 0.10986328125, "learning_rate": 7.969052224371373e-07, "loss": 0.0044, "reward": 1.3735297918319702, "reward_std": 0.13789300620555878, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.373529776930809, "step": 630 }, { "completion_length": 115.65625, "epoch": 0.40683430045132174, "grad_norm": 7.693540573120117, "kl": 0.1201171875, "learning_rate": 7.965828497743391e-07, "loss": 0.0048, "reward": 1.6083815693855286, "reward_std": 0.1517021358013153, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6083815693855286, "step": 631 }, { "completion_length": 125.515625, "epoch": 0.4074790457769181, "grad_norm": 7.784499645233154, "kl": 0.11962890625, "learning_rate": 7.962604771115409e-07, "loss": 0.0048, "reward": 1.487956166267395, "reward_std": 0.16370031237602234, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48795612156391144, "step": 632 }, { "completion_length": 108.140625, "epoch": 0.4081237911025145, "grad_norm": 6.592469215393066, "kl": 0.123779296875, "learning_rate": 7.959381044487427e-07, "loss": 0.005, "reward": 1.7070671916007996, "reward_std": 0.08668256551027298, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7070672810077667, "step": 633 }, { "completion_length": 113.203125, "epoch": 0.4087685364281109, "grad_norm": 10.177643775939941, "kl": 0.113525390625, "learning_rate": 7.956157317859446e-07, "loss": 0.0045, "reward": 1.6079189777374268, "reward_std": 0.12942004203796387, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6079188883304596, "step": 634 }, { "completion_length": 120.109375, "epoch": 0.4094132817537073, "grad_norm": 8.311883926391602, "kl": 0.1103515625, "learning_rate": 7.952933591231463e-07, "loss": 0.0044, "reward": 1.634382724761963, "reward_std": 0.19498103857040405, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.63438281416893, "step": 635 }, { "completion_length": 105.84375, "epoch": 0.41005802707930367, "grad_norm": 15.116155624389648, "kl": 0.104736328125, "learning_rate": 7.949709864603481e-07, "loss": 0.0042, "reward": 1.5293989777565002, "reward_std": 0.11750294640660286, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5293989479541779, "step": 636 }, { "completion_length": 103.53125, "epoch": 0.41070277240490005, "grad_norm": 8.729680061340332, "kl": 0.11474609375, "learning_rate": 7.946486137975498e-07, "loss": 0.0046, "reward": 1.5659652352333069, "reward_std": 0.15115171670913696, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5659652352333069, "step": 637 }, { "completion_length": 108.03125, "epoch": 0.41134751773049644, "grad_norm": 47.11723327636719, "kl": 0.1240234375, "learning_rate": 7.943262411347518e-07, "loss": 0.005, "reward": 1.554872214794159, "reward_std": 0.1266670562326908, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5548722743988037, "step": 638 }, { "completion_length": 114.296875, "epoch": 0.4119922630560928, "grad_norm": 9.169736862182617, "kl": 0.1259765625, "learning_rate": 7.940038684719535e-07, "loss": 0.005, "reward": 1.306950032711029, "reward_std": 0.07547219470143318, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.30694999545812607, "step": 639 }, { "completion_length": 107.890625, "epoch": 0.4126370083816892, "grad_norm": 9.10857105255127, "kl": 0.10693359375, "learning_rate": 7.936814958091553e-07, "loss": 0.0043, "reward": 1.6737666726112366, "reward_std": 0.0729268491268158, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6737666428089142, "step": 640 }, { "completion_length": 107.0625, "epoch": 0.41328175370728565, "grad_norm": 21.960092544555664, "kl": 0.1416015625, "learning_rate": 7.933591231463571e-07, "loss": 0.0057, "reward": 1.5458187460899353, "reward_std": 0.13090715557336807, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5458187162876129, "step": 641 }, { "completion_length": 118.515625, "epoch": 0.41392649903288203, "grad_norm": 11.414142608642578, "kl": 0.140625, "learning_rate": 7.93036750483559e-07, "loss": 0.0056, "reward": 1.5467508435249329, "reward_std": 0.1591968908905983, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5623758435249329, "step": 642 }, { "completion_length": 103.515625, "epoch": 0.4145712443584784, "grad_norm": 8.753649711608887, "kl": 0.11572265625, "learning_rate": 7.927143778207607e-07, "loss": 0.0046, "reward": 1.463947594165802, "reward_std": 0.10150327160954475, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.463947594165802, "step": 643 }, { "completion_length": 110.328125, "epoch": 0.4152159896840748, "grad_norm": 16.589797973632812, "kl": 0.12060546875, "learning_rate": 7.923920051579626e-07, "loss": 0.0048, "reward": 1.4456722736358643, "reward_std": 0.16995223611593246, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.46129727363586426, "step": 644 }, { "completion_length": 109.5625, "epoch": 0.4158607350096712, "grad_norm": 11.583199501037598, "kl": 0.1083984375, "learning_rate": 7.920696324951643e-07, "loss": 0.0043, "reward": 1.4806134700775146, "reward_std": 0.1219950020313263, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48061345517635345, "step": 645 }, { "completion_length": 111.65625, "epoch": 0.4165054803352676, "grad_norm": 22.0245418548584, "kl": 0.11181640625, "learning_rate": 7.917472598323662e-07, "loss": 0.0045, "reward": 1.457335352897644, "reward_std": 0.07867556810379028, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45733533799648285, "step": 646 }, { "completion_length": 100.890625, "epoch": 0.41715022566086396, "grad_norm": 25.137718200683594, "kl": 0.14208984375, "learning_rate": 7.91424887169568e-07, "loss": 0.0057, "reward": 1.6399447917938232, "reward_std": 0.1750987023115158, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6555697917938232, "step": 647 }, { "completion_length": 115.546875, "epoch": 0.41779497098646035, "grad_norm": 23.508705139160156, "kl": 0.122314453125, "learning_rate": 7.911025145067698e-07, "loss": 0.0049, "reward": 1.4861845970153809, "reward_std": 0.052020519971847534, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4861845374107361, "step": 648 }, { "completion_length": 106.359375, "epoch": 0.41843971631205673, "grad_norm": 11.12207317352295, "kl": 0.11083984375, "learning_rate": 7.907801418439715e-07, "loss": 0.0044, "reward": 1.5252867937088013, "reward_std": 0.19501442462205887, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5409118235111237, "step": 649 }, { "completion_length": 104.046875, "epoch": 0.4190844616376531, "grad_norm": 10.382707595825195, "kl": 0.12890625, "learning_rate": 7.904577691811734e-07, "loss": 0.0052, "reward": 1.5812249779701233, "reward_std": 0.15144172310829163, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5812250077724457, "step": 650 }, { "completion_length": 104.640625, "epoch": 0.4197292069632495, "grad_norm": 27.51158332824707, "kl": 0.11572265625, "learning_rate": 7.901353965183752e-07, "loss": 0.0046, "reward": 1.6362160444259644, "reward_std": 0.14013683423399925, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6362160742282867, "step": 651 }, { "completion_length": 104.59375, "epoch": 0.4203739522888459, "grad_norm": 16.1353816986084, "kl": 0.1357421875, "learning_rate": 7.89813023855577e-07, "loss": 0.0054, "reward": 1.453727900981903, "reward_std": 0.12474618852138519, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4537279009819031, "step": 652 }, { "completion_length": 109.765625, "epoch": 0.42101869761444227, "grad_norm": 10.052741050720215, "kl": 0.131591796875, "learning_rate": 7.894906511927788e-07, "loss": 0.0053, "reward": 1.5141014456748962, "reward_std": 0.09088888391852379, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5141013860702515, "step": 653 }, { "completion_length": 105.71875, "epoch": 0.42166344294003866, "grad_norm": 13.285776138305664, "kl": 0.1083984375, "learning_rate": 7.891682785299806e-07, "loss": 0.0043, "reward": 1.5992873907089233, "reward_std": 0.1335230953991413, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5992873311042786, "step": 654 }, { "completion_length": 116.6875, "epoch": 0.4223081882656351, "grad_norm": 14.342023849487305, "kl": 0.138427734375, "learning_rate": 7.888459058671824e-07, "loss": 0.0055, "reward": 1.3910069465637207, "reward_std": 0.09431485831737518, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3910069465637207, "step": 655 }, { "completion_length": 98.203125, "epoch": 0.4229529335912315, "grad_norm": 13.409884452819824, "kl": 0.104248046875, "learning_rate": 7.885235332043843e-07, "loss": 0.0042, "reward": 1.4601246118545532, "reward_std": 0.07733478024601936, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46012458205223083, "step": 656 }, { "completion_length": 99.015625, "epoch": 0.42359767891682787, "grad_norm": 13.980112075805664, "kl": 0.135498046875, "learning_rate": 7.88201160541586e-07, "loss": 0.0054, "reward": 1.7138119339942932, "reward_std": 0.21042115986347198, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7138119041919708, "step": 657 }, { "completion_length": 104.5625, "epoch": 0.42424242424242425, "grad_norm": 14.612245559692383, "kl": 0.093994140625, "learning_rate": 7.878787878787878e-07, "loss": 0.0038, "reward": 1.4366269707679749, "reward_std": 0.07526460662484169, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43662700057029724, "step": 658 }, { "completion_length": 100.453125, "epoch": 0.42488716956802064, "grad_norm": 50.96489715576172, "kl": 0.1083984375, "learning_rate": 7.875564152159897e-07, "loss": 0.0043, "reward": 1.4827229976654053, "reward_std": 0.164472047239542, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4827229678630829, "step": 659 }, { "completion_length": 104.296875, "epoch": 0.425531914893617, "grad_norm": 55.485740661621094, "kl": 0.16552734375, "learning_rate": 7.872340425531915e-07, "loss": 0.0066, "reward": 1.5874009132385254, "reward_std": 0.17585615068674088, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5874009132385254, "step": 660 }, { "completion_length": 89.46875, "epoch": 0.4261766602192134, "grad_norm": 10.867607116699219, "kl": 0.119140625, "learning_rate": 7.869116698903932e-07, "loss": 0.0048, "reward": 1.6328694224357605, "reward_std": 0.12034214287996292, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6328694224357605, "step": 661 }, { "completion_length": 101.515625, "epoch": 0.4268214055448098, "grad_norm": 44.72108840942383, "kl": 0.12548828125, "learning_rate": 7.86589297227595e-07, "loss": 0.005, "reward": 1.4442914724349976, "reward_std": 0.12755419686436653, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44429148733615875, "step": 662 }, { "completion_length": 104.09375, "epoch": 0.4274661508704062, "grad_norm": 19.304611206054688, "kl": 0.1318359375, "learning_rate": 7.862669245647968e-07, "loss": 0.0053, "reward": 1.5890682339668274, "reward_std": 0.15507179126143456, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5890682637691498, "step": 663 }, { "completion_length": 81.5, "epoch": 0.42811089619600257, "grad_norm": 14.729875564575195, "kl": 0.1669921875, "learning_rate": 7.859445519019987e-07, "loss": 0.0067, "reward": 1.5029385685920715, "reward_std": 0.19087035208940506, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5029385983943939, "step": 664 }, { "completion_length": 95.34375, "epoch": 0.42875564152159895, "grad_norm": 15.593925476074219, "kl": 0.12890625, "learning_rate": 7.856221792392004e-07, "loss": 0.0052, "reward": 1.4721840620040894, "reward_std": 0.1463918462395668, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47218403220176697, "step": 665 }, { "completion_length": 106.125, "epoch": 0.42940038684719534, "grad_norm": 22.686920166015625, "kl": 0.12060546875, "learning_rate": 7.852998065764023e-07, "loss": 0.0048, "reward": 1.6089572310447693, "reward_std": 0.1527450978755951, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6089572459459305, "step": 666 }, { "completion_length": 95.109375, "epoch": 0.4300451321727917, "grad_norm": 10.141805648803711, "kl": 0.11474609375, "learning_rate": 7.84977433913604e-07, "loss": 0.0046, "reward": 1.532123863697052, "reward_std": 0.12704012170433998, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5321238338947296, "step": 667 }, { "completion_length": 95.859375, "epoch": 0.43068987749838816, "grad_norm": 9.21531867980957, "kl": 0.1357421875, "learning_rate": 7.846550612508059e-07, "loss": 0.0054, "reward": 1.5557440519332886, "reward_std": 0.15990904718637466, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5557440519332886, "step": 668 }, { "completion_length": 91.984375, "epoch": 0.43133462282398455, "grad_norm": 15.773168563842773, "kl": 0.134033203125, "learning_rate": 7.843326885880077e-07, "loss": 0.0054, "reward": 1.729407250881195, "reward_std": 0.11419141665101051, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7294073700904846, "step": 669 }, { "completion_length": 95.765625, "epoch": 0.43197936814958093, "grad_norm": 17.692100524902344, "kl": 0.114013671875, "learning_rate": 7.840103159252095e-07, "loss": 0.0046, "reward": 1.602592945098877, "reward_std": 0.1361280083656311, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6025929152965546, "step": 670 }, { "completion_length": 86.0, "epoch": 0.4326241134751773, "grad_norm": 18.247699737548828, "kl": 0.12939453125, "learning_rate": 7.836879432624112e-07, "loss": 0.0052, "reward": 1.5443543195724487, "reward_std": 0.10458062216639519, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5443542748689651, "step": 671 }, { "completion_length": 97.546875, "epoch": 0.4332688588007737, "grad_norm": 22.875259399414062, "kl": 0.1044921875, "learning_rate": 7.833655705996132e-07, "loss": 0.0042, "reward": 1.4106297492980957, "reward_std": 0.07918915897607803, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4106297940015793, "step": 672 }, { "completion_length": 96.734375, "epoch": 0.4339136041263701, "grad_norm": 16.27412223815918, "kl": 0.1396484375, "learning_rate": 7.830431979368149e-07, "loss": 0.0056, "reward": 1.5767380595207214, "reward_std": 0.09661797434091568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5767380446195602, "step": 673 }, { "completion_length": 99.25, "epoch": 0.4345583494519665, "grad_norm": 216.69961547851562, "kl": 0.13134765625, "learning_rate": 7.827208252740167e-07, "loss": 0.0052, "reward": 1.5629220008850098, "reward_std": 0.11474324017763138, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5629220306873322, "step": 674 }, { "completion_length": 99.34375, "epoch": 0.43520309477756286, "grad_norm": 11.57163143157959, "kl": 0.118408203125, "learning_rate": 7.823984526112185e-07, "loss": 0.0047, "reward": 1.3828779458999634, "reward_std": 0.08658572658896446, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.38287800550460815, "step": 675 }, { "completion_length": 94.59375, "epoch": 0.43584784010315925, "grad_norm": 31.561555862426758, "kl": 0.115966796875, "learning_rate": 7.820760799484203e-07, "loss": 0.0046, "reward": 1.5925236344337463, "reward_std": 0.10616682469844818, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5925236642360687, "step": 676 }, { "completion_length": 95.0625, "epoch": 0.43649258542875563, "grad_norm": 8.750934600830078, "kl": 0.109130859375, "learning_rate": 7.817537072856221e-07, "loss": 0.0044, "reward": 1.5274120569229126, "reward_std": 0.15606505796313286, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.527412086725235, "step": 677 }, { "completion_length": 101.875, "epoch": 0.437137330754352, "grad_norm": 144.177978515625, "kl": 0.116943359375, "learning_rate": 7.81431334622824e-07, "loss": 0.0047, "reward": 1.5111930966377258, "reward_std": 0.1961452066898346, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5268180966377258, "step": 678 }, { "completion_length": 87.703125, "epoch": 0.4377820760799484, "grad_norm": 13.244322776794434, "kl": 0.123779296875, "learning_rate": 7.811089619600257e-07, "loss": 0.005, "reward": 1.52597177028656, "reward_std": 0.12621500715613365, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5259717255830765, "step": 679 }, { "completion_length": 102.34375, "epoch": 0.4384268214055448, "grad_norm": 11.01314926147461, "kl": 0.11767578125, "learning_rate": 7.807865892972275e-07, "loss": 0.0047, "reward": 1.4737398028373718, "reward_std": 0.10230442509055138, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4737398326396942, "step": 680 }, { "completion_length": 90.6875, "epoch": 0.43907156673114117, "grad_norm": 13.169625282287598, "kl": 0.13720703125, "learning_rate": 7.804642166344294e-07, "loss": 0.0055, "reward": 1.5600587129592896, "reward_std": 0.13433203101158142, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5756836533546448, "step": 681 }, { "completion_length": 96.671875, "epoch": 0.4397163120567376, "grad_norm": 19.183500289916992, "kl": 0.1298828125, "learning_rate": 7.801418439716312e-07, "loss": 0.0052, "reward": 1.52139014005661, "reward_std": 0.1388275846838951, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5213901400566101, "step": 682 }, { "completion_length": 119.015625, "epoch": 0.440361057382334, "grad_norm": 9.817489624023438, "kl": 0.103515625, "learning_rate": 7.798194713088329e-07, "loss": 0.0041, "reward": 1.6039352416992188, "reward_std": 0.0923042930662632, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6039351522922516, "step": 683 }, { "completion_length": 92.75, "epoch": 0.4410058027079304, "grad_norm": 17.6253662109375, "kl": 0.1904296875, "learning_rate": 7.794970986460347e-07, "loss": 0.0076, "reward": 1.553667962551117, "reward_std": 0.14492272213101387, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5536679923534393, "step": 684 }, { "completion_length": 97.59375, "epoch": 0.44165054803352677, "grad_norm": 21.793113708496094, "kl": 0.13134765625, "learning_rate": 7.791747259832367e-07, "loss": 0.0052, "reward": 1.608609676361084, "reward_std": 0.1321398764848709, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6086097061634064, "step": 685 }, { "completion_length": 91.28125, "epoch": 0.44229529335912315, "grad_norm": 15.338560104370117, "kl": 0.12939453125, "learning_rate": 7.788523533204384e-07, "loss": 0.0052, "reward": 1.4791777729988098, "reward_std": 0.06188085302710533, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4791778028011322, "step": 686 }, { "completion_length": 95.234375, "epoch": 0.44294003868471954, "grad_norm": 56.819026947021484, "kl": 0.11328125, "learning_rate": 7.785299806576401e-07, "loss": 0.0045, "reward": 1.608103096485138, "reward_std": 0.08347269147634506, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6081030368804932, "step": 687 }, { "completion_length": 97.890625, "epoch": 0.4435847840103159, "grad_norm": 38.48658752441406, "kl": 0.129638671875, "learning_rate": 7.78207607994842e-07, "loss": 0.0052, "reward": 1.4730200171470642, "reward_std": 0.12290628626942635, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4730200171470642, "step": 688 }, { "completion_length": 99.21875, "epoch": 0.4442295293359123, "grad_norm": 8.553914070129395, "kl": 0.1357421875, "learning_rate": 7.778852353320437e-07, "loss": 0.0054, "reward": 1.4979106187820435, "reward_std": 0.07518330216407776, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49791061878204346, "step": 689 }, { "completion_length": 96.625, "epoch": 0.4448742746615087, "grad_norm": 13.162261009216309, "kl": 0.123291015625, "learning_rate": 7.775628626692456e-07, "loss": 0.0049, "reward": 1.5289295315742493, "reward_std": 0.13452866673469543, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5289295315742493, "step": 690 }, { "completion_length": 97.515625, "epoch": 0.4455190199871051, "grad_norm": 33.99656295776367, "kl": 0.126953125, "learning_rate": 7.772404900064475e-07, "loss": 0.0051, "reward": 1.5598184466362, "reward_std": 0.07671795412898064, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5598184615373611, "step": 691 }, { "completion_length": 112.625, "epoch": 0.44616376531270147, "grad_norm": 48.81503677368164, "kl": 0.1162109375, "learning_rate": 7.769181173436492e-07, "loss": 0.0046, "reward": 1.4112409949302673, "reward_std": 0.18779970705509186, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.42686596512794495, "step": 692 }, { "completion_length": 97.609375, "epoch": 0.44680851063829785, "grad_norm": 26.25737762451172, "kl": 0.13232421875, "learning_rate": 7.765957446808509e-07, "loss": 0.0053, "reward": 1.6583746671676636, "reward_std": 0.12108327448368073, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6583746075630188, "step": 693 }, { "completion_length": 100.453125, "epoch": 0.44745325596389424, "grad_norm": 15.020151138305664, "kl": 0.10986328125, "learning_rate": 7.762733720180529e-07, "loss": 0.0044, "reward": 1.4644600749015808, "reward_std": 0.13533856347203255, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4644601047039032, "step": 694 }, { "completion_length": 95.71875, "epoch": 0.4480980012894907, "grad_norm": 21.066251754760742, "kl": 0.106689453125, "learning_rate": 7.759509993552547e-07, "loss": 0.0043, "reward": 1.6312799453735352, "reward_std": 0.10122665017843246, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6312799155712128, "step": 695 }, { "completion_length": 103.28125, "epoch": 0.44874274661508706, "grad_norm": 11.778822898864746, "kl": 0.11962890625, "learning_rate": 7.756286266924564e-07, "loss": 0.0048, "reward": 1.6831336617469788, "reward_std": 0.06576647609472275, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6831336915493011, "step": 696 }, { "completion_length": 97.890625, "epoch": 0.44938749194068345, "grad_norm": 24.91705894470215, "kl": 0.12060546875, "learning_rate": 7.753062540296583e-07, "loss": 0.0048, "reward": 1.7192001342773438, "reward_std": 0.0967547558248043, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7192001640796661, "step": 697 }, { "completion_length": 93.796875, "epoch": 0.45003223726627983, "grad_norm": 10.502815246582031, "kl": 0.138671875, "learning_rate": 7.749838813668601e-07, "loss": 0.0055, "reward": 1.5451446175575256, "reward_std": 0.10906476899981499, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5451445579528809, "step": 698 }, { "completion_length": 91.296875, "epoch": 0.4506769825918762, "grad_norm": 19.44222068786621, "kl": 0.1357421875, "learning_rate": 7.746615087040619e-07, "loss": 0.0054, "reward": 1.5762072801589966, "reward_std": 0.09864221513271332, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5762072950601578, "step": 699 }, { "completion_length": 97.125, "epoch": 0.4513217279174726, "grad_norm": 22.833763122558594, "kl": 0.129150390625, "learning_rate": 7.743391360412637e-07, "loss": 0.0052, "reward": 1.4687912464141846, "reward_std": 0.09121186658740044, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4687913507223129, "step": 700 }, { "completion_length": 91.734375, "epoch": 0.451966473243069, "grad_norm": 30.23716163635254, "kl": 0.1455078125, "learning_rate": 7.740167633784655e-07, "loss": 0.0058, "reward": 1.6057734489440918, "reward_std": 0.0660810861736536, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6057734489440918, "step": 701 }, { "completion_length": 88.265625, "epoch": 0.4526112185686654, "grad_norm": 15.325862884521484, "kl": 0.11572265625, "learning_rate": 7.736943907156672e-07, "loss": 0.0046, "reward": 1.4314576387405396, "reward_std": 0.08893175795674324, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43145765364170074, "step": 702 }, { "completion_length": 95.75, "epoch": 0.45325596389426176, "grad_norm": 12.580069541931152, "kl": 0.13623046875, "learning_rate": 7.733720180528692e-07, "loss": 0.0054, "reward": 1.5887573957443237, "reward_std": 0.07212099991738796, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5887573659420013, "step": 703 }, { "completion_length": 92.375, "epoch": 0.45390070921985815, "grad_norm": 16.807231903076172, "kl": 0.109375, "learning_rate": 7.730496453900709e-07, "loss": 0.0044, "reward": 1.5106210708618164, "reward_std": 0.08820386417210102, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5106211006641388, "step": 704 }, { "completion_length": 93.21875, "epoch": 0.45454545454545453, "grad_norm": 519.1197509765625, "kl": 0.106689453125, "learning_rate": 7.727272727272727e-07, "loss": 0.0043, "reward": 1.5637730956077576, "reward_std": 0.10183101892471313, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.56377312541008, "step": 705 }, { "completion_length": 83.203125, "epoch": 0.4551901998710509, "grad_norm": 46.5788688659668, "kl": 0.15185546875, "learning_rate": 7.724049000644744e-07, "loss": 0.0061, "reward": 1.6154998540878296, "reward_std": 0.10929432883858681, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6154997944831848, "step": 706 }, { "completion_length": 86.953125, "epoch": 0.4558349451966473, "grad_norm": 15.66248893737793, "kl": 0.133544921875, "learning_rate": 7.720825274016764e-07, "loss": 0.0053, "reward": 1.4980468153953552, "reward_std": 0.061294110491871834, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49804680049419403, "step": 707 }, { "completion_length": 102.515625, "epoch": 0.4564796905222437, "grad_norm": 16.94169807434082, "kl": 0.1357421875, "learning_rate": 7.717601547388781e-07, "loss": 0.0054, "reward": 1.716218113899231, "reward_std": 0.09692734852433205, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7162180542945862, "step": 708 }, { "completion_length": 97.265625, "epoch": 0.4571244358478401, "grad_norm": 10.600395202636719, "kl": 0.114501953125, "learning_rate": 7.7143778207608e-07, "loss": 0.0046, "reward": 1.5950488448143005, "reward_std": 0.0715242438018322, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5950488448143005, "step": 709 }, { "completion_length": 87.515625, "epoch": 0.4577691811734365, "grad_norm": 11.393702507019043, "kl": 0.1279296875, "learning_rate": 7.711154094132817e-07, "loss": 0.0051, "reward": 1.577049434185028, "reward_std": 0.09067998081445694, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5770494043827057, "step": 710 }, { "completion_length": 94.21875, "epoch": 0.4584139264990329, "grad_norm": 17.261930465698242, "kl": 0.111572265625, "learning_rate": 7.707930367504836e-07, "loss": 0.0045, "reward": 1.6038063764572144, "reward_std": 0.10224081575870514, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6038063764572144, "step": 711 }, { "completion_length": 101.921875, "epoch": 0.4590586718246293, "grad_norm": 14.559025764465332, "kl": 0.112548828125, "learning_rate": 7.704706640876853e-07, "loss": 0.0045, "reward": 1.4666990041732788, "reward_std": 0.11419890448451042, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4666990488767624, "step": 712 }, { "completion_length": 98.171875, "epoch": 0.45970341715022567, "grad_norm": 15.404255867004395, "kl": 0.148681640625, "learning_rate": 7.701482914248872e-07, "loss": 0.006, "reward": 1.6305444836616516, "reward_std": 0.11607625335454941, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6305445432662964, "step": 713 }, { "completion_length": 101.125, "epoch": 0.46034816247582205, "grad_norm": 19.92659568786621, "kl": 0.125, "learning_rate": 7.698259187620889e-07, "loss": 0.005, "reward": 1.4447981715202332, "reward_std": 0.1305028274655342, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44479817152023315, "step": 714 }, { "completion_length": 100.625, "epoch": 0.46099290780141844, "grad_norm": 11.078210830688477, "kl": 0.1474609375, "learning_rate": 7.695035460992907e-07, "loss": 0.0059, "reward": 1.4863204956054688, "reward_std": 0.09369765594601631, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48632049560546875, "step": 715 }, { "completion_length": 94.859375, "epoch": 0.4616376531270148, "grad_norm": 16.100868225097656, "kl": 0.13134765625, "learning_rate": 7.691811734364926e-07, "loss": 0.0053, "reward": 1.611817181110382, "reward_std": 0.09366685152053833, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6118170917034149, "step": 716 }, { "completion_length": 101.265625, "epoch": 0.4622823984526112, "grad_norm": 21.628129959106445, "kl": 0.110595703125, "learning_rate": 7.688588007736944e-07, "loss": 0.0044, "reward": 1.6645547151565552, "reward_std": 0.1055876649916172, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6645546853542328, "step": 717 }, { "completion_length": 102.21875, "epoch": 0.4629271437782076, "grad_norm": 20.826440811157227, "kl": 0.132568359375, "learning_rate": 7.685364281108961e-07, "loss": 0.0053, "reward": 1.5520586371421814, "reward_std": 0.09113192185759544, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5520585924386978, "step": 718 }, { "completion_length": 99.46875, "epoch": 0.463571889103804, "grad_norm": 16.853395462036133, "kl": 0.13525390625, "learning_rate": 7.68214055448098e-07, "loss": 0.0054, "reward": 1.5113345980644226, "reward_std": 0.11239676922559738, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5113345384597778, "step": 719 }, { "completion_length": 103.890625, "epoch": 0.46421663442940037, "grad_norm": 12.532203674316406, "kl": 0.13525390625, "learning_rate": 7.678916827852998e-07, "loss": 0.0054, "reward": 1.4734601974487305, "reward_std": 0.06641046330332756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47346019744873047, "step": 720 }, { "completion_length": 110.796875, "epoch": 0.46486137975499675, "grad_norm": 10.890167236328125, "kl": 0.12841796875, "learning_rate": 7.675693101225016e-07, "loss": 0.0051, "reward": 1.4852750301361084, "reward_std": 0.061814336106181145, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4852750599384308, "step": 721 }, { "completion_length": 116.859375, "epoch": 0.4655061250805932, "grad_norm": 10.753568649291992, "kl": 0.1123046875, "learning_rate": 7.672469374597034e-07, "loss": 0.0045, "reward": 1.4886300563812256, "reward_std": 0.08008428663015366, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4886299967765808, "step": 722 }, { "completion_length": 109.625, "epoch": 0.4661508704061896, "grad_norm": 10.12906551361084, "kl": 0.12255859375, "learning_rate": 7.669245647969052e-07, "loss": 0.0049, "reward": 1.503622591495514, "reward_std": 0.07655685395002365, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5036226361989975, "step": 723 }, { "completion_length": 118.53125, "epoch": 0.46679561573178596, "grad_norm": 17.245555877685547, "kl": 0.126220703125, "learning_rate": 7.66602192134107e-07, "loss": 0.0051, "reward": 1.6156468391418457, "reward_std": 0.11876443773508072, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6156468689441681, "step": 724 }, { "completion_length": 125.328125, "epoch": 0.46744036105738235, "grad_norm": 7.821877956390381, "kl": 0.1220703125, "learning_rate": 7.662798194713089e-07, "loss": 0.0049, "reward": 1.6727633476257324, "reward_std": 0.12070934474468231, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6727633774280548, "step": 725 }, { "completion_length": 127.796875, "epoch": 0.46808510638297873, "grad_norm": 18.543737411499023, "kl": 0.131591796875, "learning_rate": 7.659574468085106e-07, "loss": 0.0053, "reward": 1.5528809428215027, "reward_std": 0.09312764927744865, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5528809428215027, "step": 726 }, { "completion_length": 117.171875, "epoch": 0.4687298517085751, "grad_norm": 21.947412490844727, "kl": 0.117919921875, "learning_rate": 7.656350741457124e-07, "loss": 0.0047, "reward": 1.6618314385414124, "reward_std": 0.0795222856104374, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6618314981460571, "step": 727 }, { "completion_length": 110.140625, "epoch": 0.4693745970341715, "grad_norm": 10.362268447875977, "kl": 0.138671875, "learning_rate": 7.653127014829143e-07, "loss": 0.0056, "reward": 1.5495651364326477, "reward_std": 0.0900680385529995, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5495651364326477, "step": 728 }, { "completion_length": 122.140625, "epoch": 0.4700193423597679, "grad_norm": 9.733297348022461, "kl": 0.12158203125, "learning_rate": 7.649903288201161e-07, "loss": 0.0049, "reward": 1.6779837608337402, "reward_std": 0.06389831379055977, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6779836416244507, "step": 729 }, { "completion_length": 121.40625, "epoch": 0.4706640876853643, "grad_norm": 153.9310760498047, "kl": 0.1064453125, "learning_rate": 7.646679561573178e-07, "loss": 0.0043, "reward": 1.5564913749694824, "reward_std": 0.18158581107854843, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5564914643764496, "step": 730 }, { "completion_length": 118.234375, "epoch": 0.47130883301096066, "grad_norm": 9.935221672058105, "kl": 0.123779296875, "learning_rate": 7.643455834945196e-07, "loss": 0.0049, "reward": 1.5225478410720825, "reward_std": 0.09475431591272354, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5225478410720825, "step": 731 }, { "completion_length": 124.953125, "epoch": 0.47195357833655704, "grad_norm": 20.191246032714844, "kl": 0.1259765625, "learning_rate": 7.640232108317214e-07, "loss": 0.0051, "reward": 1.552919089794159, "reward_std": 0.07817603275179863, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5529190897941589, "step": 732 }, { "completion_length": 110.5625, "epoch": 0.47259832366215343, "grad_norm": 18.04339027404785, "kl": 0.11865234375, "learning_rate": 7.637008381689233e-07, "loss": 0.0047, "reward": 1.5138545632362366, "reward_std": 0.13918767496943474, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5138545781373978, "step": 733 }, { "completion_length": 114.0, "epoch": 0.4732430689877498, "grad_norm": 10.329635620117188, "kl": 0.11767578125, "learning_rate": 7.63378465506125e-07, "loss": 0.0047, "reward": 1.6484586000442505, "reward_std": 0.11795339360833168, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6484585404396057, "step": 734 }, { "completion_length": 108.28125, "epoch": 0.4738878143133462, "grad_norm": 19.822538375854492, "kl": 0.128662109375, "learning_rate": 7.630560928433269e-07, "loss": 0.0051, "reward": 1.5399014949798584, "reward_std": 0.11883477121591568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.53990139067173, "step": 735 }, { "completion_length": 115.875, "epoch": 0.47453255963894264, "grad_norm": 31.11959457397461, "kl": 0.134765625, "learning_rate": 7.627337201805286e-07, "loss": 0.0054, "reward": 1.5331445336341858, "reward_std": 0.15693273022770882, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5487695634365082, "step": 736 }, { "completion_length": 106.453125, "epoch": 0.475177304964539, "grad_norm": 14.009469985961914, "kl": 0.1240234375, "learning_rate": 7.624113475177305e-07, "loss": 0.005, "reward": 1.4909018278121948, "reward_std": 0.11989344283938408, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4909018278121948, "step": 737 }, { "completion_length": 120.765625, "epoch": 0.4758220502901354, "grad_norm": 13.631786346435547, "kl": 0.1240234375, "learning_rate": 7.620889748549323e-07, "loss": 0.005, "reward": 1.5898830890655518, "reward_std": 0.08887490257620811, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.589883029460907, "step": 738 }, { "completion_length": 105.171875, "epoch": 0.4764667956157318, "grad_norm": 13.331558227539062, "kl": 0.12744140625, "learning_rate": 7.617666021921341e-07, "loss": 0.0051, "reward": 1.5238662362098694, "reward_std": 0.08402681723237038, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5238662660121918, "step": 739 }, { "completion_length": 107.03125, "epoch": 0.4771115409413282, "grad_norm": 25.664894104003906, "kl": 0.116943359375, "learning_rate": 7.614442295293358e-07, "loss": 0.0047, "reward": 1.5047930479049683, "reward_std": 0.11811169236898422, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5047930777072906, "step": 740 }, { "completion_length": 104.046875, "epoch": 0.47775628626692457, "grad_norm": 7.534611701965332, "kl": 0.154296875, "learning_rate": 7.611218568665378e-07, "loss": 0.0062, "reward": 1.4220759868621826, "reward_std": 0.11570665240287781, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4220760092139244, "step": 741 }, { "completion_length": 104.78125, "epoch": 0.47840103159252095, "grad_norm": 96.45923614501953, "kl": 0.118408203125, "learning_rate": 7.607994842037395e-07, "loss": 0.0047, "reward": 1.4136383533477783, "reward_std": 0.09354563057422638, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41363826394081116, "step": 742 }, { "completion_length": 113.328125, "epoch": 0.47904577691811734, "grad_norm": 29.232112884521484, "kl": 0.14111328125, "learning_rate": 7.604771115409413e-07, "loss": 0.0056, "reward": 1.5308947563171387, "reward_std": 0.11985908448696136, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5308947861194611, "step": 743 }, { "completion_length": 99.03125, "epoch": 0.4796905222437137, "grad_norm": 22.255399703979492, "kl": 0.13720703125, "learning_rate": 7.601547388781431e-07, "loss": 0.0055, "reward": 1.6334994435310364, "reward_std": 0.10051496885716915, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6334995031356812, "step": 744 }, { "completion_length": 87.390625, "epoch": 0.4803352675693101, "grad_norm": 16.31736946105957, "kl": 0.2001953125, "learning_rate": 7.598323662153449e-07, "loss": 0.008, "reward": 1.5974202156066895, "reward_std": 0.09732247516512871, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5974202156066895, "step": 745 }, { "completion_length": 110.53125, "epoch": 0.4809800128949065, "grad_norm": 8.461565971374512, "kl": 0.13525390625, "learning_rate": 7.595099935525467e-07, "loss": 0.0054, "reward": 1.7255396246910095, "reward_std": 0.1103828065097332, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7255396246910095, "step": 746 }, { "completion_length": 96.4375, "epoch": 0.4816247582205029, "grad_norm": 225.49200439453125, "kl": 0.126953125, "learning_rate": 7.591876208897486e-07, "loss": 0.0051, "reward": 1.5281953811645508, "reward_std": 0.08646541833877563, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5281953811645508, "step": 747 }, { "completion_length": 92.984375, "epoch": 0.48226950354609927, "grad_norm": 89.93632507324219, "kl": 0.13916015625, "learning_rate": 7.588652482269503e-07, "loss": 0.0056, "reward": 1.652679681777954, "reward_std": 0.11553067341446877, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6526796817779541, "step": 748 }, { "completion_length": 104.0, "epoch": 0.4829142488716957, "grad_norm": 9.606204986572266, "kl": 0.136962890625, "learning_rate": 7.585428755641521e-07, "loss": 0.0055, "reward": 1.4608428478240967, "reward_std": 0.101207185536623, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4608428180217743, "step": 749 }, { "completion_length": 80.453125, "epoch": 0.4835589941972921, "grad_norm": 11.411137580871582, "kl": 0.14111328125, "learning_rate": 7.58220502901354e-07, "loss": 0.0056, "reward": 1.5915326476097107, "reward_std": 0.13187076896429062, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5915326774120331, "step": 750 }, { "completion_length": 98.859375, "epoch": 0.4842037395228885, "grad_norm": 43.71159362792969, "kl": 0.14501953125, "learning_rate": 7.578981302385558e-07, "loss": 0.0058, "reward": 1.5974067449569702, "reward_std": 0.13504815101623535, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5974067747592926, "step": 751 }, { "completion_length": 100.25, "epoch": 0.48484848484848486, "grad_norm": 10.606791496276855, "kl": 0.12939453125, "learning_rate": 7.575757575757575e-07, "loss": 0.0052, "reward": 1.5629411935806274, "reward_std": 0.05437769740819931, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5629411041736603, "step": 752 }, { "completion_length": 93.09375, "epoch": 0.48549323017408125, "grad_norm": 25.361251831054688, "kl": 0.1533203125, "learning_rate": 7.572533849129593e-07, "loss": 0.0061, "reward": 1.5369932055473328, "reward_std": 0.10840344429016113, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5369932353496552, "step": 753 }, { "completion_length": 96.796875, "epoch": 0.48613797549967763, "grad_norm": 35.501617431640625, "kl": 0.1416015625, "learning_rate": 7.569310122501612e-07, "loss": 0.0057, "reward": 1.5901548862457275, "reward_std": 0.122325100004673, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5901548862457275, "step": 754 }, { "completion_length": 98.59375, "epoch": 0.486782720825274, "grad_norm": 45.77256774902344, "kl": 0.220703125, "learning_rate": 7.56608639587363e-07, "loss": 0.0088, "reward": 1.449923574924469, "reward_std": 0.12096518278121948, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.449923574924469, "step": 755 }, { "completion_length": 100.0, "epoch": 0.4874274661508704, "grad_norm": 14.880589485168457, "kl": 0.1396484375, "learning_rate": 7.562862669245647e-07, "loss": 0.0056, "reward": 1.5938887000083923, "reward_std": 0.16511598601937294, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6095137000083923, "step": 756 }, { "completion_length": 111.859375, "epoch": 0.4880722114764668, "grad_norm": 10.331930160522461, "kl": 0.147705078125, "learning_rate": 7.559638942617666e-07, "loss": 0.0059, "reward": 1.5344634056091309, "reward_std": 0.1292192693799734, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5344634652137756, "step": 757 }, { "completion_length": 107.359375, "epoch": 0.4887169568020632, "grad_norm": 11.895607948303223, "kl": 0.1123046875, "learning_rate": 7.556415215989683e-07, "loss": 0.0045, "reward": 1.4465181231498718, "reward_std": 0.10399571061134338, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4621431678533554, "step": 758 }, { "completion_length": 102.390625, "epoch": 0.48936170212765956, "grad_norm": 28.140899658203125, "kl": 0.118408203125, "learning_rate": 7.553191489361702e-07, "loss": 0.0047, "reward": 1.4872334003448486, "reward_std": 0.1054934673011303, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48723337054252625, "step": 759 }, { "completion_length": 98.296875, "epoch": 0.49000644745325594, "grad_norm": 8.313096046447754, "kl": 0.137939453125, "learning_rate": 7.54996776273372e-07, "loss": 0.0055, "reward": 1.4911122918128967, "reward_std": 0.06940718367695808, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49111229181289673, "step": 760 }, { "completion_length": 100.96875, "epoch": 0.49065119277885233, "grad_norm": 25.631065368652344, "kl": 0.1279296875, "learning_rate": 7.546744036105738e-07, "loss": 0.0051, "reward": 1.6772446632385254, "reward_std": 0.11209296062588692, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.677244633436203, "step": 761 }, { "completion_length": 98.25, "epoch": 0.4912959381044487, "grad_norm": 15.898694038391113, "kl": 0.14208984375, "learning_rate": 7.543520309477755e-07, "loss": 0.0057, "reward": 1.5463205575942993, "reward_std": 0.10821164213120937, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5463205575942993, "step": 762 }, { "completion_length": 106.59375, "epoch": 0.49194068343004516, "grad_norm": 14.443410873413086, "kl": 0.11767578125, "learning_rate": 7.540296582849775e-07, "loss": 0.0047, "reward": 1.4314526915550232, "reward_std": 0.09445375949144363, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4314526617527008, "step": 763 }, { "completion_length": 106.140625, "epoch": 0.49258542875564154, "grad_norm": 34.89936447143555, "kl": 0.11767578125, "learning_rate": 7.537072856221792e-07, "loss": 0.0047, "reward": 1.5694019794464111, "reward_std": 0.08024864830076694, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5694020092487335, "step": 764 }, { "completion_length": 103.78125, "epoch": 0.4932301740812379, "grad_norm": 31.551544189453125, "kl": 0.18115234375, "learning_rate": 7.53384912959381e-07, "loss": 0.0073, "reward": 1.4196154475212097, "reward_std": 0.10790861025452614, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4196154773235321, "step": 765 }, { "completion_length": 102.65625, "epoch": 0.4938749194068343, "grad_norm": 8.728348731994629, "kl": 0.1455078125, "learning_rate": 7.530625402965828e-07, "loss": 0.0058, "reward": 1.3880661725997925, "reward_std": 0.12530139833688736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3880661725997925, "step": 766 }, { "completion_length": 104.390625, "epoch": 0.4945196647324307, "grad_norm": 34.32732009887695, "kl": 0.1494140625, "learning_rate": 7.527401676337847e-07, "loss": 0.006, "reward": 1.3845961093902588, "reward_std": 0.1502724587917328, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4002210944890976, "step": 767 }, { "completion_length": 96.828125, "epoch": 0.4951644100580271, "grad_norm": 12.261150360107422, "kl": 0.105712890625, "learning_rate": 7.524177949709864e-07, "loss": 0.0042, "reward": 1.5911886096000671, "reward_std": 0.06629323586821556, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5911886692047119, "step": 768 }, { "completion_length": 107.125, "epoch": 0.49580915538362347, "grad_norm": 14.596806526184082, "kl": 0.1103515625, "learning_rate": 7.520954223081883e-07, "loss": 0.0044, "reward": 1.6019713878631592, "reward_std": 0.05874236673116684, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6019714027643204, "step": 769 }, { "completion_length": 105.0, "epoch": 0.49645390070921985, "grad_norm": 15.82909870147705, "kl": 0.1240234375, "learning_rate": 7.5177304964539e-07, "loss": 0.005, "reward": 1.3426456451416016, "reward_std": 0.06746992282569408, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.34264568984508514, "step": 770 }, { "completion_length": 103.828125, "epoch": 0.49709864603481624, "grad_norm": 24.44872283935547, "kl": 0.1689453125, "learning_rate": 7.514506769825918e-07, "loss": 0.0067, "reward": 1.5089261531829834, "reward_std": 0.101194828748703, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5089261829853058, "step": 771 }, { "completion_length": 113.609375, "epoch": 0.4977433913604126, "grad_norm": 10.225950241088867, "kl": 0.12744140625, "learning_rate": 7.511283043197937e-07, "loss": 0.0051, "reward": 1.636949062347412, "reward_std": 0.09513459354639053, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6369490921497345, "step": 772 }, { "completion_length": 104.296875, "epoch": 0.498388136686009, "grad_norm": 21.62342643737793, "kl": 0.1416015625, "learning_rate": 7.508059316569955e-07, "loss": 0.0057, "reward": 1.6637132167816162, "reward_std": 0.11707671359181404, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6637131869792938, "step": 773 }, { "completion_length": 98.796875, "epoch": 0.4990328820116054, "grad_norm": 56.46780014038086, "kl": 0.112060546875, "learning_rate": 7.504835589941972e-07, "loss": 0.0045, "reward": 1.534183382987976, "reward_std": 0.09461552277207375, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5341833829879761, "step": 774 }, { "completion_length": 98.484375, "epoch": 0.4996776273372018, "grad_norm": 12.098061561584473, "kl": 0.11767578125, "learning_rate": 7.50161186331399e-07, "loss": 0.0047, "reward": 1.6135407090187073, "reward_std": 0.11286146193742752, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6135406941175461, "step": 775 }, { "completion_length": 114.984375, "epoch": 0.5003223726627982, "grad_norm": 20.275917053222656, "kl": 0.10302734375, "learning_rate": 7.498388136686009e-07, "loss": 0.0041, "reward": 1.5216720700263977, "reward_std": 0.06171071156859398, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5216720998287201, "step": 776 }, { "completion_length": 104.53125, "epoch": 0.5009671179883946, "grad_norm": 13.640120506286621, "kl": 0.1298828125, "learning_rate": 7.495164410058027e-07, "loss": 0.0052, "reward": 1.6737434267997742, "reward_std": 0.11507437378168106, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6737434566020966, "step": 777 }, { "completion_length": 113.203125, "epoch": 0.501611863313991, "grad_norm": 27.09701156616211, "kl": 0.121337890625, "learning_rate": 7.491940683430044e-07, "loss": 0.0048, "reward": 1.5331305265426636, "reward_std": 0.1381199024617672, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5331305861473083, "step": 778 }, { "completion_length": 100.65625, "epoch": 0.5022566086395873, "grad_norm": 29.07394027709961, "kl": 0.1337890625, "learning_rate": 7.488716956802063e-07, "loss": 0.0054, "reward": 1.5475330352783203, "reward_std": 0.26855621486902237, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.5944079458713531, "step": 779 }, { "completion_length": 105.0625, "epoch": 0.5029013539651838, "grad_norm": 9.877948760986328, "kl": 0.115966796875, "learning_rate": 7.485493230174081e-07, "loss": 0.0046, "reward": 1.673143744468689, "reward_std": 0.08836501836776733, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.673143744468689, "step": 780 }, { "completion_length": 111.3125, "epoch": 0.5035460992907801, "grad_norm": 22.850160598754883, "kl": 0.132568359375, "learning_rate": 7.482269503546099e-07, "loss": 0.0053, "reward": 1.674381136894226, "reward_std": 0.12078357115387917, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6743811964988708, "step": 781 }, { "completion_length": 109.046875, "epoch": 0.5041908446163765, "grad_norm": 31.050485610961914, "kl": 0.158203125, "learning_rate": 7.479045776918117e-07, "loss": 0.0063, "reward": 1.4201094508171082, "reward_std": 0.1670779399573803, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.43573447316884995, "step": 782 }, { "completion_length": 118.078125, "epoch": 0.504835589941973, "grad_norm": 16.994108200073242, "kl": 0.103759765625, "learning_rate": 7.475822050290135e-07, "loss": 0.0042, "reward": 1.5727297067642212, "reward_std": 0.13664092868566513, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5727297067642212, "step": 783 }, { "completion_length": 132.359375, "epoch": 0.5054803352675693, "grad_norm": 14.750675201416016, "kl": 0.12158203125, "learning_rate": 7.472598323662152e-07, "loss": 0.0049, "reward": 1.4640502333641052, "reward_std": 0.19203101471066475, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.47967518121004105, "step": 784 }, { "completion_length": 111.03125, "epoch": 0.5061250805931657, "grad_norm": 12.755722045898438, "kl": 0.123779296875, "learning_rate": 7.469374597034172e-07, "loss": 0.005, "reward": 1.59541916847229, "reward_std": 0.08936497010290623, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5954191088676453, "step": 785 }, { "completion_length": 114.09375, "epoch": 0.5067698259187621, "grad_norm": 10.042961120605469, "kl": 0.129150390625, "learning_rate": 7.466150870406189e-07, "loss": 0.0052, "reward": 1.5148058533668518, "reward_std": 0.11729194968938828, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5148058235645294, "step": 786 }, { "completion_length": 113.40625, "epoch": 0.5074145712443585, "grad_norm": 20.501480102539062, "kl": 0.107177734375, "learning_rate": 7.462927143778207e-07, "loss": 0.0043, "reward": 1.5762805938720703, "reward_std": 0.13149290531873703, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5762805938720703, "step": 787 }, { "completion_length": 117.046875, "epoch": 0.5080593165699548, "grad_norm": 16.187808990478516, "kl": 0.100341796875, "learning_rate": 7.459703417150225e-07, "loss": 0.004, "reward": 1.6261343955993652, "reward_std": 0.13067299500107765, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6261343359947205, "step": 788 }, { "completion_length": 111.359375, "epoch": 0.5087040618955513, "grad_norm": 14.373567581176758, "kl": 0.124267578125, "learning_rate": 7.456479690522244e-07, "loss": 0.005, "reward": 1.4204865097999573, "reward_std": 0.1222645752131939, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42048661410808563, "step": 789 }, { "completion_length": 121.5, "epoch": 0.5093488072211476, "grad_norm": 10.425048828125, "kl": 0.095947265625, "learning_rate": 7.453255963894261e-07, "loss": 0.0038, "reward": 1.6218205690383911, "reward_std": 0.13687458634376526, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6218205988407135, "step": 790 }, { "completion_length": 125.453125, "epoch": 0.5099935525467441, "grad_norm": 11.841174125671387, "kl": 0.14306640625, "learning_rate": 7.45003223726628e-07, "loss": 0.0057, "reward": 1.548657238483429, "reward_std": 0.12894482910633087, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5486572682857513, "step": 791 }, { "completion_length": 113.015625, "epoch": 0.5106382978723404, "grad_norm": 7.590851783752441, "kl": 0.113525390625, "learning_rate": 7.446808510638297e-07, "loss": 0.0045, "reward": 1.668638288974762, "reward_std": 0.11230403184890747, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.668638288974762, "step": 792 }, { "completion_length": 122.578125, "epoch": 0.5112830431979368, "grad_norm": 9.391864776611328, "kl": 0.1201171875, "learning_rate": 7.443584784010316e-07, "loss": 0.0048, "reward": 1.6186297535896301, "reward_std": 0.09934529662132263, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6186297535896301, "step": 793 }, { "completion_length": 113.65625, "epoch": 0.5119277885235332, "grad_norm": 14.11962890625, "kl": 0.112060546875, "learning_rate": 7.440361057382334e-07, "loss": 0.0045, "reward": 1.5052703619003296, "reward_std": 0.09183266013860703, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5052704364061356, "step": 794 }, { "completion_length": 119.515625, "epoch": 0.5125725338491296, "grad_norm": 10.853915214538574, "kl": 0.13818359375, "learning_rate": 7.437137330754352e-07, "loss": 0.0055, "reward": 1.680991291999817, "reward_std": 0.14312323927879333, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6809913218021393, "step": 795 }, { "completion_length": 119.28125, "epoch": 0.513217279174726, "grad_norm": 42.03529739379883, "kl": 0.13427734375, "learning_rate": 7.433913604126369e-07, "loss": 0.0054, "reward": 1.6375518441200256, "reward_std": 0.13579606264829636, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6375519037246704, "step": 796 }, { "completion_length": 126.59375, "epoch": 0.5138620245003224, "grad_norm": 9.108304977416992, "kl": 0.12060546875, "learning_rate": 7.430689877498387e-07, "loss": 0.0048, "reward": 1.638499677181244, "reward_std": 0.09739596769213676, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6384996771812439, "step": 797 }, { "completion_length": 122.890625, "epoch": 0.5145067698259188, "grad_norm": 10.199109077453613, "kl": 0.112060546875, "learning_rate": 7.427466150870406e-07, "loss": 0.0045, "reward": 1.5257828831672668, "reward_std": 0.17206069082021713, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5414078831672668, "step": 798 }, { "completion_length": 112.0, "epoch": 0.5151515151515151, "grad_norm": 12.440640449523926, "kl": 0.16015625, "learning_rate": 7.424242424242424e-07, "loss": 0.0064, "reward": 1.5461509823799133, "reward_std": 0.1321653015911579, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5617759674787521, "step": 799 }, { "completion_length": 117.46875, "epoch": 0.5157962604771116, "grad_norm": 10.767664909362793, "kl": 0.1298828125, "learning_rate": 7.421018697614441e-07, "loss": 0.0052, "reward": 1.5796707272529602, "reward_std": 0.1553436890244484, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5952958464622498, "step": 800 }, { "completion_length": 124.828125, "epoch": 0.5164410058027079, "grad_norm": 12.43638801574707, "kl": 0.138671875, "learning_rate": 7.41779497098646e-07, "loss": 0.0055, "reward": 1.5580316185951233, "reward_std": 0.1056671105325222, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5580316483974457, "step": 801 }, { "completion_length": 120.140625, "epoch": 0.5170857511283043, "grad_norm": 15.19301986694336, "kl": 0.119384765625, "learning_rate": 7.414571244358478e-07, "loss": 0.0048, "reward": 1.623548150062561, "reward_std": 0.17357219755649567, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.623548150062561, "step": 802 }, { "completion_length": 123.734375, "epoch": 0.5177304964539007, "grad_norm": 18.191144943237305, "kl": 0.10546875, "learning_rate": 7.411347517730496e-07, "loss": 0.0042, "reward": 1.7189044952392578, "reward_std": 0.10732979327440262, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7189044952392578, "step": 803 }, { "completion_length": 125.03125, "epoch": 0.5183752417794971, "grad_norm": 11.427577018737793, "kl": 0.103515625, "learning_rate": 7.408123791102514e-07, "loss": 0.0041, "reward": 1.6487030982971191, "reward_std": 0.08015275374054909, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6487032175064087, "step": 804 }, { "completion_length": 111.125, "epoch": 0.5190199871050934, "grad_norm": 9.316217422485352, "kl": 0.154296875, "learning_rate": 7.404900064474532e-07, "loss": 0.0062, "reward": 1.3404886722564697, "reward_std": 0.13357722014188766, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.35611365735530853, "step": 805 }, { "completion_length": 105.34375, "epoch": 0.5196647324306899, "grad_norm": 8.146766662597656, "kl": 0.130126953125, "learning_rate": 7.40167633784655e-07, "loss": 0.0052, "reward": 1.577560842037201, "reward_std": 0.08118342608213425, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5775608718395233, "step": 806 }, { "completion_length": 108.375, "epoch": 0.5203094777562862, "grad_norm": 13.145628929138184, "kl": 0.11572265625, "learning_rate": 7.398452611218569e-07, "loss": 0.0046, "reward": 1.569650411605835, "reward_std": 0.11123111471533775, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5696503669023514, "step": 807 }, { "completion_length": 123.515625, "epoch": 0.5209542230818827, "grad_norm": 17.439918518066406, "kl": 0.106201171875, "learning_rate": 7.395228884590586e-07, "loss": 0.0043, "reward": 1.6000871062278748, "reward_std": 0.16571377217769623, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6157121360301971, "step": 808 }, { "completion_length": 123.78125, "epoch": 0.521598968407479, "grad_norm": 23.56669044494629, "kl": 0.126953125, "learning_rate": 7.392005157962604e-07, "loss": 0.0051, "reward": 1.7089502811431885, "reward_std": 0.09575696289539337, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7089502811431885, "step": 809 }, { "completion_length": 123.140625, "epoch": 0.5222437137330754, "grad_norm": 35.4520149230957, "kl": 0.128662109375, "learning_rate": 7.388781431334622e-07, "loss": 0.0051, "reward": 1.6607640981674194, "reward_std": 0.19404253363609314, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6607640981674194, "step": 810 }, { "completion_length": 106.59375, "epoch": 0.5228884590586719, "grad_norm": 10.3298978805542, "kl": 0.096923828125, "learning_rate": 7.385557704706641e-07, "loss": 0.0039, "reward": 1.455498218536377, "reward_std": 0.1199144497513771, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45549826323986053, "step": 811 }, { "completion_length": 123.015625, "epoch": 0.5235332043842682, "grad_norm": 16.133468627929688, "kl": 0.1884765625, "learning_rate": 7.382333978078658e-07, "loss": 0.0075, "reward": 1.6502628922462463, "reward_std": 0.13473164662718773, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.650262862443924, "step": 812 }, { "completion_length": 132.265625, "epoch": 0.5241779497098646, "grad_norm": 9.389092445373535, "kl": 0.110595703125, "learning_rate": 7.379110251450677e-07, "loss": 0.0044, "reward": 1.6649059057235718, "reward_std": 0.104056216776371, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6649059057235718, "step": 813 }, { "completion_length": 117.0, "epoch": 0.524822695035461, "grad_norm": 31.27338218688965, "kl": 0.1220703125, "learning_rate": 7.375886524822694e-07, "loss": 0.0049, "reward": 1.5679078698158264, "reward_std": 0.15254487842321396, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5679078996181488, "step": 814 }, { "completion_length": 134.265625, "epoch": 0.5254674403610574, "grad_norm": 13.405317306518555, "kl": 0.11083984375, "learning_rate": 7.372662798194713e-07, "loss": 0.0044, "reward": 1.5193554759025574, "reward_std": 0.09003760293126106, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.519355446100235, "step": 815 }, { "completion_length": 129.1875, "epoch": 0.5261121856866537, "grad_norm": 15.608565330505371, "kl": 0.113037109375, "learning_rate": 7.369439071566731e-07, "loss": 0.0045, "reward": 1.6685249209403992, "reward_std": 0.09597417712211609, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6685249507427216, "step": 816 }, { "completion_length": 115.84375, "epoch": 0.5267569310122502, "grad_norm": 9.022123336791992, "kl": 0.1123046875, "learning_rate": 7.366215344938749e-07, "loss": 0.0045, "reward": 1.4091280698776245, "reward_std": 0.0891311950981617, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4091280996799469, "step": 817 }, { "completion_length": 132.15625, "epoch": 0.5274016763378465, "grad_norm": 16.895809173583984, "kl": 0.114990234375, "learning_rate": 7.362991618310766e-07, "loss": 0.0046, "reward": 1.518994688987732, "reward_std": 0.1304328851401806, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5189946293830872, "step": 818 }, { "completion_length": 114.328125, "epoch": 0.528046421663443, "grad_norm": 10.180063247680664, "kl": 0.12353515625, "learning_rate": 7.359767891682786e-07, "loss": 0.0049, "reward": 1.5339825749397278, "reward_std": 0.14155717194080353, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5339825749397278, "step": 819 }, { "completion_length": 132.6875, "epoch": 0.5286911669890393, "grad_norm": 15.447187423706055, "kl": 0.104736328125, "learning_rate": 7.356544165054803e-07, "loss": 0.0042, "reward": 1.4741751551628113, "reward_std": 0.09367522969841957, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47417517751455307, "step": 820 }, { "completion_length": 123.796875, "epoch": 0.5293359123146357, "grad_norm": 7.803520202636719, "kl": 0.119384765625, "learning_rate": 7.353320438426821e-07, "loss": 0.0048, "reward": 1.5355405807495117, "reward_std": 0.04676287993788719, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5355406105518341, "step": 821 }, { "completion_length": 127.78125, "epoch": 0.5299806576402321, "grad_norm": 17.904590606689453, "kl": 0.1162109375, "learning_rate": 7.350096711798838e-07, "loss": 0.0046, "reward": 1.5604600310325623, "reward_std": 0.07631072774529457, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5604600012302399, "step": 822 }, { "completion_length": 124.8125, "epoch": 0.5306254029658285, "grad_norm": 13.199056625366211, "kl": 0.105712890625, "learning_rate": 7.346872985170857e-07, "loss": 0.0042, "reward": 1.3325340151786804, "reward_std": 0.055142441764473915, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3325340151786804, "step": 823 }, { "completion_length": 125.21875, "epoch": 0.5312701482914249, "grad_norm": 12.559577941894531, "kl": 0.098388671875, "learning_rate": 7.343649258542875e-07, "loss": 0.0039, "reward": 1.5847796201705933, "reward_std": 0.08421425521373749, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5847796201705933, "step": 824 }, { "completion_length": 120.46875, "epoch": 0.5319148936170213, "grad_norm": 9.962425231933594, "kl": 0.13232421875, "learning_rate": 7.340425531914893e-07, "loss": 0.0053, "reward": 1.7481776475906372, "reward_std": 0.14498769491910934, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7481776475906372, "step": 825 }, { "completion_length": 117.0, "epoch": 0.5325596389426177, "grad_norm": 12.92812442779541, "kl": 0.114013671875, "learning_rate": 7.337201805286911e-07, "loss": 0.0046, "reward": 1.5807170271873474, "reward_std": 0.14974131435155869, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5807170867919922, "step": 826 }, { "completion_length": 116.578125, "epoch": 0.533204384268214, "grad_norm": 15.796828269958496, "kl": 0.113037109375, "learning_rate": 7.333978078658929e-07, "loss": 0.0045, "reward": 1.6642200946807861, "reward_std": 0.11426962912082672, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6642200648784637, "step": 827 }, { "completion_length": 110.4375, "epoch": 0.5338491295938105, "grad_norm": 38.065834045410156, "kl": 0.12548828125, "learning_rate": 7.330754352030947e-07, "loss": 0.005, "reward": 1.557353138923645, "reward_std": 0.0908035859465599, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5573531538248062, "step": 828 }, { "completion_length": 121.828125, "epoch": 0.5344938749194068, "grad_norm": 11.057354927062988, "kl": 0.1044921875, "learning_rate": 7.327530625402966e-07, "loss": 0.0042, "reward": 1.5736215710639954, "reward_std": 0.1010180115699768, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5736216008663177, "step": 829 }, { "completion_length": 120.046875, "epoch": 0.5351386202450032, "grad_norm": 24.34914207458496, "kl": 0.1357421875, "learning_rate": 7.324306898774983e-07, "loss": 0.0054, "reward": 1.3760216236114502, "reward_std": 0.05299963988363743, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.37602153420448303, "step": 830 }, { "completion_length": 118.734375, "epoch": 0.5357833655705996, "grad_norm": 10.806406021118164, "kl": 0.1083984375, "learning_rate": 7.321083172147001e-07, "loss": 0.0043, "reward": 1.3271194100379944, "reward_std": 0.04556073807179928, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3271194323897362, "step": 831 }, { "completion_length": 128.125, "epoch": 0.536428110896196, "grad_norm": 16.560606002807617, "kl": 0.12060546875, "learning_rate": 7.31785944551902e-07, "loss": 0.0048, "reward": 1.6746159791946411, "reward_std": 0.09150423854589462, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6746159195899963, "step": 832 }, { "completion_length": 129.953125, "epoch": 0.5370728562217923, "grad_norm": 14.166282653808594, "kl": 0.1298828125, "learning_rate": 7.314635718891038e-07, "loss": 0.0052, "reward": 1.7240188121795654, "reward_std": 0.11212188005447388, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7240188419818878, "step": 833 }, { "completion_length": 124.140625, "epoch": 0.5377176015473888, "grad_norm": 46.64698028564453, "kl": 0.11962890625, "learning_rate": 7.311411992263055e-07, "loss": 0.0048, "reward": 1.6080630421638489, "reward_std": 0.11205699667334557, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6080630421638489, "step": 834 }, { "completion_length": 119.90625, "epoch": 0.5383623468729851, "grad_norm": 8.266460418701172, "kl": 0.10791015625, "learning_rate": 7.308188265635074e-07, "loss": 0.0043, "reward": 1.5985249280929565, "reward_std": 0.11095097288489342, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5985249280929565, "step": 835 }, { "completion_length": 121.65625, "epoch": 0.5390070921985816, "grad_norm": 47.86563491821289, "kl": 0.117919921875, "learning_rate": 7.304964539007092e-07, "loss": 0.0047, "reward": 1.4885122179985046, "reward_std": 0.07481271028518677, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48851217329502106, "step": 836 }, { "completion_length": 125.828125, "epoch": 0.539651837524178, "grad_norm": 21.55581283569336, "kl": 0.12255859375, "learning_rate": 7.30174081237911e-07, "loss": 0.0049, "reward": 1.594322383403778, "reward_std": 0.14254087209701538, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5943224132061005, "step": 837 }, { "completion_length": 114.328125, "epoch": 0.5402965828497743, "grad_norm": 16.470298767089844, "kl": 0.14453125, "learning_rate": 7.298517085751128e-07, "loss": 0.0058, "reward": 1.6632260084152222, "reward_std": 0.12153082340955734, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6632260978221893, "step": 838 }, { "completion_length": 116.84375, "epoch": 0.5409413281753708, "grad_norm": 10.931767463684082, "kl": 0.108154296875, "learning_rate": 7.295293359123146e-07, "loss": 0.0043, "reward": 1.6424379348754883, "reward_std": 0.15779002383351326, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6580629050731659, "step": 839 }, { "completion_length": 111.703125, "epoch": 0.5415860735009671, "grad_norm": 13.211544036865234, "kl": 0.112060546875, "learning_rate": 7.292069632495163e-07, "loss": 0.0045, "reward": 1.555619239807129, "reward_std": 0.11191636323928833, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5712441802024841, "step": 840 }, { "completion_length": 110.90625, "epoch": 0.5422308188265635, "grad_norm": 9.324379920959473, "kl": 0.145263671875, "learning_rate": 7.288845905867183e-07, "loss": 0.0058, "reward": 1.5834694504737854, "reward_std": 0.09625018388032913, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5834693908691406, "step": 841 }, { "completion_length": 121.515625, "epoch": 0.5428755641521599, "grad_norm": 17.784393310546875, "kl": 0.15380859375, "learning_rate": 7.2856221792392e-07, "loss": 0.0062, "reward": 1.4552746415138245, "reward_std": 0.10180060565471649, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45527467131614685, "step": 842 }, { "completion_length": 113.875, "epoch": 0.5435203094777563, "grad_norm": 14.1525239944458, "kl": 0.1171875, "learning_rate": 7.282398452611218e-07, "loss": 0.0047, "reward": 1.5815837383270264, "reward_std": 0.06838434562087059, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5815837979316711, "step": 843 }, { "completion_length": 111.8125, "epoch": 0.5441650548033526, "grad_norm": 14.385869026184082, "kl": 0.134765625, "learning_rate": 7.279174725983235e-07, "loss": 0.0054, "reward": 1.5503296852111816, "reward_std": 0.08688557147979736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5503296554088593, "step": 844 }, { "completion_length": 108.9375, "epoch": 0.5448098001289491, "grad_norm": 13.819561958312988, "kl": 0.1298828125, "learning_rate": 7.275950999355255e-07, "loss": 0.0052, "reward": 1.3759878873825073, "reward_std": 0.07583236694335938, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3759878873825073, "step": 845 }, { "completion_length": 113.078125, "epoch": 0.5454545454545454, "grad_norm": 13.309516906738281, "kl": 0.11767578125, "learning_rate": 7.272727272727272e-07, "loss": 0.0047, "reward": 1.5513026714324951, "reward_std": 0.0639763344079256, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5513026863336563, "step": 846 }, { "completion_length": 125.8125, "epoch": 0.5460992907801419, "grad_norm": 21.36383056640625, "kl": 0.115966796875, "learning_rate": 7.26950354609929e-07, "loss": 0.0046, "reward": 1.4737462997436523, "reward_std": 0.11164987832307816, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47374628484249115, "step": 847 }, { "completion_length": 132.8125, "epoch": 0.5467440361057382, "grad_norm": 10.429849624633789, "kl": 0.107177734375, "learning_rate": 7.266279819471308e-07, "loss": 0.0043, "reward": 1.6067231893539429, "reward_std": 0.10046634450554848, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6067231893539429, "step": 848 }, { "completion_length": 118.578125, "epoch": 0.5473887814313346, "grad_norm": 20.214109420776367, "kl": 0.13818359375, "learning_rate": 7.263056092843327e-07, "loss": 0.0055, "reward": 1.4823689460754395, "reward_std": 0.10195580497384071, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48236894607543945, "step": 849 }, { "completion_length": 105.421875, "epoch": 0.5480335267569311, "grad_norm": 15.529704093933105, "kl": 0.146484375, "learning_rate": 7.259832366215344e-07, "loss": 0.0059, "reward": 1.6683238744735718, "reward_std": 0.13429173827171326, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.668323814868927, "step": 850 }, { "completion_length": 107.359375, "epoch": 0.5486782720825274, "grad_norm": 15.431048393249512, "kl": 0.109375, "learning_rate": 7.256608639587363e-07, "loss": 0.0044, "reward": 1.4428613781929016, "reward_std": 0.046914756298065186, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44286131858825684, "step": 851 }, { "completion_length": 104.890625, "epoch": 0.5493230174081238, "grad_norm": 20.874425888061523, "kl": 0.14306640625, "learning_rate": 7.25338491295938e-07, "loss": 0.0057, "reward": 1.5059730410575867, "reward_std": 0.10780579224228859, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5059730112552643, "step": 852 }, { "completion_length": 122.0625, "epoch": 0.5499677627337202, "grad_norm": 9.9263277053833, "kl": 0.122314453125, "learning_rate": 7.250161186331398e-07, "loss": 0.0049, "reward": 1.385474145412445, "reward_std": 0.06568328104913235, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3854740560054779, "step": 853 }, { "completion_length": 119.046875, "epoch": 0.5506125080593166, "grad_norm": 33.775062561035156, "kl": 0.152587890625, "learning_rate": 7.246937459703417e-07, "loss": 0.0061, "reward": 1.6085370779037476, "reward_std": 0.14623812586069107, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6085370779037476, "step": 854 }, { "completion_length": 126.625, "epoch": 0.5512572533849129, "grad_norm": 13.095032691955566, "kl": 0.11376953125, "learning_rate": 7.243713733075435e-07, "loss": 0.0046, "reward": 1.6513434052467346, "reward_std": 0.14508388936519623, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.666968435049057, "step": 855 }, { "completion_length": 113.0, "epoch": 0.5519019987105094, "grad_norm": 14.717634201049805, "kl": 0.113037109375, "learning_rate": 7.240490006447452e-07, "loss": 0.0045, "reward": 1.5023114681243896, "reward_std": 0.07279381342232227, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5023114830255508, "step": 856 }, { "completion_length": 125.484375, "epoch": 0.5525467440361057, "grad_norm": 12.679544448852539, "kl": 0.113525390625, "learning_rate": 7.237266279819471e-07, "loss": 0.0045, "reward": 1.5734071135520935, "reward_std": 0.052911147475242615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5734071433544159, "step": 857 }, { "completion_length": 112.21875, "epoch": 0.5531914893617021, "grad_norm": 22.410959243774414, "kl": 0.10791015625, "learning_rate": 7.23404255319149e-07, "loss": 0.0043, "reward": 1.518027424812317, "reward_std": 0.054683832451701164, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5180274546146393, "step": 858 }, { "completion_length": 117.359375, "epoch": 0.5538362346872985, "grad_norm": 8.975079536437988, "kl": 0.11083984375, "learning_rate": 7.230818826563507e-07, "loss": 0.0044, "reward": 1.531017243862152, "reward_std": 0.1474187821149826, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5466422140598297, "step": 859 }, { "completion_length": 120.359375, "epoch": 0.5544809800128949, "grad_norm": 30.06081199645996, "kl": 0.125, "learning_rate": 7.227595099935525e-07, "loss": 0.005, "reward": 1.472097635269165, "reward_std": 0.12772921100258827, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4720976650714874, "step": 860 }, { "completion_length": 121.296875, "epoch": 0.5551257253384912, "grad_norm": 7.828312397003174, "kl": 0.111328125, "learning_rate": 7.224371373307543e-07, "loss": 0.0045, "reward": 1.5114521980285645, "reward_std": 0.09453960880637169, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5114522278308868, "step": 861 }, { "completion_length": 120.09375, "epoch": 0.5557704706640877, "grad_norm": 42.72480773925781, "kl": 0.1083984375, "learning_rate": 7.221147646679562e-07, "loss": 0.0043, "reward": 1.5521915555000305, "reward_std": 0.07748100534081459, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5521914958953857, "step": 862 }, { "completion_length": 127.5625, "epoch": 0.556415215989684, "grad_norm": 14.040475845336914, "kl": 0.143310546875, "learning_rate": 7.21792392005158e-07, "loss": 0.0057, "reward": 1.5281696319580078, "reward_std": 0.10375509038567543, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.528169646859169, "step": 863 }, { "completion_length": 125.4375, "epoch": 0.5570599613152805, "grad_norm": 10.69442367553711, "kl": 0.123046875, "learning_rate": 7.214700193423598e-07, "loss": 0.0049, "reward": 1.612160563468933, "reward_std": 0.14196538925170898, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6277855336666107, "step": 864 }, { "completion_length": 113.03125, "epoch": 0.5577047066408769, "grad_norm": 12.548157691955566, "kl": 0.093017578125, "learning_rate": 7.211476466795615e-07, "loss": 0.0037, "reward": 1.5996488332748413, "reward_std": 0.08111307956278324, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5996488332748413, "step": 865 }, { "completion_length": 118.4375, "epoch": 0.5583494519664732, "grad_norm": 19.041934967041016, "kl": 0.10791015625, "learning_rate": 7.208252740167632e-07, "loss": 0.0043, "reward": 1.5888526439666748, "reward_std": 0.10432974994182587, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5888526439666748, "step": 866 }, { "completion_length": 104.59375, "epoch": 0.5589941972920697, "grad_norm": 44.237117767333984, "kl": 0.1376953125, "learning_rate": 7.205029013539652e-07, "loss": 0.0055, "reward": 1.6344898343086243, "reward_std": 0.08369472622871399, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6344898343086243, "step": 867 }, { "completion_length": 100.75, "epoch": 0.559638942617666, "grad_norm": 12.805983543395996, "kl": 0.146240234375, "learning_rate": 7.20180528691167e-07, "loss": 0.0058, "reward": 1.6388565301895142, "reward_std": 0.055887432768940926, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6388565599918365, "step": 868 }, { "completion_length": 111.90625, "epoch": 0.5602836879432624, "grad_norm": 14.531253814697266, "kl": 0.115478515625, "learning_rate": 7.198581560283687e-07, "loss": 0.0046, "reward": 1.6537020206451416, "reward_std": 0.10696634277701378, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.653702050447464, "step": 869 }, { "completion_length": 107.546875, "epoch": 0.5609284332688588, "grad_norm": 18.164827346801758, "kl": 0.13525390625, "learning_rate": 7.195357833655706e-07, "loss": 0.0054, "reward": 1.6144909858703613, "reward_std": 0.1230674758553505, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6144909858703613, "step": 870 }, { "completion_length": 103.453125, "epoch": 0.5615731785944552, "grad_norm": 14.088841438293457, "kl": 0.13232421875, "learning_rate": 7.192134107027724e-07, "loss": 0.0053, "reward": 1.6711689233779907, "reward_std": 0.10582815483212471, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.671168863773346, "step": 871 }, { "completion_length": 100.109375, "epoch": 0.5622179239200515, "grad_norm": 17.895931243896484, "kl": 0.11865234375, "learning_rate": 7.188910380399742e-07, "loss": 0.0048, "reward": 1.6541008949279785, "reward_std": 0.14635222405195236, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6697258949279785, "step": 872 }, { "completion_length": 101.234375, "epoch": 0.562862669245648, "grad_norm": 11.591320991516113, "kl": 0.138916015625, "learning_rate": 7.18568665377176e-07, "loss": 0.0055, "reward": 1.6059136390686035, "reward_std": 0.08753130957484245, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6059136092662811, "step": 873 }, { "completion_length": 103.953125, "epoch": 0.5635074145712443, "grad_norm": 10.312195777893066, "kl": 0.151123046875, "learning_rate": 7.182462927143778e-07, "loss": 0.006, "reward": 1.5342779755592346, "reward_std": 0.05523178353905678, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5342779606580734, "step": 874 }, { "completion_length": 106.34375, "epoch": 0.5641521598968408, "grad_norm": 240.9861602783203, "kl": 0.134521484375, "learning_rate": 7.179239200515796e-07, "loss": 0.0054, "reward": 1.5363209247589111, "reward_std": 0.12755867093801498, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5519458651542664, "step": 875 }, { "completion_length": 99.359375, "epoch": 0.5647969052224371, "grad_norm": 26.704421997070312, "kl": 0.14892578125, "learning_rate": 7.176015473887815e-07, "loss": 0.006, "reward": 1.6137511134147644, "reward_std": 0.0752146728336811, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6137511432170868, "step": 876 }, { "completion_length": 117.0, "epoch": 0.5654416505480335, "grad_norm": 15.467166900634766, "kl": 0.22119140625, "learning_rate": 7.172791747259832e-07, "loss": 0.0088, "reward": 1.5769726037979126, "reward_std": 0.12701868265867233, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5769726037979126, "step": 877 }, { "completion_length": 108.671875, "epoch": 0.56608639587363, "grad_norm": 18.265071868896484, "kl": 0.115478515625, "learning_rate": 7.16956802063185e-07, "loss": 0.0046, "reward": 1.6923354864120483, "reward_std": 0.07585858926177025, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6923355162143707, "step": 878 }, { "completion_length": 113.578125, "epoch": 0.5667311411992263, "grad_norm": 9.981937408447266, "kl": 0.120361328125, "learning_rate": 7.166344294003868e-07, "loss": 0.0048, "reward": 1.5243776440620422, "reward_std": 0.09604435786604881, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.524377629160881, "step": 879 }, { "completion_length": 114.0625, "epoch": 0.5673758865248227, "grad_norm": 24.6324520111084, "kl": 0.20849609375, "learning_rate": 7.163120567375887e-07, "loss": 0.0084, "reward": 1.6049633622169495, "reward_std": 0.08231936953961849, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6049633026123047, "step": 880 }, { "completion_length": 109.015625, "epoch": 0.5680206318504191, "grad_norm": 10.54612922668457, "kl": 0.1181640625, "learning_rate": 7.159896840747904e-07, "loss": 0.0047, "reward": 1.2566799521446228, "reward_std": 0.06597771868109703, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2566799595952034, "step": 881 }, { "completion_length": 113.859375, "epoch": 0.5686653771760155, "grad_norm": 11.331385612487793, "kl": 0.14111328125, "learning_rate": 7.156673114119923e-07, "loss": 0.0056, "reward": 1.6598809957504272, "reward_std": 0.08520251885056496, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6598810255527496, "step": 882 }, { "completion_length": 115.703125, "epoch": 0.5693101225016118, "grad_norm": 16.860124588012695, "kl": 0.10009765625, "learning_rate": 7.15344938749194e-07, "loss": 0.004, "reward": 1.6775421500205994, "reward_std": 0.09860784560441971, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6775421500205994, "step": 883 }, { "completion_length": 103.21875, "epoch": 0.5699548678272083, "grad_norm": 33.44534683227539, "kl": 0.13330078125, "learning_rate": 7.150225660863959e-07, "loss": 0.0053, "reward": 1.5252437591552734, "reward_std": 0.06859505549073219, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5252437740564346, "step": 884 }, { "completion_length": 117.265625, "epoch": 0.5705996131528046, "grad_norm": 12.154504776000977, "kl": 0.137451171875, "learning_rate": 7.147001934235977e-07, "loss": 0.0055, "reward": 1.6008020639419556, "reward_std": 0.12660085037350655, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6008020639419556, "step": 885 }, { "completion_length": 121.328125, "epoch": 0.571244358478401, "grad_norm": 18.56639862060547, "kl": 0.11767578125, "learning_rate": 7.143778207607995e-07, "loss": 0.0047, "reward": 1.6193873286247253, "reward_std": 0.10898285359144211, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.619387298822403, "step": 886 }, { "completion_length": 130.421875, "epoch": 0.5718891038039974, "grad_norm": 174.48056030273438, "kl": 0.123291015625, "learning_rate": 7.140554480980012e-07, "loss": 0.0049, "reward": 1.5258971452713013, "reward_std": 0.05683533661067486, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5258970856666565, "step": 887 }, { "completion_length": 111.125, "epoch": 0.5725338491295938, "grad_norm": 7.736058712005615, "kl": 0.114501953125, "learning_rate": 7.137330754352032e-07, "loss": 0.0046, "reward": 1.6826195120811462, "reward_std": 0.13724345713853836, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6982445120811462, "step": 888 }, { "completion_length": 120.171875, "epoch": 0.5731785944551901, "grad_norm": 45.026058197021484, "kl": 0.13623046875, "learning_rate": 7.134107027724049e-07, "loss": 0.0055, "reward": 1.5901039242744446, "reward_std": 0.16770201176404953, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5901040136814117, "step": 889 }, { "completion_length": 113.640625, "epoch": 0.5738233397807866, "grad_norm": 9.868265151977539, "kl": 0.126708984375, "learning_rate": 7.130883301096067e-07, "loss": 0.0051, "reward": 1.4987885355949402, "reward_std": 0.14777783304452896, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4987884759902954, "step": 890 }, { "completion_length": 103.046875, "epoch": 0.574468085106383, "grad_norm": 16.197307586669922, "kl": 0.12451171875, "learning_rate": 7.127659574468084e-07, "loss": 0.005, "reward": 1.5383508205413818, "reward_std": 0.09310048446059227, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5383508205413818, "step": 891 }, { "completion_length": 115.203125, "epoch": 0.5751128304319794, "grad_norm": 7.911108016967773, "kl": 0.133056640625, "learning_rate": 7.124435847840103e-07, "loss": 0.0053, "reward": 1.6043503284454346, "reward_std": 0.12782978266477585, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6043503284454346, "step": 892 }, { "completion_length": 110.71875, "epoch": 0.5757575757575758, "grad_norm": 49.204200744628906, "kl": 0.135498046875, "learning_rate": 7.121212121212121e-07, "loss": 0.0054, "reward": 1.557990849018097, "reward_std": 0.14290058612823486, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5579908788204193, "step": 893 }, { "completion_length": 119.65625, "epoch": 0.5764023210831721, "grad_norm": 66.35208129882812, "kl": 0.10546875, "learning_rate": 7.117988394584139e-07, "loss": 0.0042, "reward": 1.5070759654045105, "reward_std": 0.1017339937388897, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5070759654045105, "step": 894 }, { "completion_length": 106.046875, "epoch": 0.5770470664087686, "grad_norm": 10.324350357055664, "kl": 0.12451171875, "learning_rate": 7.114764667956157e-07, "loss": 0.005, "reward": 1.5135289430618286, "reward_std": 0.18148383498191833, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5291539430618286, "step": 895 }, { "completion_length": 121.34375, "epoch": 0.5776918117343649, "grad_norm": 25.454736709594727, "kl": 0.124267578125, "learning_rate": 7.111540941328175e-07, "loss": 0.005, "reward": 1.5388038158416748, "reward_std": 0.09261330217123032, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5388037711381912, "step": 896 }, { "completion_length": 113.0625, "epoch": 0.5783365570599613, "grad_norm": 7.527803421020508, "kl": 0.12060546875, "learning_rate": 7.108317214700193e-07, "loss": 0.0048, "reward": 1.6463514566421509, "reward_std": 0.1266507338732481, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6463514566421509, "step": 897 }, { "completion_length": 115.125, "epoch": 0.5789813023855577, "grad_norm": 51.9161262512207, "kl": 0.1162109375, "learning_rate": 7.105093488072212e-07, "loss": 0.0046, "reward": 1.5849735736846924, "reward_std": 0.08285956270992756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5849734991788864, "step": 898 }, { "completion_length": 114.75, "epoch": 0.5796260477111541, "grad_norm": 14.701598167419434, "kl": 0.164306640625, "learning_rate": 7.101869761444229e-07, "loss": 0.0066, "reward": 1.4290552139282227, "reward_std": 0.11476770415902138, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.44468019902706146, "step": 899 }, { "completion_length": 118.921875, "epoch": 0.5802707930367504, "grad_norm": 7.526551246643066, "kl": 0.11865234375, "learning_rate": 7.098646034816247e-07, "loss": 0.0047, "reward": 1.523693561553955, "reward_std": 0.06077193468809128, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5236935317516327, "step": 900 }, { "completion_length": 116.796875, "epoch": 0.5809155383623469, "grad_norm": 12.488341331481934, "kl": 0.1728515625, "learning_rate": 7.095422308188266e-07, "loss": 0.0069, "reward": 1.5757187604904175, "reward_std": 0.11441710963845253, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5757187008857727, "step": 901 }, { "completion_length": 114.453125, "epoch": 0.5815602836879432, "grad_norm": 25.193092346191406, "kl": 0.125, "learning_rate": 7.092198581560284e-07, "loss": 0.005, "reward": 1.601715087890625, "reward_std": 0.08856208249926567, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6017151474952698, "step": 902 }, { "completion_length": 124.03125, "epoch": 0.5822050290135397, "grad_norm": 11.461077690124512, "kl": 0.12255859375, "learning_rate": 7.088974854932301e-07, "loss": 0.0049, "reward": 1.4974235892295837, "reward_std": 0.12268773466348648, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49742357432842255, "step": 903 }, { "completion_length": 116.640625, "epoch": 0.5828497743391361, "grad_norm": 17.099470138549805, "kl": 0.130615234375, "learning_rate": 7.08575112830432e-07, "loss": 0.0052, "reward": 1.499584436416626, "reward_std": 0.05868752859532833, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4995844215154648, "step": 904 }, { "completion_length": 131.15625, "epoch": 0.5834945196647324, "grad_norm": 19.47797966003418, "kl": 0.123291015625, "learning_rate": 7.082527401676337e-07, "loss": 0.0049, "reward": 1.4962482452392578, "reward_std": 0.08062171936035156, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4962482154369354, "step": 905 }, { "completion_length": 114.4375, "epoch": 0.5841392649903289, "grad_norm": 12.948217391967773, "kl": 0.1376953125, "learning_rate": 7.079303675048356e-07, "loss": 0.0055, "reward": 1.5855439901351929, "reward_std": 0.07610942982137203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5855439305305481, "step": 906 }, { "completion_length": 122.375, "epoch": 0.5847840103159252, "grad_norm": 11.65310287475586, "kl": 0.13427734375, "learning_rate": 7.076079948420374e-07, "loss": 0.0054, "reward": 1.4916418194770813, "reward_std": 0.06596004217863083, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4916418194770813, "step": 907 }, { "completion_length": 122.03125, "epoch": 0.5854287556415216, "grad_norm": 9.089468955993652, "kl": 0.13623046875, "learning_rate": 7.072856221792392e-07, "loss": 0.0054, "reward": 1.5869945883750916, "reward_std": 0.09673259779810905, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5869945287704468, "step": 908 }, { "completion_length": 125.390625, "epoch": 0.586073500967118, "grad_norm": 13.814423561096191, "kl": 0.125732421875, "learning_rate": 7.069632495164409e-07, "loss": 0.005, "reward": 1.4843732714653015, "reward_std": 0.10544757917523384, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4843732565641403, "step": 909 }, { "completion_length": 118.09375, "epoch": 0.5867182462927144, "grad_norm": 12.298931121826172, "kl": 0.122802734375, "learning_rate": 7.066408768536429e-07, "loss": 0.0049, "reward": 1.576155424118042, "reward_std": 0.10403759777545929, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5761553943157196, "step": 910 }, { "completion_length": 136.484375, "epoch": 0.5873629916183107, "grad_norm": 17.558902740478516, "kl": 0.1494140625, "learning_rate": 7.063185041908446e-07, "loss": 0.006, "reward": 1.5917853116989136, "reward_std": 0.1398775726556778, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5917853713035583, "step": 911 }, { "completion_length": 117.828125, "epoch": 0.5880077369439072, "grad_norm": 14.441729545593262, "kl": 0.116455078125, "learning_rate": 7.059961315280464e-07, "loss": 0.0047, "reward": 1.538801908493042, "reward_std": 0.08991247788071632, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.538801908493042, "step": 912 }, { "completion_length": 114.21875, "epoch": 0.5886524822695035, "grad_norm": 8.804062843322754, "kl": 0.1318359375, "learning_rate": 7.056737588652481e-07, "loss": 0.0053, "reward": 1.6029040813446045, "reward_std": 0.1059550866484642, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6029041111469269, "step": 913 }, { "completion_length": 133.09375, "epoch": 0.5892972275951, "grad_norm": 31.393301010131836, "kl": 0.14501953125, "learning_rate": 7.053513862024501e-07, "loss": 0.0058, "reward": 1.5376715660095215, "reward_std": 0.07115558534860611, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5376715958118439, "step": 914 }, { "completion_length": 120.140625, "epoch": 0.5899419729206963, "grad_norm": 11.773250579833984, "kl": 0.115234375, "learning_rate": 7.050290135396518e-07, "loss": 0.0046, "reward": 1.6650592684745789, "reward_std": 0.061884213238954544, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6650592386722565, "step": 915 }, { "completion_length": 116.359375, "epoch": 0.5905867182462927, "grad_norm": 11.13023567199707, "kl": 0.1357421875, "learning_rate": 7.047066408768536e-07, "loss": 0.0054, "reward": 1.3285646438598633, "reward_std": 0.15595447272062302, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.3441896289587021, "step": 916 }, { "completion_length": 109.796875, "epoch": 0.591231463571889, "grad_norm": 17.604372024536133, "kl": 0.123779296875, "learning_rate": 7.043842682140554e-07, "loss": 0.005, "reward": 1.4047325253486633, "reward_std": 0.07474876940250397, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.40473249554634094, "step": 917 }, { "completion_length": 107.1875, "epoch": 0.5918762088974855, "grad_norm": 17.5688533782959, "kl": 0.130859375, "learning_rate": 7.040618955512572e-07, "loss": 0.0052, "reward": 1.6198034286499023, "reward_std": 0.15777599066495895, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6198033392429352, "step": 918 }, { "completion_length": 102.328125, "epoch": 0.5925209542230819, "grad_norm": 17.94205665588379, "kl": 0.166259765625, "learning_rate": 7.03739522888459e-07, "loss": 0.0067, "reward": 1.4272202253341675, "reward_std": 0.05302144214510918, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42722028493881226, "step": 919 }, { "completion_length": 104.921875, "epoch": 0.5931656995486783, "grad_norm": 14.72970962524414, "kl": 0.1484375, "learning_rate": 7.034171502256609e-07, "loss": 0.0059, "reward": 1.7611448764801025, "reward_std": 0.08485203981399536, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7611448764801025, "step": 920 }, { "completion_length": 111.671875, "epoch": 0.5938104448742747, "grad_norm": 22.33138084411621, "kl": 0.14404296875, "learning_rate": 7.030947775628626e-07, "loss": 0.0058, "reward": 1.6409417390823364, "reward_std": 0.10936198756098747, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6409417390823364, "step": 921 }, { "completion_length": 106.65625, "epoch": 0.594455190199871, "grad_norm": 14.650537490844727, "kl": 0.134765625, "learning_rate": 7.027724049000644e-07, "loss": 0.0054, "reward": 1.5477882027626038, "reward_std": 0.10123782977461815, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5477882623672485, "step": 922 }, { "completion_length": 113.265625, "epoch": 0.5950999355254675, "grad_norm": 13.721552848815918, "kl": 0.13720703125, "learning_rate": 7.024500322372663e-07, "loss": 0.0055, "reward": 1.5293274521827698, "reward_std": 0.09637310355901718, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5293275117874146, "step": 923 }, { "completion_length": 104.375, "epoch": 0.5957446808510638, "grad_norm": 14.026667594909668, "kl": 0.116455078125, "learning_rate": 7.021276595744681e-07, "loss": 0.0047, "reward": 1.4343553185462952, "reward_std": 0.057373858988285065, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43435533344745636, "step": 924 }, { "completion_length": 104.265625, "epoch": 0.5963894261766602, "grad_norm": 16.487895965576172, "kl": 0.1181640625, "learning_rate": 7.018052869116698e-07, "loss": 0.0047, "reward": 1.6196611523628235, "reward_std": 0.08072704821825027, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6196612119674683, "step": 925 }, { "completion_length": 102.171875, "epoch": 0.5970341715022566, "grad_norm": 23.069528579711914, "kl": 0.12158203125, "learning_rate": 7.014829142488717e-07, "loss": 0.0049, "reward": 1.632835030555725, "reward_std": 0.08614074625074863, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6328349560499191, "step": 926 }, { "completion_length": 113.53125, "epoch": 0.597678916827853, "grad_norm": 27.1119327545166, "kl": 0.12548828125, "learning_rate": 7.011605415860735e-07, "loss": 0.005, "reward": 1.4445176720619202, "reward_std": 0.14256715029478073, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44451767206192017, "step": 927 }, { "completion_length": 97.171875, "epoch": 0.5983236621534493, "grad_norm": 19.736724853515625, "kl": 0.13134765625, "learning_rate": 7.008381689232753e-07, "loss": 0.0052, "reward": 1.601191520690918, "reward_std": 0.07426881790161133, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6011915057897568, "step": 928 }, { "completion_length": 95.65625, "epoch": 0.5989684074790458, "grad_norm": 14.408288955688477, "kl": 0.15185546875, "learning_rate": 7.005157962604771e-07, "loss": 0.0061, "reward": 1.5045355558395386, "reward_std": 0.05852143466472626, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5045354962348938, "step": 929 }, { "completion_length": 111.78125, "epoch": 0.5996131528046421, "grad_norm": 14.51344108581543, "kl": 0.13134765625, "learning_rate": 7.001934235976789e-07, "loss": 0.0053, "reward": 1.6496663093566895, "reward_std": 0.07880590856075287, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6496663391590118, "step": 930 }, { "completion_length": 96.875, "epoch": 0.6002578981302386, "grad_norm": 27.438138961791992, "kl": 0.1298828125, "learning_rate": 6.998710509348806e-07, "loss": 0.0052, "reward": 1.567421317100525, "reward_std": 0.058061445131897926, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5674213171005249, "step": 931 }, { "completion_length": 99.703125, "epoch": 0.600902643455835, "grad_norm": 12.056415557861328, "kl": 0.126708984375, "learning_rate": 6.995486782720826e-07, "loss": 0.0051, "reward": 1.5093879699707031, "reward_std": 0.09708208404481411, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.509387880563736, "step": 932 }, { "completion_length": 92.359375, "epoch": 0.6015473887814313, "grad_norm": 32.935951232910156, "kl": 0.15283203125, "learning_rate": 6.992263056092843e-07, "loss": 0.0061, "reward": 1.6477266550064087, "reward_std": 0.11407394707202911, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6477266252040863, "step": 933 }, { "completion_length": 96.328125, "epoch": 0.6021921341070278, "grad_norm": 7.755570888519287, "kl": 0.143798828125, "learning_rate": 6.989039329464861e-07, "loss": 0.0058, "reward": 1.8171811699867249, "reward_std": 0.12367031536996365, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.8171811997890472, "step": 934 }, { "completion_length": 93.75, "epoch": 0.6028368794326241, "grad_norm": 25.14842987060547, "kl": 0.124755859375, "learning_rate": 6.985815602836878e-07, "loss": 0.005, "reward": 1.349519431591034, "reward_std": 0.05427607707679272, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3495194613933563, "step": 935 }, { "completion_length": 93.921875, "epoch": 0.6034816247582205, "grad_norm": 56.53002166748047, "kl": 0.1259765625, "learning_rate": 6.982591876208898e-07, "loss": 0.005, "reward": 1.508469820022583, "reward_std": 0.04055575653910637, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5084697008132935, "step": 936 }, { "completion_length": 96.390625, "epoch": 0.6041263700838169, "grad_norm": 18.896730422973633, "kl": 0.14501953125, "learning_rate": 6.979368149580915e-07, "loss": 0.0058, "reward": 1.5613884329795837, "reward_std": 0.07465831749141216, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5613884329795837, "step": 937 }, { "completion_length": 94.53125, "epoch": 0.6047711154094133, "grad_norm": 28.830486297607422, "kl": 0.1220703125, "learning_rate": 6.976144422952933e-07, "loss": 0.0049, "reward": 1.5561841130256653, "reward_std": 0.05068258661776781, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5561841726303101, "step": 938 }, { "completion_length": 89.59375, "epoch": 0.6054158607350096, "grad_norm": 20.040565490722656, "kl": 0.15966796875, "learning_rate": 6.972920696324951e-07, "loss": 0.0064, "reward": 1.5743331909179688, "reward_std": 0.07431630790233612, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5743332505226135, "step": 939 }, { "completion_length": 94.390625, "epoch": 0.6060606060606061, "grad_norm": 16.654029846191406, "kl": 0.16015625, "learning_rate": 6.96969696969697e-07, "loss": 0.0064, "reward": 1.7051098346710205, "reward_std": 0.1160728745162487, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7051098048686981, "step": 940 }, { "completion_length": 96.15625, "epoch": 0.6067053513862024, "grad_norm": 17.097835540771484, "kl": 0.167236328125, "learning_rate": 6.966473243068987e-07, "loss": 0.0067, "reward": 1.5818954706192017, "reward_std": 0.09519368968904018, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5818954259157181, "step": 941 }, { "completion_length": 90.3125, "epoch": 0.6073500967117988, "grad_norm": 17.13744354248047, "kl": 0.11962890625, "learning_rate": 6.963249516441006e-07, "loss": 0.0048, "reward": 1.6105692386627197, "reward_std": 0.11062491312623024, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6105693280696869, "step": 942 }, { "completion_length": 89.703125, "epoch": 0.6079948420373952, "grad_norm": 41.86802291870117, "kl": 0.1533203125, "learning_rate": 6.960025789813023e-07, "loss": 0.0061, "reward": 1.5191384553909302, "reward_std": 0.05962220951914787, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5191383957862854, "step": 943 }, { "completion_length": 99.4375, "epoch": 0.6086395873629916, "grad_norm": 53.935115814208984, "kl": 0.12451171875, "learning_rate": 6.956802063185042e-07, "loss": 0.005, "reward": 1.477087378501892, "reward_std": 0.07249218598008156, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4770873636007309, "step": 944 }, { "completion_length": 89.984375, "epoch": 0.6092843326885881, "grad_norm": 23.599111557006836, "kl": 0.13818359375, "learning_rate": 6.95357833655706e-07, "loss": 0.0055, "reward": 1.7460429668426514, "reward_std": 0.07097552344202995, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7460429072380066, "step": 945 }, { "completion_length": 92.765625, "epoch": 0.6099290780141844, "grad_norm": 18.278663635253906, "kl": 0.135498046875, "learning_rate": 6.950354609929078e-07, "loss": 0.0054, "reward": 1.7038219571113586, "reward_std": 0.07454410195350647, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.703821986913681, "step": 946 }, { "completion_length": 87.125, "epoch": 0.6105738233397808, "grad_norm": 25.546600341796875, "kl": 0.1328125, "learning_rate": 6.947130883301095e-07, "loss": 0.0053, "reward": 1.6144275069236755, "reward_std": 0.06358279846608639, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6144274771213531, "step": 947 }, { "completion_length": 89.40625, "epoch": 0.6112185686653772, "grad_norm": 23.264015197753906, "kl": 0.19189453125, "learning_rate": 6.943907156673114e-07, "loss": 0.0077, "reward": 1.5967767834663391, "reward_std": 0.14299481362104416, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5967767834663391, "step": 948 }, { "completion_length": 85.84375, "epoch": 0.6118633139909736, "grad_norm": 68.70216369628906, "kl": 0.12109375, "learning_rate": 6.940683430045132e-07, "loss": 0.0048, "reward": 1.452661395072937, "reward_std": 0.050826674327254295, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45266131311655045, "step": 949 }, { "completion_length": 91.140625, "epoch": 0.6125080593165699, "grad_norm": 20.137420654296875, "kl": 0.1220703125, "learning_rate": 6.93745970341715e-07, "loss": 0.0049, "reward": 1.5065082907676697, "reward_std": 0.10698516108095646, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5065082460641861, "step": 950 }, { "completion_length": 85.6875, "epoch": 0.6131528046421664, "grad_norm": 14.560029983520508, "kl": 0.1396484375, "learning_rate": 6.934235976789168e-07, "loss": 0.0056, "reward": 1.513734221458435, "reward_std": 0.06140206754207611, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5137341916561127, "step": 951 }, { "completion_length": 84.484375, "epoch": 0.6137975499677627, "grad_norm": 34.63451385498047, "kl": 0.1396484375, "learning_rate": 6.931012250161186e-07, "loss": 0.0056, "reward": 1.5841580629348755, "reward_std": 0.0958426408469677, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5841581225395203, "step": 952 }, { "completion_length": 88.15625, "epoch": 0.6144422952933591, "grad_norm": 17.28553009033203, "kl": 0.13916015625, "learning_rate": 6.927788523533204e-07, "loss": 0.0056, "reward": 1.4624040126800537, "reward_std": 0.06560158357024193, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46240395307540894, "step": 953 }, { "completion_length": 91.765625, "epoch": 0.6150870406189555, "grad_norm": 18.49038314819336, "kl": 0.14013671875, "learning_rate": 6.924564796905223e-07, "loss": 0.0056, "reward": 1.7362635135650635, "reward_std": 0.05572156980633736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7362635731697083, "step": 954 }, { "completion_length": 99.15625, "epoch": 0.6157317859445519, "grad_norm": 13.38481616973877, "kl": 0.116943359375, "learning_rate": 6.92134107027724e-07, "loss": 0.0047, "reward": 1.662828803062439, "reward_std": 0.09047294408082962, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6628288626670837, "step": 955 }, { "completion_length": 87.9375, "epoch": 0.6163765312701482, "grad_norm": 11.720268249511719, "kl": 0.13134765625, "learning_rate": 6.918117343649258e-07, "loss": 0.0053, "reward": 1.7224065065383911, "reward_std": 0.12152724154293537, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7380314767360687, "step": 956 }, { "completion_length": 89.796875, "epoch": 0.6170212765957447, "grad_norm": 51.41786193847656, "kl": 0.12744140625, "learning_rate": 6.914893617021277e-07, "loss": 0.0051, "reward": 1.600498080253601, "reward_std": 0.0731177031993866, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6004980802536011, "step": 957 }, { "completion_length": 80.34375, "epoch": 0.6176660219213411, "grad_norm": 21.15074920654297, "kl": 0.1494140625, "learning_rate": 6.911669890393295e-07, "loss": 0.006, "reward": 1.5226842164993286, "reward_std": 0.15971365571022034, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5226842164993286, "step": 958 }, { "completion_length": 88.546875, "epoch": 0.6183107672469375, "grad_norm": 20.88332176208496, "kl": 0.132080078125, "learning_rate": 6.908446163765312e-07, "loss": 0.0053, "reward": 1.5318822264671326, "reward_std": 0.08424959145486355, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5318822264671326, "step": 959 }, { "completion_length": 84.0, "epoch": 0.6189555125725339, "grad_norm": 12.1968994140625, "kl": 0.16162109375, "learning_rate": 6.90522243713733e-07, "loss": 0.0065, "reward": 1.609505832195282, "reward_std": 0.1656803898513317, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.625130832195282, "step": 960 }, { "completion_length": 88.703125, "epoch": 0.6196002578981302, "grad_norm": 37.65996551513672, "kl": 0.138916015625, "learning_rate": 6.901998710509348e-07, "loss": 0.0056, "reward": 1.6943501234054565, "reward_std": 0.10790320858359337, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6943500936031342, "step": 961 }, { "completion_length": 89.828125, "epoch": 0.6202450032237267, "grad_norm": 14.622736930847168, "kl": 0.130859375, "learning_rate": 6.898774983881367e-07, "loss": 0.0052, "reward": 1.6017565727233887, "reward_std": 0.09940549731254578, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6017565131187439, "step": 962 }, { "completion_length": 91.453125, "epoch": 0.620889748549323, "grad_norm": 21.332942962646484, "kl": 0.1259765625, "learning_rate": 6.895551257253384e-07, "loss": 0.005, "reward": 1.3970187306404114, "reward_std": 0.0625982228666544, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3970186710357666, "step": 963 }, { "completion_length": 81.546875, "epoch": 0.6215344938749194, "grad_norm": 25.18134117126465, "kl": 0.18994140625, "learning_rate": 6.892327530625403e-07, "loss": 0.0076, "reward": 1.5218143463134766, "reward_std": 0.19736109673976898, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5374393910169601, "step": 964 }, { "completion_length": 81.8125, "epoch": 0.6221792392005158, "grad_norm": 13.722188949584961, "kl": 0.15478515625, "learning_rate": 6.88910380399742e-07, "loss": 0.0062, "reward": 1.4993084073066711, "reward_std": 0.06540745310485363, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49930842220783234, "step": 965 }, { "completion_length": 86.5, "epoch": 0.6228239845261122, "grad_norm": 11.098575592041016, "kl": 0.14697265625, "learning_rate": 6.885880077369439e-07, "loss": 0.0059, "reward": 1.5485892295837402, "reward_std": 0.07327968627214432, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5485892295837402, "step": 966 }, { "completion_length": 104.078125, "epoch": 0.6234687298517085, "grad_norm": 18.58123016357422, "kl": 0.130859375, "learning_rate": 6.882656350741457e-07, "loss": 0.0052, "reward": 1.6219686269760132, "reward_std": 0.07606706768274307, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6219685971736908, "step": 967 }, { "completion_length": 90.3125, "epoch": 0.624113475177305, "grad_norm": 16.83329200744629, "kl": 0.15576171875, "learning_rate": 6.879432624113475e-07, "loss": 0.0062, "reward": 1.4810823202133179, "reward_std": 0.06910162046551704, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4810822904109955, "step": 968 }, { "completion_length": 86.9375, "epoch": 0.6247582205029013, "grad_norm": 14.21639633178711, "kl": 0.111083984375, "learning_rate": 6.876208897485492e-07, "loss": 0.0044, "reward": 1.6480799913406372, "reward_std": 0.07823758944869041, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6480799615383148, "step": 969 }, { "completion_length": 86.96875, "epoch": 0.6254029658284977, "grad_norm": 27.79960823059082, "kl": 0.137451171875, "learning_rate": 6.872985170857512e-07, "loss": 0.0055, "reward": 1.760128676891327, "reward_std": 0.07539523765444756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7601286768913269, "step": 970 }, { "completion_length": 98.34375, "epoch": 0.6260477111540941, "grad_norm": 156.4920654296875, "kl": 0.1728515625, "learning_rate": 6.869761444229529e-07, "loss": 0.0069, "reward": 1.3526663184165955, "reward_std": 0.0894409529864788, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.35266634821891785, "step": 971 }, { "completion_length": 90.359375, "epoch": 0.6266924564796905, "grad_norm": 23.380796432495117, "kl": 0.133544921875, "learning_rate": 6.866537717601547e-07, "loss": 0.0054, "reward": 1.6337125301361084, "reward_std": 0.13482321798801422, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6337125897407532, "step": 972 }, { "completion_length": 99.96875, "epoch": 0.627337201805287, "grad_norm": 147.32347106933594, "kl": 0.1103515625, "learning_rate": 6.863313990973565e-07, "loss": 0.0044, "reward": 1.5924115180969238, "reward_std": 0.10236813127994537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5924114882946014, "step": 973 }, { "completion_length": 97.453125, "epoch": 0.6279819471308833, "grad_norm": 15.938390731811523, "kl": 0.1357421875, "learning_rate": 6.860090264345583e-07, "loss": 0.0054, "reward": 1.6408860683441162, "reward_std": 0.13076971843838692, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6565110683441162, "step": 974 }, { "completion_length": 97.0625, "epoch": 0.6286266924564797, "grad_norm": 29.58421516418457, "kl": 0.104736328125, "learning_rate": 6.856866537717601e-07, "loss": 0.0042, "reward": 1.6835073232650757, "reward_std": 0.0805157907307148, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6835072636604309, "step": 975 }, { "completion_length": 101.703125, "epoch": 0.6292714377820761, "grad_norm": 11.460612297058105, "kl": 0.13037109375, "learning_rate": 6.85364281108962e-07, "loss": 0.0052, "reward": 1.4942415356636047, "reward_std": 0.0735623836517334, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49424150586128235, "step": 976 }, { "completion_length": 97.609375, "epoch": 0.6299161831076725, "grad_norm": 54.10033416748047, "kl": 0.1171875, "learning_rate": 6.850419084461637e-07, "loss": 0.0047, "reward": 1.5240637063980103, "reward_std": 0.06277331709861755, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5240637063980103, "step": 977 }, { "completion_length": 98.9375, "epoch": 0.6305609284332688, "grad_norm": 23.348764419555664, "kl": 0.109619140625, "learning_rate": 6.847195357833655e-07, "loss": 0.0044, "reward": 1.5499730706214905, "reward_std": 0.11529890447854996, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5499730408191681, "step": 978 }, { "completion_length": 101.265625, "epoch": 0.6312056737588653, "grad_norm": 36.193199157714844, "kl": 0.10595703125, "learning_rate": 6.843971631205674e-07, "loss": 0.0042, "reward": 1.7043808102607727, "reward_std": 0.065880686044693, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7043807804584503, "step": 979 }, { "completion_length": 103.21875, "epoch": 0.6318504190844616, "grad_norm": 25.998506546020508, "kl": 0.113037109375, "learning_rate": 6.840747904577692e-07, "loss": 0.0045, "reward": 1.5761331915855408, "reward_std": 0.07709749229252338, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5761331915855408, "step": 980 }, { "completion_length": 101.15625, "epoch": 0.632495164410058, "grad_norm": 25.730180740356445, "kl": 0.130615234375, "learning_rate": 6.837524177949709e-07, "loss": 0.0052, "reward": 1.3551477193832397, "reward_std": 0.13733485713601112, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.35514768958091736, "step": 981 }, { "completion_length": 105.4375, "epoch": 0.6331399097356544, "grad_norm": 29.705520629882812, "kl": 0.12841796875, "learning_rate": 6.834300451321727e-07, "loss": 0.0051, "reward": 1.4724050164222717, "reward_std": 0.09875421039760113, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4724050760269165, "step": 982 }, { "completion_length": 109.0625, "epoch": 0.6337846550612508, "grad_norm": 21.819368362426758, "kl": 0.130859375, "learning_rate": 6.831076724693746e-07, "loss": 0.0052, "reward": 1.5190939903259277, "reward_std": 0.18469242006540298, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.534718930721283, "step": 983 }, { "completion_length": 91.359375, "epoch": 0.6344294003868471, "grad_norm": 15.404848098754883, "kl": 0.123779296875, "learning_rate": 6.827852998065764e-07, "loss": 0.0049, "reward": 1.410255491733551, "reward_std": 0.13094928860664368, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41025547683238983, "step": 984 }, { "completion_length": 102.015625, "epoch": 0.6350741457124436, "grad_norm": 25.152095794677734, "kl": 0.12158203125, "learning_rate": 6.824629271437781e-07, "loss": 0.0049, "reward": 1.477942705154419, "reward_std": 0.06228645518422127, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47794269025325775, "step": 985 }, { "completion_length": 103.4375, "epoch": 0.63571889103804, "grad_norm": 16.888469696044922, "kl": 0.123291015625, "learning_rate": 6.8214055448098e-07, "loss": 0.0049, "reward": 1.4599791765213013, "reward_std": 0.05538452975451946, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4599791467189789, "step": 986 }, { "completion_length": 108.40625, "epoch": 0.6363636363636364, "grad_norm": 11.82778549194336, "kl": 0.119140625, "learning_rate": 6.818181818181817e-07, "loss": 0.0048, "reward": 1.5686384439468384, "reward_std": 0.1496180146932602, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5686384737491608, "step": 987 }, { "completion_length": 101.546875, "epoch": 0.6370083816892328, "grad_norm": 7.283498287200928, "kl": 0.110107421875, "learning_rate": 6.814958091553836e-07, "loss": 0.0044, "reward": 1.6922195553779602, "reward_std": 0.1970614790916443, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7078445553779602, "step": 988 }, { "completion_length": 93.859375, "epoch": 0.6376531270148291, "grad_norm": 13.84801197052002, "kl": 0.14697265625, "learning_rate": 6.811734364925854e-07, "loss": 0.0059, "reward": 1.6163573861122131, "reward_std": 0.1375247687101364, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6163573861122131, "step": 989 }, { "completion_length": 107.140625, "epoch": 0.6382978723404256, "grad_norm": 26.76303482055664, "kl": 0.24169921875, "learning_rate": 6.808510638297872e-07, "loss": 0.0097, "reward": 1.4439324140548706, "reward_std": 0.09625233709812164, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4595574140548706, "step": 990 }, { "completion_length": 101.75, "epoch": 0.6389426176660219, "grad_norm": 23.95302391052246, "kl": 0.12158203125, "learning_rate": 6.805286911669889e-07, "loss": 0.0049, "reward": 1.6145684123039246, "reward_std": 0.0981244370341301, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6145684719085693, "step": 991 }, { "completion_length": 104.6875, "epoch": 0.6395873629916183, "grad_norm": 19.830524444580078, "kl": 0.107421875, "learning_rate": 6.802063185041909e-07, "loss": 0.0043, "reward": 1.4319545030593872, "reward_std": 0.06444331258535385, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4319545179605484, "step": 992 }, { "completion_length": 109.578125, "epoch": 0.6402321083172147, "grad_norm": 12.206172943115234, "kl": 0.13330078125, "learning_rate": 6.798839458413926e-07, "loss": 0.0053, "reward": 1.7099278569221497, "reward_std": 0.08155185729265213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7099278569221497, "step": 993 }, { "completion_length": 115.390625, "epoch": 0.6408768536428111, "grad_norm": 25.8281192779541, "kl": 0.12841796875, "learning_rate": 6.795615731785944e-07, "loss": 0.0051, "reward": 1.6406802535057068, "reward_std": 0.11553433537483215, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6406802237033844, "step": 994 }, { "completion_length": 106.21875, "epoch": 0.6415215989684074, "grad_norm": 13.230984687805176, "kl": 0.11474609375, "learning_rate": 6.792392005157962e-07, "loss": 0.0046, "reward": 1.4972352981567383, "reward_std": 0.08803327940404415, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4972353130578995, "step": 995 }, { "completion_length": 104.953125, "epoch": 0.6421663442940039, "grad_norm": 10.515935897827148, "kl": 0.107177734375, "learning_rate": 6.789168278529981e-07, "loss": 0.0043, "reward": 1.7526929378509521, "reward_std": 0.0644787959754467, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7526930272579193, "step": 996 }, { "completion_length": 105.40625, "epoch": 0.6428110896196002, "grad_norm": 18.340421676635742, "kl": 0.1064453125, "learning_rate": 6.785944551901998e-07, "loss": 0.0043, "reward": 1.5969550013542175, "reward_std": 0.11528073251247406, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5969549715518951, "step": 997 }, { "completion_length": 107.6875, "epoch": 0.6434558349451966, "grad_norm": 15.871825218200684, "kl": 0.113525390625, "learning_rate": 6.782720825274017e-07, "loss": 0.0045, "reward": 1.522782325744629, "reward_std": 0.11090462654829025, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5227823406457901, "step": 998 }, { "completion_length": 115.90625, "epoch": 0.6441005802707931, "grad_norm": 23.02735710144043, "kl": 0.182861328125, "learning_rate": 6.779497098646034e-07, "loss": 0.0073, "reward": 1.3356384634971619, "reward_std": 0.039747599977999926, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.33563845232129097, "step": 999 }, { "completion_length": 112.265625, "epoch": 0.6447453255963894, "grad_norm": 29.998191833496094, "kl": 0.11181640625, "learning_rate": 6.776273372018052e-07, "loss": 0.0045, "reward": 1.5243074893951416, "reward_std": 0.07966102100908756, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.52430759370327, "step": 1000 }, { "completion_length": 101.875, "epoch": 0.6453900709219859, "grad_norm": 13.788435935974121, "kl": 0.11328125, "learning_rate": 6.77304964539007e-07, "loss": 0.0045, "reward": 1.5019648671150208, "reward_std": 0.064224723726511, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5019648671150208, "step": 1001 }, { "completion_length": 98.0625, "epoch": 0.6460348162475822, "grad_norm": 22.711002349853516, "kl": 0.13671875, "learning_rate": 6.769825918762089e-07, "loss": 0.0055, "reward": 1.465948462486267, "reward_std": 0.05411083064973354, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4659484773874283, "step": 1002 }, { "completion_length": 104.703125, "epoch": 0.6466795615731786, "grad_norm": 22.697656631469727, "kl": 0.13720703125, "learning_rate": 6.766602192134106e-07, "loss": 0.0055, "reward": 1.5236431956291199, "reward_std": 0.1108100637793541, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5236431360244751, "step": 1003 }, { "completion_length": 113.78125, "epoch": 0.647324306898775, "grad_norm": 13.408069610595703, "kl": 0.122314453125, "learning_rate": 6.763378465506124e-07, "loss": 0.0049, "reward": 1.5940637588500977, "reward_std": 0.09507790580391884, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5940638333559036, "step": 1004 }, { "completion_length": 96.828125, "epoch": 0.6479690522243714, "grad_norm": 34.19378662109375, "kl": 0.1181640625, "learning_rate": 6.760154738878143e-07, "loss": 0.0047, "reward": 1.6176080703735352, "reward_std": 0.13371532410383224, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6176081299781799, "step": 1005 }, { "completion_length": 107.453125, "epoch": 0.6486137975499677, "grad_norm": 27.635814666748047, "kl": 0.10400390625, "learning_rate": 6.756931012250161e-07, "loss": 0.0042, "reward": 1.4466070532798767, "reward_std": 0.1371947005391121, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4466071128845215, "step": 1006 }, { "completion_length": 108.6875, "epoch": 0.6492585428755642, "grad_norm": 9.812318801879883, "kl": 0.1337890625, "learning_rate": 6.753707285622178e-07, "loss": 0.0053, "reward": 1.5490465760231018, "reward_std": 0.08941208943724632, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5490465462207794, "step": 1007 }, { "completion_length": 113.84375, "epoch": 0.6499032882011605, "grad_norm": 15.088581085205078, "kl": 0.13232421875, "learning_rate": 6.750483558994197e-07, "loss": 0.0053, "reward": 1.6374335885047913, "reward_std": 0.13946302980184555, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6530585885047913, "step": 1008 }, { "completion_length": 100.46875, "epoch": 0.6505480335267569, "grad_norm": 18.111522674560547, "kl": 0.19873046875, "learning_rate": 6.747259832366215e-07, "loss": 0.008, "reward": 1.5997966527938843, "reward_std": 0.18083975464105606, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.615421712398529, "step": 1009 }, { "completion_length": 119.78125, "epoch": 0.6511927788523533, "grad_norm": 10.134917259216309, "kl": 0.1083984375, "learning_rate": 6.744036105738233e-07, "loss": 0.0043, "reward": 1.6330845355987549, "reward_std": 0.08692387118935585, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6330845654010773, "step": 1010 }, { "completion_length": 105.765625, "epoch": 0.6518375241779497, "grad_norm": 24.195541381835938, "kl": 0.11962890625, "learning_rate": 6.740812379110251e-07, "loss": 0.0048, "reward": 1.5731955170631409, "reward_std": 0.06743450090289116, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5731955468654633, "step": 1011 }, { "completion_length": 101.953125, "epoch": 0.6524822695035462, "grad_norm": 63.89750289916992, "kl": 0.120849609375, "learning_rate": 6.737588652482269e-07, "loss": 0.0048, "reward": 1.4986768960952759, "reward_std": 0.10133867338299751, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4986769109964371, "step": 1012 }, { "completion_length": 96.9375, "epoch": 0.6531270148291425, "grad_norm": 10.6175537109375, "kl": 0.1181640625, "learning_rate": 6.734364925854286e-07, "loss": 0.0047, "reward": 1.6250901818275452, "reward_std": 0.07033353298902512, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6250901520252228, "step": 1013 }, { "completion_length": 95.875, "epoch": 0.6537717601547389, "grad_norm": 113.03011322021484, "kl": 0.1142578125, "learning_rate": 6.731141199226306e-07, "loss": 0.0046, "reward": 1.587643027305603, "reward_std": 0.12430031597614288, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.587643027305603, "step": 1014 }, { "completion_length": 94.421875, "epoch": 0.6544165054803353, "grad_norm": 16.419910430908203, "kl": 0.110595703125, "learning_rate": 6.727917472598323e-07, "loss": 0.0044, "reward": 1.5937945246696472, "reward_std": 0.10029425844550133, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5937945246696472, "step": 1015 }, { "completion_length": 101.0625, "epoch": 0.6550612508059317, "grad_norm": 23.794921875, "kl": 0.134765625, "learning_rate": 6.724693745970341e-07, "loss": 0.0054, "reward": 1.6735421419143677, "reward_std": 0.10969025641679764, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6735422015190125, "step": 1016 }, { "completion_length": 110.609375, "epoch": 0.655705996131528, "grad_norm": 12.621672630310059, "kl": 0.11376953125, "learning_rate": 6.721470019342359e-07, "loss": 0.0045, "reward": 1.71983140707016, "reward_std": 0.06134466826915741, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7198314368724823, "step": 1017 }, { "completion_length": 98.25, "epoch": 0.6563507414571245, "grad_norm": 41.569644927978516, "kl": 0.117431640625, "learning_rate": 6.718246292714378e-07, "loss": 0.0047, "reward": 1.5260882377624512, "reward_std": 0.09492732584476471, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5260882377624512, "step": 1018 }, { "completion_length": 100.390625, "epoch": 0.6569954867827208, "grad_norm": 15.814488410949707, "kl": 0.169921875, "learning_rate": 6.715022566086395e-07, "loss": 0.0068, "reward": 1.5511394739151, "reward_std": 0.10130210220813751, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5511394739151001, "step": 1019 }, { "completion_length": 93.1875, "epoch": 0.6576402321083172, "grad_norm": 25.868892669677734, "kl": 0.1103515625, "learning_rate": 6.711798839458414e-07, "loss": 0.0044, "reward": 1.7657926678657532, "reward_std": 0.07120348140597343, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.765792727470398, "step": 1020 }, { "completion_length": 105.140625, "epoch": 0.6582849774339136, "grad_norm": 11.2671537399292, "kl": 0.1015625, "learning_rate": 6.708575112830431e-07, "loss": 0.0041, "reward": 1.5160332322120667, "reward_std": 0.06816467270255089, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5160332322120667, "step": 1021 }, { "completion_length": 103.75, "epoch": 0.65892972275951, "grad_norm": 11.783982276916504, "kl": 0.10400390625, "learning_rate": 6.70535138620245e-07, "loss": 0.0042, "reward": 1.634143590927124, "reward_std": 0.0808180458843708, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6341435015201569, "step": 1022 }, { "completion_length": 109.6875, "epoch": 0.6595744680851063, "grad_norm": 18.366363525390625, "kl": 0.1240234375, "learning_rate": 6.702127659574468e-07, "loss": 0.005, "reward": 1.7418678998947144, "reward_std": 0.09646868705749512, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7418678402900696, "step": 1023 }, { "completion_length": 104.390625, "epoch": 0.6602192134107028, "grad_norm": 44.52997589111328, "kl": 0.12939453125, "learning_rate": 6.698903932946486e-07, "loss": 0.0052, "reward": 1.4403961896896362, "reward_std": 0.13813015818595886, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.440396249294281, "step": 1024 }, { "completion_length": 101.8125, "epoch": 0.6608639587362991, "grad_norm": 13.398497581481934, "kl": 0.12158203125, "learning_rate": 6.695680206318503e-07, "loss": 0.0049, "reward": 1.6842644214630127, "reward_std": 0.12250592187047005, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6842643916606903, "step": 1025 }, { "completion_length": 111.1875, "epoch": 0.6615087040618955, "grad_norm": 37.01210403442383, "kl": 0.12744140625, "learning_rate": 6.692456479690521e-07, "loss": 0.0051, "reward": 1.5435105562210083, "reward_std": 0.08950828574597836, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5435105115175247, "step": 1026 }, { "completion_length": 109.078125, "epoch": 0.662153449387492, "grad_norm": 38.497901916503906, "kl": 0.12353515625, "learning_rate": 6.68923275306254e-07, "loss": 0.0049, "reward": 1.4949541687965393, "reward_std": 0.10163058340549469, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4949541985988617, "step": 1027 }, { "completion_length": 102.640625, "epoch": 0.6627981947130883, "grad_norm": 18.939176559448242, "kl": 0.11474609375, "learning_rate": 6.686009026434558e-07, "loss": 0.0046, "reward": 1.5689424276351929, "reward_std": 0.0853864960372448, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5689424276351929, "step": 1028 }, { "completion_length": 111.734375, "epoch": 0.6634429400386848, "grad_norm": 8.950363159179688, "kl": 0.105224609375, "learning_rate": 6.682785299806575e-07, "loss": 0.0042, "reward": 1.5498056411743164, "reward_std": 0.05270315520465374, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5498056858778, "step": 1029 }, { "completion_length": 103.359375, "epoch": 0.6640876853642811, "grad_norm": 9.672654151916504, "kl": 0.1142578125, "learning_rate": 6.679561573178594e-07, "loss": 0.0046, "reward": 1.630305826663971, "reward_std": 0.04982496798038483, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6303058564662933, "step": 1030 }, { "completion_length": 103.3125, "epoch": 0.6647324306898775, "grad_norm": 37.13180923461914, "kl": 0.103515625, "learning_rate": 6.676337846550613e-07, "loss": 0.0041, "reward": 1.6453443765640259, "reward_std": 0.09188783913850784, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6453443467617035, "step": 1031 }, { "completion_length": 98.640625, "epoch": 0.6653771760154739, "grad_norm": 13.750219345092773, "kl": 0.128173828125, "learning_rate": 6.67311411992263e-07, "loss": 0.0051, "reward": 1.4959722757339478, "reward_std": 0.07575977221131325, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49597224593162537, "step": 1032 }, { "completion_length": 116.953125, "epoch": 0.6660219213410703, "grad_norm": 18.885990142822266, "kl": 0.1201171875, "learning_rate": 6.669890393294648e-07, "loss": 0.0048, "reward": 1.507498860359192, "reward_std": 0.09108571708202362, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5074988901615143, "step": 1033 }, { "completion_length": 106.109375, "epoch": 0.6666666666666666, "grad_norm": 13.014422416687012, "kl": 0.1064453125, "learning_rate": 6.666666666666666e-07, "loss": 0.0043, "reward": 1.7497589588165283, "reward_std": 0.04258188419044018, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7497589588165283, "step": 1034 }, { "completion_length": 88.640625, "epoch": 0.6673114119922631, "grad_norm": 12.072609901428223, "kl": 0.14990234375, "learning_rate": 6.663442940038685e-07, "loss": 0.006, "reward": 1.4128879308700562, "reward_std": 0.05543842911720276, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41288794577121735, "step": 1035 }, { "completion_length": 95.421875, "epoch": 0.6679561573178594, "grad_norm": 25.26651954650879, "kl": 0.1201171875, "learning_rate": 6.660219213410703e-07, "loss": 0.0048, "reward": 1.6416820287704468, "reward_std": 0.10006970167160034, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6416819393634796, "step": 1036 }, { "completion_length": 109.75, "epoch": 0.6686009026434558, "grad_norm": 35.28297424316406, "kl": 0.114501953125, "learning_rate": 6.656995486782721e-07, "loss": 0.0046, "reward": 1.48683100938797, "reward_std": 0.12712857872247696, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48683103919029236, "step": 1037 }, { "completion_length": 94.359375, "epoch": 0.6692456479690522, "grad_norm": 32.63760757446289, "kl": 0.127197265625, "learning_rate": 6.653771760154738e-07, "loss": 0.0051, "reward": 1.6056684255599976, "reward_std": 0.07967904582619667, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6056684255599976, "step": 1038 }, { "completion_length": 115.8125, "epoch": 0.6698903932946486, "grad_norm": 14.944701194763184, "kl": 0.11181640625, "learning_rate": 6.650548033526756e-07, "loss": 0.0045, "reward": 1.5069088339805603, "reward_std": 0.0734553337097168, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5069088041782379, "step": 1039 }, { "completion_length": 105.09375, "epoch": 0.670535138620245, "grad_norm": 8.643447875976562, "kl": 0.104736328125, "learning_rate": 6.647324306898775e-07, "loss": 0.0042, "reward": 1.4595961570739746, "reward_std": 0.105046346783638, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4752211719751358, "step": 1040 }, { "completion_length": 99.65625, "epoch": 0.6711798839458414, "grad_norm": 161.3411865234375, "kl": 0.119140625, "learning_rate": 6.644100580270793e-07, "loss": 0.0048, "reward": 1.4072444438934326, "reward_std": 0.12566466629505157, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4072444587945938, "step": 1041 }, { "completion_length": 101.953125, "epoch": 0.6718246292714378, "grad_norm": 12.601658821105957, "kl": 0.11767578125, "learning_rate": 6.640876853642811e-07, "loss": 0.0047, "reward": 1.7041231393814087, "reward_std": 0.08998357132077217, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7041231989860535, "step": 1042 }, { "completion_length": 108.03125, "epoch": 0.6724693745970342, "grad_norm": 30.03297233581543, "kl": 0.11767578125, "learning_rate": 6.637653127014829e-07, "loss": 0.0047, "reward": 1.4635764956474304, "reward_std": 0.1103704385459423, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4792014956474304, "step": 1043 }, { "completion_length": 109.25, "epoch": 0.6731141199226306, "grad_norm": 16.924476623535156, "kl": 0.193603515625, "learning_rate": 6.634429400386847e-07, "loss": 0.0077, "reward": 1.6905344724655151, "reward_std": 0.15296810120344162, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6905345618724823, "step": 1044 }, { "completion_length": 112.5625, "epoch": 0.6737588652482269, "grad_norm": 12.32383918762207, "kl": 0.11865234375, "learning_rate": 6.631205673758866e-07, "loss": 0.0048, "reward": 1.307143211364746, "reward_std": 0.08060317486524582, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3071431890130043, "step": 1045 }, { "completion_length": 116.546875, "epoch": 0.6744036105738234, "grad_norm": 18.390422821044922, "kl": 0.111083984375, "learning_rate": 6.627981947130883e-07, "loss": 0.0044, "reward": 1.497663140296936, "reward_std": 0.053974613547325134, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49766309559345245, "step": 1046 }, { "completion_length": 115.375, "epoch": 0.6750483558994197, "grad_norm": 13.646191596984863, "kl": 0.1611328125, "learning_rate": 6.624758220502901e-07, "loss": 0.0064, "reward": 1.7063047885894775, "reward_std": 0.16738375276327133, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7219298183917999, "step": 1047 }, { "completion_length": 117.125, "epoch": 0.6756931012250161, "grad_norm": 16.76261329650879, "kl": 0.129150390625, "learning_rate": 6.62153449387492e-07, "loss": 0.0052, "reward": 1.7379147410392761, "reward_std": 0.0969013050198555, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.737914651632309, "step": 1048 }, { "completion_length": 106.921875, "epoch": 0.6763378465506125, "grad_norm": 23.75704002380371, "kl": 0.109619140625, "learning_rate": 6.618310767246938e-07, "loss": 0.0044, "reward": 1.598206639289856, "reward_std": 0.12298348918557167, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5982065498828888, "step": 1049 }, { "completion_length": 122.203125, "epoch": 0.6769825918762089, "grad_norm": 12.665209770202637, "kl": 0.101318359375, "learning_rate": 6.615087040618955e-07, "loss": 0.0041, "reward": 1.7609497904777527, "reward_std": 0.085877925157547, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7609497904777527, "step": 1050 }, { "completion_length": 108.734375, "epoch": 0.6776273372018052, "grad_norm": 23.685190200805664, "kl": 0.112060546875, "learning_rate": 6.611863313990973e-07, "loss": 0.0045, "reward": 1.5682024359703064, "reward_std": 0.07944511994719505, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5682024210691452, "step": 1051 }, { "completion_length": 114.78125, "epoch": 0.6782720825274017, "grad_norm": 21.806703567504883, "kl": 0.107421875, "learning_rate": 6.608639587362991e-07, "loss": 0.0043, "reward": 1.6686413288116455, "reward_std": 0.07967701368033886, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6686412990093231, "step": 1052 }, { "completion_length": 117.140625, "epoch": 0.6789168278529981, "grad_norm": 19.198450088500977, "kl": 0.102783203125, "learning_rate": 6.60541586073501e-07, "loss": 0.0041, "reward": 1.403226375579834, "reward_std": 0.08735075406730175, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4032263904809952, "step": 1053 }, { "completion_length": 119.078125, "epoch": 0.6795615731785944, "grad_norm": 29.204330444335938, "kl": 0.111328125, "learning_rate": 6.602192134107027e-07, "loss": 0.0045, "reward": 1.7787442207336426, "reward_std": 0.054605141282081604, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7787441611289978, "step": 1054 }, { "completion_length": 112.46875, "epoch": 0.6802063185041909, "grad_norm": 11.750280380249023, "kl": 0.11669921875, "learning_rate": 6.598968407479046e-07, "loss": 0.0047, "reward": 1.4942984580993652, "reward_std": 0.08487318456172943, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49429842829704285, "step": 1055 }, { "completion_length": 118.5, "epoch": 0.6808510638297872, "grad_norm": 10.436887741088867, "kl": 0.109130859375, "learning_rate": 6.595744680851063e-07, "loss": 0.0044, "reward": 1.4903525114059448, "reward_std": 0.08176442608237267, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4903525114059448, "step": 1056 }, { "completion_length": 109.078125, "epoch": 0.6814958091553837, "grad_norm": 28.66140365600586, "kl": 0.1064453125, "learning_rate": 6.592520954223082e-07, "loss": 0.0043, "reward": 1.6072522401809692, "reward_std": 0.0758088231086731, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6072521805763245, "step": 1057 }, { "completion_length": 109.21875, "epoch": 0.68214055448098, "grad_norm": 26.009347915649414, "kl": 0.1083984375, "learning_rate": 6.5892972275951e-07, "loss": 0.0043, "reward": 1.437576413154602, "reward_std": 0.06020371802151203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4375763535499573, "step": 1058 }, { "completion_length": 123.9375, "epoch": 0.6827852998065764, "grad_norm": 44.55741500854492, "kl": 0.105224609375, "learning_rate": 6.586073500967118e-07, "loss": 0.0042, "reward": 1.5399547219276428, "reward_std": 0.1357093583792448, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.555579662322998, "step": 1059 }, { "completion_length": 118.828125, "epoch": 0.6834300451321728, "grad_norm": 83.01713562011719, "kl": 0.109130859375, "learning_rate": 6.582849774339135e-07, "loss": 0.0044, "reward": 1.545635998249054, "reward_std": 0.08544642850756645, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5456359833478928, "step": 1060 }, { "completion_length": 130.25, "epoch": 0.6840747904577692, "grad_norm": 13.197311401367188, "kl": 0.10693359375, "learning_rate": 6.579626047711155e-07, "loss": 0.0043, "reward": 1.3019261956214905, "reward_std": 0.06299575977027416, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3019261509180069, "step": 1061 }, { "completion_length": 109.578125, "epoch": 0.6847195357833655, "grad_norm": 23.34103012084961, "kl": 0.107177734375, "learning_rate": 6.576402321083172e-07, "loss": 0.0043, "reward": 1.4681254625320435, "reward_std": 0.10434725135564804, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4681254029273987, "step": 1062 }, { "completion_length": 124.90625, "epoch": 0.685364281108962, "grad_norm": 27.485233306884766, "kl": 0.11669921875, "learning_rate": 6.57317859445519e-07, "loss": 0.0047, "reward": 1.5611656308174133, "reward_std": 0.09634355828166008, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5611656308174133, "step": 1063 }, { "completion_length": 113.40625, "epoch": 0.6860090264345583, "grad_norm": 37.157684326171875, "kl": 0.130859375, "learning_rate": 6.569954867827208e-07, "loss": 0.0052, "reward": 1.490740954875946, "reward_std": 0.07894630171358585, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49074095487594604, "step": 1064 }, { "completion_length": 125.75, "epoch": 0.6866537717601547, "grad_norm": 9.309281349182129, "kl": 0.103759765625, "learning_rate": 6.566731141199227e-07, "loss": 0.0041, "reward": 1.5613402128219604, "reward_std": 0.10322123765945435, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5769652426242828, "step": 1065 }, { "completion_length": 130.375, "epoch": 0.6872985170857512, "grad_norm": 8.706680297851562, "kl": 0.163818359375, "learning_rate": 6.563507414571244e-07, "loss": 0.0066, "reward": 1.732495129108429, "reward_std": 0.0963818859308958, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7324950993061066, "step": 1066 }, { "completion_length": 126.890625, "epoch": 0.6879432624113475, "grad_norm": 54.084346771240234, "kl": 0.125, "learning_rate": 6.560283687943263e-07, "loss": 0.005, "reward": 1.4755541682243347, "reward_std": 0.0856284536421299, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47555410861968994, "step": 1067 }, { "completion_length": 116.8125, "epoch": 0.688588007736944, "grad_norm": 19.93977165222168, "kl": 0.09814453125, "learning_rate": 6.55705996131528e-07, "loss": 0.0039, "reward": 1.724917709827423, "reward_std": 0.12488729506731033, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7405427396297455, "step": 1068 }, { "completion_length": 119.59375, "epoch": 0.6892327530625403, "grad_norm": 18.106996536254883, "kl": 0.106201171875, "learning_rate": 6.553836234687298e-07, "loss": 0.0042, "reward": 1.5572913885116577, "reward_std": 0.06482551246881485, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5572913736104965, "step": 1069 }, { "completion_length": 116.484375, "epoch": 0.6898774983881367, "grad_norm": 34.54378128051758, "kl": 0.146728515625, "learning_rate": 6.550612508059317e-07, "loss": 0.0059, "reward": 1.5824349522590637, "reward_std": 0.1754782274365425, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5980599820613861, "step": 1070 }, { "completion_length": 125.96875, "epoch": 0.690522243713733, "grad_norm": 14.766402244567871, "kl": 0.1083984375, "learning_rate": 6.547388781431335e-07, "loss": 0.0043, "reward": 1.4851022958755493, "reward_std": 0.07177170366048813, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4851022809743881, "step": 1071 }, { "completion_length": 108.359375, "epoch": 0.6911669890393295, "grad_norm": 17.350265502929688, "kl": 0.14404296875, "learning_rate": 6.544165054803352e-07, "loss": 0.0058, "reward": 1.5002387762069702, "reward_std": 0.15507403016090393, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5158638060092926, "step": 1072 }, { "completion_length": 111.28125, "epoch": 0.6918117343649258, "grad_norm": 26.06131935119629, "kl": 0.13671875, "learning_rate": 6.54094132817537e-07, "loss": 0.0054, "reward": 1.6340994238853455, "reward_std": 0.11746730655431747, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6340994238853455, "step": 1073 }, { "completion_length": 114.84375, "epoch": 0.6924564796905223, "grad_norm": 20.31998062133789, "kl": 0.233642578125, "learning_rate": 6.537717601547389e-07, "loss": 0.0094, "reward": 1.6390128135681152, "reward_std": 0.08344031870365143, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6390127837657928, "step": 1074 }, { "completion_length": 117.546875, "epoch": 0.6931012250161186, "grad_norm": 12.814098358154297, "kl": 0.120361328125, "learning_rate": 6.534493874919407e-07, "loss": 0.0048, "reward": 1.5889630317687988, "reward_std": 0.08722979947924614, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5889630317687988, "step": 1075 }, { "completion_length": 108.6875, "epoch": 0.693745970341715, "grad_norm": 34.06943893432617, "kl": 0.118896484375, "learning_rate": 6.531270148291424e-07, "loss": 0.0048, "reward": 1.549480378627777, "reward_std": 0.08187295310199261, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5494804084300995, "step": 1076 }, { "completion_length": 115.671875, "epoch": 0.6943907156673114, "grad_norm": 17.84603500366211, "kl": 0.144775390625, "learning_rate": 6.528046421663443e-07, "loss": 0.0058, "reward": 1.702250897884369, "reward_std": 0.09670410864055157, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7022508978843689, "step": 1077 }, { "completion_length": 118.46875, "epoch": 0.6950354609929078, "grad_norm": 15.43232250213623, "kl": 0.12158203125, "learning_rate": 6.524822695035461e-07, "loss": 0.0049, "reward": 1.7633506059646606, "reward_std": 0.08454957231879234, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7633506059646606, "step": 1078 }, { "completion_length": 120.390625, "epoch": 0.6956802063185042, "grad_norm": 19.23419761657715, "kl": 0.1318359375, "learning_rate": 6.521598968407479e-07, "loss": 0.0053, "reward": 1.5779541730880737, "reward_std": 0.08321835845708847, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5779542624950409, "step": 1079 }, { "completion_length": 111.40625, "epoch": 0.6963249516441006, "grad_norm": 44.08869934082031, "kl": 0.134521484375, "learning_rate": 6.518375241779497e-07, "loss": 0.0054, "reward": 1.6062058210372925, "reward_std": 0.09365630894899368, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6062057018280029, "step": 1080 }, { "completion_length": 125.3125, "epoch": 0.696969696969697, "grad_norm": 9.416675567626953, "kl": 0.118408203125, "learning_rate": 6.515151515151515e-07, "loss": 0.0047, "reward": 1.6733117699623108, "reward_std": 0.09015357866883278, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.673311859369278, "step": 1081 }, { "completion_length": 111.15625, "epoch": 0.6976144422952933, "grad_norm": 5.396347522735596, "kl": 0.110595703125, "learning_rate": 6.511927788523532e-07, "loss": 0.0044, "reward": 1.5660415291786194, "reward_std": 0.13443563878536224, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5972915291786194, "step": 1082 }, { "completion_length": 124.765625, "epoch": 0.6982591876208898, "grad_norm": 19.220321655273438, "kl": 0.12890625, "learning_rate": 6.508704061895552e-07, "loss": 0.0051, "reward": 1.4645467400550842, "reward_std": 0.09206113591790199, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4645467698574066, "step": 1083 }, { "completion_length": 119.125, "epoch": 0.6989039329464861, "grad_norm": 9.741296768188477, "kl": 0.113525390625, "learning_rate": 6.505480335267569e-07, "loss": 0.0045, "reward": 1.7223733067512512, "reward_std": 0.14363746345043182, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7223733961582184, "step": 1084 }, { "completion_length": 115.359375, "epoch": 0.6995486782720826, "grad_norm": 10.412392616271973, "kl": 0.11474609375, "learning_rate": 6.502256608639587e-07, "loss": 0.0046, "reward": 1.634882390499115, "reward_std": 0.03937092795968056, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6348824203014374, "step": 1085 }, { "completion_length": 111.078125, "epoch": 0.7001934235976789, "grad_norm": 52.82218551635742, "kl": 0.17236328125, "learning_rate": 6.499032882011605e-07, "loss": 0.0069, "reward": 1.6078645586967468, "reward_std": 0.14649196714162827, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.607864499092102, "step": 1086 }, { "completion_length": 110.625, "epoch": 0.7008381689232753, "grad_norm": 10.968231201171875, "kl": 0.11279296875, "learning_rate": 6.495809155383624e-07, "loss": 0.0045, "reward": 1.4408124089241028, "reward_std": 0.05924747884273529, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4408123791217804, "step": 1087 }, { "completion_length": 106.765625, "epoch": 0.7014829142488717, "grad_norm": 123.17638397216797, "kl": 0.127685546875, "learning_rate": 6.492585428755641e-07, "loss": 0.0051, "reward": 1.5098558068275452, "reward_std": 0.08591609820723534, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.509855791926384, "step": 1088 }, { "completion_length": 101.53125, "epoch": 0.7021276595744681, "grad_norm": 17.40302085876465, "kl": 0.13330078125, "learning_rate": 6.48936170212766e-07, "loss": 0.0053, "reward": 1.7117857933044434, "reward_std": 0.10072560980916023, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7117857933044434, "step": 1089 }, { "completion_length": 111.296875, "epoch": 0.7027724049000644, "grad_norm": 13.590946197509766, "kl": 0.10986328125, "learning_rate": 6.486137975499677e-07, "loss": 0.0044, "reward": 1.6580272912979126, "reward_std": 0.06897935643792152, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6580273658037186, "step": 1090 }, { "completion_length": 96.625, "epoch": 0.7034171502256609, "grad_norm": 30.83803367614746, "kl": 0.100830078125, "learning_rate": 6.482914248871696e-07, "loss": 0.004, "reward": 1.6532940864562988, "reward_std": 0.15903402864933014, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6689190864562988, "step": 1091 }, { "completion_length": 105.78125, "epoch": 0.7040618955512572, "grad_norm": 25.37020492553711, "kl": 0.117431640625, "learning_rate": 6.479690522243714e-07, "loss": 0.0047, "reward": 1.497763454914093, "reward_std": 0.09194651618599892, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49776342511177063, "step": 1092 }, { "completion_length": 114.203125, "epoch": 0.7047066408768536, "grad_norm": 13.3228120803833, "kl": 0.1279296875, "learning_rate": 6.476466795615732e-07, "loss": 0.0051, "reward": 1.725711703300476, "reward_std": 0.1199619323015213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7257117033004761, "step": 1093 }, { "completion_length": 121.0625, "epoch": 0.7053513862024501, "grad_norm": 12.090010643005371, "kl": 0.1220703125, "learning_rate": 6.473243068987749e-07, "loss": 0.0049, "reward": 1.4209555387496948, "reward_std": 0.15032630413770676, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.43658050894737244, "step": 1094 }, { "completion_length": 118.359375, "epoch": 0.7059961315280464, "grad_norm": 21.097448348999023, "kl": 0.115966796875, "learning_rate": 6.470019342359767e-07, "loss": 0.0046, "reward": 1.6207126379013062, "reward_std": 0.11108803562819958, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6363376677036285, "step": 1095 }, { "completion_length": 108.453125, "epoch": 0.7066408768536429, "grad_norm": 11.777490615844727, "kl": 0.121337890625, "learning_rate": 6.466795615731786e-07, "loss": 0.0049, "reward": 1.5517085194587708, "reward_std": 0.11031496897339821, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5517085194587708, "step": 1096 }, { "completion_length": 109.59375, "epoch": 0.7072856221792392, "grad_norm": 7.2100419998168945, "kl": 0.11328125, "learning_rate": 6.463571889103804e-07, "loss": 0.0045, "reward": 1.5937931537628174, "reward_std": 0.06582435220479965, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5937931537628174, "step": 1097 }, { "completion_length": 105.90625, "epoch": 0.7079303675048356, "grad_norm": 65.8556137084961, "kl": 0.1201171875, "learning_rate": 6.460348162475821e-07, "loss": 0.0048, "reward": 1.4370640516281128, "reward_std": 0.16717902943491936, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.4683140963315964, "step": 1098 }, { "completion_length": 101.015625, "epoch": 0.708575112830432, "grad_norm": 17.487159729003906, "kl": 0.113037109375, "learning_rate": 6.45712443584784e-07, "loss": 0.0045, "reward": 1.500916838645935, "reward_std": 0.053837522864341736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5009167790412903, "step": 1099 }, { "completion_length": 115.875, "epoch": 0.7092198581560284, "grad_norm": 11.76062297821045, "kl": 0.11376953125, "learning_rate": 6.453900709219858e-07, "loss": 0.0045, "reward": 1.4913934469223022, "reward_std": 0.13147876039147377, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49139344692230225, "step": 1100 }, { "completion_length": 109.390625, "epoch": 0.7098646034816247, "grad_norm": 8.682186126708984, "kl": 0.117919921875, "learning_rate": 6.450676982591876e-07, "loss": 0.0047, "reward": 1.6740834712982178, "reward_std": 0.14168205112218857, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6897085309028625, "step": 1101 }, { "completion_length": 97.09375, "epoch": 0.7105093488072212, "grad_norm": 15.599982261657715, "kl": 0.131103515625, "learning_rate": 6.447453255963894e-07, "loss": 0.0052, "reward": 1.4697246551513672, "reward_std": 0.05679570883512497, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4697246551513672, "step": 1102 }, { "completion_length": 103.953125, "epoch": 0.7111540941328175, "grad_norm": 12.54194450378418, "kl": 0.111083984375, "learning_rate": 6.444229529335912e-07, "loss": 0.0044, "reward": 1.5123698711395264, "reward_std": 0.13531113602221012, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5123699456453323, "step": 1103 }, { "completion_length": 97.921875, "epoch": 0.7117988394584139, "grad_norm": 18.03816795349121, "kl": 0.1318359375, "learning_rate": 6.44100580270793e-07, "loss": 0.0053, "reward": 1.6060336232185364, "reward_std": 0.08316031470894814, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6060336530208588, "step": 1104 }, { "completion_length": 105.875, "epoch": 0.7124435847840103, "grad_norm": 15.154292106628418, "kl": 0.11865234375, "learning_rate": 6.437782076079949e-07, "loss": 0.0048, "reward": 1.5886054635047913, "reward_std": 0.1678875982761383, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6042304337024689, "step": 1105 }, { "completion_length": 90.84375, "epoch": 0.7130883301096067, "grad_norm": 9.316852569580078, "kl": 0.126953125, "learning_rate": 6.434558349451966e-07, "loss": 0.0051, "reward": 1.2697542905807495, "reward_std": 0.0764620453119278, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.2697542533278465, "step": 1106 }, { "completion_length": 103.5625, "epoch": 0.7137330754352031, "grad_norm": 21.145282745361328, "kl": 0.11181640625, "learning_rate": 6.431334622823984e-07, "loss": 0.0045, "reward": 1.679430365562439, "reward_std": 0.06132471561431885, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6794304251670837, "step": 1107 }, { "completion_length": 109.71875, "epoch": 0.7143778207607995, "grad_norm": 13.92724323272705, "kl": 0.12255859375, "learning_rate": 6.428110896196002e-07, "loss": 0.0049, "reward": 1.6119479537010193, "reward_std": 0.12308447062969208, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6119480133056641, "step": 1108 }, { "completion_length": 105.0, "epoch": 0.7150225660863959, "grad_norm": 170.48899841308594, "kl": 0.122802734375, "learning_rate": 6.424887169568021e-07, "loss": 0.0049, "reward": 1.5171204805374146, "reward_std": 0.09489506855607033, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5171204507350922, "step": 1109 }, { "completion_length": 107.46875, "epoch": 0.7156673114119922, "grad_norm": 40.21750259399414, "kl": 0.11669921875, "learning_rate": 6.421663442940038e-07, "loss": 0.0047, "reward": 1.6050112843513489, "reward_std": 0.08443249017000198, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6050112396478653, "step": 1110 }, { "completion_length": 103.4375, "epoch": 0.7163120567375887, "grad_norm": 7.109347820281982, "kl": 0.114013671875, "learning_rate": 6.418439716312057e-07, "loss": 0.0046, "reward": 1.4758098721504211, "reward_std": 0.05706555396318436, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4758097976446152, "step": 1111 }, { "completion_length": 114.15625, "epoch": 0.716956802063185, "grad_norm": 8.961230278015137, "kl": 0.132080078125, "learning_rate": 6.415215989684074e-07, "loss": 0.0053, "reward": 1.5424306988716125, "reward_std": 0.10397448390722275, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5424306988716125, "step": 1112 }, { "completion_length": 107.0, "epoch": 0.7176015473887815, "grad_norm": 7.983897686004639, "kl": 0.1279296875, "learning_rate": 6.411992263056093e-07, "loss": 0.0051, "reward": 1.432960569858551, "reward_std": 0.05943846330046654, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.432960569858551, "step": 1113 }, { "completion_length": 105.1875, "epoch": 0.7182462927143778, "grad_norm": 9.793622016906738, "kl": 0.12646484375, "learning_rate": 6.40876853642811e-07, "loss": 0.0051, "reward": 1.5297775864601135, "reward_std": 0.09795258566737175, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5297775864601135, "step": 1114 }, { "completion_length": 100.65625, "epoch": 0.7188910380399742, "grad_norm": 21.03094482421875, "kl": 0.11083984375, "learning_rate": 6.405544809800129e-07, "loss": 0.0044, "reward": 1.5464897155761719, "reward_std": 0.12115862965583801, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5464896857738495, "step": 1115 }, { "completion_length": 106.25, "epoch": 0.7195357833655706, "grad_norm": 23.36917495727539, "kl": 0.123046875, "learning_rate": 6.402321083172146e-07, "loss": 0.0049, "reward": 1.4555120468139648, "reward_std": 0.09448462724685669, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.47113706171512604, "step": 1116 }, { "completion_length": 98.09375, "epoch": 0.720180528691167, "grad_norm": 10.004223823547363, "kl": 0.122314453125, "learning_rate": 6.399097356544166e-07, "loss": 0.0049, "reward": 1.3829104900360107, "reward_std": 0.06856898404657841, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.38291044533252716, "step": 1117 }, { "completion_length": 95.671875, "epoch": 0.7208252740167633, "grad_norm": 137.10011291503906, "kl": 0.13818359375, "learning_rate": 6.395873629916183e-07, "loss": 0.0055, "reward": 1.5769847631454468, "reward_std": 0.06603378802537918, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.576984703540802, "step": 1118 }, { "completion_length": 109.03125, "epoch": 0.7214700193423598, "grad_norm": 17.249679565429688, "kl": 0.1142578125, "learning_rate": 6.392649903288201e-07, "loss": 0.0046, "reward": 1.624714195728302, "reward_std": 0.1402108147740364, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6247142255306244, "step": 1119 }, { "completion_length": 106.84375, "epoch": 0.7221147646679562, "grad_norm": 9.263527870178223, "kl": 0.15380859375, "learning_rate": 6.389426176660218e-07, "loss": 0.0062, "reward": 1.6390187740325928, "reward_std": 0.12167927622795105, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6390187740325928, "step": 1120 }, { "completion_length": 114.359375, "epoch": 0.7227595099935525, "grad_norm": 26.278076171875, "kl": 0.1640625, "learning_rate": 6.386202450032237e-07, "loss": 0.0066, "reward": 1.6548975110054016, "reward_std": 0.16698498651385307, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6705225110054016, "step": 1121 }, { "completion_length": 106.515625, "epoch": 0.723404255319149, "grad_norm": 41.87917709350586, "kl": 0.1552734375, "learning_rate": 6.382978723404255e-07, "loss": 0.0062, "reward": 1.6121267080307007, "reward_std": 0.10054780542850494, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6121267527341843, "step": 1122 }, { "completion_length": 95.09375, "epoch": 0.7240490006447453, "grad_norm": 17.717201232910156, "kl": 0.1796875, "learning_rate": 6.379754996776273e-07, "loss": 0.0072, "reward": 1.5679856538772583, "reward_std": 0.08641625568270683, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5679856687784195, "step": 1123 }, { "completion_length": 103.8125, "epoch": 0.7246937459703418, "grad_norm": 10.81406021118164, "kl": 0.1337890625, "learning_rate": 6.376531270148291e-07, "loss": 0.0054, "reward": 1.4295477271080017, "reward_std": 0.08662409894168377, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4295477420091629, "step": 1124 }, { "completion_length": 119.09375, "epoch": 0.7253384912959381, "grad_norm": 8.933006286621094, "kl": 0.11181640625, "learning_rate": 6.373307543520309e-07, "loss": 0.0045, "reward": 1.4276261925697327, "reward_std": 0.10313639417290688, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42762622237205505, "step": 1125 }, { "completion_length": 109.359375, "epoch": 0.7259832366215345, "grad_norm": 31.597089767456055, "kl": 0.1337890625, "learning_rate": 6.370083816892327e-07, "loss": 0.0053, "reward": 1.6690808534622192, "reward_std": 0.0815388485789299, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6690808236598969, "step": 1126 }, { "completion_length": 107.15625, "epoch": 0.7266279819471309, "grad_norm": 18.734527587890625, "kl": 0.1337890625, "learning_rate": 6.366860090264346e-07, "loss": 0.0054, "reward": 1.4915392994880676, "reward_std": 0.1256212517619133, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49153923988342285, "step": 1127 }, { "completion_length": 103.609375, "epoch": 0.7272727272727273, "grad_norm": 10.532417297363281, "kl": 0.138671875, "learning_rate": 6.363636363636363e-07, "loss": 0.0055, "reward": 1.6402064561843872, "reward_std": 0.08831041865050793, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6402064859867096, "step": 1128 }, { "completion_length": 116.015625, "epoch": 0.7279174725983236, "grad_norm": 70.3717041015625, "kl": 0.1162109375, "learning_rate": 6.360412637008381e-07, "loss": 0.0046, "reward": 1.5526406168937683, "reward_std": 0.09058434516191483, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5526406168937683, "step": 1129 }, { "completion_length": 99.296875, "epoch": 0.7285622179239201, "grad_norm": 22.534345626831055, "kl": 0.1533203125, "learning_rate": 6.3571889103804e-07, "loss": 0.0061, "reward": 1.6604415774345398, "reward_std": 0.11532147601246834, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6604415774345398, "step": 1130 }, { "completion_length": 104.046875, "epoch": 0.7292069632495164, "grad_norm": 11.422441482543945, "kl": 0.11767578125, "learning_rate": 6.353965183752418e-07, "loss": 0.0047, "reward": 1.6915850639343262, "reward_std": 0.11195739358663559, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6915850639343262, "step": 1131 }, { "completion_length": 112.078125, "epoch": 0.7298517085751128, "grad_norm": 19.152061462402344, "kl": 0.13525390625, "learning_rate": 6.350741457124435e-07, "loss": 0.0054, "reward": 1.5175195932388306, "reward_std": 0.060434307903051376, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5175195336341858, "step": 1132 }, { "completion_length": 111.328125, "epoch": 0.7304964539007093, "grad_norm": 14.260641098022461, "kl": 0.16552734375, "learning_rate": 6.347517730496454e-07, "loss": 0.0066, "reward": 1.5333157181739807, "reward_std": 0.10276435315608978, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5333157181739807, "step": 1133 }, { "completion_length": 100.328125, "epoch": 0.7311411992263056, "grad_norm": 24.769067764282227, "kl": 0.12939453125, "learning_rate": 6.344294003868471e-07, "loss": 0.0052, "reward": 1.8205824494361877, "reward_std": 0.0905773900449276, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.8205824196338654, "step": 1134 }, { "completion_length": 113.640625, "epoch": 0.731785944551902, "grad_norm": 25.011865615844727, "kl": 0.13134765625, "learning_rate": 6.34107027724049e-07, "loss": 0.0053, "reward": 1.6984012126922607, "reward_std": 0.06417006999254227, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6984012126922607, "step": 1135 }, { "completion_length": 106.515625, "epoch": 0.7324306898774984, "grad_norm": 21.090362548828125, "kl": 0.12353515625, "learning_rate": 6.337846550612508e-07, "loss": 0.0049, "reward": 1.5312876105308533, "reward_std": 0.07693489640951157, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5312876403331757, "step": 1136 }, { "completion_length": 121.703125, "epoch": 0.7330754352030948, "grad_norm": 11.780389785766602, "kl": 0.12646484375, "learning_rate": 6.334622823984526e-07, "loss": 0.005, "reward": 1.6064636707305908, "reward_std": 0.10624757781624794, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6064636707305908, "step": 1137 }, { "completion_length": 113.671875, "epoch": 0.7337201805286911, "grad_norm": 24.819656372070312, "kl": 0.113525390625, "learning_rate": 6.331399097356543e-07, "loss": 0.0045, "reward": 1.6082735657691956, "reward_std": 0.10192062892019749, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6238984763622284, "step": 1138 }, { "completion_length": 98.25, "epoch": 0.7343649258542876, "grad_norm": 11.147581100463867, "kl": 0.109130859375, "learning_rate": 6.328175370728563e-07, "loss": 0.0044, "reward": 1.5456410646438599, "reward_std": 0.07163537293672562, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5456410944461823, "step": 1139 }, { "completion_length": 108.734375, "epoch": 0.7350096711798839, "grad_norm": 16.347904205322266, "kl": 0.12158203125, "learning_rate": 6.32495164410058e-07, "loss": 0.0049, "reward": 1.7426636219024658, "reward_std": 0.08242497406899929, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7426636517047882, "step": 1140 }, { "completion_length": 115.0, "epoch": 0.7356544165054804, "grad_norm": 24.418312072753906, "kl": 0.1181640625, "learning_rate": 6.321727917472598e-07, "loss": 0.0047, "reward": 1.4525124430656433, "reward_std": 0.11574630439281464, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4525124728679657, "step": 1141 }, { "completion_length": 102.71875, "epoch": 0.7362991618310767, "grad_norm": 23.146039962768555, "kl": 0.12548828125, "learning_rate": 6.318504190844615e-07, "loss": 0.005, "reward": 1.4995726943016052, "reward_std": 0.09345082193613052, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4995726943016052, "step": 1142 }, { "completion_length": 99.359375, "epoch": 0.7369439071566731, "grad_norm": 11.854866981506348, "kl": 0.1611328125, "learning_rate": 6.315280464216635e-07, "loss": 0.0064, "reward": 1.5400469899177551, "reward_std": 0.152745820581913, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5556719899177551, "step": 1143 }, { "completion_length": 110.265625, "epoch": 0.7375886524822695, "grad_norm": 23.145206451416016, "kl": 0.12939453125, "learning_rate": 6.312056737588652e-07, "loss": 0.0052, "reward": 1.7510847449302673, "reward_std": 0.0897570513188839, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7510847449302673, "step": 1144 }, { "completion_length": 115.1875, "epoch": 0.7382333978078659, "grad_norm": 7.87054967880249, "kl": 0.133056640625, "learning_rate": 6.30883301096067e-07, "loss": 0.0053, "reward": 1.4778541326522827, "reward_std": 0.09418310970067978, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4778541922569275, "step": 1145 }, { "completion_length": 109.515625, "epoch": 0.7388781431334622, "grad_norm": 73.98107147216797, "kl": 0.7998046875, "learning_rate": 6.305609284332688e-07, "loss": 0.032, "reward": 1.671739637851715, "reward_std": 0.08632522821426392, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6717396378517151, "step": 1146 }, { "completion_length": 122.53125, "epoch": 0.7395228884590587, "grad_norm": 31.42171287536621, "kl": 0.128173828125, "learning_rate": 6.302385557704706e-07, "loss": 0.0051, "reward": 1.554942011833191, "reward_std": 0.14347686991095543, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5549419820308685, "step": 1147 }, { "completion_length": 109.09375, "epoch": 0.7401676337846551, "grad_norm": 15.380295753479004, "kl": 0.111083984375, "learning_rate": 6.299161831076724e-07, "loss": 0.0045, "reward": 1.4521582126617432, "reward_std": 0.05634276382625103, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45215825736522675, "step": 1148 }, { "completion_length": 108.84375, "epoch": 0.7408123791102514, "grad_norm": 5.173756122589111, "kl": 0.1240234375, "learning_rate": 6.295938104448743e-07, "loss": 0.005, "reward": 1.5091955661773682, "reward_std": 0.08003686740994453, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5091955214738846, "step": 1149 }, { "completion_length": 110.515625, "epoch": 0.7414571244358479, "grad_norm": 13.267820358276367, "kl": 0.1337890625, "learning_rate": 6.29271437782076e-07, "loss": 0.0053, "reward": 1.4815285801887512, "reward_std": 0.06610534526407719, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4815286546945572, "step": 1150 }, { "completion_length": 103.546875, "epoch": 0.7421018697614442, "grad_norm": 7.666851043701172, "kl": 0.12548828125, "learning_rate": 6.289490651192778e-07, "loss": 0.005, "reward": 1.4223946332931519, "reward_std": 0.09108829498291016, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42239460349082947, "step": 1151 }, { "completion_length": 112.078125, "epoch": 0.7427466150870407, "grad_norm": 46.41838836669922, "kl": 0.117431640625, "learning_rate": 6.286266924564797e-07, "loss": 0.0047, "reward": 1.3338908553123474, "reward_std": 0.11570664495229721, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3338908553123474, "step": 1152 }, { "completion_length": 105.8125, "epoch": 0.743391360412637, "grad_norm": 39.60142517089844, "kl": 0.121826171875, "learning_rate": 6.283043197936815e-07, "loss": 0.0049, "reward": 1.4156802892684937, "reward_std": 0.05137963965535164, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41568028926849365, "step": 1153 }, { "completion_length": 130.8125, "epoch": 0.7440361057382334, "grad_norm": 8.230225563049316, "kl": 0.111328125, "learning_rate": 6.279819471308832e-07, "loss": 0.0045, "reward": 1.5346974730491638, "reward_std": 0.13676878809928894, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5346974283456802, "step": 1154 }, { "completion_length": 102.15625, "epoch": 0.7446808510638298, "grad_norm": 16.30229377746582, "kl": 0.1484375, "learning_rate": 6.276595744680851e-07, "loss": 0.0059, "reward": 1.6616498827934265, "reward_std": 0.09401846304535866, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6616499423980713, "step": 1155 }, { "completion_length": 124.65625, "epoch": 0.7453255963894262, "grad_norm": 9.194766998291016, "kl": 0.1552734375, "learning_rate": 6.273372018052869e-07, "loss": 0.0062, "reward": 1.4719805121421814, "reward_std": 0.09085122868418694, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4876054525375366, "step": 1156 }, { "completion_length": 112.515625, "epoch": 0.7459703417150225, "grad_norm": 34.08546829223633, "kl": 0.1337890625, "learning_rate": 6.270148291424887e-07, "loss": 0.0053, "reward": 1.4912188053131104, "reward_std": 0.15397004038095474, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4912187159061432, "step": 1157 }, { "completion_length": 122.4375, "epoch": 0.746615087040619, "grad_norm": 7.057889461517334, "kl": 0.13232421875, "learning_rate": 6.266924564796905e-07, "loss": 0.0053, "reward": 1.6029225587844849, "reward_std": 0.06702636182308197, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6029225587844849, "step": 1158 }, { "completion_length": 108.75, "epoch": 0.7472598323662153, "grad_norm": 20.546005249023438, "kl": 0.128662109375, "learning_rate": 6.263700838168923e-07, "loss": 0.0052, "reward": 1.7207480072975159, "reward_std": 0.09248170256614685, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7207480072975159, "step": 1159 }, { "completion_length": 121.984375, "epoch": 0.7479045776918117, "grad_norm": 10.88604736328125, "kl": 0.13818359375, "learning_rate": 6.26047711154094e-07, "loss": 0.0055, "reward": 1.6104546785354614, "reward_std": 0.14067476242780685, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6104547083377838, "step": 1160 }, { "completion_length": 112.3125, "epoch": 0.7485493230174082, "grad_norm": 17.844881057739258, "kl": 0.1328125, "learning_rate": 6.25725338491296e-07, "loss": 0.0053, "reward": 1.4679070711135864, "reward_std": 0.0854264535009861, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46790711581707, "step": 1161 }, { "completion_length": 109.359375, "epoch": 0.7491940683430045, "grad_norm": 11.832077026367188, "kl": 0.1396484375, "learning_rate": 6.254029658284977e-07, "loss": 0.0056, "reward": 1.5666400790214539, "reward_std": 0.07326122745871544, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5666401386260986, "step": 1162 }, { "completion_length": 109.5, "epoch": 0.749838813668601, "grad_norm": 12.351283073425293, "kl": 0.1318359375, "learning_rate": 6.250805931656995e-07, "loss": 0.0053, "reward": 1.4992797374725342, "reward_std": 0.08922664262354374, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4992796927690506, "step": 1163 }, { "completion_length": 108.328125, "epoch": 0.7504835589941973, "grad_norm": 14.127129554748535, "kl": 0.128662109375, "learning_rate": 6.247582205029012e-07, "loss": 0.0051, "reward": 1.725237250328064, "reward_std": 0.05798189900815487, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.725237250328064, "step": 1164 }, { "completion_length": 109.875, "epoch": 0.7511283043197937, "grad_norm": 10.182910919189453, "kl": 0.16796875, "learning_rate": 6.244358478401032e-07, "loss": 0.0067, "reward": 1.5962581038475037, "reward_std": 0.11094292625784874, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.596258133649826, "step": 1165 }, { "completion_length": 114.96875, "epoch": 0.75177304964539, "grad_norm": 33.12942886352539, "kl": 0.1328125, "learning_rate": 6.241134751773049e-07, "loss": 0.0053, "reward": 1.474349319934845, "reward_std": 0.13836726546287537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47434939444065094, "step": 1166 }, { "completion_length": 116.828125, "epoch": 0.7524177949709865, "grad_norm": 20.074556350708008, "kl": 0.12646484375, "learning_rate": 6.237911025145067e-07, "loss": 0.0051, "reward": 1.6606274843215942, "reward_std": 0.09943772666156292, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.660627543926239, "step": 1167 }, { "completion_length": 117.546875, "epoch": 0.7530625402965828, "grad_norm": 9.653539657592773, "kl": 0.1259765625, "learning_rate": 6.234687298517085e-07, "loss": 0.0051, "reward": 1.4474668502807617, "reward_std": 0.1122102215886116, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4630918353796005, "step": 1168 }, { "completion_length": 106.90625, "epoch": 0.7537072856221793, "grad_norm": 20.613239288330078, "kl": 0.12744140625, "learning_rate": 6.231463571889104e-07, "loss": 0.0051, "reward": 1.421180248260498, "reward_std": 0.12799222767353058, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42118029296398163, "step": 1169 }, { "completion_length": 116.25, "epoch": 0.7543520309477756, "grad_norm": 10.835249900817871, "kl": 0.110107421875, "learning_rate": 6.228239845261121e-07, "loss": 0.0044, "reward": 1.6783432960510254, "reward_std": 0.07521459832787514, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6783432960510254, "step": 1170 }, { "completion_length": 113.671875, "epoch": 0.754996776273372, "grad_norm": 36.19251251220703, "kl": 0.146484375, "learning_rate": 6.22501611863314e-07, "loss": 0.0059, "reward": 1.4784401059150696, "reward_std": 0.13166068121790886, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4940650761127472, "step": 1171 }, { "completion_length": 103.25, "epoch": 0.7556415215989684, "grad_norm": 13.30659008026123, "kl": 0.12744140625, "learning_rate": 6.221792392005157e-07, "loss": 0.0051, "reward": 1.6257249116897583, "reward_std": 0.15445075929164886, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6413498520851135, "step": 1172 }, { "completion_length": 123.484375, "epoch": 0.7562862669245648, "grad_norm": 11.93229866027832, "kl": 0.193359375, "learning_rate": 6.218568665377176e-07, "loss": 0.0077, "reward": 1.6381423473358154, "reward_std": 0.09956448897719383, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6381423473358154, "step": 1173 }, { "completion_length": 111.078125, "epoch": 0.7569310122501612, "grad_norm": 26.301389694213867, "kl": 0.141845703125, "learning_rate": 6.215344938749194e-07, "loss": 0.0057, "reward": 1.527126967906952, "reward_std": 0.11088033393025398, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5271269381046295, "step": 1174 }, { "completion_length": 107.875, "epoch": 0.7575757575757576, "grad_norm": 20.331317901611328, "kl": 0.12841796875, "learning_rate": 6.212121212121212e-07, "loss": 0.0051, "reward": 1.5884113907814026, "reward_std": 0.12505877017974854, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5884113907814026, "step": 1175 }, { "completion_length": 110.65625, "epoch": 0.758220502901354, "grad_norm": 16.177221298217773, "kl": 0.1337890625, "learning_rate": 6.208897485493229e-07, "loss": 0.0053, "reward": 1.5988757014274597, "reward_std": 0.1253979578614235, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5988757908344269, "step": 1176 }, { "completion_length": 108.15625, "epoch": 0.7588652482269503, "grad_norm": 100.30497741699219, "kl": 0.12841796875, "learning_rate": 6.205673758865248e-07, "loss": 0.0051, "reward": 1.4070366621017456, "reward_std": 0.11220012977719307, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4070366621017456, "step": 1177 }, { "completion_length": 107.125, "epoch": 0.7595099935525468, "grad_norm": 9.922935485839844, "kl": 0.135498046875, "learning_rate": 6.202450032237266e-07, "loss": 0.0054, "reward": 1.6834790706634521, "reward_std": 0.14150448888540268, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6834791004657745, "step": 1178 }, { "completion_length": 108.234375, "epoch": 0.7601547388781431, "grad_norm": 16.865028381347656, "kl": 0.117431640625, "learning_rate": 6.199226305609284e-07, "loss": 0.0047, "reward": 1.5311815738677979, "reward_std": 0.058224376291036606, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.531181588768959, "step": 1179 }, { "completion_length": 114.421875, "epoch": 0.7607994842037396, "grad_norm": 11.300497055053711, "kl": 0.1240234375, "learning_rate": 6.196002578981302e-07, "loss": 0.005, "reward": 1.549267292022705, "reward_std": 0.11853723600506783, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5492672622203827, "step": 1180 }, { "completion_length": 111.671875, "epoch": 0.7614442295293359, "grad_norm": 14.773999214172363, "kl": 0.11474609375, "learning_rate": 6.19277885235332e-07, "loss": 0.0046, "reward": 1.4692705869674683, "reward_std": 0.11159190721809864, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46927061676979065, "step": 1181 }, { "completion_length": 106.671875, "epoch": 0.7620889748549323, "grad_norm": 12.688148498535156, "kl": 0.13671875, "learning_rate": 6.189555125725338e-07, "loss": 0.0055, "reward": 1.6706724166870117, "reward_std": 0.1215021051466465, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6706724166870117, "step": 1182 }, { "completion_length": 110.125, "epoch": 0.7627337201805287, "grad_norm": 24.101341247558594, "kl": 0.155029296875, "learning_rate": 6.186331399097357e-07, "loss": 0.0062, "reward": 1.5696587562561035, "reward_std": 0.2225308120250702, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5852837711572647, "step": 1183 }, { "completion_length": 107.28125, "epoch": 0.7633784655061251, "grad_norm": 16.765844345092773, "kl": 0.122314453125, "learning_rate": 6.183107672469374e-07, "loss": 0.0049, "reward": 1.7224302291870117, "reward_std": 0.08688852190971375, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7224301993846893, "step": 1184 }, { "completion_length": 110.859375, "epoch": 0.7640232108317214, "grad_norm": 15.215022087097168, "kl": 0.154296875, "learning_rate": 6.179883945841392e-07, "loss": 0.0062, "reward": 1.6652826070785522, "reward_std": 0.15134353935718536, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6809075474739075, "step": 1185 }, { "completion_length": 109.515625, "epoch": 0.7646679561573179, "grad_norm": 14.763962745666504, "kl": 0.1337890625, "learning_rate": 6.17666021921341e-07, "loss": 0.0053, "reward": 1.4886223673820496, "reward_std": 0.14177030697464943, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4886223375797272, "step": 1186 }, { "completion_length": 120.53125, "epoch": 0.7653127014829143, "grad_norm": 20.856586456298828, "kl": 0.11962890625, "learning_rate": 6.173436492585429e-07, "loss": 0.0048, "reward": 1.5948373079299927, "reward_std": 0.12407495453953743, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5948372483253479, "step": 1187 }, { "completion_length": 106.53125, "epoch": 0.7659574468085106, "grad_norm": 7.932638168334961, "kl": 0.14404296875, "learning_rate": 6.170212765957446e-07, "loss": 0.0058, "reward": 1.6733270287513733, "reward_std": 0.14593126624822617, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6733270585536957, "step": 1188 }, { "completion_length": 122.390625, "epoch": 0.7666021921341071, "grad_norm": 6.648770332336426, "kl": 0.14501953125, "learning_rate": 6.166989039329464e-07, "loss": 0.0058, "reward": 1.4227712154388428, "reward_std": 0.11841831728816032, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.42277124524116516, "step": 1189 }, { "completion_length": 109.3125, "epoch": 0.7672469374597034, "grad_norm": 9.889556884765625, "kl": 0.11328125, "learning_rate": 6.163765312701482e-07, "loss": 0.0045, "reward": 1.6376824975013733, "reward_std": 0.06098635122179985, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6376824975013733, "step": 1190 }, { "completion_length": 114.984375, "epoch": 0.7678916827852998, "grad_norm": 14.385912895202637, "kl": 0.1591796875, "learning_rate": 6.160541586073501e-07, "loss": 0.0063, "reward": 1.550232172012329, "reward_std": 0.0959642305970192, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5502321124076843, "step": 1191 }, { "completion_length": 104.5, "epoch": 0.7685364281108962, "grad_norm": 23.22954559326172, "kl": 0.124267578125, "learning_rate": 6.157317859445518e-07, "loss": 0.005, "reward": 1.396329402923584, "reward_std": 0.11338374391198158, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.411954402923584, "step": 1192 }, { "completion_length": 111.671875, "epoch": 0.7691811734364926, "grad_norm": 16.116065979003906, "kl": 0.12353515625, "learning_rate": 6.154094132817537e-07, "loss": 0.0049, "reward": 1.7101741433143616, "reward_std": 0.083702202886343, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7101741433143616, "step": 1193 }, { "completion_length": 126.359375, "epoch": 0.769825918762089, "grad_norm": 12.175841331481934, "kl": 0.114501953125, "learning_rate": 6.150870406189554e-07, "loss": 0.0046, "reward": 1.7419912219047546, "reward_std": 0.08732573315501213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7419912219047546, "step": 1194 }, { "completion_length": 110.546875, "epoch": 0.7704706640876854, "grad_norm": 13.75151252746582, "kl": 0.114013671875, "learning_rate": 6.147646679561573e-07, "loss": 0.0046, "reward": 1.6501139402389526, "reward_std": 0.11225607246160507, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6501139998435974, "step": 1195 }, { "completion_length": 106.46875, "epoch": 0.7711154094132817, "grad_norm": 11.834531784057617, "kl": 0.13818359375, "learning_rate": 6.144422952933591e-07, "loss": 0.0055, "reward": 1.676537036895752, "reward_std": 0.09139727428555489, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6765370666980743, "step": 1196 }, { "completion_length": 109.140625, "epoch": 0.7717601547388782, "grad_norm": 29.20878791809082, "kl": 0.12548828125, "learning_rate": 6.141199226305609e-07, "loss": 0.005, "reward": 1.5550090074539185, "reward_std": 0.13649238646030426, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5706340670585632, "step": 1197 }, { "completion_length": 112.53125, "epoch": 0.7724049000644745, "grad_norm": 13.043038368225098, "kl": 0.140869140625, "learning_rate": 6.137975499677626e-07, "loss": 0.0056, "reward": 1.5729409456253052, "reward_std": 0.16874703764915466, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5729409903287888, "step": 1198 }, { "completion_length": 106.4375, "epoch": 0.7730496453900709, "grad_norm": 14.998727798461914, "kl": 0.14599609375, "learning_rate": 6.134751773049646e-07, "loss": 0.0058, "reward": 1.5888980627059937, "reward_std": 0.12762803584337234, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5888980329036713, "step": 1199 }, { "completion_length": 111.9375, "epoch": 0.7736943907156673, "grad_norm": 107.7376937866211, "kl": 0.13671875, "learning_rate": 6.131528046421663e-07, "loss": 0.0055, "reward": 1.6070385575294495, "reward_std": 0.10263358429074287, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6070385873317719, "step": 1200 }, { "completion_length": 113.765625, "epoch": 0.7743391360412637, "grad_norm": 7.839345455169678, "kl": 0.13916015625, "learning_rate": 6.128304319793681e-07, "loss": 0.0056, "reward": 1.6572014093399048, "reward_std": 0.11518439650535583, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6572013795375824, "step": 1201 }, { "completion_length": 98.234375, "epoch": 0.7749838813668601, "grad_norm": 14.737028121948242, "kl": 0.1513671875, "learning_rate": 6.125080593165699e-07, "loss": 0.006, "reward": 1.52261883020401, "reward_std": 0.15272536873817444, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5382438600063324, "step": 1202 }, { "completion_length": 113.875, "epoch": 0.7756286266924565, "grad_norm": 8.930920600891113, "kl": 0.134033203125, "learning_rate": 6.121856866537717e-07, "loss": 0.0054, "reward": 1.5145543813705444, "reward_std": 0.14755171723663807, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.530179426074028, "step": 1203 }, { "completion_length": 109.40625, "epoch": 0.7762733720180529, "grad_norm": 8.757515907287598, "kl": 0.130859375, "learning_rate": 6.118633139909736e-07, "loss": 0.0052, "reward": 1.53077232837677, "reward_std": 0.10953427106142044, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5307723581790924, "step": 1204 }, { "completion_length": 113.125, "epoch": 0.7769181173436492, "grad_norm": 203.08412170410156, "kl": 2.7158203125, "learning_rate": 6.115409413281754e-07, "loss": 0.1085, "reward": 1.5865591764450073, "reward_std": 0.08946330845355988, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5865591168403625, "step": 1205 }, { "completion_length": 108.046875, "epoch": 0.7775628626692457, "grad_norm": 11.308333396911621, "kl": 0.14013671875, "learning_rate": 6.112185686653771e-07, "loss": 0.0056, "reward": 1.4254971742630005, "reward_std": 0.2110433205962181, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4411221593618393, "step": 1206 }, { "completion_length": 106.875, "epoch": 0.778207607994842, "grad_norm": 13.741293907165527, "kl": 0.12646484375, "learning_rate": 6.108961960025789e-07, "loss": 0.0051, "reward": 1.6106538772583008, "reward_std": 0.13345135748386383, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6106538474559784, "step": 1207 }, { "completion_length": 101.46875, "epoch": 0.7788523533204385, "grad_norm": 8.687922477722168, "kl": 0.14111328125, "learning_rate": 6.105738233397809e-07, "loss": 0.0057, "reward": 1.4746322631835938, "reward_std": 0.19175250083208084, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5058822631835938, "step": 1208 }, { "completion_length": 99.484375, "epoch": 0.7794970986460348, "grad_norm": 12.14449691772461, "kl": 0.15185546875, "learning_rate": 6.102514506769826e-07, "loss": 0.0061, "reward": 1.7666597962379456, "reward_std": 0.18449196964502335, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7822847962379456, "step": 1209 }, { "completion_length": 106.03125, "epoch": 0.7801418439716312, "grad_norm": 22.527015686035156, "kl": 0.13916015625, "learning_rate": 6.099290780141844e-07, "loss": 0.0056, "reward": 1.5839348435401917, "reward_std": 0.112147755920887, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5839348137378693, "step": 1210 }, { "completion_length": 102.15625, "epoch": 0.7807865892972276, "grad_norm": 46.15434265136719, "kl": 0.1787109375, "learning_rate": 6.096067053513861e-07, "loss": 0.0072, "reward": 1.5750333070755005, "reward_std": 0.0655747577548027, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5750332772731781, "step": 1211 }, { "completion_length": 112.359375, "epoch": 0.781431334622824, "grad_norm": 14.038028717041016, "kl": 0.1357421875, "learning_rate": 6.092843326885881e-07, "loss": 0.0054, "reward": 1.416702389717102, "reward_std": 0.16470851749181747, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41670234501361847, "step": 1212 }, { "completion_length": 93.0, "epoch": 0.7820760799484203, "grad_norm": 25.67938804626465, "kl": 0.15087890625, "learning_rate": 6.089619600257898e-07, "loss": 0.006, "reward": 1.6413110494613647, "reward_std": 0.07309877313673496, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.64131098985672, "step": 1213 }, { "completion_length": 105.5625, "epoch": 0.7827208252740168, "grad_norm": 14.893291473388672, "kl": 0.15966796875, "learning_rate": 6.086395873629916e-07, "loss": 0.0064, "reward": 1.4017677307128906, "reward_std": 0.2676720768213272, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.4330177456140518, "step": 1214 }, { "completion_length": 93.484375, "epoch": 0.7833655705996132, "grad_norm": 9.510297775268555, "kl": 0.126708984375, "learning_rate": 6.083172147001934e-07, "loss": 0.0051, "reward": 1.4641136527061462, "reward_std": 0.11368128657341003, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.464113712310791, "step": 1215 }, { "completion_length": 101.515625, "epoch": 0.7840103159252095, "grad_norm": 23.989641189575195, "kl": 0.1552734375, "learning_rate": 6.079948420373952e-07, "loss": 0.0062, "reward": 1.7286843061447144, "reward_std": 0.10229164734482765, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7286842465400696, "step": 1216 }, { "completion_length": 100.609375, "epoch": 0.784655061250806, "grad_norm": 11.944487571716309, "kl": 0.14306640625, "learning_rate": 6.07672469374597e-07, "loss": 0.0057, "reward": 1.5962103009223938, "reward_std": 0.11994421109557152, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5962103307247162, "step": 1217 }, { "completion_length": 100.453125, "epoch": 0.7852998065764023, "grad_norm": 13.68811321258545, "kl": 0.141845703125, "learning_rate": 6.073500967117989e-07, "loss": 0.0057, "reward": 1.5690941214561462, "reward_std": 0.10337880253791809, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5690941512584686, "step": 1218 }, { "completion_length": 96.4375, "epoch": 0.7859445519019987, "grad_norm": 22.196964263916016, "kl": 0.14306640625, "learning_rate": 6.070277240490006e-07, "loss": 0.0057, "reward": 1.3420570492744446, "reward_std": 0.11058967746794224, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.342057004570961, "step": 1219 }, { "completion_length": 97.875, "epoch": 0.7865892972275951, "grad_norm": 35.983314514160156, "kl": 0.1220703125, "learning_rate": 6.067053513862024e-07, "loss": 0.0049, "reward": 1.577008068561554, "reward_std": 0.10896754078567028, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5770081877708435, "step": 1220 }, { "completion_length": 104.453125, "epoch": 0.7872340425531915, "grad_norm": 20.094215393066406, "kl": 0.12890625, "learning_rate": 6.063829787234043e-07, "loss": 0.0051, "reward": 1.6798162460327148, "reward_std": 0.18578680604696274, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6954412758350372, "step": 1221 }, { "completion_length": 105.859375, "epoch": 0.7878787878787878, "grad_norm": 21.012859344482422, "kl": 0.15576171875, "learning_rate": 6.060606060606061e-07, "loss": 0.0062, "reward": 1.4645494222640991, "reward_std": 0.08789615705609322, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4645494073629379, "step": 1222 }, { "completion_length": 97.828125, "epoch": 0.7885235332043843, "grad_norm": 21.104331970214844, "kl": 0.1318359375, "learning_rate": 6.057382333978078e-07, "loss": 0.0053, "reward": 1.5897061228752136, "reward_std": 0.1244133822619915, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.589706152677536, "step": 1223 }, { "completion_length": 95.78125, "epoch": 0.7891682785299806, "grad_norm": 21.754030227661133, "kl": 0.122314453125, "learning_rate": 6.054158607350097e-07, "loss": 0.0049, "reward": 1.6213258504867554, "reward_std": 0.12604708224534988, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6369507908821106, "step": 1224 }, { "completion_length": 116.484375, "epoch": 0.7898130238555771, "grad_norm": 10.232507705688477, "kl": 0.12841796875, "learning_rate": 6.050934880722115e-07, "loss": 0.0051, "reward": 1.5714877843856812, "reward_std": 0.10408592224121094, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5714877992868423, "step": 1225 }, { "completion_length": 105.84375, "epoch": 0.7904577691811734, "grad_norm": 12.34595775604248, "kl": 0.121337890625, "learning_rate": 6.047711154094133e-07, "loss": 0.0049, "reward": 1.571517288684845, "reward_std": 0.13167385384440422, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.571517288684845, "step": 1226 }, { "completion_length": 116.328125, "epoch": 0.7911025145067698, "grad_norm": 22.388225555419922, "kl": 0.21240234375, "learning_rate": 6.04448742746615e-07, "loss": 0.0085, "reward": 1.595013439655304, "reward_std": 0.1744321957230568, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.610638439655304, "step": 1227 }, { "completion_length": 106.953125, "epoch": 0.7917472598323663, "grad_norm": 10.770123481750488, "kl": 0.138671875, "learning_rate": 6.041263700838169e-07, "loss": 0.0056, "reward": 1.6746158003807068, "reward_std": 0.16554439067840576, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6902407705783844, "step": 1228 }, { "completion_length": 98.1875, "epoch": 0.7923920051579626, "grad_norm": 7.27217960357666, "kl": 0.12890625, "learning_rate": 6.038039974210186e-07, "loss": 0.0052, "reward": 1.4847107529640198, "reward_std": 0.08111886493861675, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4847107529640198, "step": 1229 }, { "completion_length": 105.53125, "epoch": 0.793036750483559, "grad_norm": 33.54627990722656, "kl": 0.12841796875, "learning_rate": 6.034816247582206e-07, "loss": 0.0051, "reward": 1.6125401258468628, "reward_std": 0.18335703760385513, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6125401556491852, "step": 1230 }, { "completion_length": 99.484375, "epoch": 0.7936814958091554, "grad_norm": 10.74837589263916, "kl": 0.2080078125, "learning_rate": 6.031592520954223e-07, "loss": 0.0083, "reward": 1.5733410716056824, "reward_std": 0.2276994064450264, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5889661312103271, "step": 1231 }, { "completion_length": 103.171875, "epoch": 0.7943262411347518, "grad_norm": 14.955659866333008, "kl": 0.12353515625, "learning_rate": 6.028368794326241e-07, "loss": 0.0049, "reward": 1.462202548980713, "reward_std": 0.20029695332050323, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4778275787830353, "step": 1232 }, { "completion_length": 107.609375, "epoch": 0.7949709864603481, "grad_norm": 10.545095443725586, "kl": 0.13134765625, "learning_rate": 6.025145067698258e-07, "loss": 0.0052, "reward": 1.440338671207428, "reward_std": 0.1069004088640213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4403386861085892, "step": 1233 }, { "completion_length": 108.578125, "epoch": 0.7956157317859446, "grad_norm": 16.158153533935547, "kl": 0.135009765625, "learning_rate": 6.021921341070278e-07, "loss": 0.0054, "reward": 1.503720223903656, "reward_std": 0.18556904792785645, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.503720223903656, "step": 1234 }, { "completion_length": 95.71875, "epoch": 0.7962604771115409, "grad_norm": 10.688615798950195, "kl": 0.1416015625, "learning_rate": 6.018697614442295e-07, "loss": 0.0057, "reward": 1.4603247046470642, "reward_std": 0.11439431831240654, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.460324689745903, "step": 1235 }, { "completion_length": 99.140625, "epoch": 0.7969052224371374, "grad_norm": 100.63502502441406, "kl": 0.129638671875, "learning_rate": 6.015473887814313e-07, "loss": 0.0052, "reward": 1.771046221256256, "reward_std": 0.07787198200821877, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7710461616516113, "step": 1236 }, { "completion_length": 106.015625, "epoch": 0.7975499677627337, "grad_norm": 8.697854995727539, "kl": 0.1162109375, "learning_rate": 6.012250161186331e-07, "loss": 0.0046, "reward": 1.5569599866867065, "reward_std": 0.1632199063897133, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5569599568843842, "step": 1237 }, { "completion_length": 104.859375, "epoch": 0.7981947130883301, "grad_norm": 14.559203147888184, "kl": 0.14697265625, "learning_rate": 6.00902643455835e-07, "loss": 0.0059, "reward": 1.6990448832511902, "reward_std": 0.177838034927845, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6990448534488678, "step": 1238 }, { "completion_length": 95.234375, "epoch": 0.7988394584139265, "grad_norm": 47.494850158691406, "kl": 0.16357421875, "learning_rate": 6.005802707930367e-07, "loss": 0.0065, "reward": 1.5274017453193665, "reward_std": 0.11369790881872177, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5274017453193665, "step": 1239 }, { "completion_length": 100.96875, "epoch": 0.7994842037395229, "grad_norm": 20.095169067382812, "kl": 0.113037109375, "learning_rate": 6.002578981302386e-07, "loss": 0.0045, "reward": 1.5768287777900696, "reward_std": 0.15517547726631165, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5768287479877472, "step": 1240 }, { "completion_length": 97.265625, "epoch": 0.8001289490651193, "grad_norm": 16.03243064880371, "kl": 0.1318359375, "learning_rate": 5.999355254674403e-07, "loss": 0.0053, "reward": 1.51595139503479, "reward_std": 0.14299090951681137, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5159513652324677, "step": 1241 }, { "completion_length": 103.953125, "epoch": 0.8007736943907157, "grad_norm": 10.533329010009766, "kl": 0.115478515625, "learning_rate": 5.996131528046421e-07, "loss": 0.0046, "reward": 1.4868841767311096, "reward_std": 0.18622149527072906, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48688413202762604, "step": 1242 }, { "completion_length": 94.5625, "epoch": 0.8014184397163121, "grad_norm": 8.86988353729248, "kl": 0.15185546875, "learning_rate": 5.99290780141844e-07, "loss": 0.0061, "reward": 1.5329524874687195, "reward_std": 0.216628547757864, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5485774874687195, "step": 1243 }, { "completion_length": 95.078125, "epoch": 0.8020631850419084, "grad_norm": 9.795210838317871, "kl": 0.12841796875, "learning_rate": 5.989684074790458e-07, "loss": 0.0051, "reward": 1.6223825812339783, "reward_std": 0.1345507875084877, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6223825961351395, "step": 1244 }, { "completion_length": 100.859375, "epoch": 0.8027079303675049, "grad_norm": 32.753726959228516, "kl": 0.125732421875, "learning_rate": 5.986460348162475e-07, "loss": 0.005, "reward": 1.5496293306350708, "reward_std": 0.09532084688544273, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.549629271030426, "step": 1245 }, { "completion_length": 98.875, "epoch": 0.8033526756931012, "grad_norm": 21.810379028320312, "kl": 0.1298828125, "learning_rate": 5.983236621534494e-07, "loss": 0.0052, "reward": 1.3395966291427612, "reward_std": 0.09464257955551147, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.33959658443927765, "step": 1246 }, { "completion_length": 85.375, "epoch": 0.8039974210186976, "grad_norm": 21.297460556030273, "kl": 0.14599609375, "learning_rate": 5.980012894906512e-07, "loss": 0.0058, "reward": 1.7041836380958557, "reward_std": 0.10666557401418686, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7041836380958557, "step": 1247 }, { "completion_length": 91.0625, "epoch": 0.804642166344294, "grad_norm": 13.005825996398926, "kl": 0.14453125, "learning_rate": 5.97678916827853e-07, "loss": 0.0058, "reward": 1.7234504222869873, "reward_std": 0.1442631632089615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7234503924846649, "step": 1248 }, { "completion_length": 103.765625, "epoch": 0.8052869116698904, "grad_norm": 41.9098014831543, "kl": 0.1171875, "learning_rate": 5.973565441650548e-07, "loss": 0.0047, "reward": 1.4872705340385437, "reward_std": 0.17103341966867447, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4872705489397049, "step": 1249 }, { "completion_length": 95.15625, "epoch": 0.8059316569954867, "grad_norm": 34.45671081542969, "kl": 0.13330078125, "learning_rate": 5.970341715022566e-07, "loss": 0.0053, "reward": 1.6393902897834778, "reward_std": 0.14729318767786026, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6393902599811554, "step": 1250 }, { "completion_length": 94.25, "epoch": 0.8065764023210832, "grad_norm": 14.370905876159668, "kl": 0.13720703125, "learning_rate": 5.967117988394584e-07, "loss": 0.0055, "reward": 1.5178990960121155, "reward_std": 0.07636865600943565, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5178991109132767, "step": 1251 }, { "completion_length": 88.359375, "epoch": 0.8072211476466795, "grad_norm": 47.83586120605469, "kl": 0.14111328125, "learning_rate": 5.963894261766603e-07, "loss": 0.0057, "reward": 1.5101807117462158, "reward_std": 0.1659940369427204, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5101807117462158, "step": 1252 }, { "completion_length": 92.328125, "epoch": 0.807865892972276, "grad_norm": 17.251609802246094, "kl": 0.15087890625, "learning_rate": 5.96067053513862e-07, "loss": 0.006, "reward": 1.641158401966095, "reward_std": 0.17022105678915977, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.656783401966095, "step": 1253 }, { "completion_length": 85.4375, "epoch": 0.8085106382978723, "grad_norm": 9.6865873336792, "kl": 0.173828125, "learning_rate": 5.957446808510638e-07, "loss": 0.0069, "reward": 1.5700511932373047, "reward_std": 0.11514360085129738, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5700511932373047, "step": 1254 }, { "completion_length": 94.03125, "epoch": 0.8091553836234687, "grad_norm": 21.811676025390625, "kl": 0.122802734375, "learning_rate": 5.954223081882655e-07, "loss": 0.0049, "reward": 1.520093321800232, "reward_std": 0.16418229416012764, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5200933665037155, "step": 1255 }, { "completion_length": 85.9375, "epoch": 0.8098001289490652, "grad_norm": 59.537200927734375, "kl": 0.1484375, "learning_rate": 5.950999355254675e-07, "loss": 0.0059, "reward": 1.5090277194976807, "reward_std": 0.1114044338464737, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5090276747941971, "step": 1256 }, { "completion_length": 92.640625, "epoch": 0.8104448742746615, "grad_norm": 50.82180404663086, "kl": 0.15283203125, "learning_rate": 5.947775628626692e-07, "loss": 0.0061, "reward": 1.3142651319503784, "reward_std": 0.037534992210567, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.31426508724689484, "step": 1257 }, { "completion_length": 83.796875, "epoch": 0.8110896196002579, "grad_norm": 20.525754928588867, "kl": 0.1259765625, "learning_rate": 5.94455190199871e-07, "loss": 0.005, "reward": 1.6182879209518433, "reward_std": 0.1074063628911972, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6182878911495209, "step": 1258 }, { "completion_length": 87.0, "epoch": 0.8117343649258543, "grad_norm": 9.627203941345215, "kl": 0.11865234375, "learning_rate": 5.941328175370728e-07, "loss": 0.0047, "reward": 1.5465694665908813, "reward_std": 0.09133465215563774, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.546569436788559, "step": 1259 }, { "completion_length": 99.09375, "epoch": 0.8123791102514507, "grad_norm": 22.241708755493164, "kl": 0.13037109375, "learning_rate": 5.938104448742747e-07, "loss": 0.0052, "reward": 1.5899001359939575, "reward_std": 0.15345070511102676, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6055251061916351, "step": 1260 }, { "completion_length": 96.40625, "epoch": 0.813023855577047, "grad_norm": 9.629485130310059, "kl": 0.1455078125, "learning_rate": 5.934880722114764e-07, "loss": 0.0058, "reward": 1.6390262842178345, "reward_std": 0.16355416178703308, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6390263140201569, "step": 1261 }, { "completion_length": 91.40625, "epoch": 0.8136686009026435, "grad_norm": 39.97684097290039, "kl": 0.189453125, "learning_rate": 5.931656995486783e-07, "loss": 0.0076, "reward": 1.5222207903862, "reward_std": 0.10405867174267769, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5222208201885223, "step": 1262 }, { "completion_length": 86.65625, "epoch": 0.8143133462282398, "grad_norm": 13.295222282409668, "kl": 0.114990234375, "learning_rate": 5.9284332688588e-07, "loss": 0.0046, "reward": 1.6988143920898438, "reward_std": 0.12119985744357109, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6988143026828766, "step": 1263 }, { "completion_length": 99.734375, "epoch": 0.8149580915538363, "grad_norm": 25.42751693725586, "kl": 0.17138671875, "learning_rate": 5.925209542230819e-07, "loss": 0.0069, "reward": 1.5959572792053223, "reward_std": 0.10522185638546944, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5959572196006775, "step": 1264 }, { "completion_length": 101.1875, "epoch": 0.8156028368794326, "grad_norm": 12.357390403747559, "kl": 0.19189453125, "learning_rate": 5.921985815602837e-07, "loss": 0.0077, "reward": 1.6559144854545593, "reward_std": 0.1504291370511055, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6559144854545593, "step": 1265 }, { "completion_length": 98.828125, "epoch": 0.816247582205029, "grad_norm": 22.642732620239258, "kl": 0.12890625, "learning_rate": 5.918762088974855e-07, "loss": 0.0052, "reward": 1.5548232197761536, "reward_std": 0.10006397217512131, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5548232197761536, "step": 1266 }, { "completion_length": 97.125, "epoch": 0.8168923275306254, "grad_norm": 11.454401969909668, "kl": 0.14501953125, "learning_rate": 5.915538362346872e-07, "loss": 0.0058, "reward": 1.6567357182502747, "reward_std": 0.09761639684438705, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6567357182502747, "step": 1267 }, { "completion_length": 100.59375, "epoch": 0.8175370728562218, "grad_norm": 11.95074462890625, "kl": 0.1357421875, "learning_rate": 5.912314635718891e-07, "loss": 0.0054, "reward": 1.6781417727470398, "reward_std": 0.0793631412088871, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6781417727470398, "step": 1268 }, { "completion_length": 94.03125, "epoch": 0.8181818181818182, "grad_norm": 23.272687911987305, "kl": 0.1201171875, "learning_rate": 5.909090909090909e-07, "loss": 0.0048, "reward": 1.6391507387161255, "reward_std": 0.1269705779850483, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6547757685184479, "step": 1269 }, { "completion_length": 95.546875, "epoch": 0.8188265635074146, "grad_norm": 14.534205436706543, "kl": 0.109130859375, "learning_rate": 5.905867182462927e-07, "loss": 0.0044, "reward": 1.5957148671150208, "reward_std": 0.09973860159516335, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5957148522138596, "step": 1270 }, { "completion_length": 94.78125, "epoch": 0.819471308833011, "grad_norm": 14.054845809936523, "kl": 0.128173828125, "learning_rate": 5.902643455834945e-07, "loss": 0.0051, "reward": 1.5978807210922241, "reward_std": 0.09598659723997116, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5978806614875793, "step": 1271 }, { "completion_length": 94.296875, "epoch": 0.8201160541586073, "grad_norm": 20.644716262817383, "kl": 0.1259765625, "learning_rate": 5.899419729206963e-07, "loss": 0.0051, "reward": 1.4058359265327454, "reward_std": 0.11267614737153053, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4058358669281006, "step": 1272 }, { "completion_length": 91.78125, "epoch": 0.8207607994842038, "grad_norm": 25.96257209777832, "kl": 0.11767578125, "learning_rate": 5.896196002578981e-07, "loss": 0.0047, "reward": 1.6652281284332275, "reward_std": 0.08674262091517448, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6652281284332275, "step": 1273 }, { "completion_length": 99.078125, "epoch": 0.8214055448098001, "grad_norm": 12.55455207824707, "kl": 0.1318359375, "learning_rate": 5.892972275951e-07, "loss": 0.0053, "reward": 1.500665009021759, "reward_std": 0.14083610475063324, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.500665009021759, "step": 1274 }, { "completion_length": 103.890625, "epoch": 0.8220502901353965, "grad_norm": 21.38951301574707, "kl": 0.1318359375, "learning_rate": 5.889748549323017e-07, "loss": 0.0053, "reward": 1.6267691254615784, "reward_std": 0.17314469069242477, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6423941254615784, "step": 1275 }, { "completion_length": 88.046875, "epoch": 0.8226950354609929, "grad_norm": 20.793231964111328, "kl": 0.1357421875, "learning_rate": 5.886524822695035e-07, "loss": 0.0054, "reward": 1.661752164363861, "reward_std": 0.11195382103323936, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6617521345615387, "step": 1276 }, { "completion_length": 98.78125, "epoch": 0.8233397807865893, "grad_norm": 13.758505821228027, "kl": 0.12353515625, "learning_rate": 5.883301096067053e-07, "loss": 0.0049, "reward": 1.5527395009994507, "reward_std": 0.05753506347537041, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5527395904064178, "step": 1277 }, { "completion_length": 106.171875, "epoch": 0.8239845261121856, "grad_norm": 11.539607048034668, "kl": 0.16064453125, "learning_rate": 5.880077369439072e-07, "loss": 0.0064, "reward": 1.5900872349739075, "reward_std": 0.10428940504789352, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5900872051715851, "step": 1278 }, { "completion_length": 94.890625, "epoch": 0.8246292714377821, "grad_norm": 11.546834945678711, "kl": 0.111083984375, "learning_rate": 5.876853642811089e-07, "loss": 0.0044, "reward": 1.566346824169159, "reward_std": 0.05556929484009743, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5663468241691589, "step": 1279 }, { "completion_length": 104.25, "epoch": 0.8252740167633784, "grad_norm": 13.243741035461426, "kl": 0.13623046875, "learning_rate": 5.873629916183107e-07, "loss": 0.0054, "reward": 1.7236492037773132, "reward_std": 0.07918451726436615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7236492037773132, "step": 1280 }, { "completion_length": 100.078125, "epoch": 0.8259187620889749, "grad_norm": 23.4504451751709, "kl": 0.1435546875, "learning_rate": 5.870406189555126e-07, "loss": 0.0057, "reward": 1.6817771196365356, "reward_std": 0.10275669768452644, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6817770898342133, "step": 1281 }, { "completion_length": 101.328125, "epoch": 0.8265635074145713, "grad_norm": 10.985300064086914, "kl": 0.13134765625, "learning_rate": 5.867182462927144e-07, "loss": 0.0053, "reward": 1.5396443605422974, "reward_std": 0.08468784764409065, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5396443605422974, "step": 1282 }, { "completion_length": 100.78125, "epoch": 0.8272082527401676, "grad_norm": 14.259724617004395, "kl": 0.145263671875, "learning_rate": 5.863958736299161e-07, "loss": 0.0058, "reward": 1.3875265717506409, "reward_std": 0.10882111266255379, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4031515270471573, "step": 1283 }, { "completion_length": 96.875, "epoch": 0.8278529980657641, "grad_norm": 13.990731239318848, "kl": 0.13916015625, "learning_rate": 5.86073500967118e-07, "loss": 0.0056, "reward": 1.6068326830863953, "reward_std": 0.1714675948023796, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6224577128887177, "step": 1284 }, { "completion_length": 108.359375, "epoch": 0.8284977433913604, "grad_norm": 11.542860984802246, "kl": 0.150390625, "learning_rate": 5.857511283043197e-07, "loss": 0.006, "reward": 1.5005378723144531, "reward_std": 0.07108896225690842, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5005379021167755, "step": 1285 }, { "completion_length": 100.796875, "epoch": 0.8291424887169568, "grad_norm": 12.178953170776367, "kl": 0.16552734375, "learning_rate": 5.854287556415216e-07, "loss": 0.0066, "reward": 1.4452755451202393, "reward_std": 0.2179737687110901, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.46090058982372284, "step": 1286 }, { "completion_length": 97.609375, "epoch": 0.8297872340425532, "grad_norm": 43.12259292602539, "kl": 0.123291015625, "learning_rate": 5.851063829787234e-07, "loss": 0.0049, "reward": 1.4171838164329529, "reward_std": 0.15592122077941895, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4171838164329529, "step": 1287 }, { "completion_length": 102.0, "epoch": 0.8304319793681496, "grad_norm": 17.38705062866211, "kl": 0.1328125, "learning_rate": 5.847840103159252e-07, "loss": 0.0053, "reward": 1.6413039565086365, "reward_std": 0.09573720023036003, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6413039267063141, "step": 1288 }, { "completion_length": 99.71875, "epoch": 0.8310767246937459, "grad_norm": 12.098993301391602, "kl": 0.140625, "learning_rate": 5.844616376531269e-07, "loss": 0.0056, "reward": 1.4964683651924133, "reward_std": 0.0922405980527401, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49646835029125214, "step": 1289 }, { "completion_length": 95.625, "epoch": 0.8317214700193424, "grad_norm": 24.151933670043945, "kl": 0.125, "learning_rate": 5.841392649903289e-07, "loss": 0.005, "reward": 1.5137308239936829, "reward_std": 0.13668807595968246, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5137307941913605, "step": 1290 }, { "completion_length": 103.234375, "epoch": 0.8323662153449387, "grad_norm": 17.465185165405273, "kl": 0.116943359375, "learning_rate": 5.838168923275306e-07, "loss": 0.0047, "reward": 1.5166469812393188, "reward_std": 0.08108877390623093, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5166469812393188, "step": 1291 }, { "completion_length": 107.65625, "epoch": 0.8330109606705351, "grad_norm": 19.077016830444336, "kl": 0.14892578125, "learning_rate": 5.834945196647324e-07, "loss": 0.006, "reward": 1.5799004435539246, "reward_std": 0.10533102601766586, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5799004435539246, "step": 1292 }, { "completion_length": 113.484375, "epoch": 0.8336557059961315, "grad_norm": 8.121297836303711, "kl": 0.1162109375, "learning_rate": 5.831721470019342e-07, "loss": 0.0047, "reward": 1.626124620437622, "reward_std": 0.05737309157848358, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6261245012283325, "step": 1293 }, { "completion_length": 113.484375, "epoch": 0.8343004513217279, "grad_norm": 15.856319427490234, "kl": 0.119140625, "learning_rate": 5.828497743391361e-07, "loss": 0.0048, "reward": 1.5811055302619934, "reward_std": 0.09756194055080414, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5811055451631546, "step": 1294 }, { "completion_length": 118.125, "epoch": 0.8349451966473244, "grad_norm": 28.771743774414062, "kl": 0.112060546875, "learning_rate": 5.825274016763378e-07, "loss": 0.0045, "reward": 1.590721607208252, "reward_std": 0.08623911440372467, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5907216668128967, "step": 1295 }, { "completion_length": 99.09375, "epoch": 0.8355899419729207, "grad_norm": 9.904467582702637, "kl": 0.146484375, "learning_rate": 5.822050290135397e-07, "loss": 0.0059, "reward": 1.6357603073120117, "reward_std": 0.10336859151721, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6357603967189789, "step": 1296 }, { "completion_length": 107.90625, "epoch": 0.8362346872985171, "grad_norm": 9.767264366149902, "kl": 0.11328125, "learning_rate": 5.818826563507414e-07, "loss": 0.0045, "reward": 1.4161508083343506, "reward_std": 0.09276717901229858, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4161507785320282, "step": 1297 }, { "completion_length": 106.53125, "epoch": 0.8368794326241135, "grad_norm": 10.562418937683105, "kl": 0.143798828125, "learning_rate": 5.815602836879432e-07, "loss": 0.0058, "reward": 1.7008376717567444, "reward_std": 0.1603548526763916, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7008376121520996, "step": 1298 }, { "completion_length": 115.5625, "epoch": 0.8375241779497099, "grad_norm": 11.865731239318848, "kl": 0.138671875, "learning_rate": 5.81237911025145e-07, "loss": 0.0055, "reward": 1.5670779943466187, "reward_std": 0.12002968043088913, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.567078024148941, "step": 1299 }, { "completion_length": 106.875, "epoch": 0.8381689232753062, "grad_norm": 7.02225923538208, "kl": 0.1357421875, "learning_rate": 5.809155383623469e-07, "loss": 0.0054, "reward": 1.5539129376411438, "reward_std": 0.06455387361347675, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5539128929376602, "step": 1300 }, { "completion_length": 114.25, "epoch": 0.8388136686009027, "grad_norm": 15.595966339111328, "kl": 0.1181640625, "learning_rate": 5.805931656995486e-07, "loss": 0.0047, "reward": 1.5586952567100525, "reward_std": 0.10112509876489639, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5586952567100525, "step": 1301 }, { "completion_length": 108.640625, "epoch": 0.839458413926499, "grad_norm": 8.791936874389648, "kl": 0.115234375, "learning_rate": 5.802707930367504e-07, "loss": 0.0046, "reward": 1.6700953841209412, "reward_std": 0.06693680211901665, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6700953841209412, "step": 1302 }, { "completion_length": 102.78125, "epoch": 0.8401031592520954, "grad_norm": 15.279678344726562, "kl": 0.11328125, "learning_rate": 5.799484203739523e-07, "loss": 0.0045, "reward": 1.668242335319519, "reward_std": 0.061969924718141556, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6682423055171967, "step": 1303 }, { "completion_length": 110.9375, "epoch": 0.8407479045776918, "grad_norm": 34.09355163574219, "kl": 0.15673828125, "learning_rate": 5.796260477111541e-07, "loss": 0.0063, "reward": 1.6093423962593079, "reward_std": 0.10130022093653679, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6093423962593079, "step": 1304 }, { "completion_length": 100.296875, "epoch": 0.8413926499032882, "grad_norm": 16.117502212524414, "kl": 0.1552734375, "learning_rate": 5.793036750483558e-07, "loss": 0.0062, "reward": 1.7314233183860779, "reward_std": 0.09649763628840446, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7314233183860779, "step": 1305 }, { "completion_length": 100.125, "epoch": 0.8420373952288845, "grad_norm": 10.007387161254883, "kl": 0.13623046875, "learning_rate": 5.789813023855577e-07, "loss": 0.0054, "reward": 1.7936328649520874, "reward_std": 0.07578852772712708, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7936329245567322, "step": 1306 }, { "completion_length": 98.40625, "epoch": 0.842682140554481, "grad_norm": 32.72197723388672, "kl": 0.125732421875, "learning_rate": 5.786589297227595e-07, "loss": 0.005, "reward": 1.3983741998672485, "reward_std": 0.09278368949890137, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3983742743730545, "step": 1307 }, { "completion_length": 106.078125, "epoch": 0.8433268858800773, "grad_norm": 10.910943031311035, "kl": 0.18505859375, "learning_rate": 5.783365570599613e-07, "loss": 0.0074, "reward": 1.645573914051056, "reward_std": 0.14889878779649734, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6611989140510559, "step": 1308 }, { "completion_length": 91.109375, "epoch": 0.8439716312056738, "grad_norm": 10.358012199401855, "kl": 0.1220703125, "learning_rate": 5.780141843971631e-07, "loss": 0.0049, "reward": 1.4291784167289734, "reward_std": 0.10380928218364716, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4291783422231674, "step": 1309 }, { "completion_length": 98.453125, "epoch": 0.8446163765312702, "grad_norm": 21.05621910095215, "kl": 0.11865234375, "learning_rate": 5.776918117343649e-07, "loss": 0.0047, "reward": 1.5606682300567627, "reward_std": 0.09428859874606133, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5606682300567627, "step": 1310 }, { "completion_length": 112.734375, "epoch": 0.8452611218568665, "grad_norm": 14.857038497924805, "kl": 0.12060546875, "learning_rate": 5.773694390715666e-07, "loss": 0.0048, "reward": 1.736677885055542, "reward_std": 0.07061555609107018, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7366778552532196, "step": 1311 }, { "completion_length": 105.25, "epoch": 0.845905867182463, "grad_norm": 16.779647827148438, "kl": 0.1328125, "learning_rate": 5.770470664087686e-07, "loss": 0.0053, "reward": 1.5583295822143555, "reward_std": 0.08000925555825233, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5583296418190002, "step": 1312 }, { "completion_length": 101.453125, "epoch": 0.8465506125080593, "grad_norm": 26.074615478515625, "kl": 0.14404296875, "learning_rate": 5.767246937459703e-07, "loss": 0.0058, "reward": 1.5779609084129333, "reward_std": 0.09761778265237808, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.577960878610611, "step": 1313 }, { "completion_length": 101.40625, "epoch": 0.8471953578336557, "grad_norm": 14.144932746887207, "kl": 0.1474609375, "learning_rate": 5.764023210831721e-07, "loss": 0.0059, "reward": 1.6106492280960083, "reward_std": 0.09668288752436638, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6106491684913635, "step": 1314 }, { "completion_length": 95.40625, "epoch": 0.8478401031592521, "grad_norm": 24.63298988342285, "kl": 0.10791015625, "learning_rate": 5.760799484203739e-07, "loss": 0.0043, "reward": 1.626054286956787, "reward_std": 0.1346743032336235, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6260542571544647, "step": 1315 }, { "completion_length": 97.015625, "epoch": 0.8484848484848485, "grad_norm": 13.633806228637695, "kl": 0.13232421875, "learning_rate": 5.757575757575758e-07, "loss": 0.0053, "reward": 1.6472105979919434, "reward_std": 0.10808993130922318, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6472106575965881, "step": 1316 }, { "completion_length": 100.046875, "epoch": 0.8491295938104448, "grad_norm": 31.76630210876465, "kl": 0.169921875, "learning_rate": 5.754352030947775e-07, "loss": 0.0068, "reward": 1.5727328062057495, "reward_std": 0.1476679891347885, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5883578211069107, "step": 1317 }, { "completion_length": 110.453125, "epoch": 0.8497743391360413, "grad_norm": 12.650569915771484, "kl": 0.1220703125, "learning_rate": 5.751128304319794e-07, "loss": 0.0049, "reward": 1.6151057481765747, "reward_std": 0.07021662034094334, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6151057481765747, "step": 1318 }, { "completion_length": 107.203125, "epoch": 0.8504190844616376, "grad_norm": 26.324243545532227, "kl": 0.11181640625, "learning_rate": 5.747904577691811e-07, "loss": 0.0045, "reward": 1.6326099038124084, "reward_std": 0.08545324578881264, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6326098740100861, "step": 1319 }, { "completion_length": 98.984375, "epoch": 0.851063829787234, "grad_norm": 23.667394638061523, "kl": 0.10888671875, "learning_rate": 5.74468085106383e-07, "loss": 0.0044, "reward": 1.568135380744934, "reward_std": 0.15582210570573807, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5681354403495789, "step": 1320 }, { "completion_length": 102.15625, "epoch": 0.8517085751128304, "grad_norm": 31.68404197692871, "kl": 0.124755859375, "learning_rate": 5.741457124435847e-07, "loss": 0.005, "reward": 1.7288174033164978, "reward_std": 0.07873563468456268, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.728817343711853, "step": 1321 }, { "completion_length": 99.9375, "epoch": 0.8523533204384268, "grad_norm": 78.26886749267578, "kl": 0.17138671875, "learning_rate": 5.738233397807866e-07, "loss": 0.0068, "reward": 1.5360566973686218, "reward_std": 0.19280319660902023, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5516817271709442, "step": 1322 }, { "completion_length": 98.46875, "epoch": 0.8529980657640233, "grad_norm": 26.776473999023438, "kl": 0.134765625, "learning_rate": 5.735009671179883e-07, "loss": 0.0054, "reward": 1.6314437985420227, "reward_std": 0.15900789201259613, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6470687091350555, "step": 1323 }, { "completion_length": 95.078125, "epoch": 0.8536428110896196, "grad_norm": 29.28158950805664, "kl": 0.111572265625, "learning_rate": 5.731785944551901e-07, "loss": 0.0045, "reward": 1.4915387034416199, "reward_std": 0.062356844544410706, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49153879284858704, "step": 1324 }, { "completion_length": 108.453125, "epoch": 0.854287556415216, "grad_norm": 14.453795433044434, "kl": 0.12255859375, "learning_rate": 5.72856221792392e-07, "loss": 0.0049, "reward": 1.4318147897720337, "reward_std": 0.11395280063152313, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4318147897720337, "step": 1325 }, { "completion_length": 94.96875, "epoch": 0.8549323017408124, "grad_norm": 23.957849502563477, "kl": 0.121826171875, "learning_rate": 5.725338491295938e-07, "loss": 0.0049, "reward": 1.5138567686080933, "reward_std": 0.23028697073459625, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5451066493988037, "step": 1326 }, { "completion_length": 103.640625, "epoch": 0.8555770470664088, "grad_norm": 17.610496520996094, "kl": 0.1201171875, "learning_rate": 5.722114764667955e-07, "loss": 0.0048, "reward": 1.4641339778900146, "reward_std": 0.10243158787488937, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46413394808769226, "step": 1327 }, { "completion_length": 96.515625, "epoch": 0.8562217923920051, "grad_norm": 9.11087417602539, "kl": 0.1357421875, "learning_rate": 5.718891038039974e-07, "loss": 0.0054, "reward": 1.6933382749557495, "reward_std": 0.10557930544018745, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6933382451534271, "step": 1328 }, { "completion_length": 107.765625, "epoch": 0.8568665377176016, "grad_norm": 27.55734634399414, "kl": 0.140625, "learning_rate": 5.715667311411992e-07, "loss": 0.0056, "reward": 1.480468213558197, "reward_std": 0.11969191953539848, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.480468213558197, "step": 1329 }, { "completion_length": 101.8125, "epoch": 0.8575112830431979, "grad_norm": 10.899490356445312, "kl": 0.1455078125, "learning_rate": 5.71244358478401e-07, "loss": 0.0058, "reward": 1.4227421879768372, "reward_std": 0.1466749757528305, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.43836718797683716, "step": 1330 }, { "completion_length": 107.53125, "epoch": 0.8581560283687943, "grad_norm": 17.06650733947754, "kl": 0.11669921875, "learning_rate": 5.709219858156028e-07, "loss": 0.0047, "reward": 1.5130248665809631, "reward_std": 0.13780354335904121, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5286498665809631, "step": 1331 }, { "completion_length": 103.296875, "epoch": 0.8588007736943907, "grad_norm": 44.2659797668457, "kl": 0.119140625, "learning_rate": 5.705996131528046e-07, "loss": 0.0048, "reward": 1.6124387383460999, "reward_std": 0.11327242106199265, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6124387979507446, "step": 1332 }, { "completion_length": 93.625, "epoch": 0.8594455190199871, "grad_norm": 119.70891571044922, "kl": 0.13330078125, "learning_rate": 5.702772404900064e-07, "loss": 0.0053, "reward": 1.6062958240509033, "reward_std": 0.04072526656091213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6062957644462585, "step": 1333 }, { "completion_length": 105.734375, "epoch": 0.8600902643455834, "grad_norm": 35.184932708740234, "kl": 0.119384765625, "learning_rate": 5.699548678272083e-07, "loss": 0.0048, "reward": 1.557755708694458, "reward_std": 0.07671729475259781, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5577556788921356, "step": 1334 }, { "completion_length": 100.15625, "epoch": 0.8607350096711799, "grad_norm": 19.142066955566406, "kl": 0.12548828125, "learning_rate": 5.6963249516441e-07, "loss": 0.005, "reward": 1.6992515325546265, "reward_std": 0.059500470757484436, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6992515027523041, "step": 1335 }, { "completion_length": 100.671875, "epoch": 0.8613797549967763, "grad_norm": 9.580693244934082, "kl": 0.1220703125, "learning_rate": 5.693101225016118e-07, "loss": 0.0049, "reward": 1.485615611076355, "reward_std": 0.08089196868240833, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48561564087867737, "step": 1336 }, { "completion_length": 100.78125, "epoch": 0.8620245003223727, "grad_norm": 23.950700759887695, "kl": 0.12158203125, "learning_rate": 5.689877498388136e-07, "loss": 0.0049, "reward": 1.5655575394630432, "reward_std": 0.10747053101658821, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5655575394630432, "step": 1337 }, { "completion_length": 103.96875, "epoch": 0.8626692456479691, "grad_norm": 21.37566566467285, "kl": 0.11181640625, "learning_rate": 5.686653771760155e-07, "loss": 0.0045, "reward": 1.5508422255516052, "reward_std": 0.12930458039045334, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5664671659469604, "step": 1338 }, { "completion_length": 102.921875, "epoch": 0.8633139909735654, "grad_norm": 31.438480377197266, "kl": 0.1279296875, "learning_rate": 5.683430045132172e-07, "loss": 0.0051, "reward": 1.6030420064926147, "reward_std": 0.10726083442568779, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6030420064926147, "step": 1339 }, { "completion_length": 105.125, "epoch": 0.8639587362991619, "grad_norm": 29.107524871826172, "kl": 0.12353515625, "learning_rate": 5.680206318504191e-07, "loss": 0.005, "reward": 1.6503074169158936, "reward_std": 0.08372955396771431, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6503073573112488, "step": 1340 }, { "completion_length": 96.734375, "epoch": 0.8646034816247582, "grad_norm": 7.437849044799805, "kl": 0.130859375, "learning_rate": 5.676982591876208e-07, "loss": 0.0052, "reward": 1.7455703020095825, "reward_std": 0.10204130783677101, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7455704212188721, "step": 1341 }, { "completion_length": 105.328125, "epoch": 0.8652482269503546, "grad_norm": 16.067913055419922, "kl": 0.11181640625, "learning_rate": 5.673758865248227e-07, "loss": 0.0045, "reward": 1.4903101921081543, "reward_std": 0.14681512117385864, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49031007289886475, "step": 1342 }, { "completion_length": 93.828125, "epoch": 0.865892972275951, "grad_norm": 23.24372100830078, "kl": 0.1328125, "learning_rate": 5.670535138620244e-07, "loss": 0.0053, "reward": 1.554599404335022, "reward_std": 0.18923930823802948, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5702243745326996, "step": 1343 }, { "completion_length": 109.75, "epoch": 0.8665377176015474, "grad_norm": 16.106502532958984, "kl": 0.110107421875, "learning_rate": 5.667311411992263e-07, "loss": 0.0044, "reward": 1.7196393609046936, "reward_std": 0.12335233762860298, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7196394205093384, "step": 1344 }, { "completion_length": 93.890625, "epoch": 0.8671824629271437, "grad_norm": 20.337154388427734, "kl": 0.32861328125, "learning_rate": 5.66408768536428e-07, "loss": 0.0131, "reward": 1.4278218150138855, "reward_std": 0.09321388974785805, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4278218150138855, "step": 1345 }, { "completion_length": 93.171875, "epoch": 0.8678272082527402, "grad_norm": 9.359206199645996, "kl": 0.15576171875, "learning_rate": 5.6608639587363e-07, "loss": 0.0062, "reward": 1.7311944365501404, "reward_std": 0.1090300865471363, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.731194406747818, "step": 1346 }, { "completion_length": 102.703125, "epoch": 0.8684719535783365, "grad_norm": 14.245612144470215, "kl": 0.12841796875, "learning_rate": 5.657640232108317e-07, "loss": 0.0051, "reward": 1.5649824142456055, "reward_std": 0.12245436012744904, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5649823844432831, "step": 1347 }, { "completion_length": 92.203125, "epoch": 0.869116698903933, "grad_norm": 38.64447784423828, "kl": 0.1328125, "learning_rate": 5.654416505480335e-07, "loss": 0.0053, "reward": 1.5066606998443604, "reward_std": 0.14087754115462303, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5222857445478439, "step": 1348 }, { "completion_length": 102.828125, "epoch": 0.8697614442295294, "grad_norm": 15.142080307006836, "kl": 0.154052734375, "learning_rate": 5.651192778852352e-07, "loss": 0.0062, "reward": 1.391581654548645, "reward_std": 0.1948801502585411, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4072067067027092, "step": 1349 }, { "completion_length": 101.875, "epoch": 0.8704061895551257, "grad_norm": 8.289034843444824, "kl": 0.119140625, "learning_rate": 5.647969052224371e-07, "loss": 0.0048, "reward": 1.6482349634170532, "reward_std": 0.11315581947565079, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6482349336147308, "step": 1350 }, { "completion_length": 104.28125, "epoch": 0.8710509348807222, "grad_norm": 15.9578857421875, "kl": 0.11279296875, "learning_rate": 5.644745325596389e-07, "loss": 0.0045, "reward": 1.6309210658073425, "reward_std": 0.1061040423810482, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6309210360050201, "step": 1351 }, { "completion_length": 109.65625, "epoch": 0.8716956802063185, "grad_norm": 13.198500633239746, "kl": 0.1337890625, "learning_rate": 5.641521598968407e-07, "loss": 0.0054, "reward": 1.7125477194786072, "reward_std": 0.11557654291391373, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7125477194786072, "step": 1352 }, { "completion_length": 92.40625, "epoch": 0.8723404255319149, "grad_norm": 14.851712226867676, "kl": 0.12646484375, "learning_rate": 5.638297872340425e-07, "loss": 0.005, "reward": 1.6003835201263428, "reward_std": 0.12276577949523926, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6160085499286652, "step": 1353 }, { "completion_length": 116.03125, "epoch": 0.8729851708575113, "grad_norm": 15.63801383972168, "kl": 0.115478515625, "learning_rate": 5.635074145712443e-07, "loss": 0.0046, "reward": 1.5839186310768127, "reward_std": 0.13456114381551743, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5995436608791351, "step": 1354 }, { "completion_length": 98.453125, "epoch": 0.8736299161831077, "grad_norm": 19.315853118896484, "kl": 0.125732421875, "learning_rate": 5.631850419084461e-07, "loss": 0.005, "reward": 1.6114773154258728, "reward_std": 0.09730841219425201, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6114773452281952, "step": 1355 }, { "completion_length": 98.171875, "epoch": 0.874274661508704, "grad_norm": 14.855340003967285, "kl": 0.127197265625, "learning_rate": 5.62862669245648e-07, "loss": 0.0051, "reward": 1.5679059624671936, "reward_std": 0.0623544380068779, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5679059624671936, "step": 1356 }, { "completion_length": 101.5, "epoch": 0.8749194068343005, "grad_norm": 30.180572509765625, "kl": 0.11962890625, "learning_rate": 5.625402965828497e-07, "loss": 0.0048, "reward": 1.5438405275344849, "reward_std": 0.08588853105902672, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5438405424356461, "step": 1357 }, { "completion_length": 95.125, "epoch": 0.8755641521598968, "grad_norm": 20.203781127929688, "kl": 0.17138671875, "learning_rate": 5.622179239200515e-07, "loss": 0.0069, "reward": 1.580243706703186, "reward_std": 0.19309964030981064, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.595868706703186, "step": 1358 }, { "completion_length": 98.0625, "epoch": 0.8762088974854932, "grad_norm": 7.41533899307251, "kl": 0.150390625, "learning_rate": 5.618955512572534e-07, "loss": 0.006, "reward": 1.5056850910186768, "reward_std": 0.054740700870752335, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5056850016117096, "step": 1359 }, { "completion_length": 105.328125, "epoch": 0.8768536428110896, "grad_norm": 24.1795597076416, "kl": 0.129638671875, "learning_rate": 5.615731785944552e-07, "loss": 0.0052, "reward": 1.5859038829803467, "reward_std": 0.08588533103466034, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5859038531780243, "step": 1360 }, { "completion_length": 95.453125, "epoch": 0.877498388136686, "grad_norm": 13.868168830871582, "kl": 0.1279296875, "learning_rate": 5.612508059316569e-07, "loss": 0.0051, "reward": 1.419808030128479, "reward_std": 0.06623131036758423, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4198080450296402, "step": 1361 }, { "completion_length": 91.734375, "epoch": 0.8781431334622823, "grad_norm": 5.384434700012207, "kl": 0.129150390625, "learning_rate": 5.609284332688588e-07, "loss": 0.0052, "reward": 1.4930008053779602, "reward_std": 0.12473198212683201, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49300073087215424, "step": 1362 }, { "completion_length": 107.765625, "epoch": 0.8787878787878788, "grad_norm": 19.84931182861328, "kl": 0.19189453125, "learning_rate": 5.606060606060605e-07, "loss": 0.0077, "reward": 1.5273191928863525, "reward_std": 0.15590080618858337, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5273192077875137, "step": 1363 }, { "completion_length": 92.78125, "epoch": 0.8794326241134752, "grad_norm": 22.012100219726562, "kl": 0.109130859375, "learning_rate": 5.602836879432624e-07, "loss": 0.0044, "reward": 1.6968834400177002, "reward_std": 0.11367316171526909, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6968834698200226, "step": 1364 }, { "completion_length": 95.640625, "epoch": 0.8800773694390716, "grad_norm": 26.02556800842285, "kl": 0.117431640625, "learning_rate": 5.599613152804641e-07, "loss": 0.0047, "reward": 1.464781105518341, "reward_std": 0.08185772225260735, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46478110551834106, "step": 1365 }, { "completion_length": 105.15625, "epoch": 0.880722114764668, "grad_norm": 9.117992401123047, "kl": 0.11669921875, "learning_rate": 5.59638942617666e-07, "loss": 0.0047, "reward": 1.732893705368042, "reward_std": 0.11800439655780792, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7328937351703644, "step": 1366 }, { "completion_length": 104.15625, "epoch": 0.8813668600902643, "grad_norm": 17.9821720123291, "kl": 0.11962890625, "learning_rate": 5.593165699548677e-07, "loss": 0.0048, "reward": 1.4764457941055298, "reward_std": 0.13345470279455185, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47644585371017456, "step": 1367 }, { "completion_length": 99.046875, "epoch": 0.8820116054158608, "grad_norm": 35.48114013671875, "kl": 0.124755859375, "learning_rate": 5.589941972920696e-07, "loss": 0.005, "reward": 1.714375615119934, "reward_std": 0.1126624196767807, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7143756151199341, "step": 1368 }, { "completion_length": 92.46875, "epoch": 0.8826563507414571, "grad_norm": 19.23027992248535, "kl": 0.12744140625, "learning_rate": 5.586718246292714e-07, "loss": 0.0051, "reward": 1.53688383102417, "reward_std": 0.16378530487418175, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5525089502334595, "step": 1369 }, { "completion_length": 107.203125, "epoch": 0.8833010960670535, "grad_norm": 18.839021682739258, "kl": 0.11083984375, "learning_rate": 5.583494519664732e-07, "loss": 0.0044, "reward": 1.618294894695282, "reward_std": 0.10168775171041489, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6182949244976044, "step": 1370 }, { "completion_length": 109.796875, "epoch": 0.8839458413926499, "grad_norm": 14.45029354095459, "kl": 0.119140625, "learning_rate": 5.580270793036749e-07, "loss": 0.0048, "reward": 1.5975992679595947, "reward_std": 0.16226022690534592, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5975992381572723, "step": 1371 }, { "completion_length": 98.84375, "epoch": 0.8845905867182463, "grad_norm": 14.76009750366211, "kl": 0.100341796875, "learning_rate": 5.577047066408769e-07, "loss": 0.004, "reward": 1.5008023381233215, "reward_std": 0.12654516100883484, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5008023977279663, "step": 1372 }, { "completion_length": 101.03125, "epoch": 0.8852353320438426, "grad_norm": 40.02851867675781, "kl": 0.1328125, "learning_rate": 5.573823339780786e-07, "loss": 0.0053, "reward": 1.5849175453186035, "reward_std": 0.13229620456695557, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5849175751209259, "step": 1373 }, { "completion_length": 108.234375, "epoch": 0.8858800773694391, "grad_norm": 12.737630844116211, "kl": 0.119384765625, "learning_rate": 5.570599613152804e-07, "loss": 0.0048, "reward": 1.6538455486297607, "reward_std": 0.09892918542027473, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6538455188274384, "step": 1374 }, { "completion_length": 106.4375, "epoch": 0.8865248226950354, "grad_norm": 31.22701644897461, "kl": 0.125, "learning_rate": 5.567375886524822e-07, "loss": 0.005, "reward": 1.4634947776794434, "reward_std": 0.09996931068599224, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46349479258060455, "step": 1375 }, { "completion_length": 93.71875, "epoch": 0.8871695680206318, "grad_norm": 47.132286071777344, "kl": 0.125244140625, "learning_rate": 5.56415215989684e-07, "loss": 0.005, "reward": 1.585554599761963, "reward_std": 0.06121959537267685, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5855547189712524, "step": 1376 }, { "completion_length": 90.203125, "epoch": 0.8878143133462283, "grad_norm": 16.654361724853516, "kl": 0.123046875, "learning_rate": 5.560928433268859e-07, "loss": 0.0049, "reward": 1.5993537306785583, "reward_std": 0.07232223823666573, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5993536710739136, "step": 1377 }, { "completion_length": 100.9375, "epoch": 0.8884590586718246, "grad_norm": 16.512577056884766, "kl": 0.12158203125, "learning_rate": 5.557704706640877e-07, "loss": 0.0049, "reward": 1.614881694316864, "reward_std": 0.12154518440365791, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.614881694316864, "step": 1378 }, { "completion_length": 102.265625, "epoch": 0.8891038039974211, "grad_norm": 9.475802421569824, "kl": 0.113525390625, "learning_rate": 5.554480980012894e-07, "loss": 0.0045, "reward": 1.6055333614349365, "reward_std": 0.1013088971376419, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6055334508419037, "step": 1379 }, { "completion_length": 91.640625, "epoch": 0.8897485493230174, "grad_norm": 10.056422233581543, "kl": 0.115966796875, "learning_rate": 5.551257253384912e-07, "loss": 0.0046, "reward": 1.6401957869529724, "reward_std": 0.07066305167973042, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.64019575715065, "step": 1380 }, { "completion_length": 98.59375, "epoch": 0.8903932946486138, "grad_norm": 21.696550369262695, "kl": 0.120849609375, "learning_rate": 5.548033526756932e-07, "loss": 0.0048, "reward": 1.554986298084259, "reward_std": 0.14988679066300392, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5549863278865814, "step": 1381 }, { "completion_length": 95.75, "epoch": 0.8910380399742102, "grad_norm": 37.21528244018555, "kl": 0.13671875, "learning_rate": 5.544809800128949e-07, "loss": 0.0055, "reward": 1.6293746829032898, "reward_std": 0.21692323684692383, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.6606246829032898, "step": 1382 }, { "completion_length": 108.703125, "epoch": 0.8916827852998066, "grad_norm": 172.50978088378906, "kl": 0.134765625, "learning_rate": 5.541586073500967e-07, "loss": 0.0054, "reward": 1.4939188361167908, "reward_std": 0.14137040078639984, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49391883611679077, "step": 1383 }, { "completion_length": 104.546875, "epoch": 0.8923275306254029, "grad_norm": 110.67353057861328, "kl": 0.1123046875, "learning_rate": 5.538362346872985e-07, "loss": 0.0045, "reward": 1.5028228163719177, "reward_std": 0.18492045998573303, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.502822756767273, "step": 1384 }, { "completion_length": 97.4375, "epoch": 0.8929722759509994, "grad_norm": 128.3675079345703, "kl": 0.129150390625, "learning_rate": 5.535138620245004e-07, "loss": 0.0052, "reward": 1.7198100686073303, "reward_std": 0.13068809360265732, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7198100984096527, "step": 1385 }, { "completion_length": 92.015625, "epoch": 0.8936170212765957, "grad_norm": 22.55219268798828, "kl": 0.13623046875, "learning_rate": 5.531914893617021e-07, "loss": 0.0055, "reward": 1.5806416273117065, "reward_std": 0.07453634962439537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5806415677070618, "step": 1386 }, { "completion_length": 98.21875, "epoch": 0.8942617666021921, "grad_norm": 25.168912887573242, "kl": 0.109619140625, "learning_rate": 5.52869116698904e-07, "loss": 0.0044, "reward": 1.4894877076148987, "reward_std": 0.11500835418701172, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48948773741722107, "step": 1387 }, { "completion_length": 111.9375, "epoch": 0.8949065119277885, "grad_norm": 8.519505500793457, "kl": 0.12353515625, "learning_rate": 5.525467440361057e-07, "loss": 0.0049, "reward": 1.4606132507324219, "reward_std": 0.10788729041814804, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.46061332523822784, "step": 1388 }, { "completion_length": 102.515625, "epoch": 0.8955512572533849, "grad_norm": 25.281314849853516, "kl": 0.13330078125, "learning_rate": 5.522243713733076e-07, "loss": 0.0053, "reward": 1.59192955493927, "reward_std": 0.14972123503684998, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5919295251369476, "step": 1389 }, { "completion_length": 92.4375, "epoch": 0.8961960025789814, "grad_norm": 18.566326141357422, "kl": 0.22119140625, "learning_rate": 5.519019987105093e-07, "loss": 0.0088, "reward": 1.6087493300437927, "reward_std": 0.16570261120796204, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6243743300437927, "step": 1390 }, { "completion_length": 101.21875, "epoch": 0.8968407479045777, "grad_norm": 37.18821716308594, "kl": 0.12841796875, "learning_rate": 5.515796260477112e-07, "loss": 0.0051, "reward": 1.4420801401138306, "reward_std": 0.12759844586253166, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44208016991615295, "step": 1391 }, { "completion_length": 96.515625, "epoch": 0.8974854932301741, "grad_norm": 57.861751556396484, "kl": 0.12353515625, "learning_rate": 5.512572533849129e-07, "loss": 0.0049, "reward": 1.4791641235351562, "reward_std": 0.05819516442716122, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4791640341281891, "step": 1392 }, { "completion_length": 99.4375, "epoch": 0.8981302385557705, "grad_norm": 11.22120189666748, "kl": 0.13427734375, "learning_rate": 5.509348807221147e-07, "loss": 0.0054, "reward": 1.5360617637634277, "reward_std": 0.0876328144222498, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5360617637634277, "step": 1393 }, { "completion_length": 99.46875, "epoch": 0.8987749838813669, "grad_norm": 9.769732475280762, "kl": 0.14111328125, "learning_rate": 5.506125080593166e-07, "loss": 0.0056, "reward": 1.6172817945480347, "reward_std": 0.13143368437886238, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6172817945480347, "step": 1394 }, { "completion_length": 96.046875, "epoch": 0.8994197292069632, "grad_norm": 21.860910415649414, "kl": 0.110107421875, "learning_rate": 5.502901353965184e-07, "loss": 0.0044, "reward": 1.7212462425231934, "reward_std": 0.09481841512024403, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7212462425231934, "step": 1395 }, { "completion_length": 96.359375, "epoch": 0.9000644745325597, "grad_norm": 15.907546043395996, "kl": 0.121826171875, "learning_rate": 5.499677627337201e-07, "loss": 0.0049, "reward": 1.694203495979309, "reward_std": 0.09414593875408173, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6942034661769867, "step": 1396 }, { "completion_length": 107.34375, "epoch": 0.900709219858156, "grad_norm": 12.596553802490234, "kl": 0.119140625, "learning_rate": 5.49645390070922e-07, "loss": 0.0048, "reward": 1.4774943590164185, "reward_std": 0.08623380959033966, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4774942696094513, "step": 1397 }, { "completion_length": 89.625, "epoch": 0.9013539651837524, "grad_norm": 16.927295684814453, "kl": 0.13037109375, "learning_rate": 5.493230174081238e-07, "loss": 0.0052, "reward": 1.6467631459236145, "reward_std": 0.09450025856494904, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6467630863189697, "step": 1398 }, { "completion_length": 101.609375, "epoch": 0.9019987105093488, "grad_norm": 14.867881774902344, "kl": 0.125, "learning_rate": 5.490006447453256e-07, "loss": 0.005, "reward": 1.5352025032043457, "reward_std": 0.08666136488318443, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5352024734020233, "step": 1399 }, { "completion_length": 97.96875, "epoch": 0.9026434558349452, "grad_norm": 79.15862274169922, "kl": 0.13330078125, "learning_rate": 5.486782720825274e-07, "loss": 0.0053, "reward": 1.525882363319397, "reward_std": 0.08245071768760681, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5258822739124298, "step": 1400 }, { "completion_length": 90.578125, "epoch": 0.9032882011605415, "grad_norm": 53.31522750854492, "kl": 0.1435546875, "learning_rate": 5.483558994197292e-07, "loss": 0.0057, "reward": 1.5624061822891235, "reward_std": 0.13468413054943085, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5624062120914459, "step": 1401 }, { "completion_length": 95.75, "epoch": 0.903932946486138, "grad_norm": 11.083121299743652, "kl": 0.11669921875, "learning_rate": 5.48033526756931e-07, "loss": 0.0047, "reward": 1.6812357902526855, "reward_std": 0.13949067145586014, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.681235671043396, "step": 1402 }, { "completion_length": 86.953125, "epoch": 0.9045776918117344, "grad_norm": 16.63957405090332, "kl": 0.1337890625, "learning_rate": 5.477111540941329e-07, "loss": 0.0053, "reward": 1.7723530530929565, "reward_std": 0.06749296560883522, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7723531126976013, "step": 1403 }, { "completion_length": 95.046875, "epoch": 0.9052224371373307, "grad_norm": 23.644563674926758, "kl": 0.134521484375, "learning_rate": 5.473887814313346e-07, "loss": 0.0054, "reward": 1.4659785032272339, "reward_std": 0.1834467351436615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4659785330295563, "step": 1404 }, { "completion_length": 97.796875, "epoch": 0.9058671824629272, "grad_norm": 9.021123886108398, "kl": 0.11865234375, "learning_rate": 5.470664087685364e-07, "loss": 0.0048, "reward": 1.6176230907440186, "reward_std": 0.13125622272491455, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6176231205463409, "step": 1405 }, { "completion_length": 103.875, "epoch": 0.9065119277885235, "grad_norm": 22.73553466796875, "kl": 0.12451171875, "learning_rate": 5.467440361057382e-07, "loss": 0.005, "reward": 1.605991005897522, "reward_std": 0.09940499439835548, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.605991005897522, "step": 1406 }, { "completion_length": 85.140625, "epoch": 0.90715667311412, "grad_norm": 11.063023567199707, "kl": 0.135009765625, "learning_rate": 5.464216634429401e-07, "loss": 0.0054, "reward": 1.6347458958625793, "reward_std": 0.08830403909087181, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6347458362579346, "step": 1407 }, { "completion_length": 97.765625, "epoch": 0.9078014184397163, "grad_norm": 22.741058349609375, "kl": 0.15283203125, "learning_rate": 5.460992907801418e-07, "loss": 0.0061, "reward": 1.6160573959350586, "reward_std": 0.10062050446867943, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6160573959350586, "step": 1408 }, { "completion_length": 98.625, "epoch": 0.9084461637653127, "grad_norm": 26.559572219848633, "kl": 0.1376953125, "learning_rate": 5.457769181173437e-07, "loss": 0.0055, "reward": 1.7845659255981445, "reward_std": 0.07418366987258196, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7845659852027893, "step": 1409 }, { "completion_length": 97.9375, "epoch": 0.9090909090909091, "grad_norm": 9.306669235229492, "kl": 0.1171875, "learning_rate": 5.454545454545454e-07, "loss": 0.0047, "reward": 1.5508063435554504, "reward_std": 0.09839482605457306, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5508063286542892, "step": 1410 }, { "completion_length": 96.078125, "epoch": 0.9097356544165055, "grad_norm": 17.91299057006836, "kl": 0.177734375, "learning_rate": 5.451321727917473e-07, "loss": 0.0071, "reward": 1.5460429787635803, "reward_std": 0.1461724489927292, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5460430383682251, "step": 1411 }, { "completion_length": 99.3125, "epoch": 0.9103803997421018, "grad_norm": 11.548925399780273, "kl": 0.12109375, "learning_rate": 5.44809800128949e-07, "loss": 0.0048, "reward": 1.5693344473838806, "reward_std": 0.1188022643327713, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5849594324827194, "step": 1412 }, { "completion_length": 108.453125, "epoch": 0.9110251450676983, "grad_norm": 11.278193473815918, "kl": 0.11572265625, "learning_rate": 5.444874274661509e-07, "loss": 0.0046, "reward": 1.5335027575492859, "reward_std": 0.0699017271399498, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5335027575492859, "step": 1413 }, { "completion_length": 105.0625, "epoch": 0.9116698903932946, "grad_norm": 11.437976837158203, "kl": 0.131591796875, "learning_rate": 5.441650548033526e-07, "loss": 0.0053, "reward": 1.6035997867584229, "reward_std": 0.07448092475533485, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6035997867584229, "step": 1414 }, { "completion_length": 106.171875, "epoch": 0.912314635718891, "grad_norm": 21.975309371948242, "kl": 0.121826171875, "learning_rate": 5.438426821405545e-07, "loss": 0.0049, "reward": 1.6646922826766968, "reward_std": 0.11215503513813019, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6646923124790192, "step": 1415 }, { "completion_length": 104.234375, "epoch": 0.9129593810444874, "grad_norm": 11.85417366027832, "kl": 0.1279296875, "learning_rate": 5.435203094777563e-07, "loss": 0.0051, "reward": 1.6083498001098633, "reward_std": 0.06887305155396461, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6083498597145081, "step": 1416 }, { "completion_length": 106.953125, "epoch": 0.9136041263700838, "grad_norm": 59.94190979003906, "kl": 0.12646484375, "learning_rate": 5.431979368149581e-07, "loss": 0.0051, "reward": 1.5003491640090942, "reward_std": 0.15336010605096817, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5003491938114166, "step": 1417 }, { "completion_length": 108.390625, "epoch": 0.9142488716956803, "grad_norm": 10.740958213806152, "kl": 0.11181640625, "learning_rate": 5.428755641521598e-07, "loss": 0.0045, "reward": 1.6392948627471924, "reward_std": 0.12374603003263474, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.65491983294487, "step": 1418 }, { "completion_length": 96.546875, "epoch": 0.9148936170212766, "grad_norm": 10.774832725524902, "kl": 0.124267578125, "learning_rate": 5.425531914893617e-07, "loss": 0.005, "reward": 1.487897515296936, "reward_std": 0.1082611046731472, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48789748549461365, "step": 1419 }, { "completion_length": 94.03125, "epoch": 0.915538362346873, "grad_norm": 17.441375732421875, "kl": 0.24755859375, "learning_rate": 5.422308188265635e-07, "loss": 0.0099, "reward": 1.5370892882347107, "reward_std": 0.13201788812875748, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5370892584323883, "step": 1420 }, { "completion_length": 102.203125, "epoch": 0.9161831076724694, "grad_norm": 23.82035255432129, "kl": 0.12451171875, "learning_rate": 5.419084461637653e-07, "loss": 0.005, "reward": 1.5023330450057983, "reward_std": 0.08557785674929619, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5023330599069595, "step": 1421 }, { "completion_length": 104.8125, "epoch": 0.9168278529980658, "grad_norm": 11.59601879119873, "kl": 0.122802734375, "learning_rate": 5.415860735009671e-07, "loss": 0.0049, "reward": 1.7010485529899597, "reward_std": 0.07506170868873596, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7010485231876373, "step": 1422 }, { "completion_length": 98.140625, "epoch": 0.9174725983236621, "grad_norm": 21.762170791625977, "kl": 0.11767578125, "learning_rate": 5.412637008381689e-07, "loss": 0.0047, "reward": 1.479046642780304, "reward_std": 0.09455282241106033, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47904662787914276, "step": 1423 }, { "completion_length": 103.890625, "epoch": 0.9181173436492586, "grad_norm": 12.237387657165527, "kl": 0.1318359375, "learning_rate": 5.409413281753707e-07, "loss": 0.0053, "reward": 1.6916875839233398, "reward_std": 0.0679397452622652, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6916875839233398, "step": 1424 }, { "completion_length": 98.34375, "epoch": 0.9187620889748549, "grad_norm": 13.88647174835205, "kl": 0.1220703125, "learning_rate": 5.406189555125726e-07, "loss": 0.0049, "reward": 1.5719730257987976, "reward_std": 0.11926516145467758, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.57197305560112, "step": 1425 }, { "completion_length": 93.390625, "epoch": 0.9194068343004513, "grad_norm": 15.400588989257812, "kl": 0.13232421875, "learning_rate": 5.402965828497743e-07, "loss": 0.0053, "reward": 1.651523232460022, "reward_std": 0.12631524726748466, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.651523232460022, "step": 1426 }, { "completion_length": 90.34375, "epoch": 0.9200515796260477, "grad_norm": 32.75628662109375, "kl": 0.15234375, "learning_rate": 5.399742101869761e-07, "loss": 0.0061, "reward": 1.6175696849822998, "reward_std": 0.13943760097026825, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6331946849822998, "step": 1427 }, { "completion_length": 104.9375, "epoch": 0.9206963249516441, "grad_norm": 19.23307228088379, "kl": 0.132080078125, "learning_rate": 5.39651837524178e-07, "loss": 0.0053, "reward": 1.5814802646636963, "reward_std": 0.12079915404319763, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5814802348613739, "step": 1428 }, { "completion_length": 101.484375, "epoch": 0.9213410702772404, "grad_norm": 13.339950561523438, "kl": 0.146484375, "learning_rate": 5.393294648613798e-07, "loss": 0.0058, "reward": 1.6199922561645508, "reward_std": 0.10084755718708038, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.619992196559906, "step": 1429 }, { "completion_length": 86.46875, "epoch": 0.9219858156028369, "grad_norm": 9.657578468322754, "kl": 0.13720703125, "learning_rate": 5.390070921985815e-07, "loss": 0.0055, "reward": 1.5761032104492188, "reward_std": 0.11388009041547775, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5761031806468964, "step": 1430 }, { "completion_length": 102.34375, "epoch": 0.9226305609284333, "grad_norm": 77.72435760498047, "kl": 0.142822265625, "learning_rate": 5.386847195357834e-07, "loss": 0.0057, "reward": 1.303051471710205, "reward_std": 0.08642319589853287, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.30305153876543045, "step": 1431 }, { "completion_length": 106.234375, "epoch": 0.9232753062540296, "grad_norm": 20.863059997558594, "kl": 0.124267578125, "learning_rate": 5.383623468729851e-07, "loss": 0.005, "reward": 1.5418643951416016, "reward_std": 0.11117121577262878, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5418643653392792, "step": 1432 }, { "completion_length": 92.125, "epoch": 0.9239200515796261, "grad_norm": 17.035001754760742, "kl": 0.1591796875, "learning_rate": 5.38039974210187e-07, "loss": 0.0064, "reward": 1.6256510615348816, "reward_std": 0.15372055768966675, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6256510317325592, "step": 1433 }, { "completion_length": 103.953125, "epoch": 0.9245647969052224, "grad_norm": 10.168933868408203, "kl": 0.1123046875, "learning_rate": 5.377176015473887e-07, "loss": 0.0045, "reward": 1.5418055057525635, "reward_std": 0.09734742343425751, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5418055355548859, "step": 1434 }, { "completion_length": 99.3125, "epoch": 0.9252095422308189, "grad_norm": 10.835103988647461, "kl": 0.1318359375, "learning_rate": 5.373952288845906e-07, "loss": 0.0053, "reward": 1.4514469504356384, "reward_std": 0.1669703871011734, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45144692063331604, "step": 1435 }, { "completion_length": 99.109375, "epoch": 0.9258542875564152, "grad_norm": 21.041080474853516, "kl": 0.145751953125, "learning_rate": 5.370728562217923e-07, "loss": 0.0058, "reward": 1.4748035073280334, "reward_std": 0.06715789437294006, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47480346262454987, "step": 1436 }, { "completion_length": 105.203125, "epoch": 0.9264990328820116, "grad_norm": 7.949612617492676, "kl": 0.1064453125, "learning_rate": 5.367504835589942e-07, "loss": 0.0043, "reward": 1.6560617685317993, "reward_std": 0.06005428917706013, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6560617983341217, "step": 1437 }, { "completion_length": 93.6875, "epoch": 0.927143778207608, "grad_norm": 16.48297882080078, "kl": 0.127685546875, "learning_rate": 5.36428110896196e-07, "loss": 0.0051, "reward": 1.73074871301651, "reward_std": 0.12964390963315964, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7307488024234772, "step": 1438 }, { "completion_length": 100.25, "epoch": 0.9277885235332044, "grad_norm": 12.600117683410645, "kl": 0.120849609375, "learning_rate": 5.361057382333978e-07, "loss": 0.0048, "reward": 1.488064706325531, "reward_std": 0.09356208145618439, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4880646616220474, "step": 1439 }, { "completion_length": 90.109375, "epoch": 0.9284332688588007, "grad_norm": 30.321741104125977, "kl": 0.14501953125, "learning_rate": 5.357833655705995e-07, "loss": 0.0058, "reward": 1.4567986726760864, "reward_std": 0.13803285732865334, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45679864287376404, "step": 1440 }, { "completion_length": 106.015625, "epoch": 0.9290780141843972, "grad_norm": 26.666709899902344, "kl": 0.117431640625, "learning_rate": 5.354609929078015e-07, "loss": 0.0047, "reward": 1.671416163444519, "reward_std": 0.08873732388019562, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6714162230491638, "step": 1441 }, { "completion_length": 95.609375, "epoch": 0.9297227595099935, "grad_norm": 8.158126831054688, "kl": 0.109619140625, "learning_rate": 5.351386202450032e-07, "loss": 0.0044, "reward": 1.531356930732727, "reward_std": 0.12410330772399902, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5313569456338882, "step": 1442 }, { "completion_length": 92.03125, "epoch": 0.9303675048355899, "grad_norm": 13.258638381958008, "kl": 0.14111328125, "learning_rate": 5.34816247582205e-07, "loss": 0.0057, "reward": 1.5752546787261963, "reward_std": 0.11212371289730072, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5752545893192291, "step": 1443 }, { "completion_length": 95.796875, "epoch": 0.9310122501611864, "grad_norm": 9.991436004638672, "kl": 0.120849609375, "learning_rate": 5.344938749194068e-07, "loss": 0.0048, "reward": 1.6488659381866455, "reward_std": 0.13059785589575768, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6644909381866455, "step": 1444 }, { "completion_length": 109.4375, "epoch": 0.9316569954867827, "grad_norm": 19.7529239654541, "kl": 0.159423828125, "learning_rate": 5.341715022566086e-07, "loss": 0.0064, "reward": 1.4868982434272766, "reward_std": 0.17621616274118423, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.486898273229599, "step": 1445 }, { "completion_length": 106.6875, "epoch": 0.9323017408123792, "grad_norm": 9.851184844970703, "kl": 0.118408203125, "learning_rate": 5.338491295938104e-07, "loss": 0.0047, "reward": 1.5718659162521362, "reward_std": 0.12131549417972565, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.571865975856781, "step": 1446 }, { "completion_length": 94.875, "epoch": 0.9329464861379755, "grad_norm": 15.667919158935547, "kl": 0.1357421875, "learning_rate": 5.335267569310123e-07, "loss": 0.0054, "reward": 1.6483543515205383, "reward_std": 0.13322065770626068, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6639794409275055, "step": 1447 }, { "completion_length": 102.671875, "epoch": 0.9335912314635719, "grad_norm": 16.409921646118164, "kl": 0.1494140625, "learning_rate": 5.33204384268214e-07, "loss": 0.006, "reward": 1.6805023550987244, "reward_std": 0.13565493375062943, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.680502325296402, "step": 1448 }, { "completion_length": 103.71875, "epoch": 0.9342359767891683, "grad_norm": 21.049869537353516, "kl": 0.114501953125, "learning_rate": 5.328820116054158e-07, "loss": 0.0046, "reward": 1.5279292464256287, "reward_std": 0.11103372648358345, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5279292613267899, "step": 1449 }, { "completion_length": 98.59375, "epoch": 0.9348807221147647, "grad_norm": 12.341633796691895, "kl": 0.1298828125, "learning_rate": 5.325596389426177e-07, "loss": 0.0052, "reward": 1.686635673046112, "reward_std": 0.09568401798605919, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6866356432437897, "step": 1450 }, { "completion_length": 111.109375, "epoch": 0.935525467440361, "grad_norm": 14.195385932922363, "kl": 0.10986328125, "learning_rate": 5.322372662798195e-07, "loss": 0.0044, "reward": 1.6304091215133667, "reward_std": 0.07046160846948624, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6304091215133667, "step": 1451 }, { "completion_length": 94.78125, "epoch": 0.9361702127659575, "grad_norm": 10.250699996948242, "kl": 0.13037109375, "learning_rate": 5.319148936170212e-07, "loss": 0.0052, "reward": 1.6717593669891357, "reward_std": 0.10181083902716637, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6717594265937805, "step": 1452 }, { "completion_length": 119.796875, "epoch": 0.9368149580915538, "grad_norm": 13.502083778381348, "kl": 0.108154296875, "learning_rate": 5.315925209542231e-07, "loss": 0.0043, "reward": 1.5886726379394531, "reward_std": 0.09807108342647552, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5886726975440979, "step": 1453 }, { "completion_length": 97.546875, "epoch": 0.9374597034171502, "grad_norm": 103.66673278808594, "kl": 0.140625, "learning_rate": 5.312701482914249e-07, "loss": 0.0056, "reward": 1.6207711100578308, "reward_std": 0.1276988536119461, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.620771050453186, "step": 1454 }, { "completion_length": 104.0, "epoch": 0.9381044487427466, "grad_norm": 33.64444351196289, "kl": 0.140869140625, "learning_rate": 5.309477756286267e-07, "loss": 0.0056, "reward": 1.3918054699897766, "reward_std": 0.149025097489357, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3918054550886154, "step": 1455 }, { "completion_length": 100.65625, "epoch": 0.938749194068343, "grad_norm": 168.25440979003906, "kl": 0.1552734375, "learning_rate": 5.306254029658284e-07, "loss": 0.0062, "reward": 1.6715111136436462, "reward_std": 0.10104092955589294, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6715111136436462, "step": 1456 }, { "completion_length": 98.59375, "epoch": 0.9393939393939394, "grad_norm": 21.297224044799805, "kl": 0.123046875, "learning_rate": 5.303030303030303e-07, "loss": 0.0049, "reward": 1.5171695351600647, "reward_std": 0.09040552377700806, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5171695351600647, "step": 1457 }, { "completion_length": 106.8125, "epoch": 0.9400386847195358, "grad_norm": 21.35308837890625, "kl": 0.13671875, "learning_rate": 5.29980657640232e-07, "loss": 0.0055, "reward": 1.6648077964782715, "reward_std": 0.16886910796165466, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6648077964782715, "step": 1458 }, { "completion_length": 113.234375, "epoch": 0.9406834300451322, "grad_norm": 9.700336456298828, "kl": 0.107666015625, "learning_rate": 5.29658284977434e-07, "loss": 0.0043, "reward": 1.5858876705169678, "reward_std": 0.08786206692457199, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5858877599239349, "step": 1459 }, { "completion_length": 112.28125, "epoch": 0.9413281753707285, "grad_norm": 16.349515914916992, "kl": 0.131591796875, "learning_rate": 5.293359123146357e-07, "loss": 0.0053, "reward": 1.5591378808021545, "reward_std": 0.1534385234117508, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5747629702091217, "step": 1460 }, { "completion_length": 99.0625, "epoch": 0.941972920696325, "grad_norm": 15.572367668151855, "kl": 0.1181640625, "learning_rate": 5.290135396518375e-07, "loss": 0.0047, "reward": 1.561755359172821, "reward_std": 0.04094194434583187, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5617553442716599, "step": 1461 }, { "completion_length": 109.59375, "epoch": 0.9426176660219213, "grad_norm": 6.7918524742126465, "kl": 0.121826171875, "learning_rate": 5.286911669890392e-07, "loss": 0.0049, "reward": 1.5657913088798523, "reward_std": 0.059127068147063255, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5657913386821747, "step": 1462 }, { "completion_length": 121.484375, "epoch": 0.9432624113475178, "grad_norm": 12.832409858703613, "kl": 0.13818359375, "learning_rate": 5.283687943262412e-07, "loss": 0.0055, "reward": 1.4953020811080933, "reward_std": 0.10412278026342392, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49530209600925446, "step": 1463 }, { "completion_length": 101.125, "epoch": 0.9439071566731141, "grad_norm": 19.80808448791504, "kl": 0.1328125, "learning_rate": 5.280464216634429e-07, "loss": 0.0053, "reward": 1.6220327019691467, "reward_std": 0.16693954914808273, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6376576721668243, "step": 1464 }, { "completion_length": 99.515625, "epoch": 0.9445519019987105, "grad_norm": 26.701688766479492, "kl": 0.122314453125, "learning_rate": 5.277240490006447e-07, "loss": 0.0049, "reward": 1.6026424169540405, "reward_std": 0.12471824884414673, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6026423424482346, "step": 1465 }, { "completion_length": 96.65625, "epoch": 0.9451966473243069, "grad_norm": 17.756526947021484, "kl": 0.142578125, "learning_rate": 5.274016763378465e-07, "loss": 0.0057, "reward": 1.597671627998352, "reward_std": 0.1306580062955618, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5976717472076416, "step": 1466 }, { "completion_length": 97.84375, "epoch": 0.9458413926499033, "grad_norm": 35.332550048828125, "kl": 0.1396484375, "learning_rate": 5.270793036750484e-07, "loss": 0.0056, "reward": 1.5732786655426025, "reward_std": 0.08914842084050179, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.573278620839119, "step": 1467 }, { "completion_length": 106.5625, "epoch": 0.9464861379754996, "grad_norm": 10.186761856079102, "kl": 0.11083984375, "learning_rate": 5.267569310122501e-07, "loss": 0.0044, "reward": 1.7234991788864136, "reward_std": 0.08161475323140621, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7234991490840912, "step": 1468 }, { "completion_length": 104.578125, "epoch": 0.9471308833010961, "grad_norm": 11.844374656677246, "kl": 0.1640625, "learning_rate": 5.26434558349452e-07, "loss": 0.0066, "reward": 1.7355561256408691, "reward_std": 0.07350215315818787, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7355562448501587, "step": 1469 }, { "completion_length": 100.3125, "epoch": 0.9477756286266924, "grad_norm": 14.433293342590332, "kl": 0.12646484375, "learning_rate": 5.261121856866537e-07, "loss": 0.0051, "reward": 1.7054158449172974, "reward_std": 0.09924307838082314, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7054157853126526, "step": 1470 }, { "completion_length": 104.078125, "epoch": 0.9484203739522888, "grad_norm": 13.400076866149902, "kl": 0.1201171875, "learning_rate": 5.257898130238555e-07, "loss": 0.0048, "reward": 1.6034408211708069, "reward_std": 0.09964192658662796, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6034408509731293, "step": 1471 }, { "completion_length": 105.53125, "epoch": 0.9490651192778853, "grad_norm": 11.762677192687988, "kl": 0.127197265625, "learning_rate": 5.254674403610574e-07, "loss": 0.0051, "reward": 1.5714890956878662, "reward_std": 0.1405608281493187, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5871140956878662, "step": 1472 }, { "completion_length": 113.65625, "epoch": 0.9497098646034816, "grad_norm": 8.321244239807129, "kl": 0.11962890625, "learning_rate": 5.251450676982592e-07, "loss": 0.0048, "reward": 1.5024237632751465, "reward_std": 0.10825404524803162, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5024236738681793, "step": 1473 }, { "completion_length": 109.03125, "epoch": 0.950354609929078, "grad_norm": 21.794754028320312, "kl": 0.1279296875, "learning_rate": 5.248226950354609e-07, "loss": 0.0051, "reward": 1.5815457105636597, "reward_std": 0.16428036987781525, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5815458297729492, "step": 1474 }, { "completion_length": 102.28125, "epoch": 0.9509993552546744, "grad_norm": 15.166093826293945, "kl": 0.137939453125, "learning_rate": 5.245003223726628e-07, "loss": 0.0055, "reward": 1.696491301059723, "reward_std": 0.08702771738171577, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6964913010597229, "step": 1475 }, { "completion_length": 106.84375, "epoch": 0.9516441005802708, "grad_norm": 35.8002815246582, "kl": 0.12548828125, "learning_rate": 5.241779497098646e-07, "loss": 0.005, "reward": 1.6845852136611938, "reward_std": 0.1240113228559494, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6845852732658386, "step": 1476 }, { "completion_length": 107.390625, "epoch": 0.9522888459058672, "grad_norm": 12.195111274719238, "kl": 0.114013671875, "learning_rate": 5.238555770470664e-07, "loss": 0.0046, "reward": 1.5479717254638672, "reward_std": 0.06802619993686676, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5479717254638672, "step": 1477 }, { "completion_length": 109.234375, "epoch": 0.9529335912314636, "grad_norm": 7.733105182647705, "kl": 0.158203125, "learning_rate": 5.235332043842681e-07, "loss": 0.0063, "reward": 1.670875072479248, "reward_std": 0.08894642814993858, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6708749830722809, "step": 1478 }, { "completion_length": 118.703125, "epoch": 0.9535783365570599, "grad_norm": 9.842519760131836, "kl": 0.11865234375, "learning_rate": 5.2321083172147e-07, "loss": 0.0047, "reward": 1.5258830785751343, "reward_std": 0.11915917694568634, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5258831083774567, "step": 1479 }, { "completion_length": 110.640625, "epoch": 0.9542230818826564, "grad_norm": 15.017685890197754, "kl": 0.131103515625, "learning_rate": 5.228884590586718e-07, "loss": 0.0052, "reward": 1.4207090139389038, "reward_std": 0.08460971340537071, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.420709028840065, "step": 1480 }, { "completion_length": 101.015625, "epoch": 0.9548678272082527, "grad_norm": 9.356352806091309, "kl": 0.12255859375, "learning_rate": 5.225660863958736e-07, "loss": 0.0049, "reward": 1.4379160404205322, "reward_std": 0.055083997547626495, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4379161298274994, "step": 1481 }, { "completion_length": 112.875, "epoch": 0.9555125725338491, "grad_norm": 21.275924682617188, "kl": 0.1875, "learning_rate": 5.222437137330754e-07, "loss": 0.0075, "reward": 1.198258638381958, "reward_std": 0.11420409381389618, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.19825860857963562, "step": 1482 }, { "completion_length": 123.0, "epoch": 0.9561573178594455, "grad_norm": 9.146618843078613, "kl": 0.11328125, "learning_rate": 5.219213410702772e-07, "loss": 0.0045, "reward": 1.4889427423477173, "reward_std": 0.08368909172713757, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4889426678419113, "step": 1483 }, { "completion_length": 126.734375, "epoch": 0.9568020631850419, "grad_norm": 14.507353782653809, "kl": 0.114013671875, "learning_rate": 5.215989684074789e-07, "loss": 0.0046, "reward": 1.525374710559845, "reward_std": 0.08460210636258125, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5253747850656509, "step": 1484 }, { "completion_length": 113.0625, "epoch": 0.9574468085106383, "grad_norm": 44.216976165771484, "kl": 0.15185546875, "learning_rate": 5.212765957446809e-07, "loss": 0.0061, "reward": 1.5072972178459167, "reward_std": 0.06448614783585072, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5072971731424332, "step": 1485 }, { "completion_length": 103.25, "epoch": 0.9580915538362347, "grad_norm": 14.714810371398926, "kl": 0.17919921875, "learning_rate": 5.209542230818826e-07, "loss": 0.0072, "reward": 1.5354995727539062, "reward_std": 0.16414112597703934, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5511245578527451, "step": 1486 }, { "completion_length": 117.234375, "epoch": 0.9587362991618311, "grad_norm": 10.766145706176758, "kl": 0.15283203125, "learning_rate": 5.206318504190844e-07, "loss": 0.0061, "reward": 1.5310158729553223, "reward_std": 0.16506896913051605, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5466408729553223, "step": 1487 }, { "completion_length": 120.0, "epoch": 0.9593810444874274, "grad_norm": 11.279646873474121, "kl": 0.127685546875, "learning_rate": 5.203094777562862e-07, "loss": 0.0051, "reward": 1.7490030527114868, "reward_std": 0.0665210410952568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7490030825138092, "step": 1488 }, { "completion_length": 107.875, "epoch": 0.9600257898130239, "grad_norm": 9.737847328186035, "kl": 0.11962890625, "learning_rate": 5.199871050934881e-07, "loss": 0.0048, "reward": 1.6569666862487793, "reward_std": 0.04395459406077862, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6569667458534241, "step": 1489 }, { "completion_length": 115.234375, "epoch": 0.9606705351386202, "grad_norm": 16.854692459106445, "kl": 0.110107421875, "learning_rate": 5.196647324306898e-07, "loss": 0.0044, "reward": 1.7448121309280396, "reward_std": 0.0847182609140873, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7448122203350067, "step": 1490 }, { "completion_length": 105.59375, "epoch": 0.9613152804642167, "grad_norm": 7.6493449211120605, "kl": 0.15185546875, "learning_rate": 5.193423597678917e-07, "loss": 0.0061, "reward": 1.5667211413383484, "reward_std": 0.17857857793569565, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.582346111536026, "step": 1491 }, { "completion_length": 108.21875, "epoch": 0.961960025789813, "grad_norm": 47.15330123901367, "kl": 0.128662109375, "learning_rate": 5.190199871050934e-07, "loss": 0.0052, "reward": 1.6914491057395935, "reward_std": 0.1899641454219818, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7070741355419159, "step": 1492 }, { "completion_length": 128.96875, "epoch": 0.9626047711154094, "grad_norm": 29.482093811035156, "kl": 0.1552734375, "learning_rate": 5.186976144422953e-07, "loss": 0.0062, "reward": 1.5123082399368286, "reward_std": 0.08841293677687645, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5123083293437958, "step": 1493 }, { "completion_length": 114.671875, "epoch": 0.9632495164410058, "grad_norm": 14.688892364501953, "kl": 0.118896484375, "learning_rate": 5.183752417794971e-07, "loss": 0.0048, "reward": 1.7382131814956665, "reward_std": 0.1165858581662178, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7382131516933441, "step": 1494 }, { "completion_length": 128.5, "epoch": 0.9638942617666022, "grad_norm": 9.904383659362793, "kl": 0.122802734375, "learning_rate": 5.180528691166989e-07, "loss": 0.0049, "reward": 1.5775507688522339, "reward_std": 0.09859605878591537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5775507986545563, "step": 1495 }, { "completion_length": 114.125, "epoch": 0.9645390070921985, "grad_norm": 14.388285636901855, "kl": 0.14111328125, "learning_rate": 5.177304964539006e-07, "loss": 0.0056, "reward": 1.7104634642601013, "reward_std": 0.12022716552019119, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7104635238647461, "step": 1496 }, { "completion_length": 111.015625, "epoch": 0.965183752417795, "grad_norm": 21.702617645263672, "kl": 0.12939453125, "learning_rate": 5.174081237911026e-07, "loss": 0.0052, "reward": 1.6979969143867493, "reward_std": 0.152745820581913, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6979968249797821, "step": 1497 }, { "completion_length": 119.84375, "epoch": 0.9658284977433914, "grad_norm": 7.377781391143799, "kl": 0.11865234375, "learning_rate": 5.170857511283043e-07, "loss": 0.0047, "reward": 1.6159717440605164, "reward_std": 0.0942818857729435, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6159717440605164, "step": 1498 }, { "completion_length": 119.109375, "epoch": 0.9664732430689877, "grad_norm": 19.44676399230957, "kl": 0.133056640625, "learning_rate": 5.167633784655061e-07, "loss": 0.0053, "reward": 1.70160710811615, "reward_std": 0.1373962201178074, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7172320783138275, "step": 1499 }, { "completion_length": 106.296875, "epoch": 0.9671179883945842, "grad_norm": 13.977012634277344, "kl": 0.14306640625, "learning_rate": 5.164410058027078e-07, "loss": 0.0057, "reward": 1.6375981569290161, "reward_std": 0.13657869026064873, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6375981867313385, "step": 1500 }, { "completion_length": 117.03125, "epoch": 0.9677627337201805, "grad_norm": 13.271944046020508, "kl": 0.153564453125, "learning_rate": 5.161186331399097e-07, "loss": 0.0061, "reward": 1.5755575895309448, "reward_std": 0.11266742460429668, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5755576193332672, "step": 1501 }, { "completion_length": 116.421875, "epoch": 0.968407479045777, "grad_norm": 19.823734283447266, "kl": 0.135498046875, "learning_rate": 5.157962604771115e-07, "loss": 0.0054, "reward": 1.6668625473976135, "reward_std": 0.09966631233692169, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6668625175952911, "step": 1502 }, { "completion_length": 140.203125, "epoch": 0.9690522243713733, "grad_norm": 16.144290924072266, "kl": 0.119384765625, "learning_rate": 5.154738878143133e-07, "loss": 0.0048, "reward": 1.7107030749320984, "reward_std": 0.08905570209026337, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.710703045129776, "step": 1503 }, { "completion_length": 124.796875, "epoch": 0.9696969696969697, "grad_norm": 12.73034381866455, "kl": 0.11865234375, "learning_rate": 5.151515151515151e-07, "loss": 0.0047, "reward": 1.4738882780075073, "reward_std": 0.09788031131029129, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4738882780075073, "step": 1504 }, { "completion_length": 124.515625, "epoch": 0.970341715022566, "grad_norm": 13.489100456237793, "kl": 0.1337890625, "learning_rate": 5.148291424887169e-07, "loss": 0.0053, "reward": 1.5239415764808655, "reward_std": 0.07494577765464783, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5239415764808655, "step": 1505 }, { "completion_length": 127.6875, "epoch": 0.9709864603481625, "grad_norm": 19.454742431640625, "kl": 0.11865234375, "learning_rate": 5.145067698259187e-07, "loss": 0.0048, "reward": 1.414416253566742, "reward_std": 0.1128045842051506, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41441625356674194, "step": 1506 }, { "completion_length": 124.640625, "epoch": 0.9716312056737588, "grad_norm": 9.694129943847656, "kl": 0.11767578125, "learning_rate": 5.141843971631206e-07, "loss": 0.0047, "reward": 1.482945740222931, "reward_std": 0.1253538653254509, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4985707104206085, "step": 1507 }, { "completion_length": 139.828125, "epoch": 0.9722759509993553, "grad_norm": 52.3492546081543, "kl": 0.12548828125, "learning_rate": 5.138620245003223e-07, "loss": 0.005, "reward": 1.6624096035957336, "reward_std": 0.13141599297523499, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6624096035957336, "step": 1508 }, { "completion_length": 132.140625, "epoch": 0.9729206963249516, "grad_norm": 19.669851303100586, "kl": 0.10986328125, "learning_rate": 5.135396518375241e-07, "loss": 0.0044, "reward": 1.52760249376297, "reward_std": 0.14120056480169296, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5276025533676147, "step": 1509 }, { "completion_length": 133.453125, "epoch": 0.973565441650548, "grad_norm": 11.811445236206055, "kl": 0.101806640625, "learning_rate": 5.13217279174726e-07, "loss": 0.0041, "reward": 1.542509913444519, "reward_std": 0.09525493159890175, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5425098836421967, "step": 1510 }, { "completion_length": 130.0625, "epoch": 0.9742101869761445, "grad_norm": 14.556682586669922, "kl": 0.107666015625, "learning_rate": 5.128949065119278e-07, "loss": 0.0043, "reward": 1.622962474822998, "reward_std": 0.1552848145365715, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6385875046253204, "step": 1511 }, { "completion_length": 114.84375, "epoch": 0.9748549323017408, "grad_norm": 9.852642059326172, "kl": 0.12158203125, "learning_rate": 5.125725338491295e-07, "loss": 0.0049, "reward": 1.5891509652137756, "reward_std": 0.1219281405210495, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6047759354114532, "step": 1512 }, { "completion_length": 141.40625, "epoch": 0.9754996776273372, "grad_norm": 11.842811584472656, "kl": 0.109375, "learning_rate": 5.122501611863314e-07, "loss": 0.0044, "reward": 1.6655017733573914, "reward_std": 0.11256514489650726, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6655017733573914, "step": 1513 }, { "completion_length": 118.015625, "epoch": 0.9761444229529336, "grad_norm": 27.52327537536621, "kl": 0.114501953125, "learning_rate": 5.119277885235331e-07, "loss": 0.0046, "reward": 1.5326758027076721, "reward_std": 0.1902296021580696, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5326757878065109, "step": 1514 }, { "completion_length": 116.375, "epoch": 0.97678916827853, "grad_norm": 16.492992401123047, "kl": 0.1318359375, "learning_rate": 5.11605415860735e-07, "loss": 0.0053, "reward": 1.4814838767051697, "reward_std": 0.12518265470862389, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4814838469028473, "step": 1515 }, { "completion_length": 119.140625, "epoch": 0.9774339136041263, "grad_norm": 12.996352195739746, "kl": 0.1376953125, "learning_rate": 5.112830431979368e-07, "loss": 0.0055, "reward": 1.533171832561493, "reward_std": 0.13848894834518433, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5331718623638153, "step": 1516 }, { "completion_length": 122.671875, "epoch": 0.9780786589297228, "grad_norm": 9.61275863647461, "kl": 0.114990234375, "learning_rate": 5.109606705351386e-07, "loss": 0.0046, "reward": 1.5685065388679504, "reward_std": 0.08499204367399216, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5685064792633057, "step": 1517 }, { "completion_length": 128.046875, "epoch": 0.9787234042553191, "grad_norm": 16.09702491760254, "kl": 0.12451171875, "learning_rate": 5.106382978723403e-07, "loss": 0.005, "reward": 1.4926301836967468, "reward_std": 0.07699324190616608, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4926301836967468, "step": 1518 }, { "completion_length": 127.15625, "epoch": 0.9793681495809156, "grad_norm": 17.23007583618164, "kl": 0.111083984375, "learning_rate": 5.103159252095423e-07, "loss": 0.0044, "reward": 1.5509237051010132, "reward_std": 0.12189146503806114, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.550923690199852, "step": 1519 }, { "completion_length": 117.90625, "epoch": 0.9800128949065119, "grad_norm": 20.01177406311035, "kl": 0.12646484375, "learning_rate": 5.09993552546744e-07, "loss": 0.0051, "reward": 1.551382064819336, "reward_std": 0.10219447687268257, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5513819754123688, "step": 1520 }, { "completion_length": 116.5, "epoch": 0.9806576402321083, "grad_norm": 33.8641357421875, "kl": 0.13916015625, "learning_rate": 5.096711798839458e-07, "loss": 0.0056, "reward": 1.624679148197174, "reward_std": 0.1435355320572853, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6246791183948517, "step": 1521 }, { "completion_length": 112.9375, "epoch": 0.9813023855577047, "grad_norm": 10.071717262268066, "kl": 0.123779296875, "learning_rate": 5.093488072211475e-07, "loss": 0.0049, "reward": 1.633798360824585, "reward_std": 0.0972403883934021, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6337983310222626, "step": 1522 }, { "completion_length": 128.484375, "epoch": 0.9819471308833011, "grad_norm": 9.838187217712402, "kl": 0.2041015625, "learning_rate": 5.090264345583495e-07, "loss": 0.0082, "reward": 1.6361613273620605, "reward_std": 0.18265075236558914, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6517862677574158, "step": 1523 }, { "completion_length": 108.71875, "epoch": 0.9825918762088974, "grad_norm": 15.268589973449707, "kl": 0.121337890625, "learning_rate": 5.087040618955512e-07, "loss": 0.0049, "reward": 1.5366319417953491, "reward_std": 0.13059881143271923, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5522569417953491, "step": 1524 }, { "completion_length": 120.765625, "epoch": 0.9832366215344939, "grad_norm": 13.799566268920898, "kl": 0.12744140625, "learning_rate": 5.08381689232753e-07, "loss": 0.0051, "reward": 1.5936528444290161, "reward_std": 0.16167183220386505, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5936528742313385, "step": 1525 }, { "completion_length": 123.09375, "epoch": 0.9838813668600903, "grad_norm": 9.85830020904541, "kl": 0.169921875, "learning_rate": 5.080593165699548e-07, "loss": 0.0068, "reward": 1.6045432686805725, "reward_std": 0.1380324363708496, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6201682984828949, "step": 1526 }, { "completion_length": 119.78125, "epoch": 0.9845261121856866, "grad_norm": 41.4371452331543, "kl": 0.14453125, "learning_rate": 5.077369439071566e-07, "loss": 0.0058, "reward": 1.6698803901672363, "reward_std": 0.10253677517175674, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6698803603649139, "step": 1527 }, { "completion_length": 133.8125, "epoch": 0.9851708575112831, "grad_norm": 8.188080787658691, "kl": 0.179443359375, "learning_rate": 5.074145712443584e-07, "loss": 0.0072, "reward": 1.559647560119629, "reward_std": 0.09752954542636871, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5596475452184677, "step": 1528 }, { "completion_length": 118.21875, "epoch": 0.9858156028368794, "grad_norm": 15.319279670715332, "kl": 0.115478515625, "learning_rate": 5.070921985815603e-07, "loss": 0.0046, "reward": 1.651024043560028, "reward_std": 0.0649954117834568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6510239839553833, "step": 1529 }, { "completion_length": 116.1875, "epoch": 0.9864603481624759, "grad_norm": 14.552484512329102, "kl": 0.1435546875, "learning_rate": 5.06769825918762e-07, "loss": 0.0057, "reward": 1.630611777305603, "reward_std": 0.1458933800458908, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6306118071079254, "step": 1530 }, { "completion_length": 109.734375, "epoch": 0.9871050934880722, "grad_norm": 18.779993057250977, "kl": 0.104248046875, "learning_rate": 5.064474532559638e-07, "loss": 0.0042, "reward": 1.6451267004013062, "reward_std": 0.06406429223716259, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6451267302036285, "step": 1531 }, { "completion_length": 114.484375, "epoch": 0.9877498388136686, "grad_norm": 12.041047096252441, "kl": 0.11669921875, "learning_rate": 5.061250805931657e-07, "loss": 0.0047, "reward": 1.5772784948349, "reward_std": 0.07114759460091591, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5772784948348999, "step": 1532 }, { "completion_length": 99.15625, "epoch": 0.988394584139265, "grad_norm": 11.344866752624512, "kl": 0.1328125, "learning_rate": 5.058027079303675e-07, "loss": 0.0053, "reward": 1.7052232027053833, "reward_std": 0.12659567594528198, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7052232325077057, "step": 1533 }, { "completion_length": 128.125, "epoch": 0.9890393294648614, "grad_norm": 7.520096302032471, "kl": 0.111328125, "learning_rate": 5.054803352675692e-07, "loss": 0.0045, "reward": 1.6211850047111511, "reward_std": 0.07714429125189781, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6211849749088287, "step": 1534 }, { "completion_length": 114.390625, "epoch": 0.9896840747904577, "grad_norm": 14.49476432800293, "kl": 0.10791015625, "learning_rate": 5.051579626047711e-07, "loss": 0.0043, "reward": 1.516048014163971, "reward_std": 0.07544976845383644, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5160480439662933, "step": 1535 }, { "completion_length": 99.28125, "epoch": 0.9903288201160542, "grad_norm": 16.49233627319336, "kl": 0.1298828125, "learning_rate": 5.048355899419729e-07, "loss": 0.0052, "reward": 1.582204818725586, "reward_std": 0.08421331271529198, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5822047889232635, "step": 1536 }, { "completion_length": 111.703125, "epoch": 0.9909735654416505, "grad_norm": 16.815492630004883, "kl": 0.1298828125, "learning_rate": 5.045132172791747e-07, "loss": 0.0052, "reward": 1.440306544303894, "reward_std": 0.17948133870959282, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.45593155920505524, "step": 1537 }, { "completion_length": 100.984375, "epoch": 0.9916183107672469, "grad_norm": 22.465801239013672, "kl": 0.11767578125, "learning_rate": 5.041908446163765e-07, "loss": 0.0047, "reward": 1.5949344038963318, "reward_std": 0.09736866503953934, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5949344038963318, "step": 1538 }, { "completion_length": 108.1875, "epoch": 0.9922630560928434, "grad_norm": 21.445663452148438, "kl": 0.117919921875, "learning_rate": 5.038684719535783e-07, "loss": 0.0047, "reward": 1.6617515087127686, "reward_std": 0.1481238529086113, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6617515981197357, "step": 1539 }, { "completion_length": 122.078125, "epoch": 0.9929078014184397, "grad_norm": 11.333296775817871, "kl": 0.105712890625, "learning_rate": 5.0354609929078e-07, "loss": 0.0042, "reward": 1.57760488986969, "reward_std": 0.08707677200436592, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5776048898696899, "step": 1540 }, { "completion_length": 99.1875, "epoch": 0.9935525467440361, "grad_norm": 19.648611068725586, "kl": 0.135986328125, "learning_rate": 5.03223726627982e-07, "loss": 0.0054, "reward": 1.6852267384529114, "reward_std": 0.14017397910356522, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7008517682552338, "step": 1541 }, { "completion_length": 94.828125, "epoch": 0.9941972920696325, "grad_norm": 29.060779571533203, "kl": 0.115966796875, "learning_rate": 5.029013539651837e-07, "loss": 0.0046, "reward": 1.6200896501541138, "reward_std": 0.0910437311977148, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6200897097587585, "step": 1542 }, { "completion_length": 113.28125, "epoch": 0.9948420373952289, "grad_norm": 10.805424690246582, "kl": 0.112060546875, "learning_rate": 5.025789813023855e-07, "loss": 0.0045, "reward": 1.5506914258003235, "reward_std": 0.083372563123703, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5506914258003235, "step": 1543 }, { "completion_length": 105.953125, "epoch": 0.9954867827208252, "grad_norm": 46.60286331176758, "kl": 0.10888671875, "learning_rate": 5.022566086395872e-07, "loss": 0.0044, "reward": 1.71124666929245, "reward_std": 0.07887070626020432, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7112466394901276, "step": 1544 }, { "completion_length": 99.28125, "epoch": 0.9961315280464217, "grad_norm": 15.590272903442383, "kl": 0.13134765625, "learning_rate": 5.019342359767892e-07, "loss": 0.0052, "reward": 1.6452555656433105, "reward_std": 0.08676682412624359, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6452555060386658, "step": 1545 }, { "completion_length": 97.859375, "epoch": 0.996776273372018, "grad_norm": 25.825660705566406, "kl": 0.13134765625, "learning_rate": 5.016118633139909e-07, "loss": 0.0053, "reward": 1.582628071308136, "reward_std": 0.11850284785032272, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5982531011104584, "step": 1546 }, { "completion_length": 100.21875, "epoch": 0.9974210186976145, "grad_norm": 12.175593376159668, "kl": 0.12060546875, "learning_rate": 5.012894906511927e-07, "loss": 0.0048, "reward": 1.489000380039215, "reward_std": 0.10180428251624107, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4890003502368927, "step": 1547 }, { "completion_length": 113.71875, "epoch": 0.9980657640232108, "grad_norm": 10.186779975891113, "kl": 0.12109375, "learning_rate": 5.009671179883945e-07, "loss": 0.0048, "reward": 1.687839686870575, "reward_std": 0.0874037854373455, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.687839686870575, "step": 1548 }, { "completion_length": 105.109375, "epoch": 0.9987105093488072, "grad_norm": 15.987135887145996, "kl": 0.12060546875, "learning_rate": 5.006447453255964e-07, "loss": 0.0048, "reward": 1.6305145025253296, "reward_std": 0.05935047194361687, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.630514532327652, "step": 1549 }, { "completion_length": 113.21875, "epoch": 0.9993552546744036, "grad_norm": 18.414926528930664, "kl": 0.117919921875, "learning_rate": 5.003223726627982e-07, "loss": 0.0047, "reward": 1.5300384759902954, "reward_std": 0.06420031562447548, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5300384759902954, "step": 1550 }, { "completion_length": 98.15625, "epoch": 1.0, "grad_norm": 21.30752944946289, "kl": 0.118408203125, "learning_rate": 5e-07, "loss": 0.0047, "reward": 1.5693151354789734, "reward_std": 0.13229862600564957, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5693151652812958, "step": 1551 }, { "completion_length": 125.671875, "epoch": 1.0006447453255964, "grad_norm": 14.886545181274414, "kl": 0.1435546875, "learning_rate": 4.996776273372017e-07, "loss": 0.0058, "reward": 1.620706021785736, "reward_std": 0.07777786627411842, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6207059621810913, "step": 1552 }, { "completion_length": 105.84375, "epoch": 1.0012894906511929, "grad_norm": 31.582460403442383, "kl": 0.107421875, "learning_rate": 4.993552546744036e-07, "loss": 0.0043, "reward": 1.5983824729919434, "reward_std": 0.10737652331590652, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.598382443189621, "step": 1553 }, { "completion_length": 109.546875, "epoch": 1.001934235976789, "grad_norm": 19.191818237304688, "kl": 0.12451171875, "learning_rate": 4.990328820116054e-07, "loss": 0.005, "reward": 1.5432923436164856, "reward_std": 0.14936043322086334, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.543292373418808, "step": 1554 }, { "completion_length": 110.75, "epoch": 1.0025789813023855, "grad_norm": 8.744702339172363, "kl": 0.110595703125, "learning_rate": 4.987105093488072e-07, "loss": 0.0044, "reward": 1.377943515777588, "reward_std": 0.046197013929486275, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3779434710741043, "step": 1555 }, { "completion_length": 119.171875, "epoch": 1.003223726627982, "grad_norm": 18.153520584106445, "kl": 0.123046875, "learning_rate": 4.98388136686009e-07, "loss": 0.0049, "reward": 1.532510221004486, "reward_std": 0.13577953726053238, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5325101613998413, "step": 1556 }, { "completion_length": 118.078125, "epoch": 1.0038684719535784, "grad_norm": 5.668532848358154, "kl": 0.1162109375, "learning_rate": 4.980657640232109e-07, "loss": 0.0046, "reward": 1.4012870788574219, "reward_std": 0.05575377307832241, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4012870639562607, "step": 1557 }, { "completion_length": 118.96875, "epoch": 1.0045132172791746, "grad_norm": 21.61020851135254, "kl": 0.12060546875, "learning_rate": 4.977433913604126e-07, "loss": 0.0048, "reward": 1.5471839904785156, "reward_std": 0.12884113937616348, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5471839904785156, "step": 1558 }, { "completion_length": 119.75, "epoch": 1.005157962604771, "grad_norm": 11.653385162353516, "kl": 0.24658203125, "learning_rate": 4.974210186976144e-07, "loss": 0.0099, "reward": 1.6526493430137634, "reward_std": 0.05117896664887667, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.652649313211441, "step": 1559 }, { "completion_length": 115.625, "epoch": 1.0058027079303675, "grad_norm": 14.82691478729248, "kl": 0.1259765625, "learning_rate": 4.970986460348163e-07, "loss": 0.005, "reward": 1.6142604351043701, "reward_std": 0.10050322860479355, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6142604947090149, "step": 1560 }, { "completion_length": 111.09375, "epoch": 1.006447453255964, "grad_norm": 8.875225067138672, "kl": 0.11376953125, "learning_rate": 4.967762733720181e-07, "loss": 0.0046, "reward": 1.7486295104026794, "reward_std": 0.08543649315834045, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7486294507980347, "step": 1561 }, { "completion_length": 123.1875, "epoch": 1.0070921985815602, "grad_norm": 15.650257110595703, "kl": 0.12060546875, "learning_rate": 4.964539007092198e-07, "loss": 0.0048, "reward": 1.564130961894989, "reward_std": 0.09400678798556328, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.564130961894989, "step": 1562 }, { "completion_length": 112.328125, "epoch": 1.0077369439071566, "grad_norm": 17.488706588745117, "kl": 0.117431640625, "learning_rate": 4.961315280464217e-07, "loss": 0.0047, "reward": 1.5162153840065002, "reward_std": 0.1196032203733921, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5162154138088226, "step": 1563 }, { "completion_length": 110.8125, "epoch": 1.008381689232753, "grad_norm": 9.31165599822998, "kl": 0.11279296875, "learning_rate": 4.958091553836235e-07, "loss": 0.0045, "reward": 1.625089704990387, "reward_std": 0.08436309918761253, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6250897943973541, "step": 1564 }, { "completion_length": 118.46875, "epoch": 1.0090264345583495, "grad_norm": 44.249385833740234, "kl": 0.093505859375, "learning_rate": 4.954867827208252e-07, "loss": 0.0037, "reward": 1.625906765460968, "reward_std": 0.07646162062883377, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6259067952632904, "step": 1565 }, { "completion_length": 110.71875, "epoch": 1.009671179883946, "grad_norm": 17.5499267578125, "kl": 0.134765625, "learning_rate": 4.951644100580271e-07, "loss": 0.0054, "reward": 1.7371680736541748, "reward_std": 0.11888208240270615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7371681034564972, "step": 1566 }, { "completion_length": 107.765625, "epoch": 1.0103159252095422, "grad_norm": 20.713336944580078, "kl": 0.13818359375, "learning_rate": 4.948420373952289e-07, "loss": 0.0055, "reward": 1.4837321639060974, "reward_std": 0.0846199318766594, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4837321788072586, "step": 1567 }, { "completion_length": 107.328125, "epoch": 1.0109606705351386, "grad_norm": 23.845182418823242, "kl": 0.12060546875, "learning_rate": 4.945196647324307e-07, "loss": 0.0048, "reward": 1.6508268117904663, "reward_std": 0.06122574955224991, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6508268117904663, "step": 1568 }, { "completion_length": 125.0625, "epoch": 1.011605415860735, "grad_norm": 9.649888038635254, "kl": 0.10986328125, "learning_rate": 4.941972920696324e-07, "loss": 0.0044, "reward": 1.5454849004745483, "reward_std": 0.15033403038978577, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5611098408699036, "step": 1569 }, { "completion_length": 117.015625, "epoch": 1.0122501611863315, "grad_norm": 14.543222427368164, "kl": 0.126220703125, "learning_rate": 4.938749194068343e-07, "loss": 0.0051, "reward": 1.6417489647865295, "reward_std": 0.06025063619017601, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6417489647865295, "step": 1570 }, { "completion_length": 104.578125, "epoch": 1.0128949065119277, "grad_norm": 20.035690307617188, "kl": 0.17822265625, "learning_rate": 4.935525467440361e-07, "loss": 0.0071, "reward": 1.7167410850524902, "reward_std": 0.13318730890750885, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7167410850524902, "step": 1571 }, { "completion_length": 116.296875, "epoch": 1.0135396518375241, "grad_norm": 7.308071136474609, "kl": 0.13427734375, "learning_rate": 4.93230174081238e-07, "loss": 0.0054, "reward": 1.4862470030784607, "reward_std": 0.11065686121582985, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4862470328807831, "step": 1572 }, { "completion_length": 107.84375, "epoch": 1.0141843971631206, "grad_norm": 9.348837852478027, "kl": 0.1064453125, "learning_rate": 4.929078014184397e-07, "loss": 0.0043, "reward": 1.4404972791671753, "reward_std": 0.08681789599359035, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4404972344636917, "step": 1573 }, { "completion_length": 108.046875, "epoch": 1.014829142488717, "grad_norm": 40.28298568725586, "kl": 0.13916015625, "learning_rate": 4.925854287556415e-07, "loss": 0.0056, "reward": 1.7261015176773071, "reward_std": 0.11216892302036285, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7261015474796295, "step": 1574 }, { "completion_length": 111.609375, "epoch": 1.0154738878143132, "grad_norm": 30.62251853942871, "kl": 0.11474609375, "learning_rate": 4.922630560928433e-07, "loss": 0.0046, "reward": 1.5981187224388123, "reward_std": 0.1484297215938568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5981186926364899, "step": 1575 }, { "completion_length": 107.96875, "epoch": 1.0161186331399097, "grad_norm": 14.246084213256836, "kl": 0.128662109375, "learning_rate": 4.919406834300452e-07, "loss": 0.0051, "reward": 1.5964959263801575, "reward_std": 0.09668882563710213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5964959263801575, "step": 1576 }, { "completion_length": 110.015625, "epoch": 1.0167633784655061, "grad_norm": 45.870033264160156, "kl": 0.10986328125, "learning_rate": 4.916183107672469e-07, "loss": 0.0044, "reward": 1.6973443031311035, "reward_std": 0.06511173956096172, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6973443031311035, "step": 1577 }, { "completion_length": 115.78125, "epoch": 1.0174081237911026, "grad_norm": 23.571115493774414, "kl": 0.132568359375, "learning_rate": 4.912959381044487e-07, "loss": 0.0053, "reward": 1.6813059449195862, "reward_std": 0.1136469803750515, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6813060343265533, "step": 1578 }, { "completion_length": 99.984375, "epoch": 1.018052869116699, "grad_norm": 42.18523025512695, "kl": 0.12353515625, "learning_rate": 4.909735654416506e-07, "loss": 0.0049, "reward": 1.5764471292495728, "reward_std": 0.10761763527989388, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5764471590518951, "step": 1579 }, { "completion_length": 113.234375, "epoch": 1.0186976144422952, "grad_norm": 37.51557159423828, "kl": 0.106201171875, "learning_rate": 4.906511927788523e-07, "loss": 0.0042, "reward": 1.6667637825012207, "reward_std": 0.12272565066814423, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6667637825012207, "step": 1580 }, { "completion_length": 104.4375, "epoch": 1.0193423597678917, "grad_norm": 10.304484367370605, "kl": 0.12060546875, "learning_rate": 4.903288201160541e-07, "loss": 0.0048, "reward": 1.5135563611984253, "reward_std": 0.08310433477163315, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5135563760995865, "step": 1581 }, { "completion_length": 106.28125, "epoch": 1.0199871050934881, "grad_norm": 17.609773635864258, "kl": 0.1142578125, "learning_rate": 4.90006447453256e-07, "loss": 0.0046, "reward": 1.6618885397911072, "reward_std": 0.07358003780245781, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6618885397911072, "step": 1582 }, { "completion_length": 95.5, "epoch": 1.0206318504190846, "grad_norm": 39.74180221557617, "kl": 0.4404296875, "learning_rate": 4.896840747904578e-07, "loss": 0.0176, "reward": 1.6940548419952393, "reward_std": 0.17156479507684708, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6940547823905945, "step": 1583 }, { "completion_length": 103.96875, "epoch": 1.0212765957446808, "grad_norm": 20.01495933532715, "kl": 0.120361328125, "learning_rate": 4.893617021276595e-07, "loss": 0.0048, "reward": 1.537524700164795, "reward_std": 0.08335036598145962, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5375246703624725, "step": 1584 }, { "completion_length": 101.890625, "epoch": 1.0219213410702772, "grad_norm": 14.507052421569824, "kl": 0.099609375, "learning_rate": 4.890393294648614e-07, "loss": 0.004, "reward": 1.5363896489143372, "reward_std": 0.060725823044776917, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5363896787166595, "step": 1585 }, { "completion_length": 114.59375, "epoch": 1.0225660863958737, "grad_norm": 22.21466827392578, "kl": 0.0986328125, "learning_rate": 4.887169568020632e-07, "loss": 0.0039, "reward": 1.6946098804473877, "reward_std": 0.12630461901426315, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6946098208427429, "step": 1586 }, { "completion_length": 102.25, "epoch": 1.02321083172147, "grad_norm": 11.273469924926758, "kl": 0.1396484375, "learning_rate": 4.88394584139265e-07, "loss": 0.0056, "reward": 1.5050395131111145, "reward_std": 0.1273115649819374, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.505039632320404, "step": 1587 }, { "completion_length": 101.15625, "epoch": 1.0238555770470663, "grad_norm": 9.453971862792969, "kl": 0.1181640625, "learning_rate": 4.880722114764668e-07, "loss": 0.0047, "reward": 1.6377010941505432, "reward_std": 0.17548616230487823, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6533261239528656, "step": 1588 }, { "completion_length": 109.53125, "epoch": 1.0245003223726628, "grad_norm": 32.01575469970703, "kl": 0.1201171875, "learning_rate": 4.877498388136686e-07, "loss": 0.0048, "reward": 1.6303631067276, "reward_std": 0.1175847053527832, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6303631365299225, "step": 1589 }, { "completion_length": 104.15625, "epoch": 1.0251450676982592, "grad_norm": 10.830766677856445, "kl": 0.130126953125, "learning_rate": 4.874274661508704e-07, "loss": 0.0052, "reward": 1.4934070110321045, "reward_std": 0.13720114901661873, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4934070557355881, "step": 1590 }, { "completion_length": 98.375, "epoch": 1.0257898130238556, "grad_norm": 47.61631393432617, "kl": 0.154296875, "learning_rate": 4.871050934880723e-07, "loss": 0.0062, "reward": 1.6362435221672058, "reward_std": 0.13050130754709244, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6362435519695282, "step": 1591 }, { "completion_length": 104.609375, "epoch": 1.026434558349452, "grad_norm": 25.48038101196289, "kl": 0.12060546875, "learning_rate": 4.86782720825274e-07, "loss": 0.0048, "reward": 1.6710693836212158, "reward_std": 0.08275764994323254, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6710693538188934, "step": 1592 }, { "completion_length": 97.640625, "epoch": 1.0270793036750483, "grad_norm": 148.87600708007812, "kl": 0.11767578125, "learning_rate": 4.864603481624758e-07, "loss": 0.0047, "reward": 1.4782089591026306, "reward_std": 0.17569049820303917, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.478208988904953, "step": 1593 }, { "completion_length": 104.65625, "epoch": 1.0277240490006447, "grad_norm": 82.6065444946289, "kl": 0.110595703125, "learning_rate": 4.861379754996777e-07, "loss": 0.0044, "reward": 1.5708330869674683, "reward_std": 0.09587904810905457, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5708329975605011, "step": 1594 }, { "completion_length": 105.703125, "epoch": 1.0283687943262412, "grad_norm": 12.176362991333008, "kl": 0.108642578125, "learning_rate": 4.858156028368794e-07, "loss": 0.0043, "reward": 1.6616841554641724, "reward_std": 0.09205985069274902, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6616840362548828, "step": 1595 }, { "completion_length": 101.5, "epoch": 1.0290135396518376, "grad_norm": 26.574724197387695, "kl": 0.1240234375, "learning_rate": 4.854932301740812e-07, "loss": 0.005, "reward": 1.5319926142692566, "reward_std": 0.07157671824097633, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5319925844669342, "step": 1596 }, { "completion_length": 104.03125, "epoch": 1.0296582849774338, "grad_norm": 27.543840408325195, "kl": 0.125, "learning_rate": 4.85170857511283e-07, "loss": 0.005, "reward": 1.3573514819145203, "reward_std": 0.10726149939000607, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3573514372110367, "step": 1597 }, { "completion_length": 105.25, "epoch": 1.0303030303030303, "grad_norm": 52.3618049621582, "kl": 0.1220703125, "learning_rate": 4.848484848484849e-07, "loss": 0.0049, "reward": 1.6784070134162903, "reward_std": 0.06625668704509735, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6784070134162903, "step": 1598 }, { "completion_length": 101.203125, "epoch": 1.0309477756286267, "grad_norm": 24.95046043395996, "kl": 0.1103515625, "learning_rate": 4.845261121856866e-07, "loss": 0.0044, "reward": 1.5376904606819153, "reward_std": 0.07565680518746376, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5376904159784317, "step": 1599 }, { "completion_length": 95.953125, "epoch": 1.0315925209542232, "grad_norm": 24.625473022460938, "kl": 0.123779296875, "learning_rate": 4.842037395228884e-07, "loss": 0.0049, "reward": 1.6368153095245361, "reward_std": 0.17632479965686798, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6524402499198914, "step": 1600 }, { "completion_length": 113.0, "epoch": 1.0322372662798194, "grad_norm": 23.780305862426758, "kl": 0.120361328125, "learning_rate": 4.838813668600903e-07, "loss": 0.0048, "reward": 1.611727237701416, "reward_std": 0.09991315007209778, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6117272675037384, "step": 1601 }, { "completion_length": 100.484375, "epoch": 1.0328820116054158, "grad_norm": 20.35822296142578, "kl": 0.14794921875, "learning_rate": 4.835589941972921e-07, "loss": 0.0059, "reward": 1.5440884828567505, "reward_std": 0.11685510165989399, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5440884828567505, "step": 1602 }, { "completion_length": 102.609375, "epoch": 1.0335267569310123, "grad_norm": 13.551732063293457, "kl": 0.142333984375, "learning_rate": 4.832366215344938e-07, "loss": 0.0057, "reward": 1.6955845952033997, "reward_std": 0.09659884497523308, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6955845654010773, "step": 1603 }, { "completion_length": 102.703125, "epoch": 1.0341715022566087, "grad_norm": 14.002148628234863, "kl": 0.1396484375, "learning_rate": 4.829142488716957e-07, "loss": 0.0056, "reward": 1.683996319770813, "reward_std": 0.17433501780033112, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6839962899684906, "step": 1604 }, { "completion_length": 110.578125, "epoch": 1.0348162475822051, "grad_norm": 15.583176612854004, "kl": 0.126220703125, "learning_rate": 4.825918762088975e-07, "loss": 0.005, "reward": 1.5995303988456726, "reward_std": 0.052622975781559944, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.599530428647995, "step": 1605 }, { "completion_length": 99.984375, "epoch": 1.0354609929078014, "grad_norm": 15.612828254699707, "kl": 0.1435546875, "learning_rate": 4.822695035460992e-07, "loss": 0.0057, "reward": 1.6585041880607605, "reward_std": 0.11094475910067558, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6585041582584381, "step": 1606 }, { "completion_length": 101.21875, "epoch": 1.0361057382333978, "grad_norm": 24.793502807617188, "kl": 0.13134765625, "learning_rate": 4.819471308833011e-07, "loss": 0.0052, "reward": 1.536777913570404, "reward_std": 0.11932965368032455, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5367779284715652, "step": 1607 }, { "completion_length": 107.5625, "epoch": 1.0367504835589942, "grad_norm": 11.683782577514648, "kl": 0.1220703125, "learning_rate": 4.816247582205029e-07, "loss": 0.0049, "reward": 1.6076741814613342, "reward_std": 0.17864874005317688, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.607674166560173, "step": 1608 }, { "completion_length": 95.421875, "epoch": 1.0373952288845907, "grad_norm": 15.090388298034668, "kl": 0.133056640625, "learning_rate": 4.813023855577047e-07, "loss": 0.0053, "reward": 1.5302485227584839, "reward_std": 0.14502151682972908, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5302484333515167, "step": 1609 }, { "completion_length": 96.53125, "epoch": 1.038039974210187, "grad_norm": 12.423053741455078, "kl": 0.12158203125, "learning_rate": 4.809800128949065e-07, "loss": 0.0049, "reward": 1.6846227645874023, "reward_std": 0.10945509001612663, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6846228241920471, "step": 1610 }, { "completion_length": 105.046875, "epoch": 1.0386847195357833, "grad_norm": 10.02845573425293, "kl": 0.121826171875, "learning_rate": 4.806576402321083e-07, "loss": 0.0049, "reward": 1.727286458015442, "reward_std": 0.054582754150033, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7272864580154419, "step": 1611 }, { "completion_length": 100.0625, "epoch": 1.0393294648613798, "grad_norm": 7.871588230133057, "kl": 0.1083984375, "learning_rate": 4.803352675693101e-07, "loss": 0.0043, "reward": 1.6807933449745178, "reward_std": 0.03430874086916447, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6807933747768402, "step": 1612 }, { "completion_length": 111.375, "epoch": 1.0399742101869762, "grad_norm": 10.092997550964355, "kl": 0.11083984375, "learning_rate": 4.80012894906512e-07, "loss": 0.0044, "reward": 1.5945587754249573, "reward_std": 0.08610989525914192, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6101837754249573, "step": 1613 }, { "completion_length": 104.1875, "epoch": 1.0406189555125724, "grad_norm": 13.337946891784668, "kl": 0.1337890625, "learning_rate": 4.796905222437137e-07, "loss": 0.0054, "reward": 1.5582968592643738, "reward_std": 0.09042531624436378, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5582968294620514, "step": 1614 }, { "completion_length": 101.421875, "epoch": 1.0412637008381689, "grad_norm": 18.421772003173828, "kl": 0.13671875, "learning_rate": 4.793681495809155e-07, "loss": 0.0055, "reward": 1.5865644216537476, "reward_std": 0.07357398793101311, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5865644216537476, "step": 1615 }, { "completion_length": 99.328125, "epoch": 1.0419084461637653, "grad_norm": 8.236613273620605, "kl": 0.140625, "learning_rate": 4.790457769181174e-07, "loss": 0.0056, "reward": 1.5641028881072998, "reward_std": 0.12269283831119537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.564102828502655, "step": 1616 }, { "completion_length": 99.609375, "epoch": 1.0425531914893618, "grad_norm": 12.832371711730957, "kl": 0.1455078125, "learning_rate": 4.787234042553192e-07, "loss": 0.0058, "reward": 1.5059439539909363, "reward_std": 0.07279989495873451, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5059439837932587, "step": 1617 }, { "completion_length": 115.75, "epoch": 1.0431979368149582, "grad_norm": 9.09863567352295, "kl": 0.134765625, "learning_rate": 4.784010315925209e-07, "loss": 0.0054, "reward": 1.5284171104431152, "reward_std": 0.05645672604441643, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5284171551465988, "step": 1618 }, { "completion_length": 99.515625, "epoch": 1.0438426821405544, "grad_norm": 14.850272178649902, "kl": 0.1474609375, "learning_rate": 4.780786589297227e-07, "loss": 0.0059, "reward": 1.6368718147277832, "reward_std": 0.14861361682415009, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6368718147277832, "step": 1619 }, { "completion_length": 100.203125, "epoch": 1.0444874274661509, "grad_norm": 20.048105239868164, "kl": 0.13671875, "learning_rate": 4.777562862669246e-07, "loss": 0.0055, "reward": 1.761348843574524, "reward_std": 0.1145159900188446, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7613488435745239, "step": 1620 }, { "completion_length": 102.875, "epoch": 1.0451321727917473, "grad_norm": 10.379176139831543, "kl": 0.124755859375, "learning_rate": 4.774339136041263e-07, "loss": 0.005, "reward": 1.5658888816833496, "reward_std": 0.08592114225029945, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5658888667821884, "step": 1621 }, { "completion_length": 105.125, "epoch": 1.0457769181173437, "grad_norm": 15.628901481628418, "kl": 0.12841796875, "learning_rate": 4.771115409413281e-07, "loss": 0.0051, "reward": 1.6197215914726257, "reward_std": 0.06863688118755817, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6197216510772705, "step": 1622 }, { "completion_length": 107.625, "epoch": 1.04642166344294, "grad_norm": 11.287335395812988, "kl": 0.11474609375, "learning_rate": 4.767891682785299e-07, "loss": 0.0046, "reward": 1.5502777695655823, "reward_std": 0.13397856429219246, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5502777099609375, "step": 1623 }, { "completion_length": 105.296875, "epoch": 1.0470664087685364, "grad_norm": 52.87205505371094, "kl": 0.13427734375, "learning_rate": 4.7646679561573175e-07, "loss": 0.0054, "reward": 1.6524277925491333, "reward_std": 0.089669119566679, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6524278521537781, "step": 1624 }, { "completion_length": 105.71875, "epoch": 1.0477111540941328, "grad_norm": 8.991185188293457, "kl": 0.11767578125, "learning_rate": 4.7614442295293353e-07, "loss": 0.0047, "reward": 1.8288054466247559, "reward_std": 0.08612105995416641, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.8288054168224335, "step": 1625 }, { "completion_length": 98.5, "epoch": 1.0483558994197293, "grad_norm": 265.6639709472656, "kl": 0.14794921875, "learning_rate": 4.7582205029013537e-07, "loss": 0.0059, "reward": 1.721388816833496, "reward_std": 0.11115417629480362, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7213887572288513, "step": 1626 }, { "completion_length": 94.34375, "epoch": 1.0490006447453255, "grad_norm": 55.32572555541992, "kl": 0.150390625, "learning_rate": 4.7549967762733715e-07, "loss": 0.006, "reward": 1.7054846286773682, "reward_std": 0.06372135505080223, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7054846584796906, "step": 1627 }, { "completion_length": 107.578125, "epoch": 1.049645390070922, "grad_norm": 20.151453018188477, "kl": 0.133544921875, "learning_rate": 4.75177304964539e-07, "loss": 0.0053, "reward": 1.766117811203003, "reward_std": 0.08813095837831497, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7661179304122925, "step": 1628 }, { "completion_length": 106.671875, "epoch": 1.0502901353965184, "grad_norm": 25.65150260925293, "kl": 0.12939453125, "learning_rate": 4.7485493230174076e-07, "loss": 0.0052, "reward": 1.480661690235138, "reward_std": 0.07892773300409317, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48066161572933197, "step": 1629 }, { "completion_length": 102.328125, "epoch": 1.0509348807221148, "grad_norm": 42.69692611694336, "kl": 0.13232421875, "learning_rate": 4.745325596389426e-07, "loss": 0.0053, "reward": 1.357925534248352, "reward_std": 0.07967722043395042, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.35792553424835205, "step": 1630 }, { "completion_length": 91.40625, "epoch": 1.051579626047711, "grad_norm": 24.92087173461914, "kl": 0.130859375, "learning_rate": 4.742101869761444e-07, "loss": 0.0052, "reward": 1.6058239936828613, "reward_std": 0.12468841299414635, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6058240830898285, "step": 1631 }, { "completion_length": 108.375, "epoch": 1.0522243713733075, "grad_norm": 18.37355613708496, "kl": 0.1162109375, "learning_rate": 4.7388781431334616e-07, "loss": 0.0047, "reward": 1.6409690976142883, "reward_std": 0.07419593259692192, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6409690976142883, "step": 1632 }, { "completion_length": 102.71875, "epoch": 1.052869116698904, "grad_norm": 13.083841323852539, "kl": 0.1337890625, "learning_rate": 4.73565441650548e-07, "loss": 0.0054, "reward": 1.5053048133850098, "reward_std": 0.099861279129982, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.520929828286171, "step": 1633 }, { "completion_length": 102.90625, "epoch": 1.0535138620245004, "grad_norm": 11.976901054382324, "kl": 0.141845703125, "learning_rate": 4.7324306898774977e-07, "loss": 0.0057, "reward": 1.6774804592132568, "reward_std": 0.11440124735236168, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6774804592132568, "step": 1634 }, { "completion_length": 114.65625, "epoch": 1.0541586073500968, "grad_norm": 13.442267417907715, "kl": 0.142578125, "learning_rate": 4.729206963249516e-07, "loss": 0.0057, "reward": 1.5921526551246643, "reward_std": 0.07790956273674965, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5921527147293091, "step": 1635 }, { "completion_length": 114.875, "epoch": 1.054803352675693, "grad_norm": 15.106107711791992, "kl": 0.13720703125, "learning_rate": 4.725983236621534e-07, "loss": 0.0055, "reward": 1.6467521786689758, "reward_std": 0.09242963790893555, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6467522084712982, "step": 1636 }, { "completion_length": 104.703125, "epoch": 1.0554480980012895, "grad_norm": 14.148865699768066, "kl": 0.114013671875, "learning_rate": 4.7227595099935527e-07, "loss": 0.0046, "reward": 1.7142847776412964, "reward_std": 0.08839049190282822, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7142847776412964, "step": 1637 }, { "completion_length": 93.859375, "epoch": 1.056092843326886, "grad_norm": 9.402307510375977, "kl": 0.15234375, "learning_rate": 4.71953578336557e-07, "loss": 0.0061, "reward": 1.6899645328521729, "reward_std": 0.09697774797677994, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6899645030498505, "step": 1638 }, { "completion_length": 110.109375, "epoch": 1.0567375886524824, "grad_norm": 25.695877075195312, "kl": 0.1630859375, "learning_rate": 4.716312056737589e-07, "loss": 0.0065, "reward": 1.5348122119903564, "reward_std": 0.09680547192692757, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5348122417926788, "step": 1639 }, { "completion_length": 116.484375, "epoch": 1.0573823339780786, "grad_norm": 26.15424346923828, "kl": 0.12939453125, "learning_rate": 4.7130883301096066e-07, "loss": 0.0052, "reward": 1.542705476284027, "reward_std": 0.09862326830625534, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5427054464817047, "step": 1640 }, { "completion_length": 111.40625, "epoch": 1.058027079303675, "grad_norm": 14.094022750854492, "kl": 0.12744140625, "learning_rate": 4.709864603481625e-07, "loss": 0.0051, "reward": 1.4890643954277039, "reward_std": 0.06961235404014587, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48906436562538147, "step": 1641 }, { "completion_length": 115.34375, "epoch": 1.0586718246292715, "grad_norm": 18.5950984954834, "kl": 0.122802734375, "learning_rate": 4.706640876853643e-07, "loss": 0.0049, "reward": 1.5355899930000305, "reward_std": 0.0737343579530716, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5355899631977081, "step": 1642 }, { "completion_length": 119.015625, "epoch": 1.059316569954868, "grad_norm": 10.859661102294922, "kl": 0.154296875, "learning_rate": 4.703417150225661e-07, "loss": 0.0062, "reward": 1.5704774856567383, "reward_std": 0.12773460149765015, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5704774707555771, "step": 1643 }, { "completion_length": 103.328125, "epoch": 1.0599613152804641, "grad_norm": 11.223685264587402, "kl": 0.130615234375, "learning_rate": 4.700193423597679e-07, "loss": 0.0052, "reward": 1.6081100702285767, "reward_std": 0.08056379854679108, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.608110100030899, "step": 1644 }, { "completion_length": 106.625, "epoch": 1.0606060606060606, "grad_norm": 49.92131805419922, "kl": 0.130126953125, "learning_rate": 4.696969696969697e-07, "loss": 0.0052, "reward": 1.4988184571266174, "reward_std": 0.09734079986810684, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49881842732429504, "step": 1645 }, { "completion_length": 115.0, "epoch": 1.061250805931657, "grad_norm": 30.047075271606445, "kl": 0.136474609375, "learning_rate": 4.693745970341715e-07, "loss": 0.0055, "reward": 1.6277170777320862, "reward_std": 0.04251527041196823, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6277171224355698, "step": 1646 }, { "completion_length": 110.390625, "epoch": 1.0618955512572534, "grad_norm": 8.765604972839355, "kl": 0.1376953125, "learning_rate": 4.690522243713733e-07, "loss": 0.0055, "reward": 1.5230787992477417, "reward_std": 0.09076399356126785, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5230787694454193, "step": 1647 }, { "completion_length": 117.78125, "epoch": 1.0625402965828499, "grad_norm": 18.16217613220215, "kl": 0.14111328125, "learning_rate": 4.687298517085751e-07, "loss": 0.0057, "reward": 1.4952101111412048, "reward_std": 0.1261536367237568, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49521011114120483, "step": 1648 }, { "completion_length": 102.578125, "epoch": 1.063185041908446, "grad_norm": 9.8458251953125, "kl": 0.13525390625, "learning_rate": 4.684074790457769e-07, "loss": 0.0054, "reward": 1.5231833457946777, "reward_std": 0.10076124221086502, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5388083457946777, "step": 1649 }, { "completion_length": 103.609375, "epoch": 1.0638297872340425, "grad_norm": 54.941654205322266, "kl": 0.12646484375, "learning_rate": 4.6808510638297873e-07, "loss": 0.0051, "reward": 1.5805718898773193, "reward_std": 0.11327047273516655, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.580571860074997, "step": 1650 }, { "completion_length": 121.703125, "epoch": 1.064474532559639, "grad_norm": 12.24212646484375, "kl": 0.14794921875, "learning_rate": 4.677627337201805e-07, "loss": 0.0059, "reward": 1.5812976360321045, "reward_std": 0.07798319682478905, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5812976062297821, "step": 1651 }, { "completion_length": 103.625, "epoch": 1.0651192778852354, "grad_norm": 7.415297985076904, "kl": 0.127685546875, "learning_rate": 4.6744036105738235e-07, "loss": 0.0051, "reward": 1.4523885846138, "reward_std": 0.12561476044356823, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45238858461380005, "step": 1652 }, { "completion_length": 115.21875, "epoch": 1.0657640232108316, "grad_norm": 12.325562477111816, "kl": 0.1337890625, "learning_rate": 4.6711798839458413e-07, "loss": 0.0054, "reward": 1.5610628724098206, "reward_std": 0.07157477736473083, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5610628575086594, "step": 1653 }, { "completion_length": 104.09375, "epoch": 1.066408768536428, "grad_norm": 20.869457244873047, "kl": 0.13818359375, "learning_rate": 4.6679561573178596e-07, "loss": 0.0055, "reward": 1.5744282603263855, "reward_std": 0.09144582599401474, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5744282603263855, "step": 1654 }, { "completion_length": 104.421875, "epoch": 1.0670535138620245, "grad_norm": 7.075346946716309, "kl": 0.14599609375, "learning_rate": 4.6647324306898774e-07, "loss": 0.0059, "reward": 1.6107639074325562, "reward_std": 0.13972818106412888, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6263889372348785, "step": 1655 }, { "completion_length": 100.640625, "epoch": 1.067698259187621, "grad_norm": 6.115519046783447, "kl": 0.122314453125, "learning_rate": 4.661508704061896e-07, "loss": 0.0049, "reward": 1.5545591115951538, "reward_std": 0.053600821644067764, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.554559051990509, "step": 1656 }, { "completion_length": 108.640625, "epoch": 1.0683430045132172, "grad_norm": 12.812996864318848, "kl": 0.134765625, "learning_rate": 4.6582849774339136e-07, "loss": 0.0054, "reward": 1.6389289498329163, "reward_std": 0.07384452596306801, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6389289498329163, "step": 1657 }, { "completion_length": 124.5625, "epoch": 1.0689877498388136, "grad_norm": 10.919026374816895, "kl": 0.09814453125, "learning_rate": 4.655061250805932e-07, "loss": 0.0039, "reward": 1.5925003290176392, "reward_std": 0.057900771498680115, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5925003588199615, "step": 1658 }, { "completion_length": 114.0, "epoch": 1.06963249516441, "grad_norm": 33.34097671508789, "kl": 0.1142578125, "learning_rate": 4.6518375241779497e-07, "loss": 0.0046, "reward": 1.700951337814331, "reward_std": 0.11340216547250748, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7009514570236206, "step": 1659 }, { "completion_length": 107.953125, "epoch": 1.0702772404900065, "grad_norm": 30.96587371826172, "kl": 0.120849609375, "learning_rate": 4.6486137975499675e-07, "loss": 0.0048, "reward": 1.5911786556243896, "reward_std": 0.21220052242279053, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6068036258220673, "step": 1660 }, { "completion_length": 97.828125, "epoch": 1.070921985815603, "grad_norm": 13.614809036254883, "kl": 0.1171875, "learning_rate": 4.645390070921986e-07, "loss": 0.0047, "reward": 1.5786557793617249, "reward_std": 0.10805037245154381, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5786558091640472, "step": 1661 }, { "completion_length": 105.53125, "epoch": 1.0715667311411992, "grad_norm": 13.102092742919922, "kl": 0.11474609375, "learning_rate": 4.6421663442940036e-07, "loss": 0.0046, "reward": 1.7086985111236572, "reward_std": 0.05099846050143242, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7086984813213348, "step": 1662 }, { "completion_length": 95.953125, "epoch": 1.0722114764667956, "grad_norm": 12.318425178527832, "kl": 0.1298828125, "learning_rate": 4.638942617666022e-07, "loss": 0.0052, "reward": 1.563442349433899, "reward_std": 0.04831070266664028, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5634424388408661, "step": 1663 }, { "completion_length": 113.578125, "epoch": 1.072856221792392, "grad_norm": 17.619489669799805, "kl": 0.101806640625, "learning_rate": 4.63571889103804e-07, "loss": 0.0041, "reward": 1.6703379154205322, "reward_std": 0.08459489978849888, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6703379154205322, "step": 1664 }, { "completion_length": 110.671875, "epoch": 1.0735009671179885, "grad_norm": 26.056339263916016, "kl": 0.13623046875, "learning_rate": 4.632495164410058e-07, "loss": 0.0054, "reward": 1.7076152563095093, "reward_std": 0.09282954782247543, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.707615315914154, "step": 1665 }, { "completion_length": 107.46875, "epoch": 1.0741457124435847, "grad_norm": 9.738430976867676, "kl": 0.10888671875, "learning_rate": 4.629271437782076e-07, "loss": 0.0044, "reward": 1.595727562904358, "reward_std": 0.12758340686559677, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5957275629043579, "step": 1666 }, { "completion_length": 102.265625, "epoch": 1.0747904577691811, "grad_norm": 17.98639488220215, "kl": 0.114501953125, "learning_rate": 4.626047711154094e-07, "loss": 0.0046, "reward": 1.5077466368675232, "reward_std": 0.04883422330021858, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5077466517686844, "step": 1667 }, { "completion_length": 109.546875, "epoch": 1.0754352030947776, "grad_norm": 9.506091117858887, "kl": 0.10986328125, "learning_rate": 4.622823984526112e-07, "loss": 0.0044, "reward": 1.5453261137008667, "reward_std": 0.07154025323688984, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5453260838985443, "step": 1668 }, { "completion_length": 102.984375, "epoch": 1.076079948420374, "grad_norm": 20.088478088378906, "kl": 0.15771484375, "learning_rate": 4.6196002578981304e-07, "loss": 0.0063, "reward": 1.5990652441978455, "reward_std": 0.08672228083014488, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5990653038024902, "step": 1669 }, { "completion_length": 108.390625, "epoch": 1.0767246937459702, "grad_norm": 9.232168197631836, "kl": 0.1298828125, "learning_rate": 4.616376531270148e-07, "loss": 0.0052, "reward": 1.639800250530243, "reward_std": 0.1497473306953907, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6398002207279205, "step": 1670 }, { "completion_length": 119.8125, "epoch": 1.0773694390715667, "grad_norm": 14.667919158935547, "kl": 0.140625, "learning_rate": 4.6131528046421665e-07, "loss": 0.0056, "reward": 1.4386180639266968, "reward_std": 0.10323400981724262, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43861812353134155, "step": 1671 }, { "completion_length": 110.765625, "epoch": 1.0780141843971631, "grad_norm": 26.46577262878418, "kl": 0.14306640625, "learning_rate": 4.6099290780141843e-07, "loss": 0.0057, "reward": 1.4531315565109253, "reward_std": 0.1466679349541664, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4687565565109253, "step": 1672 }, { "completion_length": 103.078125, "epoch": 1.0786589297227596, "grad_norm": 72.6494369506836, "kl": 0.115478515625, "learning_rate": 4.606705351386202e-07, "loss": 0.0046, "reward": 1.620385468006134, "reward_std": 0.09875315055251122, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6203854531049728, "step": 1673 }, { "completion_length": 109.53125, "epoch": 1.0793036750483558, "grad_norm": 105.22774505615234, "kl": 0.13525390625, "learning_rate": 4.6034816247582205e-07, "loss": 0.0054, "reward": 1.5872658491134644, "reward_std": 0.07589317858219147, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5872658789157867, "step": 1674 }, { "completion_length": 107.203125, "epoch": 1.0799484203739522, "grad_norm": 16.956668853759766, "kl": 0.113525390625, "learning_rate": 4.6002578981302383e-07, "loss": 0.0045, "reward": 1.683825135231018, "reward_std": 0.15254256501793861, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6994501352310181, "step": 1675 }, { "completion_length": 102.78125, "epoch": 1.0805931656995487, "grad_norm": 33.67800521850586, "kl": 0.11572265625, "learning_rate": 4.5970341715022566e-07, "loss": 0.0046, "reward": 1.730326533317566, "reward_std": 0.1375247836112976, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7303265333175659, "step": 1676 }, { "completion_length": 99.21875, "epoch": 1.081237911025145, "grad_norm": 12.1163969039917, "kl": 0.119140625, "learning_rate": 4.5938104448742744e-07, "loss": 0.0048, "reward": 1.5533208847045898, "reward_std": 0.09048768505454063, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5533208549022675, "step": 1677 }, { "completion_length": 99.03125, "epoch": 1.0818826563507415, "grad_norm": 21.379323959350586, "kl": 0.14306640625, "learning_rate": 4.590586718246293e-07, "loss": 0.0057, "reward": 1.547052264213562, "reward_std": 0.09882994741201401, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.547052264213562, "step": 1678 }, { "completion_length": 100.734375, "epoch": 1.0825274016763378, "grad_norm": 16.001218795776367, "kl": 0.21875, "learning_rate": 4.5873629916183106e-07, "loss": 0.0088, "reward": 1.509481430053711, "reward_std": 0.20628438889980316, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5251064896583557, "step": 1679 }, { "completion_length": 94.640625, "epoch": 1.0831721470019342, "grad_norm": 19.575855255126953, "kl": 0.13134765625, "learning_rate": 4.584139264990329e-07, "loss": 0.0052, "reward": 1.5105572938919067, "reward_std": 0.06101445481181145, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5105573087930679, "step": 1680 }, { "completion_length": 106.015625, "epoch": 1.0838168923275306, "grad_norm": 15.92323112487793, "kl": 0.1083984375, "learning_rate": 4.5809155383623467e-07, "loss": 0.0043, "reward": 1.755763292312622, "reward_std": 0.0871877558529377, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7557632923126221, "step": 1681 }, { "completion_length": 95.5625, "epoch": 1.084461637653127, "grad_norm": 13.914878845214844, "kl": 0.12890625, "learning_rate": 4.577691811734365e-07, "loss": 0.0052, "reward": 1.485410749912262, "reward_std": 0.12811970710754395, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4854106605052948, "step": 1682 }, { "completion_length": 111.90625, "epoch": 1.0851063829787233, "grad_norm": 10.895196914672852, "kl": 0.1318359375, "learning_rate": 4.574468085106383e-07, "loss": 0.0053, "reward": 1.5880500078201294, "reward_std": 0.10918740928173065, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5880500674247742, "step": 1683 }, { "completion_length": 97.40625, "epoch": 1.0857511283043197, "grad_norm": 21.795761108398438, "kl": 0.120849609375, "learning_rate": 4.571244358478401e-07, "loss": 0.0048, "reward": 1.4776260256767273, "reward_std": 0.06880192831158638, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47762608528137207, "step": 1684 }, { "completion_length": 102.421875, "epoch": 1.0863958736299162, "grad_norm": 14.754842758178711, "kl": 0.1103515625, "learning_rate": 4.568020631850419e-07, "loss": 0.0044, "reward": 1.710328757762909, "reward_std": 0.06619776226580143, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7103287577629089, "step": 1685 }, { "completion_length": 105.65625, "epoch": 1.0870406189555126, "grad_norm": 9.40377426147461, "kl": 0.4111328125, "learning_rate": 4.564796905222437e-07, "loss": 0.0164, "reward": 1.6777933239936829, "reward_std": 0.2508092671632767, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.7090433239936829, "step": 1686 }, { "completion_length": 104.53125, "epoch": 1.0876853642811088, "grad_norm": 175.24464416503906, "kl": 0.12646484375, "learning_rate": 4.561573178594455e-07, "loss": 0.0051, "reward": 1.484681248664856, "reward_std": 0.07167026400566101, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48468117415905, "step": 1687 }, { "completion_length": 97.96875, "epoch": 1.0883301096067053, "grad_norm": 12.59119987487793, "kl": 0.119873046875, "learning_rate": 4.558349451966473e-07, "loss": 0.0048, "reward": 1.7193995118141174, "reward_std": 0.058623988181352615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.719399482011795, "step": 1688 }, { "completion_length": 103.1875, "epoch": 1.0889748549323017, "grad_norm": 12.258233070373535, "kl": 0.15087890625, "learning_rate": 4.555125725338491e-07, "loss": 0.006, "reward": 1.6564828157424927, "reward_std": 0.08070876821875572, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6564827561378479, "step": 1689 }, { "completion_length": 114.921875, "epoch": 1.0896196002578982, "grad_norm": 21.755619049072266, "kl": 0.112060546875, "learning_rate": 4.551901998710509e-07, "loss": 0.0045, "reward": 1.6646082401275635, "reward_std": 0.09645628184080124, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6646082699298859, "step": 1690 }, { "completion_length": 104.421875, "epoch": 1.0902643455834946, "grad_norm": 10.000032424926758, "kl": 0.14501953125, "learning_rate": 4.5486782720825274e-07, "loss": 0.0058, "reward": 1.708804428577423, "reward_std": 0.17115085572004318, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7244293987751007, "step": 1691 }, { "completion_length": 107.71875, "epoch": 1.0909090909090908, "grad_norm": 26.18584632873535, "kl": 0.1240234375, "learning_rate": 4.545454545454545e-07, "loss": 0.005, "reward": 1.7140477299690247, "reward_std": 0.10914264246821404, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7140477001667023, "step": 1692 }, { "completion_length": 100.3125, "epoch": 1.0915538362346873, "grad_norm": 12.087810516357422, "kl": 0.119384765625, "learning_rate": 4.5422308188265635e-07, "loss": 0.0048, "reward": 1.6223152875900269, "reward_std": 0.07956447452306747, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6223153173923492, "step": 1693 }, { "completion_length": 105.265625, "epoch": 1.0921985815602837, "grad_norm": 11.739949226379395, "kl": 0.14208984375, "learning_rate": 4.5390070921985813e-07, "loss": 0.0057, "reward": 1.5097407698631287, "reward_std": 0.09296748787164688, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5097408145666122, "step": 1694 }, { "completion_length": 104.625, "epoch": 1.0928433268858802, "grad_norm": 11.357398986816406, "kl": 0.12060546875, "learning_rate": 4.5357833655705997e-07, "loss": 0.0048, "reward": 1.7072781324386597, "reward_std": 0.15004675462841988, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7229031324386597, "step": 1695 }, { "completion_length": 114.4375, "epoch": 1.0934880722114764, "grad_norm": 9.150431632995605, "kl": 0.110107421875, "learning_rate": 4.5325596389426175e-07, "loss": 0.0044, "reward": 1.6482697129249573, "reward_std": 0.06808177940547466, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6482696235179901, "step": 1696 }, { "completion_length": 108.765625, "epoch": 1.0941328175370728, "grad_norm": 26.6113224029541, "kl": 0.143310546875, "learning_rate": 4.529335912314636e-07, "loss": 0.0057, "reward": 1.5538044571876526, "reward_std": 0.105574119836092, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5694293677806854, "step": 1697 }, { "completion_length": 112.109375, "epoch": 1.0947775628626693, "grad_norm": 15.188982963562012, "kl": 0.1513671875, "learning_rate": 4.5261121856866536e-07, "loss": 0.006, "reward": 1.4535160660743713, "reward_std": 0.14586328715085983, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4535161405801773, "step": 1698 }, { "completion_length": 101.59375, "epoch": 1.0954223081882657, "grad_norm": 9.596237182617188, "kl": 0.150390625, "learning_rate": 4.522888459058672e-07, "loss": 0.006, "reward": 1.618313193321228, "reward_std": 0.20893344283103943, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.6495631337165833, "step": 1699 }, { "completion_length": 99.671875, "epoch": 1.096067053513862, "grad_norm": 14.38936996459961, "kl": 0.13916015625, "learning_rate": 4.51966473243069e-07, "loss": 0.0056, "reward": 1.6061973571777344, "reward_std": 0.10794469714164734, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.606197327375412, "step": 1700 }, { "completion_length": 96.375, "epoch": 1.0967117988394584, "grad_norm": 81.89236450195312, "kl": 0.1357421875, "learning_rate": 4.5164410058027076e-07, "loss": 0.0054, "reward": 1.7045114636421204, "reward_std": 0.19201814383268356, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7201364934444427, "step": 1701 }, { "completion_length": 113.984375, "epoch": 1.0973565441650548, "grad_norm": 19.061376571655273, "kl": 0.10986328125, "learning_rate": 4.513217279174726e-07, "loss": 0.0044, "reward": 1.6339808106422424, "reward_std": 0.05904205143451691, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6339808702468872, "step": 1702 }, { "completion_length": 103.984375, "epoch": 1.0980012894906512, "grad_norm": 8.031238555908203, "kl": 0.110595703125, "learning_rate": 4.5099935525467437e-07, "loss": 0.0044, "reward": 1.6934545040130615, "reward_std": 0.11994657665491104, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6934545040130615, "step": 1703 }, { "completion_length": 107.28125, "epoch": 1.0986460348162477, "grad_norm": 73.1088638305664, "kl": 0.116943359375, "learning_rate": 4.506769825918762e-07, "loss": 0.0047, "reward": 1.6883281469345093, "reward_std": 0.10029840841889381, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6883280873298645, "step": 1704 }, { "completion_length": 106.125, "epoch": 1.099290780141844, "grad_norm": 18.33809471130371, "kl": 0.13134765625, "learning_rate": 4.50354609929078e-07, "loss": 0.0053, "reward": 1.5655602812767029, "reward_std": 0.15298619121313095, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5811853408813477, "step": 1705 }, { "completion_length": 111.8125, "epoch": 1.0999355254674403, "grad_norm": 13.750594139099121, "kl": 0.12158203125, "learning_rate": 4.500322372662798e-07, "loss": 0.0049, "reward": 1.5528867840766907, "reward_std": 0.14944328367710114, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5685118287801743, "step": 1706 }, { "completion_length": 107.0625, "epoch": 1.1005802707930368, "grad_norm": 15.120196342468262, "kl": 0.129150390625, "learning_rate": 4.497098646034816e-07, "loss": 0.0052, "reward": 1.7107568383216858, "reward_std": 0.06326938793063164, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7107568383216858, "step": 1707 }, { "completion_length": 97.53125, "epoch": 1.1012250161186332, "grad_norm": 37.98473358154297, "kl": 0.173828125, "learning_rate": 4.4938749194068343e-07, "loss": 0.007, "reward": 1.6928101778030396, "reward_std": 0.1691095195710659, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7084352076053619, "step": 1708 }, { "completion_length": 102.015625, "epoch": 1.1018697614442294, "grad_norm": 8.216543197631836, "kl": 0.13427734375, "learning_rate": 4.490651192778852e-07, "loss": 0.0054, "reward": 1.6665999293327332, "reward_std": 0.042136115953326225, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6665998697280884, "step": 1709 }, { "completion_length": 99.578125, "epoch": 1.1025145067698259, "grad_norm": 9.077375411987305, "kl": 0.13427734375, "learning_rate": 4.4874274661508705e-07, "loss": 0.0054, "reward": 1.7417653799057007, "reward_std": 0.12950639054179192, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7573904097080231, "step": 1710 }, { "completion_length": 98.71875, "epoch": 1.1031592520954223, "grad_norm": 15.816248893737793, "kl": 0.1611328125, "learning_rate": 4.484203739522888e-07, "loss": 0.0064, "reward": 1.2563967108726501, "reward_std": 0.24838687479496002, "rewards/format_reward": 0.953125, "rewards/iou_timestamp_reward": 0.30327168107032776, "step": 1711 }, { "completion_length": 94.796875, "epoch": 1.1038039974210188, "grad_norm": 13.886948585510254, "kl": 0.1337890625, "learning_rate": 4.4809800128949066e-07, "loss": 0.0053, "reward": 1.7339037656784058, "reward_std": 0.0918034091591835, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7339037656784058, "step": 1712 }, { "completion_length": 101.265625, "epoch": 1.104448742746615, "grad_norm": 30.30232048034668, "kl": 0.141357421875, "learning_rate": 4.4777562862669244e-07, "loss": 0.0056, "reward": 1.5611588954925537, "reward_std": 0.12069892883300781, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5767838507890701, "step": 1713 }, { "completion_length": 102.4375, "epoch": 1.1050934880722114, "grad_norm": 16.694547653198242, "kl": 0.130859375, "learning_rate": 4.474532559638942e-07, "loss": 0.0052, "reward": 1.698163628578186, "reward_std": 0.20370149239897728, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.7294135689735413, "step": 1714 }, { "completion_length": 96.671875, "epoch": 1.1057382333978079, "grad_norm": 12.910335540771484, "kl": 0.1259765625, "learning_rate": 4.4713088330109605e-07, "loss": 0.005, "reward": 1.4640353918075562, "reward_std": 0.03386647813022137, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4640353322029114, "step": 1715 }, { "completion_length": 99.234375, "epoch": 1.1063829787234043, "grad_norm": 20.440353393554688, "kl": 0.134765625, "learning_rate": 4.4680851063829783e-07, "loss": 0.0054, "reward": 1.6791711449623108, "reward_std": 0.07557164318859577, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.679171234369278, "step": 1716 }, { "completion_length": 98.0625, "epoch": 1.1070277240490007, "grad_norm": 12.584556579589844, "kl": 0.150390625, "learning_rate": 4.4648613797549967e-07, "loss": 0.006, "reward": 1.495797336101532, "reward_std": 0.15810192748904228, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5114222764968872, "step": 1717 }, { "completion_length": 88.4375, "epoch": 1.107672469374597, "grad_norm": 23.811677932739258, "kl": 0.1435546875, "learning_rate": 4.4616376531270145e-07, "loss": 0.0058, "reward": 1.4199917912483215, "reward_std": 0.1605324186384678, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.4512418508529663, "step": 1718 }, { "completion_length": 102.265625, "epoch": 1.1083172147001934, "grad_norm": 8.892284393310547, "kl": 0.128662109375, "learning_rate": 4.458413926499033e-07, "loss": 0.0051, "reward": 1.5546639561653137, "reward_std": 0.2123618647456169, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5859139859676361, "step": 1719 }, { "completion_length": 101.90625, "epoch": 1.1089619600257898, "grad_norm": 9.575830459594727, "kl": 0.1396484375, "learning_rate": 4.4551901998710506e-07, "loss": 0.0056, "reward": 1.5857290029525757, "reward_std": 0.07718907855451107, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5857289731502533, "step": 1720 }, { "completion_length": 106.078125, "epoch": 1.1096067053513863, "grad_norm": 273.09173583984375, "kl": 0.1279296875, "learning_rate": 4.451966473243069e-07, "loss": 0.0051, "reward": 1.488102912902832, "reward_std": 0.17496028542518616, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5037279427051544, "step": 1721 }, { "completion_length": 111.328125, "epoch": 1.1102514506769825, "grad_norm": 15.99120044708252, "kl": 0.12451171875, "learning_rate": 4.448742746615087e-07, "loss": 0.005, "reward": 1.6247381567955017, "reward_std": 0.1117971558123827, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6247380971908569, "step": 1722 }, { "completion_length": 107.953125, "epoch": 1.110896196002579, "grad_norm": 9.210831642150879, "kl": 0.107666015625, "learning_rate": 4.445519019987105e-07, "loss": 0.0043, "reward": 1.5621591806411743, "reward_std": 0.18757861852645874, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5934091806411743, "step": 1723 }, { "completion_length": 106.078125, "epoch": 1.1115409413281754, "grad_norm": 39.334266662597656, "kl": 0.1162109375, "learning_rate": 4.442295293359123e-07, "loss": 0.0046, "reward": 1.404995858669281, "reward_std": 0.05613952688872814, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.40499579906463623, "step": 1724 }, { "completion_length": 114.71875, "epoch": 1.1121856866537718, "grad_norm": 15.110870361328125, "kl": 0.114501953125, "learning_rate": 4.439071566731141e-07, "loss": 0.0046, "reward": 1.5173662900924683, "reward_std": 0.05804556608200073, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5173663049936295, "step": 1725 }, { "completion_length": 102.265625, "epoch": 1.112830431979368, "grad_norm": 16.11209487915039, "kl": 0.140625, "learning_rate": 4.435847840103159e-07, "loss": 0.0056, "reward": 1.5029131174087524, "reward_std": 0.13497604429721832, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5029131919145584, "step": 1726 }, { "completion_length": 110.375, "epoch": 1.1134751773049645, "grad_norm": 19.925506591796875, "kl": 0.121826171875, "learning_rate": 4.432624113475177e-07, "loss": 0.0049, "reward": 1.668049395084381, "reward_std": 0.06225564517080784, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6680494248867035, "step": 1727 }, { "completion_length": 99.625, "epoch": 1.114119922630561, "grad_norm": 12.299700736999512, "kl": 0.1357421875, "learning_rate": 4.429400386847195e-07, "loss": 0.0054, "reward": 1.632355809211731, "reward_std": 0.1639389768242836, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.6636057496070862, "step": 1728 }, { "completion_length": 109.71875, "epoch": 1.1147646679561574, "grad_norm": 24.155738830566406, "kl": 0.1142578125, "learning_rate": 4.426176660219213e-07, "loss": 0.0046, "reward": 1.5948566198349, "reward_std": 0.08682410418987274, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5948565602302551, "step": 1729 }, { "completion_length": 120.21875, "epoch": 1.1154094132817538, "grad_norm": 6.264504909515381, "kl": 0.1767578125, "learning_rate": 4.4229529335912313e-07, "loss": 0.0071, "reward": 1.5787187814712524, "reward_std": 0.08336924016475677, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.578718900680542, "step": 1730 }, { "completion_length": 115.609375, "epoch": 1.11605415860735, "grad_norm": 17.270288467407227, "kl": 0.1083984375, "learning_rate": 4.419729206963249e-07, "loss": 0.0043, "reward": 1.6184866428375244, "reward_std": 0.17556436359882355, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.6497366428375244, "step": 1731 }, { "completion_length": 116.796875, "epoch": 1.1166989039329465, "grad_norm": 15.003190994262695, "kl": 0.12255859375, "learning_rate": 4.4165054803352675e-07, "loss": 0.0049, "reward": 1.5859253406524658, "reward_std": 0.11098824068903923, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6015503257513046, "step": 1732 }, { "completion_length": 119.15625, "epoch": 1.117343649258543, "grad_norm": 11.927645683288574, "kl": 0.11328125, "learning_rate": 4.413281753707285e-07, "loss": 0.0045, "reward": 1.707281231880188, "reward_std": 0.09659011662006378, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.707281231880188, "step": 1733 }, { "completion_length": 108.671875, "epoch": 1.1179883945841393, "grad_norm": 11.922825813293457, "kl": 0.118408203125, "learning_rate": 4.4100580270793036e-07, "loss": 0.0047, "reward": 1.5895445942878723, "reward_std": 0.053144171833992004, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5895446836948395, "step": 1734 }, { "completion_length": 106.84375, "epoch": 1.1186331399097356, "grad_norm": 14.034700393676758, "kl": 0.11572265625, "learning_rate": 4.4068343004513214e-07, "loss": 0.0046, "reward": 1.6887595057487488, "reward_std": 0.10110663995146751, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6887595057487488, "step": 1735 }, { "completion_length": 119.546875, "epoch": 1.119277885235332, "grad_norm": 21.64225959777832, "kl": 0.12548828125, "learning_rate": 4.4036105738233397e-07, "loss": 0.005, "reward": 1.6815902590751648, "reward_std": 0.10987572371959686, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6815903186798096, "step": 1736 }, { "completion_length": 108.0625, "epoch": 1.1199226305609284, "grad_norm": 9.544087409973145, "kl": 0.1376953125, "learning_rate": 4.4003868471953575e-07, "loss": 0.0055, "reward": 1.6549758315086365, "reward_std": 0.08593714982271194, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6549758017063141, "step": 1737 }, { "completion_length": 121.21875, "epoch": 1.1205673758865249, "grad_norm": 39.510986328125, "kl": 0.10791015625, "learning_rate": 4.397163120567376e-07, "loss": 0.0043, "reward": 1.6081666350364685, "reward_std": 0.09677704423666, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6081666350364685, "step": 1738 }, { "completion_length": 116.953125, "epoch": 1.121212121212121, "grad_norm": 9.956169128417969, "kl": 0.134765625, "learning_rate": 4.3939393939393937e-07, "loss": 0.0054, "reward": 1.4132041931152344, "reward_std": 0.09558882936835289, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.428829163312912, "step": 1739 }, { "completion_length": 117.0, "epoch": 1.1218568665377175, "grad_norm": 19.143388748168945, "kl": 0.102294921875, "learning_rate": 4.3907156673114115e-07, "loss": 0.0041, "reward": 1.5230024456977844, "reward_std": 0.07811861485242844, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5230023860931396, "step": 1740 }, { "completion_length": 110.5625, "epoch": 1.122501611863314, "grad_norm": 17.286823272705078, "kl": 0.115234375, "learning_rate": 4.38749194068343e-07, "loss": 0.0046, "reward": 1.6897668838500977, "reward_std": 0.09538877755403519, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6897669434547424, "step": 1741 }, { "completion_length": 123.3125, "epoch": 1.1231463571889104, "grad_norm": 15.611011505126953, "kl": 0.1240234375, "learning_rate": 4.3842682140554476e-07, "loss": 0.005, "reward": 1.457338809967041, "reward_std": 0.1063726358115673, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4573388397693634, "step": 1742 }, { "completion_length": 116.84375, "epoch": 1.1237911025145069, "grad_norm": 14.022031784057617, "kl": 0.115966796875, "learning_rate": 4.381044487427466e-07, "loss": 0.0046, "reward": 1.3342976570129395, "reward_std": 0.08087656646966934, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.33429761230945587, "step": 1743 }, { "completion_length": 113.390625, "epoch": 1.124435847840103, "grad_norm": 9.711297988891602, "kl": 0.102294921875, "learning_rate": 4.377820760799484e-07, "loss": 0.0041, "reward": 1.548888087272644, "reward_std": 0.09965697675943375, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5488881766796112, "step": 1744 }, { "completion_length": 116.15625, "epoch": 1.1250805931656995, "grad_norm": 24.13825225830078, "kl": 0.130859375, "learning_rate": 4.374597034171502e-07, "loss": 0.0052, "reward": 1.7484040260314941, "reward_std": 0.12463311851024628, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7484039068222046, "step": 1745 }, { "completion_length": 118.625, "epoch": 1.125725338491296, "grad_norm": 15.037027359008789, "kl": 0.10888671875, "learning_rate": 4.37137330754352e-07, "loss": 0.0044, "reward": 1.6446164846420288, "reward_std": 0.12209159508347511, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6602414548397064, "step": 1746 }, { "completion_length": 115.640625, "epoch": 1.1263700838168924, "grad_norm": 19.545330047607422, "kl": 0.119140625, "learning_rate": 4.368149580915538e-07, "loss": 0.0048, "reward": 1.4536727666854858, "reward_std": 0.17526058107614517, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4692978411912918, "step": 1747 }, { "completion_length": 106.40625, "epoch": 1.1270148291424886, "grad_norm": 18.563114166259766, "kl": 0.119140625, "learning_rate": 4.364925854287556e-07, "loss": 0.0048, "reward": 1.6942253708839417, "reward_std": 0.07673277705907822, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6942253112792969, "step": 1748 }, { "completion_length": 121.90625, "epoch": 1.127659574468085, "grad_norm": 12.64437198638916, "kl": 0.107421875, "learning_rate": 4.3617021276595744e-07, "loss": 0.0043, "reward": 1.4421557188034058, "reward_std": 0.11760647222399712, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4421556890010834, "step": 1749 }, { "completion_length": 111.46875, "epoch": 1.1283043197936815, "grad_norm": 25.638565063476562, "kl": 0.1650390625, "learning_rate": 4.358478401031592e-07, "loss": 0.0066, "reward": 1.6255099177360535, "reward_std": 0.1136264055967331, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6255099177360535, "step": 1750 }, { "completion_length": 105.421875, "epoch": 1.128949065119278, "grad_norm": 12.374931335449219, "kl": 0.127685546875, "learning_rate": 4.3552546744036105e-07, "loss": 0.0051, "reward": 1.6772376894950867, "reward_std": 0.1765252035111189, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.7084877490997314, "step": 1751 }, { "completion_length": 113.078125, "epoch": 1.1295938104448742, "grad_norm": 14.301130294799805, "kl": 0.12744140625, "learning_rate": 4.3520309477756283e-07, "loss": 0.0051, "reward": 1.494019091129303, "reward_std": 0.11037448793649673, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5096440762281418, "step": 1752 }, { "completion_length": 116.546875, "epoch": 1.1302385557704706, "grad_norm": 13.462944030761719, "kl": 0.205078125, "learning_rate": 4.3488072211476467e-07, "loss": 0.0082, "reward": 1.5543888211250305, "reward_std": 0.08765115961432457, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5543888360261917, "step": 1753 }, { "completion_length": 103.859375, "epoch": 1.130883301096067, "grad_norm": 58.70383071899414, "kl": 0.13525390625, "learning_rate": 4.3455834945196645e-07, "loss": 0.0054, "reward": 1.5313851833343506, "reward_std": 0.14955976232886314, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5313852429389954, "step": 1754 }, { "completion_length": 103.203125, "epoch": 1.1315280464216635, "grad_norm": 10.312337875366211, "kl": 0.11767578125, "learning_rate": 4.342359767891682e-07, "loss": 0.0047, "reward": 1.536129117012024, "reward_std": 0.11497850716114044, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5517541468143463, "step": 1755 }, { "completion_length": 99.140625, "epoch": 1.13217279174726, "grad_norm": 10.849982261657715, "kl": 0.11181640625, "learning_rate": 4.3391360412637006e-07, "loss": 0.0045, "reward": 1.7079610228538513, "reward_std": 0.10665994510054588, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7079610228538513, "step": 1756 }, { "completion_length": 106.9375, "epoch": 1.1328175370728562, "grad_norm": 10.794480323791504, "kl": 0.108154296875, "learning_rate": 4.3359123146357184e-07, "loss": 0.0043, "reward": 1.5266578197479248, "reward_std": 0.10716821625828743, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5422827899456024, "step": 1757 }, { "completion_length": 98.09375, "epoch": 1.1334622823984526, "grad_norm": 31.109466552734375, "kl": 0.1396484375, "learning_rate": 4.3326885880077367e-07, "loss": 0.0056, "reward": 1.6419140696525574, "reward_std": 0.11855699494481087, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6419140696525574, "step": 1758 }, { "completion_length": 109.0, "epoch": 1.134107027724049, "grad_norm": 11.9324312210083, "kl": 0.12060546875, "learning_rate": 4.3294648613797545e-07, "loss": 0.0048, "reward": 1.6172628998756409, "reward_std": 0.09370382130146027, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6172628998756409, "step": 1759 }, { "completion_length": 107.703125, "epoch": 1.1347517730496455, "grad_norm": 8.980454444885254, "kl": 0.107666015625, "learning_rate": 4.326241134751773e-07, "loss": 0.0043, "reward": 1.5437541007995605, "reward_std": 0.09230579063296318, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5437541604042053, "step": 1760 }, { "completion_length": 103.984375, "epoch": 1.1353965183752417, "grad_norm": 16.976593017578125, "kl": 0.119384765625, "learning_rate": 4.3230174081237907e-07, "loss": 0.0048, "reward": 1.6876962184906006, "reward_std": 0.1564544513821602, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6876962184906006, "step": 1761 }, { "completion_length": 96.140625, "epoch": 1.1360412637008381, "grad_norm": 16.373489379882812, "kl": 0.12060546875, "learning_rate": 4.319793681495809e-07, "loss": 0.0048, "reward": 1.7533943057060242, "reward_std": 0.11407231912016869, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7533943355083466, "step": 1762 }, { "completion_length": 104.96875, "epoch": 1.1366860090264346, "grad_norm": 12.83205509185791, "kl": 0.12841796875, "learning_rate": 4.316569954867827e-07, "loss": 0.0051, "reward": 1.7652117609977722, "reward_std": 0.11743732914328575, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7652117908000946, "step": 1763 }, { "completion_length": 106.28125, "epoch": 1.137330754352031, "grad_norm": 15.829553604125977, "kl": 0.130126953125, "learning_rate": 4.313346228239845e-07, "loss": 0.0052, "reward": 1.6745178699493408, "reward_std": 0.11732401326298714, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6745178401470184, "step": 1764 }, { "completion_length": 104.984375, "epoch": 1.1379754996776272, "grad_norm": 31.47109031677246, "kl": 0.13427734375, "learning_rate": 4.310122501611863e-07, "loss": 0.0054, "reward": 1.5751590132713318, "reward_std": 0.17913639172911644, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5907839834690094, "step": 1765 }, { "completion_length": 104.890625, "epoch": 1.1386202450032237, "grad_norm": 14.089750289916992, "kl": 0.124755859375, "learning_rate": 4.3068987749838813e-07, "loss": 0.005, "reward": 1.617997169494629, "reward_std": 0.06984603218734264, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6179972290992737, "step": 1766 }, { "completion_length": 98.046875, "epoch": 1.1392649903288201, "grad_norm": 52.08578872680664, "kl": 0.14697265625, "learning_rate": 4.303675048355899e-07, "loss": 0.0059, "reward": 1.4860327243804932, "reward_std": 0.05698714032769203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48603275418281555, "step": 1767 }, { "completion_length": 97.234375, "epoch": 1.1399097356544166, "grad_norm": 13.764451026916504, "kl": 0.1318359375, "learning_rate": 4.300451321727917e-07, "loss": 0.0053, "reward": 1.4116371273994446, "reward_std": 0.15613991767168045, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4272621124982834, "step": 1768 }, { "completion_length": 93.375, "epoch": 1.140554480980013, "grad_norm": 21.74166488647461, "kl": 0.114990234375, "learning_rate": 4.297227595099935e-07, "loss": 0.0046, "reward": 1.5942260026931763, "reward_std": 0.13864929229021072, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5942259430885315, "step": 1769 }, { "completion_length": 97.78125, "epoch": 1.1411992263056092, "grad_norm": 17.24546241760254, "kl": 0.11474609375, "learning_rate": 4.294003868471953e-07, "loss": 0.0046, "reward": 1.6961501836776733, "reward_std": 0.10894617810845375, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7117751836776733, "step": 1770 }, { "completion_length": 97.21875, "epoch": 1.1418439716312057, "grad_norm": 23.654821395874023, "kl": 0.143310546875, "learning_rate": 4.2907801418439714e-07, "loss": 0.0057, "reward": 1.6986296772956848, "reward_std": 0.10340171307325363, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6986297369003296, "step": 1771 }, { "completion_length": 97.625, "epoch": 1.142488716956802, "grad_norm": 16.723711013793945, "kl": 0.1376953125, "learning_rate": 4.287556415215989e-07, "loss": 0.0055, "reward": 1.6945127248764038, "reward_std": 0.10590310022234917, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.694512665271759, "step": 1772 }, { "completion_length": 102.0, "epoch": 1.1431334622823985, "grad_norm": 11.695935249328613, "kl": 0.1123046875, "learning_rate": 4.2843326885880075e-07, "loss": 0.0045, "reward": 1.6573434472084045, "reward_std": 0.12627606093883514, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6573435068130493, "step": 1773 }, { "completion_length": 101.734375, "epoch": 1.1437782076079948, "grad_norm": 11.529833793640137, "kl": 0.150390625, "learning_rate": 4.2811089619600253e-07, "loss": 0.006, "reward": 1.5360539555549622, "reward_std": 0.1177862137556076, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5360539853572845, "step": 1774 }, { "completion_length": 103.546875, "epoch": 1.1444229529335912, "grad_norm": 15.755107879638672, "kl": 0.19140625, "learning_rate": 4.2778852353320437e-07, "loss": 0.0076, "reward": 1.6712934970855713, "reward_std": 0.21360860764980316, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6869185268878937, "step": 1775 }, { "completion_length": 115.65625, "epoch": 1.1450676982591876, "grad_norm": 12.252145767211914, "kl": 0.12109375, "learning_rate": 4.2746615087040615e-07, "loss": 0.0048, "reward": 1.4710392951965332, "reward_std": 0.0702495351433754, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4710392951965332, "step": 1776 }, { "completion_length": 108.609375, "epoch": 1.145712443584784, "grad_norm": 11.388358116149902, "kl": 0.117431640625, "learning_rate": 4.27143778207608e-07, "loss": 0.0047, "reward": 1.7197344303131104, "reward_std": 0.09033240005373955, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7197344303131104, "step": 1777 }, { "completion_length": 115.328125, "epoch": 1.1463571889103803, "grad_norm": 10.445749282836914, "kl": 0.1025390625, "learning_rate": 4.2682140554480976e-07, "loss": 0.0041, "reward": 1.6068493723869324, "reward_std": 0.11387959867715836, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6068494319915771, "step": 1778 }, { "completion_length": 107.09375, "epoch": 1.1470019342359767, "grad_norm": 8.58862018585205, "kl": 0.125, "learning_rate": 4.264990328820116e-07, "loss": 0.005, "reward": 1.57830011844635, "reward_std": 0.145315021276474, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5783001780509949, "step": 1779 }, { "completion_length": 110.890625, "epoch": 1.1476466795615732, "grad_norm": 12.297728538513184, "kl": 0.11572265625, "learning_rate": 4.2617666021921337e-07, "loss": 0.0046, "reward": 1.5999037027359009, "reward_std": 0.10857437551021576, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5999037325382233, "step": 1780 }, { "completion_length": 102.65625, "epoch": 1.1482914248871696, "grad_norm": 32.0905647277832, "kl": 0.12451171875, "learning_rate": 4.2585428755641515e-07, "loss": 0.005, "reward": 1.7533397674560547, "reward_std": 0.06611281260848045, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7533398270606995, "step": 1781 }, { "completion_length": 98.78125, "epoch": 1.148936170212766, "grad_norm": 59.476600646972656, "kl": 0.14013671875, "learning_rate": 4.25531914893617e-07, "loss": 0.0056, "reward": 1.6604995727539062, "reward_std": 0.06654183939099312, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6604995727539062, "step": 1782 }, { "completion_length": 108.09375, "epoch": 1.1495809155383623, "grad_norm": 8.271899223327637, "kl": 0.120849609375, "learning_rate": 4.2520954223081877e-07, "loss": 0.0048, "reward": 1.637656807899475, "reward_std": 0.0554551612585783, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6376567780971527, "step": 1783 }, { "completion_length": 111.640625, "epoch": 1.1502256608639587, "grad_norm": 16.318239212036133, "kl": 0.1220703125, "learning_rate": 4.248871695680206e-07, "loss": 0.0049, "reward": 1.692155122756958, "reward_std": 0.16502898186445236, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.692155122756958, "step": 1784 }, { "completion_length": 106.0, "epoch": 1.1508704061895552, "grad_norm": 18.496143341064453, "kl": 0.110595703125, "learning_rate": 4.245647969052224e-07, "loss": 0.0044, "reward": 1.6564997434616089, "reward_std": 0.08594655059278011, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6564997732639313, "step": 1785 }, { "completion_length": 114.234375, "epoch": 1.1515151515151516, "grad_norm": 19.95158576965332, "kl": 0.1357421875, "learning_rate": 4.242424242424242e-07, "loss": 0.0054, "reward": 1.605454921722412, "reward_std": 0.08248751610517502, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6054548621177673, "step": 1786 }, { "completion_length": 103.53125, "epoch": 1.1521598968407478, "grad_norm": 29.761093139648438, "kl": 0.141357421875, "learning_rate": 4.23920051579626e-07, "loss": 0.0057, "reward": 1.6491164565086365, "reward_std": 0.15113864839076996, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6491163969039917, "step": 1787 }, { "completion_length": 95.421875, "epoch": 1.1528046421663443, "grad_norm": 27.836938858032227, "kl": 0.149658203125, "learning_rate": 4.2359767891682783e-07, "loss": 0.006, "reward": 1.5169153213500977, "reward_std": 0.12604781612753868, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5325403362512589, "step": 1788 }, { "completion_length": 110.484375, "epoch": 1.1534493874919407, "grad_norm": 20.547147750854492, "kl": 0.13330078125, "learning_rate": 4.232753062540296e-07, "loss": 0.0053, "reward": 1.5799216628074646, "reward_std": 0.07361920922994614, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.579921692609787, "step": 1789 }, { "completion_length": 106.234375, "epoch": 1.1540941328175371, "grad_norm": 34.002098083496094, "kl": 0.130859375, "learning_rate": 4.2295293359123144e-07, "loss": 0.0052, "reward": 1.6779032349586487, "reward_std": 0.11163284257054329, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6779031455516815, "step": 1790 }, { "completion_length": 109.5, "epoch": 1.1547388781431334, "grad_norm": 17.270278930664062, "kl": 0.123291015625, "learning_rate": 4.226305609284332e-07, "loss": 0.0049, "reward": 1.6175901889801025, "reward_std": 0.11164752580225468, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6175901889801025, "step": 1791 }, { "completion_length": 102.640625, "epoch": 1.1553836234687298, "grad_norm": 16.843664169311523, "kl": 0.1591796875, "learning_rate": 4.2230818826563506e-07, "loss": 0.0064, "reward": 1.7334028482437134, "reward_std": 0.07044504210352898, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.733402818441391, "step": 1792 }, { "completion_length": 113.375, "epoch": 1.1560283687943262, "grad_norm": 11.798425674438477, "kl": 0.130859375, "learning_rate": 4.2198581560283684e-07, "loss": 0.0052, "reward": 1.7188551425933838, "reward_std": 0.0985119491815567, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7188552021980286, "step": 1793 }, { "completion_length": 106.25, "epoch": 1.1566731141199227, "grad_norm": 15.71810531616211, "kl": 0.146484375, "learning_rate": 4.216634429400386e-07, "loss": 0.0059, "reward": 1.4141835570335388, "reward_std": 0.1422448419034481, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.41418349742889404, "step": 1794 }, { "completion_length": 104.65625, "epoch": 1.1573178594455191, "grad_norm": 28.754180908203125, "kl": 0.1455078125, "learning_rate": 4.2134107027724045e-07, "loss": 0.0058, "reward": 1.5761706233024597, "reward_std": 0.08441785722970963, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5761705487966537, "step": 1795 }, { "completion_length": 116.609375, "epoch": 1.1579626047711153, "grad_norm": 8.733136177062988, "kl": 0.12841796875, "learning_rate": 4.2101869761444223e-07, "loss": 0.0051, "reward": 1.5780669450759888, "reward_std": 0.08834883198142052, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5780669152736664, "step": 1796 }, { "completion_length": 124.0, "epoch": 1.1586073500967118, "grad_norm": 15.45083236694336, "kl": 0.13037109375, "learning_rate": 4.2069632495164407e-07, "loss": 0.0052, "reward": 1.6721486449241638, "reward_std": 0.23771092295646667, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.7033986151218414, "step": 1797 }, { "completion_length": 118.15625, "epoch": 1.1592520954223082, "grad_norm": 10.324441909790039, "kl": 0.1259765625, "learning_rate": 4.2037395228884585e-07, "loss": 0.005, "reward": 1.5779080986976624, "reward_std": 0.18352527171373367, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5935330986976624, "step": 1798 }, { "completion_length": 116.0625, "epoch": 1.1598968407479047, "grad_norm": 36.42701721191406, "kl": 0.112060546875, "learning_rate": 4.200515796260477e-07, "loss": 0.0045, "reward": 1.5790598392486572, "reward_std": 0.056727977469563484, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5790598094463348, "step": 1799 }, { "completion_length": 121.5625, "epoch": 1.1605415860735009, "grad_norm": 35.930091857910156, "kl": 0.111572265625, "learning_rate": 4.1972920696324946e-07, "loss": 0.0045, "reward": 1.608137845993042, "reward_std": 0.14337752014398575, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6081377863883972, "step": 1800 }, { "completion_length": 118.8125, "epoch": 1.1611863313990973, "grad_norm": 6.747325420379639, "kl": 0.134765625, "learning_rate": 4.194068343004513e-07, "loss": 0.0054, "reward": 1.5423181653022766, "reward_std": 0.08650622889399529, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.542318195104599, "step": 1801 }, { "completion_length": 110.59375, "epoch": 1.1618310767246938, "grad_norm": 11.091395378112793, "kl": 0.16552734375, "learning_rate": 4.190844616376531e-07, "loss": 0.0066, "reward": 1.6449681520462036, "reward_std": 0.10411114990711212, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.644968181848526, "step": 1802 }, { "completion_length": 111.71875, "epoch": 1.1624758220502902, "grad_norm": 14.368660926818848, "kl": 0.13525390625, "learning_rate": 4.187620889748549e-07, "loss": 0.0054, "reward": 1.7189124822616577, "reward_std": 0.09125639125704765, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7189124822616577, "step": 1803 }, { "completion_length": 124.25, "epoch": 1.1631205673758864, "grad_norm": 9.280939102172852, "kl": 0.12548828125, "learning_rate": 4.184397163120567e-07, "loss": 0.005, "reward": 1.5595988035202026, "reward_std": 0.09420382976531982, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5595987737178802, "step": 1804 }, { "completion_length": 103.65625, "epoch": 1.1637653127014829, "grad_norm": 27.450349807739258, "kl": 0.12939453125, "learning_rate": 4.181173436492585e-07, "loss": 0.0052, "reward": 1.6111841797828674, "reward_std": 0.1274169199168682, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.626809149980545, "step": 1805 }, { "completion_length": 107.65625, "epoch": 1.1644100580270793, "grad_norm": 17.96258544921875, "kl": 0.208984375, "learning_rate": 4.177949709864603e-07, "loss": 0.0084, "reward": 1.539582073688507, "reward_std": 0.10280460864305496, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5552070736885071, "step": 1806 }, { "completion_length": 108.265625, "epoch": 1.1650548033526757, "grad_norm": 21.98755645751953, "kl": 0.15234375, "learning_rate": 4.174725983236622e-07, "loss": 0.0061, "reward": 1.6323410272598267, "reward_std": 0.1224982887506485, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6323409676551819, "step": 1807 }, { "completion_length": 104.3125, "epoch": 1.1656995486782722, "grad_norm": 19.422637939453125, "kl": 0.12890625, "learning_rate": 4.171502256608639e-07, "loss": 0.0051, "reward": 1.560622751712799, "reward_std": 0.06334619224071503, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5606227219104767, "step": 1808 }, { "completion_length": 96.25, "epoch": 1.1663442940038684, "grad_norm": 10.938532829284668, "kl": 0.12841796875, "learning_rate": 4.168278529980657e-07, "loss": 0.0051, "reward": 1.5526504516601562, "reward_std": 0.09076980128884315, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5526504814624786, "step": 1809 }, { "completion_length": 104.625, "epoch": 1.1669890393294649, "grad_norm": 21.19317054748535, "kl": 0.146484375, "learning_rate": 4.165054803352676e-07, "loss": 0.0058, "reward": 1.4290502071380615, "reward_std": 0.1222308985888958, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4290502667427063, "step": 1810 }, { "completion_length": 108.125, "epoch": 1.1676337846550613, "grad_norm": 10.554473876953125, "kl": 0.13671875, "learning_rate": 4.161831076724693e-07, "loss": 0.0055, "reward": 1.443765640258789, "reward_std": 0.053497713059186935, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44376565515995026, "step": 1811 }, { "completion_length": 118.703125, "epoch": 1.1682785299806577, "grad_norm": 23.218820571899414, "kl": 0.13037109375, "learning_rate": 4.158607350096712e-07, "loss": 0.0052, "reward": 1.5981783866882324, "reward_std": 0.05331315100193024, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5981782972812653, "step": 1812 }, { "completion_length": 126.234375, "epoch": 1.168923275306254, "grad_norm": 6.66879415512085, "kl": 0.130615234375, "learning_rate": 4.15538362346873e-07, "loss": 0.0052, "reward": 1.564515233039856, "reward_std": 0.09983974322676659, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.564515233039856, "step": 1813 }, { "completion_length": 112.984375, "epoch": 1.1695680206318504, "grad_norm": 9.480602264404297, "kl": 0.14208984375, "learning_rate": 4.152159896840748e-07, "loss": 0.0057, "reward": 1.7875486016273499, "reward_std": 0.09949357807636261, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7875485420227051, "step": 1814 }, { "completion_length": 104.921875, "epoch": 1.1702127659574468, "grad_norm": 6.718891620635986, "kl": 0.11669921875, "learning_rate": 4.148936170212766e-07, "loss": 0.0047, "reward": 1.604787290096283, "reward_std": 0.09039689414203167, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6047872602939606, "step": 1815 }, { "completion_length": 113.359375, "epoch": 1.1708575112830433, "grad_norm": 10.286612510681152, "kl": 0.154052734375, "learning_rate": 4.145712443584784e-07, "loss": 0.0062, "reward": 1.5428114533424377, "reward_std": 0.06623676046729088, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5428114533424377, "step": 1816 }, { "completion_length": 107.125, "epoch": 1.1715022566086395, "grad_norm": 18.38083267211914, "kl": 0.13232421875, "learning_rate": 4.142488716956802e-07, "loss": 0.0053, "reward": 1.596979796886444, "reward_std": 0.1776793971657753, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6126048564910889, "step": 1817 }, { "completion_length": 128.296875, "epoch": 1.172147001934236, "grad_norm": 12.867851257324219, "kl": 0.11962890625, "learning_rate": 4.1392649903288204e-07, "loss": 0.0048, "reward": 1.6555660367012024, "reward_std": 0.08787687122821808, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6555660367012024, "step": 1818 }, { "completion_length": 109.84375, "epoch": 1.1727917472598324, "grad_norm": 77.79759216308594, "kl": 0.111328125, "learning_rate": 4.136041263700838e-07, "loss": 0.0045, "reward": 1.6702940464019775, "reward_std": 0.08833744376897812, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6702939569950104, "step": 1819 }, { "completion_length": 112.0625, "epoch": 1.1734364925854288, "grad_norm": 24.407093048095703, "kl": 0.125732421875, "learning_rate": 4.1328175370728565e-07, "loss": 0.005, "reward": 1.694697380065918, "reward_std": 0.059025075286626816, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.694697380065918, "step": 1820 }, { "completion_length": 119.015625, "epoch": 1.1740812379110253, "grad_norm": 10.360937118530273, "kl": 0.13623046875, "learning_rate": 4.1295938104448743e-07, "loss": 0.0054, "reward": 1.6886966824531555, "reward_std": 0.11815246567130089, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6886966526508331, "step": 1821 }, { "completion_length": 121.21875, "epoch": 1.1747259832366215, "grad_norm": 24.1602783203125, "kl": 0.10791015625, "learning_rate": 4.126370083816892e-07, "loss": 0.0043, "reward": 1.549957811832428, "reward_std": 0.1373315379023552, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5812077820301056, "step": 1822 }, { "completion_length": 113.140625, "epoch": 1.175370728562218, "grad_norm": 16.046770095825195, "kl": 0.114990234375, "learning_rate": 4.1231463571889105e-07, "loss": 0.0046, "reward": 1.5942580699920654, "reward_std": 0.09112752042710781, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5942580997943878, "step": 1823 }, { "completion_length": 114.390625, "epoch": 1.1760154738878144, "grad_norm": 13.49719524383545, "kl": 0.138671875, "learning_rate": 4.1199226305609283e-07, "loss": 0.0056, "reward": 1.601550281047821, "reward_std": 0.0931050106883049, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.601550281047821, "step": 1824 }, { "completion_length": 108.75, "epoch": 1.1766602192134108, "grad_norm": 12.032121658325195, "kl": 0.11376953125, "learning_rate": 4.1166989039329466e-07, "loss": 0.0046, "reward": 1.6884644031524658, "reward_std": 0.14345430955290794, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7040894031524658, "step": 1825 }, { "completion_length": 110.671875, "epoch": 1.177304964539007, "grad_norm": 34.89378356933594, "kl": 0.11279296875, "learning_rate": 4.1134751773049644e-07, "loss": 0.0045, "reward": 1.675636649131775, "reward_std": 0.19111549109220505, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6912615597248077, "step": 1826 }, { "completion_length": 116.640625, "epoch": 1.1779497098646035, "grad_norm": 14.025679588317871, "kl": 0.124267578125, "learning_rate": 4.110251450676983e-07, "loss": 0.005, "reward": 1.477051854133606, "reward_std": 0.09772965125739574, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.47705191373825073, "step": 1827 }, { "completion_length": 111.4375, "epoch": 1.1785944551902, "grad_norm": 12.973898887634277, "kl": 0.19384765625, "learning_rate": 4.1070277240490005e-07, "loss": 0.0077, "reward": 1.724359393119812, "reward_std": 0.13874923437833786, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7243594825267792, "step": 1828 }, { "completion_length": 129.140625, "epoch": 1.1792392005157963, "grad_norm": 14.122653007507324, "kl": 0.121337890625, "learning_rate": 4.103803997421019e-07, "loss": 0.0048, "reward": 1.600561499595642, "reward_std": 0.10613074153661728, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6005614995956421, "step": 1829 }, { "completion_length": 108.09375, "epoch": 1.1798839458413926, "grad_norm": 18.514957427978516, "kl": 0.153564453125, "learning_rate": 4.1005802707930367e-07, "loss": 0.0061, "reward": 1.6020503640174866, "reward_std": 0.08536173682659864, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.602050393819809, "step": 1830 }, { "completion_length": 106.703125, "epoch": 1.180528691166989, "grad_norm": 12.282980918884277, "kl": 0.11474609375, "learning_rate": 4.097356544165055e-07, "loss": 0.0046, "reward": 1.552680253982544, "reward_std": 0.05671044811606407, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5526802390813828, "step": 1831 }, { "completion_length": 118.015625, "epoch": 1.1811734364925854, "grad_norm": 12.24289321899414, "kl": 0.1318359375, "learning_rate": 4.094132817537073e-07, "loss": 0.0053, "reward": 1.5175416469573975, "reward_std": 0.07208650559186935, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5175416022539139, "step": 1832 }, { "completion_length": 107.796875, "epoch": 1.1818181818181819, "grad_norm": 10.43206787109375, "kl": 0.142578125, "learning_rate": 4.090909090909091e-07, "loss": 0.0057, "reward": 1.4518251419067383, "reward_std": 0.07507746666669846, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.45182517170906067, "step": 1833 }, { "completion_length": 113.875, "epoch": 1.1824629271437783, "grad_norm": 30.288999557495117, "kl": 0.1083984375, "learning_rate": 4.087685364281109e-07, "loss": 0.0043, "reward": 1.7571146488189697, "reward_std": 0.07010109722614288, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7571147084236145, "step": 1834 }, { "completion_length": 112.453125, "epoch": 1.1831076724693745, "grad_norm": 12.52745532989502, "kl": 0.215087890625, "learning_rate": 4.084461637653127e-07, "loss": 0.0086, "reward": 1.6558008193969727, "reward_std": 0.10477161407470703, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6558008193969727, "step": 1835 }, { "completion_length": 109.578125, "epoch": 1.183752417794971, "grad_norm": 13.707616806030273, "kl": 0.116455078125, "learning_rate": 4.081237911025145e-07, "loss": 0.0047, "reward": 1.6503602862358093, "reward_std": 0.08982540853321552, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6503602713346481, "step": 1836 }, { "completion_length": 105.71875, "epoch": 1.1843971631205674, "grad_norm": 11.990213394165039, "kl": 0.11669921875, "learning_rate": 4.078014184397163e-07, "loss": 0.0047, "reward": 1.503011703491211, "reward_std": 0.22921422868967056, "rewards/format_reward": 0.96875, "rewards/iou_timestamp_reward": 0.5342617332935333, "step": 1837 }, { "completion_length": 108.46875, "epoch": 1.1850419084461639, "grad_norm": 20.66301727294922, "kl": 0.12109375, "learning_rate": 4.074790457769181e-07, "loss": 0.0048, "reward": 1.5639979243278503, "reward_std": 0.11984455585479736, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5639979243278503, "step": 1838 }, { "completion_length": 126.203125, "epoch": 1.18568665377176, "grad_norm": 63.44125747680664, "kl": 0.101806640625, "learning_rate": 4.071566731141199e-07, "loss": 0.0041, "reward": 1.6595558524131775, "reward_std": 0.11242682486772537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6595558822154999, "step": 1839 }, { "completion_length": 110.40625, "epoch": 1.1863313990973565, "grad_norm": 19.61737632751465, "kl": 0.1181640625, "learning_rate": 4.0683430045132174e-07, "loss": 0.0047, "reward": 1.5931558012962341, "reward_std": 0.14882973581552505, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5931558012962341, "step": 1840 }, { "completion_length": 114.671875, "epoch": 1.186976144422953, "grad_norm": 27.75486183166504, "kl": 0.1435546875, "learning_rate": 4.065119277885235e-07, "loss": 0.0057, "reward": 1.7199392914772034, "reward_std": 0.12808328121900558, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.719939261674881, "step": 1841 }, { "completion_length": 109.953125, "epoch": 1.1876208897485494, "grad_norm": 13.68565845489502, "kl": 0.10498046875, "learning_rate": 4.0618955512572535e-07, "loss": 0.0042, "reward": 1.5208486318588257, "reward_std": 0.14144017547369003, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5364736467599869, "step": 1842 }, { "completion_length": 113.890625, "epoch": 1.1882656350741456, "grad_norm": 15.589136123657227, "kl": 0.11279296875, "learning_rate": 4.0586718246292713e-07, "loss": 0.0045, "reward": 1.7398034930229187, "reward_std": 0.04764362797141075, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7398034334182739, "step": 1843 }, { "completion_length": 105.359375, "epoch": 1.188910380399742, "grad_norm": 14.390912055969238, "kl": 0.13037109375, "learning_rate": 4.0554480980012897e-07, "loss": 0.0052, "reward": 1.3299148678779602, "reward_std": 0.09040863066911697, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3299148678779602, "step": 1844 }, { "completion_length": 107.78125, "epoch": 1.1895551257253385, "grad_norm": 28.451305389404297, "kl": 0.135009765625, "learning_rate": 4.0522243713733075e-07, "loss": 0.0054, "reward": 1.469842255115509, "reward_std": 0.11970901861786842, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4698422998189926, "step": 1845 }, { "completion_length": 107.296875, "epoch": 1.190199871050935, "grad_norm": 13.97714900970459, "kl": 0.117919921875, "learning_rate": 4.049000644745326e-07, "loss": 0.0047, "reward": 1.7399972081184387, "reward_std": 0.11769672483205795, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7399971783161163, "step": 1846 }, { "completion_length": 113.75, "epoch": 1.1908446163765314, "grad_norm": 8.553421020507812, "kl": 0.119140625, "learning_rate": 4.0457769181173436e-07, "loss": 0.0048, "reward": 1.6283788084983826, "reward_std": 0.11559978127479553, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6283787786960602, "step": 1847 }, { "completion_length": 104.359375, "epoch": 1.1914893617021276, "grad_norm": 8.622238159179688, "kl": 0.14990234375, "learning_rate": 4.0425531914893614e-07, "loss": 0.006, "reward": 1.5790454745292664, "reward_std": 0.09102013148367405, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5790454745292664, "step": 1848 }, { "completion_length": 103.78125, "epoch": 1.192134107027724, "grad_norm": 19.665542602539062, "kl": 0.1298828125, "learning_rate": 4.03932946486138e-07, "loss": 0.0052, "reward": 1.6341629028320312, "reward_std": 0.07100488245487213, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6341629028320312, "step": 1849 }, { "completion_length": 109.203125, "epoch": 1.1927788523533205, "grad_norm": 23.64558219909668, "kl": 0.109619140625, "learning_rate": 4.0361057382333975e-07, "loss": 0.0044, "reward": 1.6545790433883667, "reward_std": 0.0724497102200985, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6545790731906891, "step": 1850 }, { "completion_length": 109.15625, "epoch": 1.1934235976789167, "grad_norm": 10.852958679199219, "kl": 0.12451171875, "learning_rate": 4.032882011605416e-07, "loss": 0.005, "reward": 1.6551001071929932, "reward_std": 0.10190777480602264, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6551000475883484, "step": 1851 }, { "completion_length": 107.171875, "epoch": 1.1940683430045131, "grad_norm": 7.953454494476318, "kl": 0.130859375, "learning_rate": 4.0296582849774337e-07, "loss": 0.0053, "reward": 1.5733691453933716, "reward_std": 0.09967935457825661, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.573369175195694, "step": 1852 }, { "completion_length": 114.265625, "epoch": 1.1947130883301096, "grad_norm": 11.328710556030273, "kl": 0.14306640625, "learning_rate": 4.026434558349452e-07, "loss": 0.0057, "reward": 1.7428075671195984, "reward_std": 0.10012370720505714, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7428075969219208, "step": 1853 }, { "completion_length": 123.390625, "epoch": 1.195357833655706, "grad_norm": 20.28865623474121, "kl": 0.103759765625, "learning_rate": 4.02321083172147e-07, "loss": 0.0041, "reward": 1.5555278062820435, "reward_std": 0.09157108515501022, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5555277466773987, "step": 1854 }, { "completion_length": 122.921875, "epoch": 1.1960025789813025, "grad_norm": 8.482815742492676, "kl": 0.11962890625, "learning_rate": 4.019987105093488e-07, "loss": 0.0048, "reward": 1.5178706645965576, "reward_std": 0.08336883038282394, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5178706347942352, "step": 1855 }, { "completion_length": 118.125, "epoch": 1.1966473243068987, "grad_norm": 9.631011962890625, "kl": 0.101318359375, "learning_rate": 4.016763378465506e-07, "loss": 0.0041, "reward": 1.659381926059723, "reward_std": 0.06883524730801582, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6593818664550781, "step": 1856 }, { "completion_length": 119.8125, "epoch": 1.1972920696324951, "grad_norm": 8.929444313049316, "kl": 0.103515625, "learning_rate": 4.0135396518375243e-07, "loss": 0.0041, "reward": 1.5685328841209412, "reward_std": 0.10261768102645874, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5685328543186188, "step": 1857 }, { "completion_length": 105.3125, "epoch": 1.1979368149580916, "grad_norm": 32.578330993652344, "kl": 0.1083984375, "learning_rate": 4.010315925209542e-07, "loss": 0.0043, "reward": 1.5939223766326904, "reward_std": 0.10584653168916702, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5939223766326904, "step": 1858 }, { "completion_length": 119.296875, "epoch": 1.198581560283688, "grad_norm": 9.094733238220215, "kl": 0.10302734375, "learning_rate": 4.0070921985815604e-07, "loss": 0.0041, "reward": 1.7315629124641418, "reward_std": 0.10077636688947678, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7315629422664642, "step": 1859 }, { "completion_length": 110.34375, "epoch": 1.1992263056092844, "grad_norm": 13.449752807617188, "kl": 0.118408203125, "learning_rate": 4.003868471953578e-07, "loss": 0.0047, "reward": 1.6524174809455872, "reward_std": 0.13661283254623413, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6524175405502319, "step": 1860 }, { "completion_length": 110.25, "epoch": 1.1998710509348807, "grad_norm": 11.15349292755127, "kl": 0.117431640625, "learning_rate": 4.0006447453255966e-07, "loss": 0.0047, "reward": 1.5107752680778503, "reward_std": 0.21871937066316605, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5264002084732056, "step": 1861 }, { "completion_length": 126.234375, "epoch": 1.200515796260477, "grad_norm": 8.474930763244629, "kl": 0.1123046875, "learning_rate": 3.9974210186976144e-07, "loss": 0.0045, "reward": 1.515737771987915, "reward_std": 0.11956487596035004, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.515737771987915, "step": 1862 }, { "completion_length": 118.875, "epoch": 1.2011605415860735, "grad_norm": 14.834139823913574, "kl": 0.12109375, "learning_rate": 3.994197292069632e-07, "loss": 0.0048, "reward": 1.6979880928993225, "reward_std": 0.17543792724609375, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.7136130630970001, "step": 1863 }, { "completion_length": 106.984375, "epoch": 1.2018052869116698, "grad_norm": 18.74297332763672, "kl": 0.104736328125, "learning_rate": 3.9909735654416505e-07, "loss": 0.0042, "reward": 1.5749844312667847, "reward_std": 0.13023612275719643, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5906094610691071, "step": 1864 }, { "completion_length": 109.015625, "epoch": 1.2024500322372662, "grad_norm": 10.109322547912598, "kl": 0.14599609375, "learning_rate": 3.9877498388136683e-07, "loss": 0.0058, "reward": 1.6240103244781494, "reward_std": 0.18006663024425507, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6240103244781494, "step": 1865 }, { "completion_length": 115.53125, "epoch": 1.2030947775628626, "grad_norm": 98.04276275634766, "kl": 0.119384765625, "learning_rate": 3.9845261121856867e-07, "loss": 0.0048, "reward": 1.6898424625396729, "reward_std": 0.08776196092367172, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6898424625396729, "step": 1866 }, { "completion_length": 111.0625, "epoch": 1.203739522888459, "grad_norm": 16.498126983642578, "kl": 0.114990234375, "learning_rate": 3.9813023855577045e-07, "loss": 0.0046, "reward": 1.4743621349334717, "reward_std": 0.20478830486536026, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.4899871349334717, "step": 1867 }, { "completion_length": 119.71875, "epoch": 1.2043842682140555, "grad_norm": 15.135804176330566, "kl": 0.11572265625, "learning_rate": 3.978078658929723e-07, "loss": 0.0046, "reward": 1.4399540424346924, "reward_std": 0.11498159915208817, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.43995407223701477, "step": 1868 }, { "completion_length": 111.0, "epoch": 1.2050290135396517, "grad_norm": 10.168272972106934, "kl": 0.118408203125, "learning_rate": 3.9748549323017406e-07, "loss": 0.0047, "reward": 1.5744973421096802, "reward_std": 0.05219046585261822, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5744973719120026, "step": 1869 }, { "completion_length": 117.65625, "epoch": 1.2056737588652482, "grad_norm": 7.834216117858887, "kl": 0.12060546875, "learning_rate": 3.971631205673759e-07, "loss": 0.0048, "reward": 1.6258432269096375, "reward_std": 0.07550669834017754, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6258432269096375, "step": 1870 }, { "completion_length": 98.6875, "epoch": 1.2063185041908446, "grad_norm": 13.367279052734375, "kl": 0.12451171875, "learning_rate": 3.968407479045777e-07, "loss": 0.005, "reward": 1.5441078543663025, "reward_std": 0.11310616880655289, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5441079139709473, "step": 1871 }, { "completion_length": 114.359375, "epoch": 1.206963249516441, "grad_norm": 11.95669937133789, "kl": 0.113037109375, "learning_rate": 3.965183752417795e-07, "loss": 0.0045, "reward": 1.6491506695747375, "reward_std": 0.15457647293806076, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6647757291793823, "step": 1872 }, { "completion_length": 123.53125, "epoch": 1.2076079948420375, "grad_norm": 23.579849243164062, "kl": 0.103515625, "learning_rate": 3.961960025789813e-07, "loss": 0.0041, "reward": 1.4970921874046326, "reward_std": 0.1336561180651188, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.497092142701149, "step": 1873 }, { "completion_length": 106.109375, "epoch": 1.2082527401676337, "grad_norm": 12.425701141357422, "kl": 0.1162109375, "learning_rate": 3.958736299161831e-07, "loss": 0.0046, "reward": 1.50269216299057, "reward_std": 0.0642969198524952, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5026921331882477, "step": 1874 }, { "completion_length": 100.109375, "epoch": 1.2088974854932302, "grad_norm": 9.212443351745605, "kl": 0.1416015625, "learning_rate": 3.955512572533849e-07, "loss": 0.0057, "reward": 1.757175624370575, "reward_std": 0.17486996948719025, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.772800624370575, "step": 1875 }, { "completion_length": 112.890625, "epoch": 1.2095422308188266, "grad_norm": 12.235011100769043, "kl": 0.1220703125, "learning_rate": 3.952288845905867e-07, "loss": 0.0049, "reward": 1.5499593615531921, "reward_std": 0.12618184834718704, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5499593615531921, "step": 1876 }, { "completion_length": 113.53125, "epoch": 1.2101869761444228, "grad_norm": 41.290550231933594, "kl": 0.1318359375, "learning_rate": 3.949065119277885e-07, "loss": 0.0053, "reward": 1.5221721529960632, "reward_std": 0.10043122246861458, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.522172138094902, "step": 1877 }, { "completion_length": 116.4375, "epoch": 1.2108317214700193, "grad_norm": 10.30317497253418, "kl": 0.13720703125, "learning_rate": 3.945841392649903e-07, "loss": 0.0055, "reward": 1.652651309967041, "reward_std": 0.14301005005836487, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6682763397693634, "step": 1878 }, { "completion_length": 120.375, "epoch": 1.2114764667956157, "grad_norm": 7.7513580322265625, "kl": 0.140625, "learning_rate": 3.9426176660219213e-07, "loss": 0.0056, "reward": 1.447944164276123, "reward_std": 0.09541654214262962, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.44794413447380066, "step": 1879 }, { "completion_length": 119.65625, "epoch": 1.2121212121212122, "grad_norm": 16.127681732177734, "kl": 0.120361328125, "learning_rate": 3.939393939393939e-07, "loss": 0.0048, "reward": 1.594392716884613, "reward_std": 0.08314290270209312, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5943926572799683, "step": 1880 }, { "completion_length": 128.78125, "epoch": 1.2127659574468086, "grad_norm": 56.94392013549805, "kl": 0.111328125, "learning_rate": 3.9361702127659574e-07, "loss": 0.0045, "reward": 1.7492846846580505, "reward_std": 0.07751237228512764, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7492846846580505, "step": 1881 }, { "completion_length": 115.03125, "epoch": 1.2134107027724048, "grad_norm": 22.434030532836914, "kl": 0.15625, "learning_rate": 3.932946486137975e-07, "loss": 0.0063, "reward": 1.5479236245155334, "reward_std": 0.08889007940888405, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5479235649108887, "step": 1882 }, { "completion_length": 108.109375, "epoch": 1.2140554480980013, "grad_norm": 7.91255521774292, "kl": 0.115234375, "learning_rate": 3.9297227595099936e-07, "loss": 0.0046, "reward": 1.674900233745575, "reward_std": 0.08419017121195793, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6749002635478973, "step": 1883 }, { "completion_length": 120.5625, "epoch": 1.2147001934235977, "grad_norm": 42.26613998413086, "kl": 0.12255859375, "learning_rate": 3.9264990328820114e-07, "loss": 0.0049, "reward": 1.6492977738380432, "reward_std": 0.14469093456864357, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6492977738380432, "step": 1884 }, { "completion_length": 124.046875, "epoch": 1.2153449387491941, "grad_norm": 35.12214279174805, "kl": 0.1220703125, "learning_rate": 3.9232753062540297e-07, "loss": 0.0049, "reward": 1.5660077929496765, "reward_std": 0.08853030204772949, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5660077631473541, "step": 1885 }, { "completion_length": 119.0, "epoch": 1.2159896840747906, "grad_norm": 10.311363220214844, "kl": 0.111083984375, "learning_rate": 3.9200515796260475e-07, "loss": 0.0044, "reward": 1.7127981781959534, "reward_std": 0.1427604779601097, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.712798148393631, "step": 1886 }, { "completion_length": 117.796875, "epoch": 1.2166344294003868, "grad_norm": 10.61994457244873, "kl": 0.1171875, "learning_rate": 3.916827852998066e-07, "loss": 0.0047, "reward": 1.6093450784683228, "reward_std": 0.1268884353339672, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6093450784683228, "step": 1887 }, { "completion_length": 125.671875, "epoch": 1.2172791747259832, "grad_norm": 10.94974136352539, "kl": 0.108642578125, "learning_rate": 3.9136041263700837e-07, "loss": 0.0043, "reward": 1.7339577674865723, "reward_std": 0.06080514006316662, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7339577078819275, "step": 1888 }, { "completion_length": 130.171875, "epoch": 1.2179239200515797, "grad_norm": 29.69418716430664, "kl": 0.11669921875, "learning_rate": 3.9103803997421015e-07, "loss": 0.0047, "reward": 1.546896517276764, "reward_std": 0.10660860687494278, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5468964874744415, "step": 1889 }, { "completion_length": 109.1875, "epoch": 1.218568665377176, "grad_norm": 10.2553071975708, "kl": 0.129638671875, "learning_rate": 3.90715667311412e-07, "loss": 0.0052, "reward": 1.7039321660995483, "reward_std": 0.1126185953617096, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7039321660995483, "step": 1890 }, { "completion_length": 127.015625, "epoch": 1.2192134107027723, "grad_norm": 15.430004119873047, "kl": 0.09765625, "learning_rate": 3.9039329464861376e-07, "loss": 0.0039, "reward": 1.644881248474121, "reward_std": 0.0988176241517067, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6448812484741211, "step": 1891 }, { "completion_length": 120.84375, "epoch": 1.2198581560283688, "grad_norm": 52.64883041381836, "kl": 0.114013671875, "learning_rate": 3.900709219858156e-07, "loss": 0.0046, "reward": 1.7061986923217773, "reward_std": 0.0526033453643322, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7061986327171326, "step": 1892 }, { "completion_length": 136.9375, "epoch": 1.2205029013539652, "grad_norm": 11.081215858459473, "kl": 0.13623046875, "learning_rate": 3.897485493230174e-07, "loss": 0.0054, "reward": 1.4815837144851685, "reward_std": 0.07057693414390087, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48158374428749084, "step": 1893 }, { "completion_length": 113.53125, "epoch": 1.2211476466795617, "grad_norm": 9.108400344848633, "kl": 0.13623046875, "learning_rate": 3.894261766602192e-07, "loss": 0.0054, "reward": 1.6832290291786194, "reward_std": 0.08856482990086079, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6988540291786194, "step": 1894 }, { "completion_length": 116.5, "epoch": 1.2217923920051579, "grad_norm": 33.61598587036133, "kl": 0.107666015625, "learning_rate": 3.89103803997421e-07, "loss": 0.0043, "reward": 1.6324860453605652, "reward_std": 0.09398233145475388, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6324860155582428, "step": 1895 }, { "completion_length": 117.90625, "epoch": 1.2224371373307543, "grad_norm": 14.763751029968262, "kl": 0.127685546875, "learning_rate": 3.887814313346228e-07, "loss": 0.0051, "reward": 1.5660215020179749, "reward_std": 0.09480106830596924, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5660214722156525, "step": 1896 }, { "completion_length": 119.46875, "epoch": 1.2230818826563508, "grad_norm": 23.6367244720459, "kl": 0.1298828125, "learning_rate": 3.884590586718246e-07, "loss": 0.0052, "reward": 1.598044991493225, "reward_std": 0.121733158826828, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5980449616909027, "step": 1897 }, { "completion_length": 109.390625, "epoch": 1.2237266279819472, "grad_norm": 12.922637939453125, "kl": 0.1123046875, "learning_rate": 3.8813668600902644e-07, "loss": 0.0045, "reward": 1.6700684428215027, "reward_std": 0.07409593462944031, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6700685024261475, "step": 1898 }, { "completion_length": 119.078125, "epoch": 1.2243713733075436, "grad_norm": 42.0638542175293, "kl": 0.11474609375, "learning_rate": 3.878143133462282e-07, "loss": 0.0046, "reward": 1.6279435753822327, "reward_std": 0.11568474024534225, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6279435753822327, "step": 1899 }, { "completion_length": 120.109375, "epoch": 1.2250161186331399, "grad_norm": 7.401854038238525, "kl": 0.1142578125, "learning_rate": 3.8749194068343005e-07, "loss": 0.0046, "reward": 1.5463709831237793, "reward_std": 0.12193901836872101, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5463710427284241, "step": 1900 }, { "completion_length": 119.0625, "epoch": 1.2256608639587363, "grad_norm": 10.23751163482666, "kl": 0.107421875, "learning_rate": 3.8716956802063183e-07, "loss": 0.0043, "reward": 1.59816575050354, "reward_std": 0.08526627346873283, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5981658101081848, "step": 1901 }, { "completion_length": 128.453125, "epoch": 1.2263056092843327, "grad_norm": 8.265456199645996, "kl": 0.098876953125, "learning_rate": 3.868471953578336e-07, "loss": 0.004, "reward": 1.820991039276123, "reward_std": 0.05691416189074516, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.8209910690784454, "step": 1902 }, { "completion_length": 116.3125, "epoch": 1.226950354609929, "grad_norm": 10.897770881652832, "kl": 0.104736328125, "learning_rate": 3.8652482269503544e-07, "loss": 0.0042, "reward": 1.617471992969513, "reward_std": 0.08763501048088074, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6174720227718353, "step": 1903 }, { "completion_length": 117.328125, "epoch": 1.2275950999355254, "grad_norm": 11.371556282043457, "kl": 0.129150390625, "learning_rate": 3.862024500322372e-07, "loss": 0.0052, "reward": 1.4868090152740479, "reward_std": 0.06387560069561005, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48680898547172546, "step": 1904 }, { "completion_length": 127.53125, "epoch": 1.2282398452611218, "grad_norm": 7.77946138381958, "kl": 0.12060546875, "learning_rate": 3.8588007736943906e-07, "loss": 0.0048, "reward": 1.6102416515350342, "reward_std": 0.15693595260381699, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6102416068315506, "step": 1905 }, { "completion_length": 111.6875, "epoch": 1.2288845905867183, "grad_norm": 16.03278160095215, "kl": 0.107666015625, "learning_rate": 3.8555770470664084e-07, "loss": 0.0043, "reward": 1.5970102548599243, "reward_std": 0.06430243700742722, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5970102250576019, "step": 1906 }, { "completion_length": 119.453125, "epoch": 1.2295293359123147, "grad_norm": 10.253130912780762, "kl": 0.14599609375, "learning_rate": 3.8523533204384267e-07, "loss": 0.0058, "reward": 1.6699411273002625, "reward_std": 0.09906578063964844, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6699410974979401, "step": 1907 }, { "completion_length": 120.21875, "epoch": 1.230174081237911, "grad_norm": 10.90766429901123, "kl": 0.116943359375, "learning_rate": 3.8491295938104445e-07, "loss": 0.0047, "reward": 1.7290301322937012, "reward_std": 0.12131063267588615, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7290300726890564, "step": 1908 }, { "completion_length": 129.828125, "epoch": 1.2308188265635074, "grad_norm": 9.48066234588623, "kl": 0.1123046875, "learning_rate": 3.845905867182463e-07, "loss": 0.0045, "reward": 1.6760023832321167, "reward_std": 0.14010852575302124, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6916274130344391, "step": 1909 }, { "completion_length": 120.15625, "epoch": 1.2314635718891038, "grad_norm": 13.372663497924805, "kl": 0.106201171875, "learning_rate": 3.8426821405544807e-07, "loss": 0.0043, "reward": 1.489621639251709, "reward_std": 0.0755012072622776, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48962169885635376, "step": 1910 }, { "completion_length": 136.375, "epoch": 1.2321083172147003, "grad_norm": 32.22721481323242, "kl": 0.126708984375, "learning_rate": 3.839458413926499e-07, "loss": 0.0051, "reward": 1.5128714442253113, "reward_std": 0.15325216948986053, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5284964740276337, "step": 1911 }, { "completion_length": 114.5625, "epoch": 1.2327530625402967, "grad_norm": 15.361464500427246, "kl": 0.11865234375, "learning_rate": 3.836234687298517e-07, "loss": 0.0047, "reward": 1.4920198321342468, "reward_std": 0.10974390432238579, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4920198917388916, "step": 1912 }, { "completion_length": 126.1875, "epoch": 1.233397807865893, "grad_norm": 18.84747886657715, "kl": 0.112548828125, "learning_rate": 3.833010960670535e-07, "loss": 0.0045, "reward": 1.4142240285873413, "reward_std": 0.04326391592621803, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4142240881919861, "step": 1913 }, { "completion_length": 126.609375, "epoch": 1.2340425531914894, "grad_norm": 25.70592498779297, "kl": 0.120361328125, "learning_rate": 3.829787234042553e-07, "loss": 0.0048, "reward": 1.6135888695716858, "reward_std": 0.16180770099163055, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.613588809967041, "step": 1914 }, { "completion_length": 125.203125, "epoch": 1.2346872985170858, "grad_norm": 30.81764793395996, "kl": 0.122802734375, "learning_rate": 3.8265635074145713e-07, "loss": 0.0049, "reward": 1.570852518081665, "reward_std": 0.12249016016721725, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5708525478839874, "step": 1915 }, { "completion_length": 126.09375, "epoch": 1.235332043842682, "grad_norm": 22.804471969604492, "kl": 0.116455078125, "learning_rate": 3.823339780786589e-07, "loss": 0.0047, "reward": 1.7204737663269043, "reward_std": 0.04909878969192505, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7204737663269043, "step": 1916 }, { "completion_length": 106.25, "epoch": 1.2359767891682785, "grad_norm": 22.267070770263672, "kl": 0.126220703125, "learning_rate": 3.820116054158607e-07, "loss": 0.0051, "reward": 1.7154393792152405, "reward_std": 0.1028110608458519, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7154394090175629, "step": 1917 }, { "completion_length": 124.78125, "epoch": 1.236621534493875, "grad_norm": 29.534210205078125, "kl": 0.108642578125, "learning_rate": 3.816892327530625e-07, "loss": 0.0043, "reward": 1.46629136800766, "reward_std": 0.09430722147226334, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4662914276123047, "step": 1918 }, { "completion_length": 121.921875, "epoch": 1.2372662798194713, "grad_norm": 15.665167808532715, "kl": 0.10986328125, "learning_rate": 3.813668600902643e-07, "loss": 0.0044, "reward": 1.5596240758895874, "reward_std": 0.10030604526400566, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5596240758895874, "step": 1919 }, { "completion_length": 114.5, "epoch": 1.2379110251450678, "grad_norm": 8.512285232543945, "kl": 0.115234375, "learning_rate": 3.8104448742746614e-07, "loss": 0.0046, "reward": 1.6296950578689575, "reward_std": 0.05596102774143219, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6296949684619904, "step": 1920 }, { "completion_length": 115.8125, "epoch": 1.238555770470664, "grad_norm": 79.31735229492188, "kl": 0.107666015625, "learning_rate": 3.807221147646679e-07, "loss": 0.0043, "reward": 1.5508543848991394, "reward_std": 0.09145446866750717, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.550854355096817, "step": 1921 }, { "completion_length": 119.015625, "epoch": 1.2392005157962604, "grad_norm": 11.683690071105957, "kl": 0.131103515625, "learning_rate": 3.8039974210186975e-07, "loss": 0.0052, "reward": 1.7141978740692139, "reward_std": 0.11888259276747704, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7141977846622467, "step": 1922 }, { "completion_length": 117.203125, "epoch": 1.239845261121857, "grad_norm": 24.96306800842285, "kl": 0.123779296875, "learning_rate": 3.8007736943907153e-07, "loss": 0.0049, "reward": 1.646182358264923, "reward_std": 0.09359883144497871, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6461824774742126, "step": 1923 }, { "completion_length": 119.828125, "epoch": 1.2404900064474533, "grad_norm": 13.091527938842773, "kl": 0.125, "learning_rate": 3.7975499677627336e-07, "loss": 0.005, "reward": 1.4956902861595154, "reward_std": 0.10586478561162949, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4956902712583542, "step": 1924 }, { "completion_length": 116.171875, "epoch": 1.2411347517730495, "grad_norm": 9.332618713378906, "kl": 0.100341796875, "learning_rate": 3.7943262411347514e-07, "loss": 0.004, "reward": 1.715123176574707, "reward_std": 0.056105077266693115, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7151232659816742, "step": 1925 }, { "completion_length": 124.453125, "epoch": 1.241779497098646, "grad_norm": 6.001155853271484, "kl": 0.1201171875, "learning_rate": 3.79110251450677e-07, "loss": 0.0048, "reward": 1.5464325547218323, "reward_std": 0.09315594844520092, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.546432614326477, "step": 1926 }, { "completion_length": 117.265625, "epoch": 1.2424242424242424, "grad_norm": 10.31252384185791, "kl": 0.105224609375, "learning_rate": 3.7878787878787876e-07, "loss": 0.0042, "reward": 1.5988069772720337, "reward_std": 0.0450415788218379, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5988070219755173, "step": 1927 }, { "completion_length": 122.15625, "epoch": 1.2430689877498389, "grad_norm": 10.713313102722168, "kl": 0.1318359375, "learning_rate": 3.784655061250806e-07, "loss": 0.0053, "reward": 1.5424224138259888, "reward_std": 0.0438772514462471, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5424223840236664, "step": 1928 }, { "completion_length": 110.421875, "epoch": 1.243713733075435, "grad_norm": 16.321693420410156, "kl": 0.108642578125, "learning_rate": 3.7814313346228237e-07, "loss": 0.0043, "reward": 1.7239617109298706, "reward_std": 0.07088684663176537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.723961740732193, "step": 1929 }, { "completion_length": 116.0, "epoch": 1.2443584784010315, "grad_norm": 12.18964672088623, "kl": 0.130859375, "learning_rate": 3.7782076079948415e-07, "loss": 0.0052, "reward": 1.5831393599510193, "reward_std": 0.1350330039858818, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5831393897533417, "step": 1930 }, { "completion_length": 107.234375, "epoch": 1.245003223726628, "grad_norm": 26.728723526000977, "kl": 0.1142578125, "learning_rate": 3.77498388136686e-07, "loss": 0.0046, "reward": 1.5077446699142456, "reward_std": 0.1127016618847847, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5077447444200516, "step": 1931 }, { "completion_length": 114.515625, "epoch": 1.2456479690522244, "grad_norm": 14.777571678161621, "kl": 0.11279296875, "learning_rate": 3.7717601547388777e-07, "loss": 0.0045, "reward": 1.5410062074661255, "reward_std": 0.09689564630389214, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5410062372684479, "step": 1932 }, { "completion_length": 109.03125, "epoch": 1.2462927143778209, "grad_norm": 5.873917102813721, "kl": 0.140380859375, "learning_rate": 3.768536428110896e-07, "loss": 0.0056, "reward": 1.4805980324745178, "reward_std": 0.08332928642630577, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.480598047375679, "step": 1933 }, { "completion_length": 112.921875, "epoch": 1.246937459703417, "grad_norm": 54.74165725708008, "kl": 0.162841796875, "learning_rate": 3.765312701482914e-07, "loss": 0.0065, "reward": 1.706473171710968, "reward_std": 0.09516463428735733, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7064732313156128, "step": 1934 }, { "completion_length": 107.96875, "epoch": 1.2475822050290135, "grad_norm": 22.8502254486084, "kl": 0.10888671875, "learning_rate": 3.762088974854932e-07, "loss": 0.0044, "reward": 1.5394970774650574, "reward_std": 0.08196517452597618, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5394971370697021, "step": 1935 }, { "completion_length": 108.953125, "epoch": 1.24822695035461, "grad_norm": 13.624421119689941, "kl": 0.125, "learning_rate": 3.75886524822695e-07, "loss": 0.005, "reward": 1.4955153465270996, "reward_std": 0.0882682166993618, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4955153614282608, "step": 1936 }, { "completion_length": 98.75, "epoch": 1.2488716956802064, "grad_norm": 12.699457168579102, "kl": 0.114501953125, "learning_rate": 3.7556415215989683e-07, "loss": 0.0046, "reward": 1.7613229155540466, "reward_std": 0.05144652538001537, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7613229155540466, "step": 1937 }, { "completion_length": 113.28125, "epoch": 1.2495164410058026, "grad_norm": 233.15621948242188, "kl": 0.10888671875, "learning_rate": 3.752417794970986e-07, "loss": 0.0044, "reward": 1.5607260465621948, "reward_std": 0.10854167118668556, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5607260465621948, "step": 1938 }, { "completion_length": 102.078125, "epoch": 1.250161186331399, "grad_norm": 12.188179969787598, "kl": 0.115234375, "learning_rate": 3.7491940683430044e-07, "loss": 0.0046, "reward": 1.6384233236312866, "reward_std": 0.0817994475364685, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6384232938289642, "step": 1939 }, { "completion_length": 107.4375, "epoch": 1.2508059316569955, "grad_norm": 13.628211975097656, "kl": 0.123291015625, "learning_rate": 3.745970341715022e-07, "loss": 0.0049, "reward": 1.6886098980903625, "reward_std": 0.09688212350010872, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6886099278926849, "step": 1940 }, { "completion_length": 114.46875, "epoch": 1.251450676982592, "grad_norm": 25.029949188232422, "kl": 0.12646484375, "learning_rate": 3.7427466150870406e-07, "loss": 0.005, "reward": 1.6348047256469727, "reward_std": 0.11835089325904846, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6348046958446503, "step": 1941 }, { "completion_length": 115.5625, "epoch": 1.2520954223081882, "grad_norm": 8.084458351135254, "kl": 0.108642578125, "learning_rate": 3.7395228884590584e-07, "loss": 0.0043, "reward": 1.6494329571723938, "reward_std": 0.06459452584385872, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6494330167770386, "step": 1942 }, { "completion_length": 116.90625, "epoch": 1.2527401676337846, "grad_norm": 12.821890830993652, "kl": 0.10595703125, "learning_rate": 3.736299161831076e-07, "loss": 0.0042, "reward": 1.5637995600700378, "reward_std": 0.06873523443937302, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5637995451688766, "step": 1943 }, { "completion_length": 121.9375, "epoch": 1.253384912959381, "grad_norm": 10.103633880615234, "kl": 0.1142578125, "learning_rate": 3.7330754352030945e-07, "loss": 0.0046, "reward": 1.4810506105422974, "reward_std": 0.08668520301580429, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48105068504810333, "step": 1944 }, { "completion_length": 121.0, "epoch": 1.2540296582849775, "grad_norm": 12.35737419128418, "kl": 0.11767578125, "learning_rate": 3.7298517085751123e-07, "loss": 0.0047, "reward": 1.585673987865448, "reward_std": 0.12142669595777988, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5856739580631256, "step": 1945 }, { "completion_length": 117.4375, "epoch": 1.254674403610574, "grad_norm": 11.945507049560547, "kl": 0.1455078125, "learning_rate": 3.7266279819471306e-07, "loss": 0.0058, "reward": 1.4567408561706543, "reward_std": 0.1808721274137497, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.47236596047878265, "step": 1946 }, { "completion_length": 103.265625, "epoch": 1.2553191489361701, "grad_norm": 18.38748550415039, "kl": 0.1181640625, "learning_rate": 3.7234042553191484e-07, "loss": 0.0047, "reward": 1.580788016319275, "reward_std": 0.08059585839509964, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5807879716157913, "step": 1947 }, { "completion_length": 110.203125, "epoch": 1.2559638942617666, "grad_norm": 20.399362564086914, "kl": 0.116943359375, "learning_rate": 3.720180528691167e-07, "loss": 0.0047, "reward": 1.5159632563591003, "reward_std": 0.09451697021722794, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5159633159637451, "step": 1948 }, { "completion_length": 112.65625, "epoch": 1.256608639587363, "grad_norm": 21.889610290527344, "kl": 0.104736328125, "learning_rate": 3.7169568020631846e-07, "loss": 0.0042, "reward": 1.558445394039154, "reward_std": 0.11743486672639847, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5584454238414764, "step": 1949 }, { "completion_length": 120.78125, "epoch": 1.2572533849129595, "grad_norm": 12.497200965881348, "kl": 0.1220703125, "learning_rate": 3.713733075435203e-07, "loss": 0.0049, "reward": 1.7180161476135254, "reward_std": 0.07873895950615406, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7180160582065582, "step": 1950 }, { "completion_length": 114.359375, "epoch": 1.257898130238556, "grad_norm": 41.40201187133789, "kl": 0.13134765625, "learning_rate": 3.7105093488072207e-07, "loss": 0.0053, "reward": 1.5681995153427124, "reward_std": 0.06578772515058517, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5681995451450348, "step": 1951 }, { "completion_length": 115.03125, "epoch": 1.2585428755641521, "grad_norm": 14.451946258544922, "kl": 0.1181640625, "learning_rate": 3.707285622179239e-07, "loss": 0.0047, "reward": 1.5138095617294312, "reward_std": 0.17110786586999893, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.5294344574213028, "step": 1952 }, { "completion_length": 106.9375, "epoch": 1.2591876208897486, "grad_norm": 14.541790008544922, "kl": 0.126220703125, "learning_rate": 3.704061895551257e-07, "loss": 0.0051, "reward": 1.6685959696769714, "reward_std": 0.10416876897215843, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6842210292816162, "step": 1953 }, { "completion_length": 105.984375, "epoch": 1.259832366215345, "grad_norm": 10.058271408081055, "kl": 0.124755859375, "learning_rate": 3.700838168923275e-07, "loss": 0.005, "reward": 1.6186620593070984, "reward_std": 0.11036109924316406, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6186620444059372, "step": 1954 }, { "completion_length": 95.25, "epoch": 1.2604771115409412, "grad_norm": 9.15190315246582, "kl": 0.1474609375, "learning_rate": 3.697614442295293e-07, "loss": 0.0059, "reward": 1.6321035623550415, "reward_std": 0.10482674837112427, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6321035325527191, "step": 1955 }, { "completion_length": 100.53125, "epoch": 1.2611218568665377, "grad_norm": 10.478346824645996, "kl": 0.146484375, "learning_rate": 3.694390715667311e-07, "loss": 0.0059, "reward": 1.7911347150802612, "reward_std": 0.06305089220404625, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.791134774684906, "step": 1956 }, { "completion_length": 114.5625, "epoch": 1.261766602192134, "grad_norm": 17.663482666015625, "kl": 0.11083984375, "learning_rate": 3.691166989039329e-07, "loss": 0.0044, "reward": 1.5123714208602905, "reward_std": 0.10252119600772858, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5123714208602905, "step": 1957 }, { "completion_length": 117.234375, "epoch": 1.2624113475177305, "grad_norm": 6.887454509735107, "kl": 0.112548828125, "learning_rate": 3.687943262411347e-07, "loss": 0.0045, "reward": 1.6729698181152344, "reward_std": 0.07313475385308266, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6729699075222015, "step": 1958 }, { "completion_length": 113.265625, "epoch": 1.263056092843327, "grad_norm": 18.4481201171875, "kl": 0.1220703125, "learning_rate": 3.6847195357833653e-07, "loss": 0.0049, "reward": 1.4612666964530945, "reward_std": 0.07354892790317535, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4612666964530945, "step": 1959 }, { "completion_length": 110.921875, "epoch": 1.2637008381689232, "grad_norm": 37.00850296020508, "kl": 0.1123046875, "learning_rate": 3.681495809155383e-07, "loss": 0.0045, "reward": 1.581781268119812, "reward_std": 0.08708008006215096, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5817812830209732, "step": 1960 }, { "completion_length": 108.765625, "epoch": 1.2643455834945196, "grad_norm": 20.20766830444336, "kl": 0.126953125, "learning_rate": 3.6782720825274014e-07, "loss": 0.0051, "reward": 1.7111400365829468, "reward_std": 0.08487332798540592, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7111401557922363, "step": 1961 }, { "completion_length": 128.984375, "epoch": 1.264990328820116, "grad_norm": 14.24345588684082, "kl": 0.1005859375, "learning_rate": 3.675048355899419e-07, "loss": 0.004, "reward": 1.4533016085624695, "reward_std": 0.08253934234380722, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4533015191555023, "step": 1962 }, { "completion_length": 117.53125, "epoch": 1.2656350741457125, "grad_norm": 6.561637878417969, "kl": 0.107177734375, "learning_rate": 3.6718246292714376e-07, "loss": 0.0043, "reward": 1.3847861886024475, "reward_std": 0.07416202872991562, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.3847862035036087, "step": 1963 }, { "completion_length": 117.140625, "epoch": 1.266279819471309, "grad_norm": 13.04010009765625, "kl": 0.12109375, "learning_rate": 3.6686009026434554e-07, "loss": 0.0048, "reward": 1.555762767791748, "reward_std": 0.08999788016080856, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.555762767791748, "step": 1964 }, { "completion_length": 126.1875, "epoch": 1.2669245647969052, "grad_norm": 7.148639678955078, "kl": 0.1572265625, "learning_rate": 3.6653771760154737e-07, "loss": 0.0063, "reward": 1.5946932435035706, "reward_std": 0.08358455076813698, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5946933031082153, "step": 1965 }, { "completion_length": 117.359375, "epoch": 1.2675693101225016, "grad_norm": 16.43146324157715, "kl": 0.102783203125, "learning_rate": 3.6621534493874915e-07, "loss": 0.0041, "reward": 1.6910881996154785, "reward_std": 0.07531457021832466, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6910882294178009, "step": 1966 }, { "completion_length": 113.46875, "epoch": 1.268214055448098, "grad_norm": 12.970282554626465, "kl": 0.115966796875, "learning_rate": 3.65892972275951e-07, "loss": 0.0046, "reward": 1.5166686177253723, "reward_std": 0.10853827744722366, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5166686773300171, "step": 1967 }, { "completion_length": 102.703125, "epoch": 1.2688588007736943, "grad_norm": 10.751826286315918, "kl": 0.123779296875, "learning_rate": 3.6557059961315276e-07, "loss": 0.0049, "reward": 1.6648629307746887, "reward_std": 0.09130343794822693, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6648629009723663, "step": 1968 }, { "completion_length": 114.078125, "epoch": 1.2695035460992907, "grad_norm": 13.697860717773438, "kl": 0.1650390625, "learning_rate": 3.652482269503546e-07, "loss": 0.0066, "reward": 1.531709372997284, "reward_std": 0.09723073989152908, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5317093729972839, "step": 1969 }, { "completion_length": 131.328125, "epoch": 1.2701482914248872, "grad_norm": 13.180706024169922, "kl": 0.11083984375, "learning_rate": 3.649258542875564e-07, "loss": 0.0044, "reward": 1.6077740788459778, "reward_std": 0.12473616376519203, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6077740788459778, "step": 1970 }, { "completion_length": 119.546875, "epoch": 1.2707930367504836, "grad_norm": 12.488205909729004, "kl": 0.12353515625, "learning_rate": 3.6460348162475816e-07, "loss": 0.0049, "reward": 1.6692855954170227, "reward_std": 0.07351256161928177, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6692855060100555, "step": 1971 }, { "completion_length": 117.671875, "epoch": 1.27143778207608, "grad_norm": 7.643041610717773, "kl": 0.12353515625, "learning_rate": 3.6428110896196e-07, "loss": 0.0049, "reward": 1.496739387512207, "reward_std": 0.03722511604428291, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.49673932790756226, "step": 1972 }, { "completion_length": 113.75, "epoch": 1.2720825274016763, "grad_norm": 8.244843482971191, "kl": 0.1162109375, "learning_rate": 3.6395873629916177e-07, "loss": 0.0046, "reward": 1.5505931377410889, "reward_std": 0.06616784632205963, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5505931377410889, "step": 1973 }, { "completion_length": 114.6875, "epoch": 1.2727272727272727, "grad_norm": 9.663063049316406, "kl": 0.111328125, "learning_rate": 3.636363636363636e-07, "loss": 0.0045, "reward": 1.5648088455200195, "reward_std": 0.0582730658352375, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5648089051246643, "step": 1974 }, { "completion_length": 105.453125, "epoch": 1.2733720180528691, "grad_norm": 31.60782241821289, "kl": 0.109130859375, "learning_rate": 3.633139909735654e-07, "loss": 0.0044, "reward": 1.503617525100708, "reward_std": 0.07258416526019573, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.503617525100708, "step": 1975 }, { "completion_length": 111.046875, "epoch": 1.2740167633784656, "grad_norm": 18.72150993347168, "kl": 0.111572265625, "learning_rate": 3.629916183107672e-07, "loss": 0.0045, "reward": 1.7879358530044556, "reward_std": 0.10292981564998627, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7879358530044556, "step": 1976 }, { "completion_length": 119.375, "epoch": 1.274661508704062, "grad_norm": 16.401723861694336, "kl": 0.1171875, "learning_rate": 3.62669245647969e-07, "loss": 0.0047, "reward": 1.7952114343643188, "reward_std": 0.08389570191502571, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7952114343643188, "step": 1977 }, { "completion_length": 116.5625, "epoch": 1.2753062540296582, "grad_norm": 10.445584297180176, "kl": 0.1337890625, "learning_rate": 3.6234687298517083e-07, "loss": 0.0054, "reward": 1.70708167552948, "reward_std": 0.09822971373796463, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7070816159248352, "step": 1978 }, { "completion_length": 126.171875, "epoch": 1.2759509993552547, "grad_norm": 27.000473022460938, "kl": 0.11572265625, "learning_rate": 3.620245003223726e-07, "loss": 0.0046, "reward": 1.487796425819397, "reward_std": 0.10756743885576725, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.48779644072055817, "step": 1979 }, { "completion_length": 112.359375, "epoch": 1.2765957446808511, "grad_norm": 19.835025787353516, "kl": 0.125, "learning_rate": 3.617021276595745e-07, "loss": 0.005, "reward": 1.8274250626564026, "reward_std": 0.15155724436044693, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.8430500030517578, "step": 1980 }, { "completion_length": 113.203125, "epoch": 1.2772404900064473, "grad_norm": 14.025164604187012, "kl": 0.12060546875, "learning_rate": 3.6137975499677623e-07, "loss": 0.0048, "reward": 1.5321632623672485, "reward_std": 0.09312083013355732, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5321632027626038, "step": 1981 }, { "completion_length": 114.5625, "epoch": 1.2778852353320438, "grad_norm": 11.435771942138672, "kl": 0.109375, "learning_rate": 3.610573823339781e-07, "loss": 0.0044, "reward": 1.5915541052818298, "reward_std": 0.06063748896121979, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5915541350841522, "step": 1982 }, { "completion_length": 113.046875, "epoch": 1.2785299806576402, "grad_norm": 8.650049209594727, "kl": 0.12060546875, "learning_rate": 3.607350096711799e-07, "loss": 0.0048, "reward": 1.5912625789642334, "reward_std": 0.07665859535336494, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5912625789642334, "step": 1983 }, { "completion_length": 113.34375, "epoch": 1.2791747259832367, "grad_norm": 16.17777442932129, "kl": 0.108642578125, "learning_rate": 3.604126370083816e-07, "loss": 0.0043, "reward": 1.6600617170333862, "reward_std": 0.06648578122258186, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6600617170333862, "step": 1984 }, { "completion_length": 116.796875, "epoch": 1.2798194713088331, "grad_norm": 12.173382759094238, "kl": 0.0986328125, "learning_rate": 3.600902643455835e-07, "loss": 0.0039, "reward": 1.6574986577033997, "reward_std": 0.1398964375257492, "rewards/format_reward": 0.984375, "rewards/iou_timestamp_reward": 0.6731236279010773, "step": 1985 }, { "completion_length": 119.140625, "epoch": 1.2804642166344293, "grad_norm": 11.117454528808594, "kl": 0.1337890625, "learning_rate": 3.597678916827853e-07, "loss": 0.0053, "reward": 1.4467003345489502, "reward_std": 0.09076781012117863, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.4467003643512726, "step": 1986 }, { "completion_length": 111.703125, "epoch": 1.2811089619600258, "grad_norm": 17.02039337158203, "kl": 0.1435546875, "learning_rate": 3.594455190199871e-07, "loss": 0.0057, "reward": 1.6881715059280396, "reward_std": 0.09348884224891663, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6881715357303619, "step": 1987 }, { "completion_length": 114.0, "epoch": 1.2817537072856222, "grad_norm": 41.62820816040039, "kl": 0.11865234375, "learning_rate": 3.591231463571889e-07, "loss": 0.0047, "reward": 1.599281907081604, "reward_std": 0.05156847834587097, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.599281907081604, "step": 1988 }, { "completion_length": 115.234375, "epoch": 1.2823984526112184, "grad_norm": 17.14328384399414, "kl": 0.11083984375, "learning_rate": 3.5880077369439074e-07, "loss": 0.0044, "reward": 1.708803951740265, "reward_std": 0.11099771410226822, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7088040113449097, "step": 1989 }, { "completion_length": 119.671875, "epoch": 1.283043197936815, "grad_norm": 342.91497802734375, "kl": 0.177734375, "learning_rate": 3.584784010315925e-07, "loss": 0.0071, "reward": 1.5411792993545532, "reward_std": 0.13300441205501556, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5411793291568756, "step": 1990 }, { "completion_length": 113.25, "epoch": 1.2836879432624113, "grad_norm": 14.473013877868652, "kl": 0.1025390625, "learning_rate": 3.5815602836879435e-07, "loss": 0.0041, "reward": 1.6225409507751465, "reward_std": 0.07372570037841797, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6225408911705017, "step": 1991 }, { "completion_length": 110.390625, "epoch": 1.2843326885880078, "grad_norm": 10.521697998046875, "kl": 0.110107421875, "learning_rate": 3.5783365570599613e-07, "loss": 0.0044, "reward": 1.6926472783088684, "reward_std": 0.09197350218892097, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6926472783088684, "step": 1992 }, { "completion_length": 110.484375, "epoch": 1.2849774339136042, "grad_norm": 8.88347339630127, "kl": 0.122802734375, "learning_rate": 3.5751128304319796e-07, "loss": 0.0049, "reward": 1.773346483707428, "reward_std": 0.11623063683509827, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.773346483707428, "step": 1993 }, { "completion_length": 104.796875, "epoch": 1.2856221792392004, "grad_norm": 13.534448623657227, "kl": 0.109619140625, "learning_rate": 3.5718891038039974e-07, "loss": 0.0044, "reward": 1.5154983401298523, "reward_std": 0.05960991233587265, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5154983699321747, "step": 1994 }, { "completion_length": 118.21875, "epoch": 1.2862669245647969, "grad_norm": 11.80124282836914, "kl": 0.12451171875, "learning_rate": 3.568665377176016e-07, "loss": 0.005, "reward": 1.787231683731079, "reward_std": 0.11993472650647163, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.7872316539287567, "step": 1995 }, { "completion_length": 101.765625, "epoch": 1.2869116698903933, "grad_norm": 13.930368423461914, "kl": 0.125732421875, "learning_rate": 3.5654416505480336e-07, "loss": 0.005, "reward": 1.5652704238891602, "reward_std": 0.11598680540919304, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.5652704238891602, "step": 1996 }, { "completion_length": 106.140625, "epoch": 1.2875564152159897, "grad_norm": 8.300092697143555, "kl": 0.1728515625, "learning_rate": 3.5622179239200514e-07, "loss": 0.0069, "reward": 1.6255128383636475, "reward_std": 0.1058376170694828, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6255128383636475, "step": 1997 }, { "completion_length": 113.53125, "epoch": 1.2882011605415862, "grad_norm": 14.051785469055176, "kl": 0.134033203125, "learning_rate": 3.5589941972920697e-07, "loss": 0.0054, "reward": 1.6041387915611267, "reward_std": 0.08702468872070312, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6041387319564819, "step": 1998 }, { "completion_length": 111.484375, "epoch": 1.2888459058671824, "grad_norm": 14.08708667755127, "kl": 0.14501953125, "learning_rate": 3.5557704706640875e-07, "loss": 0.0058, "reward": 1.6865518689155579, "reward_std": 0.07037417963147163, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6865518391132355, "step": 1999 }, { "completion_length": 116.109375, "epoch": 1.2894906511927788, "grad_norm": 13.85488510131836, "kl": 0.109130859375, "learning_rate": 3.552546744036106e-07, "loss": 0.0044, "reward": 1.6934226155281067, "reward_std": 0.08548137731850147, "rewards/format_reward": 1.0, "rewards/iou_timestamp_reward": 0.6934226155281067, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 3102, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }