diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,43966 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3138, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 285.84375, + "epoch": 0.00031867431485022306, + "grad_norm": 7.580958366394043, + "kl": 0.0, + "learning_rate": 9.996813256851498e-07, + "loss": -0.0, + "reward": 0.9926098585128784, + "reward_std": 0.4385250508785248, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 0.71875, + "rewards/iou_glue_reward": 0.19573485851287842, + "step": 1 + }, + { + "completion_length": 265.53125, + "epoch": 0.0006373486297004461, + "grad_norm": 11.604437828063965, + "kl": 0.00079345703125, + "learning_rate": 9.993626513702996e-07, + "loss": 0.0, + "reward": 1.0492684841156006, + "reward_std": 0.5438859462738037, + "rewards/format_reward_tg": 0.71875, + "rewards/iou_timestamp_reward": 0.23676855862140656, + "rewards/pad": 0.09375, + "step": 2 + }, + { + "completion_length": 251.453125, + "epoch": 0.0009560229445506692, + "grad_norm": 12.72636604309082, + "kl": 0.00127410888671875, + "learning_rate": 9.990439770554494e-07, + "loss": 0.0001, + "reward": 1.0591096878051758, + "reward_std": 0.5083183646202087, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.75, + "rewards/tracking_iou_reward": 0.23098470270633698, + "step": 3 + }, + { + "completion_length": 127.15625, + "epoch": 0.0012746972594008922, + "grad_norm": 16.9719295501709, + "kl": 0.00360107421875, + "learning_rate": 9.98725302740599e-07, + "loss": 0.0001, + "reward": 1.0967737436294556, + "reward_std": 0.32875216007232666, + "rewards/format_reward_tg": 0.90625, + "rewards/iou_timestamp_reward": 0.19052375853061676, + "rewards/pad": 0.0, + "step": 4 + }, + { + "completion_length": 209.3125, + "epoch": 0.0015933715742511153, + "grad_norm": 16.323808670043945, + "kl": 0.0026092529296875, + "learning_rate": 9.984066284257488e-07, + "loss": 0.0001, + "reward": 1.1501266956329346, + "reward_std": 0.2969282269477844, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.875, + "rewards/iou_glue_reward": 0.16575169563293457, + "step": 5 + }, + { + "completion_length": 237.34375, + "epoch": 0.0019120458891013384, + "grad_norm": 6.7259602546691895, + "kl": 0.00933837890625, + "learning_rate": 9.980879541108986e-07, + "loss": 0.0004, + "reward": 1.2382855415344238, + "reward_std": 0.2583405375480652, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.1601606011390686, + "step": 6 + }, + { + "completion_length": 237.921875, + "epoch": 0.0022307202039515616, + "grad_norm": 11.441876411437988, + "kl": 0.004364013671875, + "learning_rate": 9.977692797960484e-07, + "loss": 0.0002, + "reward": 1.0197477340698242, + "reward_std": 0.5455303192138672, + "rewards/format_reward_tg": 0.65625, + "rewards/iou_timestamp_reward": 0.191622793674469, + "rewards/pad": 0.171875, + "step": 7 + }, + { + "completion_length": 245.140625, + "epoch": 0.0025493945188017845, + "grad_norm": 37.0315055847168, + "kl": 0.01007080078125, + "learning_rate": 9.974506054811982e-07, + "loss": 0.0004, + "reward": 1.1466755867004395, + "reward_std": 0.32557743787765503, + "rewards/format_reward_tg": 0.875, + "rewards/iou_timestamp_reward": 0.17792558670043945, + "rewards/pad": 0.09375, + "step": 8 + }, + { + "completion_length": 269.09375, + "epoch": 0.0028680688336520078, + "grad_norm": 30.685070037841797, + "kl": 0.01055908203125, + "learning_rate": 9.97131931166348e-07, + "loss": 0.0004, + "reward": 1.16693115234375, + "reward_std": 0.42338114976882935, + "rewards/format_reward_tg": 0.8125, + "rewards/iou_timestamp_reward": 0.21380607783794403, + "rewards/pad": 0.140625, + "step": 9 + }, + { + "completion_length": 267.671875, + "epoch": 0.0031867431485022306, + "grad_norm": 6.045750617980957, + "kl": 0.013916015625, + "learning_rate": 9.968132568514978e-07, + "loss": 0.0006, + "reward": 1.0137124061584473, + "reward_std": 0.5149969458580017, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.75, + "rewards/iou_glue_reward": 0.13871240615844727, + "step": 10 + }, + { + "completion_length": 303.875, + "epoch": 0.003505417463352454, + "grad_norm": 19.550809860229492, + "kl": 0.007476806640625, + "learning_rate": 9.964945825366476e-07, + "loss": 0.0003, + "reward": 0.9201228618621826, + "reward_std": 0.4545382857322693, + "rewards/format_reward_tg": 0.671875, + "rewards/iou_timestamp_reward": 0.24824786186218262, + "rewards/pad": 0.0, + "step": 11 + }, + { + "completion_length": 305.109375, + "epoch": 0.0038240917782026767, + "grad_norm": 10.959343910217285, + "kl": 0.009521484375, + "learning_rate": 9.961759082217972e-07, + "loss": 0.0004, + "reward": 1.1065305471420288, + "reward_std": 0.4581749141216278, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.796875, + "rewards/tracking_iou_reward": 0.21590548753738403, + "step": 12 + }, + { + "completion_length": 254.71875, + "epoch": 0.0041427660930529, + "grad_norm": 61.00483322143555, + "kl": 0.010986328125, + "learning_rate": 9.95857233906947e-07, + "loss": 0.0004, + "reward": 1.1909832954406738, + "reward_std": 0.3574945330619812, + "rewards/format_reward_tg": 0.890625, + "rewards/iou_timestamp_reward": 0.1909833550453186, + "rewards/pad": 0.109375, + "step": 13 + }, + { + "completion_length": 304.65625, + "epoch": 0.004461440407903123, + "grad_norm": 6.863766670227051, + "kl": 0.00927734375, + "learning_rate": 9.955385595920968e-07, + "loss": 0.0004, + "reward": 1.004595398902893, + "reward_std": 0.46202772855758667, + "rewards/format_reward_tg": 0.78125, + "rewards/iou_timestamp_reward": 0.2233453392982483, + "rewards/pad": 0.0, + "step": 14 + }, + { + "completion_length": 274.21875, + "epoch": 0.004780114722753346, + "grad_norm": 11.535690307617188, + "kl": 0.01177978515625, + "learning_rate": 9.952198852772466e-07, + "loss": 0.0005, + "reward": 1.1242345571517944, + "reward_std": 0.3550039231777191, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.828125, + "rewards/iou_glue_reward": 0.17110954225063324, + "step": 15 + }, + { + "completion_length": 201.15625, + "epoch": 0.005098789037603569, + "grad_norm": 18.40079689025879, + "kl": 0.02392578125, + "learning_rate": 9.949012109623964e-07, + "loss": 0.001, + "reward": 1.2128632068634033, + "reward_std": 0.3046284019947052, + "rewards/format_reward_tg": 0.9375, + "rewards/iou_timestamp_reward": 0.24411317706108093, + "rewards/pad": 0.03125, + "step": 16 + }, + { + "completion_length": 322.9375, + "epoch": 0.005417463352453792, + "grad_norm": 23.227741241455078, + "kl": 0.01165771484375, + "learning_rate": 9.945825366475462e-07, + "loss": 0.0005, + "reward": 1.2675498723983765, + "reward_std": 0.35713836550712585, + "rewards/format_reward_tg": 0.875, + "rewards/iou_timestamp_reward": 0.23629984259605408, + "rewards/pad": 0.15625, + "step": 17 + }, + { + "completion_length": 351.0625, + "epoch": 0.0057361376673040155, + "grad_norm": 6.902748107910156, + "kl": 0.0101318359375, + "learning_rate": 9.94263862332696e-07, + "loss": 0.0004, + "reward": 1.127271056175232, + "reward_std": 0.4339676797389984, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.828125, + "rewards/tracking_iou_reward": 0.2053961157798767, + "step": 18 + }, + { + "completion_length": 298.671875, + "epoch": 0.006054811982154238, + "grad_norm": 8.274727821350098, + "kl": 0.0150146484375, + "learning_rate": 9.939451880178459e-07, + "loss": 0.0006, + "reward": 1.0681308507919312, + "reward_std": 0.4059593677520752, + "rewards/format_reward_tg": 0.828125, + "rewards/iou_timestamp_reward": 0.24000586569309235, + "rewards/pad": 0.0, + "step": 19 + }, + { + "completion_length": 316.703125, + "epoch": 0.006373486297004461, + "grad_norm": 5.176186561584473, + "kl": 0.01361083984375, + "learning_rate": 9.936265137029955e-07, + "loss": 0.0005, + "reward": 1.147798776626587, + "reward_std": 0.20939724147319794, + "rewards/format_reward_tg": 0.9375, + "rewards/iou_timestamp_reward": 0.21029871702194214, + "rewards/pad": 0.0, + "step": 20 + }, + { + "completion_length": 332.609375, + "epoch": 0.006692160611854685, + "grad_norm": 15.555620193481445, + "kl": 0.0089111328125, + "learning_rate": 9.933078393881453e-07, + "loss": 0.0004, + "reward": 1.0623363256454468, + "reward_std": 0.3388970196247101, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.859375, + "rewards/tracking_iou_reward": 0.20296134054660797, + "step": 21 + }, + { + "completion_length": 338.875, + "epoch": 0.007010834926704908, + "grad_norm": 5.417019367218018, + "kl": 0.01373291015625, + "learning_rate": 9.92989165073295e-07, + "loss": 0.0006, + "reward": 1.1889852285385132, + "reward_std": 0.3602343797683716, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.890625, + "rewards/tracking_iou_reward": 0.2983602285385132, + "step": 22 + }, + { + "completion_length": 301.140625, + "epoch": 0.007329509241555131, + "grad_norm": 8.908356666564941, + "kl": 0.01507568359375, + "learning_rate": 9.926704907584449e-07, + "loss": 0.0006, + "reward": 1.141218662261963, + "reward_std": 0.31373298168182373, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.90625, + "rewards/tracking_iou_reward": 0.2349686622619629, + "step": 23 + }, + { + "completion_length": 257.84375, + "epoch": 0.0076481835564053535, + "grad_norm": 21.931612014770508, + "kl": 0.0223388671875, + "learning_rate": 9.923518164435945e-07, + "loss": 0.0009, + "reward": 1.3234606981277466, + "reward_std": 0.31268778443336487, + "rewards/format_reward_tg": 0.890625, + "rewards/iou_timestamp_reward": 0.1984606385231018, + "rewards/pad": 0.234375, + "step": 24 + }, + { + "completion_length": 321.734375, + "epoch": 0.007966857871255577, + "grad_norm": 13.89256763458252, + "kl": 0.033447265625, + "learning_rate": 9.920331421287443e-07, + "loss": 0.0013, + "reward": 1.2630127668380737, + "reward_std": 0.3075515627861023, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.3255128264427185, + "step": 25 + }, + { + "completion_length": 219.9375, + "epoch": 0.0082855321861058, + "grad_norm": 10.740429878234863, + "kl": 0.0274658203125, + "learning_rate": 9.91714467813894e-07, + "loss": 0.0011, + "reward": 1.4887242317199707, + "reward_std": 0.22904357314109802, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2855991721153259, + "step": 26 + }, + { + "completion_length": 276.5625, + "epoch": 0.008604206500956023, + "grad_norm": 15.421710968017578, + "kl": 0.050048828125, + "learning_rate": 9.91395793499044e-07, + "loss": 0.002, + "reward": 1.297201156616211, + "reward_std": 0.23605427145957947, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.31282609701156616, + "step": 27 + }, + { + "completion_length": 171.46875, + "epoch": 0.008922880815806247, + "grad_norm": 7.487827777862549, + "kl": 0.046630859375, + "learning_rate": 9.910771191841937e-07, + "loss": 0.0019, + "reward": 1.2176066637039185, + "reward_std": 0.13908687233924866, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.21760669350624084, + "step": 28 + }, + { + "completion_length": 198.078125, + "epoch": 0.009241555130656469, + "grad_norm": 17.258358001708984, + "kl": 0.055419921875, + "learning_rate": 9.907584448693435e-07, + "loss": 0.0022, + "reward": 1.3561272621154785, + "reward_std": 0.18805494904518127, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.21550217270851135, + "step": 29 + }, + { + "completion_length": 191.953125, + "epoch": 0.009560229445506692, + "grad_norm": 14.358026504516602, + "kl": 0.035400390625, + "learning_rate": 9.904397705544933e-07, + "loss": 0.0014, + "reward": 1.3451197147369385, + "reward_std": 0.16682513058185577, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34511977434158325, + "rewards/pad": 0.0, + "step": 30 + }, + { + "completion_length": 217.296875, + "epoch": 0.009878903760356916, + "grad_norm": 8.59325885772705, + "kl": 0.033447265625, + "learning_rate": 9.90121096239643e-07, + "loss": 0.0013, + "reward": 1.4028737545013428, + "reward_std": 0.43754905462265015, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.3247487545013428, + "step": 31 + }, + { + "completion_length": 262.6875, + "epoch": 0.010197578075207138, + "grad_norm": 13.470588684082031, + "kl": 0.0269775390625, + "learning_rate": 9.898024219247927e-07, + "loss": 0.0011, + "reward": 1.4790947437286377, + "reward_std": 0.31719809770584106, + "rewards/answer_reward": 0.265625, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.24471977353096008, + "step": 32 + }, + { + "completion_length": 195.5625, + "epoch": 0.010516252390057362, + "grad_norm": 33.77853775024414, + "kl": 0.0390625, + "learning_rate": 9.894837476099425e-07, + "loss": 0.0016, + "reward": 1.470423698425293, + "reward_std": 0.25363314151763916, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3610486388206482, + "step": 33 + }, + { + "completion_length": 260.3125, + "epoch": 0.010834926704907584, + "grad_norm": 5.149546146392822, + "kl": 0.022705078125, + "learning_rate": 9.891650732950923e-07, + "loss": 0.0009, + "reward": 1.2387065887451172, + "reward_std": 0.1923772543668747, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.2699566185474396, + "step": 34 + }, + { + "completion_length": 124.09375, + "epoch": 0.011153601019757807, + "grad_norm": 10.509054183959961, + "kl": 0.058837890625, + "learning_rate": 9.888463989802421e-07, + "loss": 0.0024, + "reward": 1.262293815612793, + "reward_std": 0.13928033411502838, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2622936964035034, + "rewards/pad": 0.0, + "step": 35 + }, + { + "completion_length": 175.25, + "epoch": 0.011472275334608031, + "grad_norm": 40.40633010864258, + "kl": 0.031494140625, + "learning_rate": 9.88527724665392e-07, + "loss": 0.0013, + "reward": 1.3443725109100342, + "reward_std": 0.22584381699562073, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.23499749600887299, + "step": 36 + }, + { + "completion_length": 210.0625, + "epoch": 0.011790949649458253, + "grad_norm": 7.93485689163208, + "kl": 0.04736328125, + "learning_rate": 9.882090503505418e-07, + "loss": 0.0019, + "reward": 1.2812135219573975, + "reward_std": 0.16277143359184265, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.18746356666088104, + "rewards/pad": 0.09375, + "step": 37 + }, + { + "completion_length": 232.78125, + "epoch": 0.012109623964308477, + "grad_norm": 9.572932243347168, + "kl": 0.031982421875, + "learning_rate": 9.878903760356916e-07, + "loss": 0.0013, + "reward": 1.2721017599105835, + "reward_std": 0.2769339978694916, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 0.921875, + "rewards/iou_glue_reward": 0.30335181951522827, + "step": 38 + }, + { + "completion_length": 159.296875, + "epoch": 0.0124282982791587, + "grad_norm": 15.72573471069336, + "kl": 0.04833984375, + "learning_rate": 9.875717017208412e-07, + "loss": 0.0019, + "reward": 1.2688236236572266, + "reward_std": 0.18624919652938843, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.28444868326187134, + "step": 39 + }, + { + "completion_length": 201.390625, + "epoch": 0.012746972594008922, + "grad_norm": 18.504762649536133, + "kl": 0.05224609375, + "learning_rate": 9.87253027405991e-07, + "loss": 0.0021, + "reward": 1.2598755359649658, + "reward_std": 0.3119271397590637, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.3067505955696106, + "step": 40 + }, + { + "completion_length": 165.9375, + "epoch": 0.013065646908859146, + "grad_norm": 16.481534957885742, + "kl": 0.080078125, + "learning_rate": 9.869343530911408e-07, + "loss": 0.0032, + "reward": 1.279308795928955, + "reward_std": 0.2580498158931732, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.21680885553359985, + "rewards/pad": 0.0625, + "step": 41 + }, + { + "completion_length": 187.375, + "epoch": 0.01338432122370937, + "grad_norm": 18.130592346191406, + "kl": 0.033935546875, + "learning_rate": 9.866156787762906e-07, + "loss": 0.0014, + "reward": 1.4841196537017822, + "reward_std": 0.2194441258907318, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2966196537017822, + "step": 42 + }, + { + "completion_length": 208.734375, + "epoch": 0.013702995538559592, + "grad_norm": 11.812344551086426, + "kl": 0.033203125, + "learning_rate": 9.862970044614404e-07, + "loss": 0.0013, + "reward": 1.5125854015350342, + "reward_std": 0.38584810495376587, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.953125, + "rewards/iou_glue_reward": 0.32508549094200134, + "step": 43 + }, + { + "completion_length": 283.6875, + "epoch": 0.014021669853409816, + "grad_norm": 15.581579208374023, + "kl": 0.026123046875, + "learning_rate": 9.859783301465902e-07, + "loss": 0.001, + "reward": 1.4057618379592896, + "reward_std": 0.24079158902168274, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.32763683795928955, + "rewards/pad": 0.125, + "step": 44 + }, + { + "completion_length": 190.4375, + "epoch": 0.014340344168260038, + "grad_norm": 15.17668342590332, + "kl": 0.049560546875, + "learning_rate": 9.8565965583174e-07, + "loss": 0.002, + "reward": 1.2811412811279297, + "reward_std": 0.24153098464012146, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.31239134073257446, + "rewards/pad": 0.0, + "step": 45 + }, + { + "completion_length": 209.984375, + "epoch": 0.014659018483110261, + "grad_norm": 9.36685848236084, + "kl": 0.034423828125, + "learning_rate": 9.853409815168898e-07, + "loss": 0.0014, + "reward": 1.501969575881958, + "reward_std": 0.2501377463340759, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.345719575881958, + "rewards/pad": 0.15625, + "step": 46 + }, + { + "completion_length": 256.015625, + "epoch": 0.014977692797960485, + "grad_norm": 9.444185256958008, + "kl": 0.042236328125, + "learning_rate": 9.850223072020394e-07, + "loss": 0.0017, + "reward": 1.464492917060852, + "reward_std": 0.209243506193161, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48011791706085205, + "rewards/pad": 0.0, + "step": 47 + }, + { + "completion_length": 198.59375, + "epoch": 0.015296367112810707, + "grad_norm": 15.230579376220703, + "kl": 0.048828125, + "learning_rate": 9.847036328871892e-07, + "loss": 0.002, + "reward": 1.2543593645095825, + "reward_std": 0.22551663219928741, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.26998430490493774, + "rewards/pad": 0.0, + "step": 48 + }, + { + "completion_length": 251.25, + "epoch": 0.01561504142766093, + "grad_norm": 7.851691246032715, + "kl": 0.043212890625, + "learning_rate": 9.84384958572339e-07, + "loss": 0.0017, + "reward": 1.33894944190979, + "reward_std": 0.13577714562416077, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3389494717121124, + "step": 49 + }, + { + "completion_length": 184.046875, + "epoch": 0.015933715742511154, + "grad_norm": 20.711200714111328, + "kl": 0.05224609375, + "learning_rate": 9.840662842574888e-07, + "loss": 0.0021, + "reward": 1.4345719814300537, + "reward_std": 0.21074189245700836, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45019692182540894, + "rewards/pad": 0.0, + "step": 50 + }, + { + "completion_length": 171.546875, + "epoch": 0.016252390057361378, + "grad_norm": 10.879066467285156, + "kl": 0.05322265625, + "learning_rate": 9.837476099426386e-07, + "loss": 0.0021, + "reward": 1.4225932359695435, + "reward_std": 0.27576547861099243, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34446826577186584, + "rewards/pad": 0.078125, + "step": 51 + }, + { + "completion_length": 335.015625, + "epoch": 0.0165710643722116, + "grad_norm": 9.482775688171387, + "kl": 0.0224609375, + "learning_rate": 9.834289356277884e-07, + "loss": 0.0009, + "reward": 1.5082097053527832, + "reward_std": 0.1784358024597168, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39883482456207275, + "rewards/pad": 0.109375, + "step": 52 + }, + { + "completion_length": 250.703125, + "epoch": 0.016889738687061822, + "grad_norm": 14.3960599899292, + "kl": 0.02978515625, + "learning_rate": 9.831102613129382e-07, + "loss": 0.0012, + "reward": 1.435816764831543, + "reward_std": 0.2369682788848877, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.35769182443618774, + "step": 53 + }, + { + "completion_length": 253.4375, + "epoch": 0.017208413001912046, + "grad_norm": 7.352794647216797, + "kl": 0.039794921875, + "learning_rate": 9.82791586998088e-07, + "loss": 0.0016, + "reward": 1.255972146987915, + "reward_std": 0.13122376799583435, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.25597214698791504, + "rewards/pad": 0.0, + "step": 54 + }, + { + "completion_length": 226.0625, + "epoch": 0.01752708731676227, + "grad_norm": 33.61541748046875, + "kl": 0.044677734375, + "learning_rate": 9.824729126832376e-07, + "loss": 0.0018, + "reward": 1.3116388320922852, + "reward_std": 0.314866840839386, + "rewards/format_reward_tg": 0.9375, + "rewards/iou_timestamp_reward": 0.3741387128829956, + "rewards/pad": 0.0, + "step": 55 + }, + { + "completion_length": 179.171875, + "epoch": 0.017845761631612493, + "grad_norm": 12.783524513244629, + "kl": 0.0517578125, + "learning_rate": 9.821542383683875e-07, + "loss": 0.0021, + "reward": 1.3135948181152344, + "reward_std": 0.18926963210105896, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31359487771987915, + "rewards/pad": 0.0, + "step": 56 + }, + { + "completion_length": 312.171875, + "epoch": 0.018164435946462717, + "grad_norm": 19.495323181152344, + "kl": 0.0235595703125, + "learning_rate": 9.818355640535373e-07, + "loss": 0.0009, + "reward": 1.2919933795928955, + "reward_std": 0.20228049159049988, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.2294934093952179, + "step": 57 + }, + { + "completion_length": 246.0625, + "epoch": 0.018483110261312937, + "grad_norm": 5.100099086761475, + "kl": 0.0302734375, + "learning_rate": 9.81516889738687e-07, + "loss": 0.0012, + "reward": 1.3808673620224, + "reward_std": 0.21135839819908142, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2714923918247223, + "step": 58 + }, + { + "completion_length": 253.453125, + "epoch": 0.01880178457616316, + "grad_norm": 26.97967529296875, + "kl": 0.034912109375, + "learning_rate": 9.811982154238367e-07, + "loss": 0.0014, + "reward": 1.2082840204238892, + "reward_std": 0.1777166873216629, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.23953397572040558, + "step": 59 + }, + { + "completion_length": 298.75, + "epoch": 0.019120458891013385, + "grad_norm": 9.286375999450684, + "kl": 0.0274658203125, + "learning_rate": 9.808795411089865e-07, + "loss": 0.0011, + "reward": 1.4248732328414917, + "reward_std": 0.2562997341156006, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.3623732328414917, + "step": 60 + }, + { + "completion_length": 325.6875, + "epoch": 0.019439133205863608, + "grad_norm": 11.21821403503418, + "kl": 0.022216796875, + "learning_rate": 9.805608667941363e-07, + "loss": 0.0009, + "reward": 1.3899298906326294, + "reward_std": 0.22722920775413513, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4211798906326294, + "step": 61 + }, + { + "completion_length": 231.484375, + "epoch": 0.019757807520713832, + "grad_norm": 14.974238395690918, + "kl": 0.0439453125, + "learning_rate": 9.80242192479286e-07, + "loss": 0.0018, + "reward": 1.3300275802612305, + "reward_std": 0.19355839490890503, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34565258026123047, + "rewards/pad": 0.0, + "step": 62 + }, + { + "completion_length": 194.859375, + "epoch": 0.020076481835564052, + "grad_norm": 6.293397426605225, + "kl": 0.040771484375, + "learning_rate": 9.799235181644359e-07, + "loss": 0.0016, + "reward": 1.4031833410263062, + "reward_std": 0.15579763054847717, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2781832814216614, + "rewards/pad": 0.125, + "step": 63 + }, + { + "completion_length": 247.71875, + "epoch": 0.020395156150414276, + "grad_norm": 8.145806312561035, + "kl": 0.038330078125, + "learning_rate": 9.796048438495857e-07, + "loss": 0.0015, + "reward": 1.2597522735595703, + "reward_std": 0.3148002624511719, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.2597523331642151, + "step": 64 + }, + { + "completion_length": 251.5, + "epoch": 0.0207138304652645, + "grad_norm": 17.193103790283203, + "kl": 0.033447265625, + "learning_rate": 9.792861695347355e-07, + "loss": 0.0013, + "reward": 1.4773969650268555, + "reward_std": 0.2409696727991104, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39927199482917786, + "rewards/pad": 0.09375, + "step": 65 + }, + { + "completion_length": 244.671875, + "epoch": 0.021032504780114723, + "grad_norm": 11.186094284057617, + "kl": 0.033447265625, + "learning_rate": 9.78967495219885e-07, + "loss": 0.0013, + "reward": 1.234787940979004, + "reward_std": 0.3220973610877991, + "rewards/format_reward_tg": 0.9375, + "rewards/iou_timestamp_reward": 0.2972880005836487, + "rewards/pad": 0.0, + "step": 66 + }, + { + "completion_length": 203.84375, + "epoch": 0.021351179094964947, + "grad_norm": 10.883271217346191, + "kl": 0.039306640625, + "learning_rate": 9.78648820905035e-07, + "loss": 0.0016, + "reward": 1.2655227184295654, + "reward_std": 0.1531958132982254, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.23427259922027588, + "step": 67 + }, + { + "completion_length": 205.1875, + "epoch": 0.021669853409815167, + "grad_norm": 20.127857208251953, + "kl": 0.03515625, + "learning_rate": 9.783301465901847e-07, + "loss": 0.0014, + "reward": 1.4338656663894653, + "reward_std": 0.22612062096595764, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2463657557964325, + "step": 68 + }, + { + "completion_length": 220.03125, + "epoch": 0.02198852772466539, + "grad_norm": 8.743712425231934, + "kl": 0.029541015625, + "learning_rate": 9.780114722753345e-07, + "loss": 0.0012, + "reward": 1.3033971786499023, + "reward_std": 0.15178921818733215, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3033972382545471, + "rewards/pad": 0.0, + "step": 69 + }, + { + "completion_length": 324.953125, + "epoch": 0.022307202039515615, + "grad_norm": 8.447614669799805, + "kl": 0.022705078125, + "learning_rate": 9.776927979604843e-07, + "loss": 0.0009, + "reward": 1.5264579057693481, + "reward_std": 0.17748026549816132, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.43270793557167053, + "step": 70 + }, + { + "completion_length": 253.234375, + "epoch": 0.02262587635436584, + "grad_norm": 7.076296806335449, + "kl": 0.0277099609375, + "learning_rate": 9.773741236456341e-07, + "loss": 0.0011, + "reward": 1.4195590019226074, + "reward_std": 0.14488717913627625, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4195590913295746, + "rewards/pad": 0.0, + "step": 71 + }, + { + "completion_length": 271.90625, + "epoch": 0.022944550669216062, + "grad_norm": 10.374483108520508, + "kl": 0.0228271484375, + "learning_rate": 9.77055449330784e-07, + "loss": 0.0009, + "reward": 1.619044542312622, + "reward_std": 0.2657039761543274, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4315445125102997, + "step": 72 + }, + { + "completion_length": 266.125, + "epoch": 0.023263224984066286, + "grad_norm": 11.071622848510742, + "kl": 0.033447265625, + "learning_rate": 9.767367750159337e-07, + "loss": 0.0013, + "reward": 1.3494877815246582, + "reward_std": 0.179287850856781, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3651127815246582, + "step": 73 + }, + { + "completion_length": 186.65625, + "epoch": 0.023581899298916506, + "grad_norm": 22.304418563842773, + "kl": 0.0284423828125, + "learning_rate": 9.764181007010833e-07, + "loss": 0.0012, + "reward": 1.5244197845458984, + "reward_std": 0.2890230417251587, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3837948441505432, + "step": 74 + }, + { + "completion_length": 332.875, + "epoch": 0.02390057361376673, + "grad_norm": 5.554767608642578, + "kl": 0.0257568359375, + "learning_rate": 9.760994263862331e-07, + "loss": 0.001, + "reward": 1.4590568542480469, + "reward_std": 0.12253490090370178, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4746818542480469, + "step": 75 + }, + { + "completion_length": 278.09375, + "epoch": 0.024219247928616953, + "grad_norm": 20.972787857055664, + "kl": 0.033447265625, + "learning_rate": 9.75780752071383e-07, + "loss": 0.0013, + "reward": 1.4379208087921143, + "reward_std": 0.16233232617378235, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34417077898979187, + "step": 76 + }, + { + "completion_length": 209.0625, + "epoch": 0.024537922243467177, + "grad_norm": 6.096325874328613, + "kl": 0.0361328125, + "learning_rate": 9.754620777565328e-07, + "loss": 0.0014, + "reward": 1.504381537437439, + "reward_std": 0.27014750242233276, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.45750656723976135, + "step": 77 + }, + { + "completion_length": 182.0, + "epoch": 0.0248565965583174, + "grad_norm": 6.677753925323486, + "kl": 0.031494140625, + "learning_rate": 9.751434034416826e-07, + "loss": 0.0013, + "reward": 1.5277019739151, + "reward_std": 0.22165103256702423, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3714520037174225, + "rewards/pad": 0.171875, + "step": 78 + }, + { + "completion_length": 94.0625, + "epoch": 0.02517527087316762, + "grad_norm": 18.051889419555664, + "kl": 0.0546875, + "learning_rate": 9.748247291268324e-07, + "loss": 0.0022, + "reward": 1.4485448598861694, + "reward_std": 0.2159993201494217, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.33916980028152466, + "rewards/pad": 0.125, + "step": 79 + }, + { + "completion_length": 123.875, + "epoch": 0.025493945188017845, + "grad_norm": 12.451839447021484, + "kl": 0.0537109375, + "learning_rate": 9.745060548119822e-07, + "loss": 0.0022, + "reward": 1.2630850076675415, + "reward_std": 0.2797267436981201, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.24746005237102509, + "rewards/pad": 0.03125, + "step": 80 + }, + { + "completion_length": 96.53125, + "epoch": 0.02581261950286807, + "grad_norm": 20.299448013305664, + "kl": 0.05810546875, + "learning_rate": 9.74187380497132e-07, + "loss": 0.0023, + "reward": 1.3956706523895264, + "reward_std": 0.24089336395263672, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.38004571199417114, + "step": 81 + }, + { + "completion_length": 150.8125, + "epoch": 0.026131293817718292, + "grad_norm": 18.910051345825195, + "kl": 0.060302734375, + "learning_rate": 9.738687061822816e-07, + "loss": 0.0024, + "reward": 1.4209949970245361, + "reward_std": 0.16225898265838623, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42099499702453613, + "rewards/pad": 0.0, + "step": 82 + }, + { + "completion_length": 236.71875, + "epoch": 0.026449968132568516, + "grad_norm": 11.744294166564941, + "kl": 0.046142578125, + "learning_rate": 9.735500318674314e-07, + "loss": 0.0018, + "reward": 1.4210379123687744, + "reward_std": 0.2842930555343628, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.3741629421710968, + "rewards/pad": 0.078125, + "step": 83 + }, + { + "completion_length": 171.515625, + "epoch": 0.02676864244741874, + "grad_norm": 13.975997924804688, + "kl": 0.057373046875, + "learning_rate": 9.732313575525812e-07, + "loss": 0.0023, + "reward": 1.4923350811004639, + "reward_std": 0.20501980185508728, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47671014070510864, + "step": 84 + }, + { + "completion_length": 221.796875, + "epoch": 0.02708731676226896, + "grad_norm": 7.706657409667969, + "kl": 0.04052734375, + "learning_rate": 9.72912683237731e-07, + "loss": 0.0016, + "reward": 1.380906581878662, + "reward_std": 0.25143831968307495, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.41215670108795166, + "rewards/pad": 0.0, + "step": 85 + }, + { + "completion_length": 176.734375, + "epoch": 0.027405991077119184, + "grad_norm": 11.26560115814209, + "kl": 0.064453125, + "learning_rate": 9.725940089228808e-07, + "loss": 0.0026, + "reward": 1.2714415788650513, + "reward_std": 0.19051113724708557, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.30269163846969604, + "rewards/pad": 0.0, + "step": 86 + }, + { + "completion_length": 140.953125, + "epoch": 0.027724665391969407, + "grad_norm": 65.41498565673828, + "kl": 0.0712890625, + "learning_rate": 9.722753346080306e-07, + "loss": 0.0028, + "reward": 1.315810203552246, + "reward_std": 0.17996680736541748, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3158102035522461, + "step": 87 + }, + { + "completion_length": 134.84375, + "epoch": 0.02804333970681963, + "grad_norm": 7.701048374176025, + "kl": 0.0712890625, + "learning_rate": 9.719566602931804e-07, + "loss": 0.0029, + "reward": 1.4724924564361572, + "reward_std": 0.18042278289794922, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36311742663383484, + "rewards/pad": 0.109375, + "step": 88 + }, + { + "completion_length": 149.078125, + "epoch": 0.028362014021669855, + "grad_norm": 9.855350494384766, + "kl": 0.06640625, + "learning_rate": 9.716379859783302e-07, + "loss": 0.0027, + "reward": 1.4240100383758545, + "reward_std": 0.15711775422096252, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4240100681781769, + "step": 89 + }, + { + "completion_length": 114.953125, + "epoch": 0.028680688336520075, + "grad_norm": 7.680419921875, + "kl": 0.07373046875, + "learning_rate": 9.713193116634798e-07, + "loss": 0.0029, + "reward": 1.4028632640838623, + "reward_std": 0.20188677310943604, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40286314487457275, + "step": 90 + }, + { + "completion_length": 194.625, + "epoch": 0.0289993626513703, + "grad_norm": 5.96244478225708, + "kl": 0.0537109375, + "learning_rate": 9.710006373486296e-07, + "loss": 0.0021, + "reward": 1.4750306606292725, + "reward_std": 0.16800019145011902, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47503069043159485, + "rewards/pad": 0.0, + "step": 91 + }, + { + "completion_length": 172.03125, + "epoch": 0.029318036966220522, + "grad_norm": 7.921225070953369, + "kl": 0.056640625, + "learning_rate": 9.706819630337794e-07, + "loss": 0.0023, + "reward": 1.550010323524475, + "reward_std": 0.11083010584115982, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3000103235244751, + "rewards/pad": 0.25, + "step": 92 + }, + { + "completion_length": 240.9375, + "epoch": 0.029636711281070746, + "grad_norm": 9.648941993713379, + "kl": 0.04443359375, + "learning_rate": 9.703632887189293e-07, + "loss": 0.0018, + "reward": 1.5524835586547852, + "reward_std": 0.12360557913780212, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42748352885246277, + "rewards/pad": 0.125, + "step": 93 + }, + { + "completion_length": 162.25, + "epoch": 0.02995538559592097, + "grad_norm": 20.81108283996582, + "kl": 0.0751953125, + "learning_rate": 9.70044614404079e-07, + "loss": 0.003, + "reward": 1.5567106008529663, + "reward_std": 0.10027439892292023, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4317106306552887, + "step": 94 + }, + { + "completion_length": 155.59375, + "epoch": 0.030274059910771194, + "grad_norm": 6.367391109466553, + "kl": 0.06396484375, + "learning_rate": 9.697259400892289e-07, + "loss": 0.0026, + "reward": 1.3457229137420654, + "reward_std": 0.13679799437522888, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3457227945327759, + "rewards/pad": 0.0, + "step": 95 + }, + { + "completion_length": 100.40625, + "epoch": 0.030592734225621414, + "grad_norm": 19.097196578979492, + "kl": 0.0732421875, + "learning_rate": 9.694072657743787e-07, + "loss": 0.0029, + "reward": 1.475441575050354, + "reward_std": 0.1915864497423172, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3504416048526764, + "rewards/pad": 0.125, + "step": 96 + }, + { + "completion_length": 213.34375, + "epoch": 0.030911408540471638, + "grad_norm": 9.829632759094238, + "kl": 0.047119140625, + "learning_rate": 9.690885914595283e-07, + "loss": 0.0019, + "reward": 1.6064780950546265, + "reward_std": 0.13289913535118103, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37210309505462646, + "step": 97 + }, + { + "completion_length": 107.15625, + "epoch": 0.03123008285532186, + "grad_norm": 13.913726806640625, + "kl": 0.10009765625, + "learning_rate": 9.68769917144678e-07, + "loss": 0.004, + "reward": 1.4501903057098389, + "reward_std": 0.1813579499721527, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45019030570983887, + "rewards/pad": 0.0, + "step": 98 + }, + { + "completion_length": 246.109375, + "epoch": 0.03154875717017208, + "grad_norm": 5.717926025390625, + "kl": 0.03564453125, + "learning_rate": 9.684512428298279e-07, + "loss": 0.0014, + "reward": 1.3290218114852905, + "reward_std": 0.15001249313354492, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2977718412876129, + "rewards/pad": 0.03125, + "step": 99 + }, + { + "completion_length": 127.84375, + "epoch": 0.03186743148502231, + "grad_norm": 14.6702241897583, + "kl": 0.12451171875, + "learning_rate": 9.681325685149777e-07, + "loss": 0.005, + "reward": 1.3388731479644775, + "reward_std": 0.18304340541362762, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.33887311816215515, + "rewards/pad": 0.0, + "step": 100 + }, + { + "completion_length": 171.640625, + "epoch": 0.03218610579987253, + "grad_norm": 12.464656829833984, + "kl": 0.06494140625, + "learning_rate": 9.678138942001273e-07, + "loss": 0.0025, + "reward": 1.3632150888442993, + "reward_std": 0.1353316754102707, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3788400888442993, + "rewards/pad": 0.0, + "step": 101 + }, + { + "completion_length": 178.390625, + "epoch": 0.032504780114722756, + "grad_norm": 12.3067045211792, + "kl": 0.058349609375, + "learning_rate": 9.67495219885277e-07, + "loss": 0.0023, + "reward": 1.5209242105484009, + "reward_std": 0.21918198466300964, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44279927015304565, + "rewards/pad": 0.078125, + "step": 102 + }, + { + "completion_length": 198.75, + "epoch": 0.032823454429572976, + "grad_norm": 17.653640747070312, + "kl": 0.054931640625, + "learning_rate": 9.67176545570427e-07, + "loss": 0.0021, + "reward": 1.4329454898834229, + "reward_std": 0.15643572807312012, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3704456090927124, + "step": 103 + }, + { + "completion_length": 173.765625, + "epoch": 0.0331421287444232, + "grad_norm": 18.334651947021484, + "kl": 0.06201171875, + "learning_rate": 9.668578712555767e-07, + "loss": 0.0025, + "reward": 1.3778859376907349, + "reward_std": 0.1982203871011734, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.31538593769073486, + "step": 104 + }, + { + "completion_length": 199.53125, + "epoch": 0.033460803059273424, + "grad_norm": 18.57367706298828, + "kl": 0.05517578125, + "learning_rate": 9.665391969407265e-07, + "loss": 0.0022, + "reward": 1.6662395000457764, + "reward_std": 0.13840028643608093, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.35373955965042114, + "step": 105 + }, + { + "completion_length": 232.703125, + "epoch": 0.033779477374123644, + "grad_norm": 7.207043170928955, + "kl": 0.0458984375, + "learning_rate": 9.662205226258763e-07, + "loss": 0.0018, + "reward": 1.347933292388916, + "reward_std": 0.09887917339801788, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3635583817958832, + "step": 106 + }, + { + "completion_length": 162.578125, + "epoch": 0.03409815168897387, + "grad_norm": 9.840560913085938, + "kl": 0.056640625, + "learning_rate": 9.659018483110261e-07, + "loss": 0.0023, + "reward": 1.53261137008667, + "reward_std": 0.16933324933052063, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40761134028434753, + "rewards/pad": 0.125, + "step": 107 + }, + { + "completion_length": 233.71875, + "epoch": 0.03441682600382409, + "grad_norm": 4.3503031730651855, + "kl": 0.0400390625, + "learning_rate": 9.65583173996176e-07, + "loss": 0.0016, + "reward": 1.4072836637496948, + "reward_std": 0.04489172250032425, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4072836637496948, + "rewards/pad": 0.0, + "step": 108 + }, + { + "completion_length": 154.234375, + "epoch": 0.03473550031867431, + "grad_norm": 11.090832710266113, + "kl": 0.056396484375, + "learning_rate": 9.652644996813255e-07, + "loss": 0.0023, + "reward": 1.6570996046066284, + "reward_std": 0.21542125940322876, + "rewards/answer_reward": 0.28125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.39147457480430603, + "step": 109 + }, + { + "completion_length": 108.40625, + "epoch": 0.03505417463352454, + "grad_norm": 20.70996856689453, + "kl": 0.07666015625, + "learning_rate": 9.649458253664753e-07, + "loss": 0.0031, + "reward": 1.3417737483978271, + "reward_std": 0.11198395490646362, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3417738378047943, + "rewards/pad": 0.0, + "step": 110 + }, + { + "completion_length": 166.359375, + "epoch": 0.03537284894837476, + "grad_norm": 6.13435697555542, + "kl": 0.06494140625, + "learning_rate": 9.646271510516251e-07, + "loss": 0.0026, + "reward": 1.451405644416809, + "reward_std": 0.09623386710882187, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4514055550098419, + "step": 111 + }, + { + "completion_length": 134.859375, + "epoch": 0.035691523263224986, + "grad_norm": 15.755718231201172, + "kl": 0.0634765625, + "learning_rate": 9.64308476736775e-07, + "loss": 0.0025, + "reward": 1.7639336585998535, + "reward_std": 0.20902717113494873, + "rewards/answer_reward": 0.328125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4358087182044983, + "step": 112 + }, + { + "completion_length": 215.359375, + "epoch": 0.036010197578075206, + "grad_norm": 13.079955101013184, + "kl": 0.05615234375, + "learning_rate": 9.639898024219248e-07, + "loss": 0.0022, + "reward": 1.4580156803131104, + "reward_std": 0.1539626121520996, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.36426568031311035, + "step": 113 + }, + { + "completion_length": 174.140625, + "epoch": 0.036328871892925434, + "grad_norm": 8.691699028015137, + "kl": 0.087890625, + "learning_rate": 9.636711281070746e-07, + "loss": 0.0035, + "reward": 1.3323192596435547, + "reward_std": 0.12468598783016205, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34794431924819946, + "rewards/pad": 0.0, + "step": 114 + }, + { + "completion_length": 190.140625, + "epoch": 0.036647546207775654, + "grad_norm": 7.315731048583984, + "kl": 0.052734375, + "learning_rate": 9.633524537922244e-07, + "loss": 0.0021, + "reward": 1.51028311252594, + "reward_std": 0.11671674251556396, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33840811252593994, + "step": 115 + }, + { + "completion_length": 177.40625, + "epoch": 0.036966220522625874, + "grad_norm": 10.40539836883545, + "kl": 0.04931640625, + "learning_rate": 9.630337794773742e-07, + "loss": 0.002, + "reward": 1.7965991497039795, + "reward_std": 0.2596476078033447, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4372241795063019, + "step": 116 + }, + { + "completion_length": 197.4375, + "epoch": 0.0372848948374761, + "grad_norm": 17.551198959350586, + "kl": 0.05615234375, + "learning_rate": 9.627151051625238e-07, + "loss": 0.0022, + "reward": 1.487061619758606, + "reward_std": 0.12751880288124084, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48706167936325073, + "rewards/pad": 0.0, + "step": 117 + }, + { + "completion_length": 212.609375, + "epoch": 0.03760356915232632, + "grad_norm": 25.8797550201416, + "kl": 0.06103515625, + "learning_rate": 9.623964308476736e-07, + "loss": 0.0024, + "reward": 1.4586904048919678, + "reward_std": 0.18688003718852997, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4118153750896454, + "step": 118 + }, + { + "completion_length": 198.171875, + "epoch": 0.03792224346717655, + "grad_norm": 11.605658531188965, + "kl": 0.0478515625, + "learning_rate": 9.620777565328234e-07, + "loss": 0.0019, + "reward": 1.5306899547576904, + "reward_std": 0.24100443720817566, + "rewards/answer_reward": 0.1875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3431899845600128, + "step": 119 + }, + { + "completion_length": 209.46875, + "epoch": 0.03824091778202677, + "grad_norm": 3.406437397003174, + "kl": 0.054443359375, + "learning_rate": 9.617590822179732e-07, + "loss": 0.0022, + "reward": 1.437645435333252, + "reward_std": 0.07607047259807587, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43764549493789673, + "step": 120 + }, + { + "completion_length": 162.171875, + "epoch": 0.03855959209687699, + "grad_norm": 23.933696746826172, + "kl": 0.06396484375, + "learning_rate": 9.61440407903123e-07, + "loss": 0.0025, + "reward": 1.439674735069275, + "reward_std": 0.18641090393066406, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3146747350692749, + "rewards/pad": 0.125, + "step": 121 + }, + { + "completion_length": 185.8125, + "epoch": 0.038878266411727216, + "grad_norm": 9.487775802612305, + "kl": 0.056640625, + "learning_rate": 9.611217335882728e-07, + "loss": 0.0023, + "reward": 1.4100788831710815, + "reward_std": 0.11771468818187714, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39445382356643677, + "step": 122 + }, + { + "completion_length": 236.921875, + "epoch": 0.03919694072657744, + "grad_norm": 7.253052711486816, + "kl": 0.1279296875, + "learning_rate": 9.608030592734226e-07, + "loss": 0.0051, + "reward": 1.4906271696090698, + "reward_std": 0.1307528018951416, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3187522292137146, + "step": 123 + }, + { + "completion_length": 199.546875, + "epoch": 0.039515615041427664, + "grad_norm": 8.534823417663574, + "kl": 0.064453125, + "learning_rate": 9.604843849585724e-07, + "loss": 0.0026, + "reward": 1.3232932090759277, + "reward_std": 0.17143693566322327, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.1982932984828949, + "step": 124 + }, + { + "completion_length": 101.796875, + "epoch": 0.039834289356277884, + "grad_norm": 7.8380818367004395, + "kl": 0.08349609375, + "learning_rate": 9.60165710643722e-07, + "loss": 0.0033, + "reward": 1.4448331594467163, + "reward_std": 0.1456798017024994, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4448332190513611, + "rewards/pad": 0.0, + "step": 125 + }, + { + "completion_length": 77.078125, + "epoch": 0.040152963671128104, + "grad_norm": 13.454174995422363, + "kl": 0.10400390625, + "learning_rate": 9.598470363288718e-07, + "loss": 0.0042, + "reward": 1.3266003131866455, + "reward_std": 0.16800163686275482, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.32660022377967834, + "rewards/pad": 0.0, + "step": 126 + }, + { + "completion_length": 185.6875, + "epoch": 0.04047163798597833, + "grad_norm": 8.160616874694824, + "kl": 0.055419921875, + "learning_rate": 9.595283620140216e-07, + "loss": 0.0022, + "reward": 1.7484498023986816, + "reward_std": 0.16031304001808167, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4046997129917145, + "rewards/pad": 0.34375, + "step": 127 + }, + { + "completion_length": 215.515625, + "epoch": 0.04079031230082855, + "grad_norm": 9.371366500854492, + "kl": 0.05419921875, + "learning_rate": 9.592096876991714e-07, + "loss": 0.0022, + "reward": 1.3254632949829102, + "reward_std": 0.09307268261909485, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.32546335458755493, + "rewards/pad": 0.0, + "step": 128 + }, + { + "completion_length": 222.796875, + "epoch": 0.04110898661567878, + "grad_norm": 16.473413467407227, + "kl": 0.0625, + "learning_rate": 9.588910133843212e-07, + "loss": 0.0025, + "reward": 1.3656400442123413, + "reward_std": 0.1587304174900055, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.38126498460769653, + "rewards/pad": 0.0, + "step": 129 + }, + { + "completion_length": 270.875, + "epoch": 0.041427660930529, + "grad_norm": 5.4612555503845215, + "kl": 0.05419921875, + "learning_rate": 9.58572339069471e-07, + "loss": 0.0022, + "reward": 1.3114277124404907, + "reward_std": 0.06440472602844238, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3114277124404907, + "step": 130 + }, + { + "completion_length": 238.84375, + "epoch": 0.04174633524537922, + "grad_norm": 6.282101631164551, + "kl": 0.04638671875, + "learning_rate": 9.582536647546209e-07, + "loss": 0.0019, + "reward": 1.4994099140167236, + "reward_std": 0.10063318908214569, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3744097948074341, + "rewards/pad": 0.125, + "step": 131 + }, + { + "completion_length": 163.625, + "epoch": 0.04206500956022945, + "grad_norm": 8.125163078308105, + "kl": 0.12109375, + "learning_rate": 9.579349904397705e-07, + "loss": 0.0049, + "reward": 1.4386279582977295, + "reward_std": 0.14473479986190796, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3917529284954071, + "rewards/pad": 0.0625, + "step": 132 + }, + { + "completion_length": 231.890625, + "epoch": 0.04238368387507967, + "grad_norm": 8.46726131439209, + "kl": 0.07275390625, + "learning_rate": 9.576163161249203e-07, + "loss": 0.0029, + "reward": 1.523742914199829, + "reward_std": 0.11945880949497223, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4143679141998291, + "step": 133 + }, + { + "completion_length": 268.34375, + "epoch": 0.042702358189929894, + "grad_norm": 24.495853424072266, + "kl": 0.057861328125, + "learning_rate": 9.5729764181007e-07, + "loss": 0.0023, + "reward": 1.3764296770095825, + "reward_std": 0.12560050189495087, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3139297366142273, + "step": 134 + }, + { + "completion_length": 241.609375, + "epoch": 0.043021032504780114, + "grad_norm": 5.319091796875, + "kl": 0.0498046875, + "learning_rate": 9.569789674952199e-07, + "loss": 0.002, + "reward": 1.4918053150177002, + "reward_std": 0.13128961622714996, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5074301958084106, + "step": 135 + }, + { + "completion_length": 135.6875, + "epoch": 0.043339706819630335, + "grad_norm": 13.785394668579102, + "kl": 0.07421875, + "learning_rate": 9.566602931803697e-07, + "loss": 0.003, + "reward": 1.3160868883132935, + "reward_std": 0.19807332754135132, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3160868287086487, + "step": 136 + }, + { + "completion_length": 272.875, + "epoch": 0.04365838113448056, + "grad_norm": 5.691545486450195, + "kl": 0.045654296875, + "learning_rate": 9.563416188655195e-07, + "loss": 0.0018, + "reward": 1.3066635131835938, + "reward_std": 0.11025525629520416, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.29103854298591614, + "step": 137 + }, + { + "completion_length": 169.25, + "epoch": 0.04397705544933078, + "grad_norm": 10.946203231811523, + "kl": 0.06982421875, + "learning_rate": 9.56022944550669e-07, + "loss": 0.0028, + "reward": 1.563328504562378, + "reward_std": 0.21476612985134125, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45395350456237793, + "rewards/pad": 0.109375, + "step": 138 + }, + { + "completion_length": 242.734375, + "epoch": 0.04429572976418101, + "grad_norm": 7.429897785186768, + "kl": 0.047119140625, + "learning_rate": 9.557042702358189e-07, + "loss": 0.0019, + "reward": 1.4747035503387451, + "reward_std": 0.16535669565200806, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3028285801410675, + "step": 139 + }, + { + "completion_length": 281.90625, + "epoch": 0.04461440407903123, + "grad_norm": 8.921205520629883, + "kl": 0.033447265625, + "learning_rate": 9.553855959209687e-07, + "loss": 0.0013, + "reward": 1.7500858306884766, + "reward_std": 0.13492590188980103, + "rewards/pad": 0.328125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42196089029312134, + "step": 140 + }, + { + "completion_length": 233.671875, + "epoch": 0.044933078393881457, + "grad_norm": 6.8118672370910645, + "kl": 0.043212890625, + "learning_rate": 9.550669216061185e-07, + "loss": 0.0017, + "reward": 1.7276577949523926, + "reward_std": 0.1283380091190338, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4932827949523926, + "step": 141 + }, + { + "completion_length": 339.484375, + "epoch": 0.04525175270873168, + "grad_norm": 3.8644649982452393, + "kl": 0.03125, + "learning_rate": 9.547482472912683e-07, + "loss": 0.0013, + "reward": 1.7625560760498047, + "reward_std": 0.17026743292808533, + "rewards/pad": 0.4375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3250562250614166, + "step": 142 + }, + { + "completion_length": 324.59375, + "epoch": 0.0455704270235819, + "grad_norm": 3.7466437816619873, + "kl": 0.037841796875, + "learning_rate": 9.544295729764181e-07, + "loss": 0.0015, + "reward": 1.369144320487976, + "reward_std": 0.06762483716011047, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3691442906856537, + "step": 143 + }, + { + "completion_length": 323.609375, + "epoch": 0.045889101338432124, + "grad_norm": 13.69135570526123, + "kl": 0.040771484375, + "learning_rate": 9.541108986615677e-07, + "loss": 0.0016, + "reward": 1.3979535102844238, + "reward_std": 0.06589115411043167, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39795345067977905, + "rewards/pad": 0.0, + "step": 144 + }, + { + "completion_length": 299.984375, + "epoch": 0.046207775653282344, + "grad_norm": 13.790980339050293, + "kl": 0.0615234375, + "learning_rate": 9.537922243467175e-07, + "loss": 0.0025, + "reward": 1.449575662612915, + "reward_std": 0.1409299224615097, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4495756924152374, + "step": 145 + }, + { + "completion_length": 224.8125, + "epoch": 0.04652644996813257, + "grad_norm": 17.20113182067871, + "kl": 0.06640625, + "learning_rate": 9.534735500318673e-07, + "loss": 0.0027, + "reward": 1.3060486316680908, + "reward_std": 0.2386862337589264, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.2904236912727356, + "rewards/pad": 0.03125, + "step": 146 + }, + { + "completion_length": 295.859375, + "epoch": 0.04684512428298279, + "grad_norm": 5.707564830780029, + "kl": 0.05517578125, + "learning_rate": 9.531548757170171e-07, + "loss": 0.0022, + "reward": 1.4157187938690186, + "reward_std": 0.19549745321273804, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.33759376406669617, + "step": 147 + }, + { + "completion_length": 196.71875, + "epoch": 0.04716379859783301, + "grad_norm": 7.082728385925293, + "kl": 0.107421875, + "learning_rate": 9.528362014021669e-07, + "loss": 0.0043, + "reward": 1.6351996660232544, + "reward_std": 0.2388363629579544, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.447699636220932, + "step": 148 + }, + { + "completion_length": 209.5, + "epoch": 0.04748247291268324, + "grad_norm": 4.3276190757751465, + "kl": 0.0537109375, + "learning_rate": 9.525175270873167e-07, + "loss": 0.0022, + "reward": 1.6083552837371826, + "reward_std": 0.1997946947813034, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5302302837371826, + "step": 149 + }, + { + "completion_length": 224.359375, + "epoch": 0.04780114722753346, + "grad_norm": 21.417600631713867, + "kl": 0.058349609375, + "learning_rate": 9.521988527724664e-07, + "loss": 0.0023, + "reward": 1.4707787036895752, + "reward_std": 0.13133534789085388, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47077876329421997, + "step": 150 + }, + { + "completion_length": 234.34375, + "epoch": 0.04811982154238369, + "grad_norm": 7.798461437225342, + "kl": 0.0732421875, + "learning_rate": 9.518801784576163e-07, + "loss": 0.0029, + "reward": 1.2438874244689941, + "reward_std": 0.1509024202823639, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.24388736486434937, + "rewards/pad": 0.0, + "step": 151 + }, + { + "completion_length": 228.71875, + "epoch": 0.04843849585723391, + "grad_norm": 7.1418585777282715, + "kl": 0.06591796875, + "learning_rate": 9.515615041427661e-07, + "loss": 0.0026, + "reward": 1.4649324417114258, + "reward_std": 0.1798781454563141, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.433682382106781, + "rewards/pad": 0.03125, + "step": 152 + }, + { + "completion_length": 276.234375, + "epoch": 0.04875717017208413, + "grad_norm": 7.342140197753906, + "kl": 0.05224609375, + "learning_rate": 9.512428298279159e-07, + "loss": 0.0021, + "reward": 1.5144888162612915, + "reward_std": 0.1600225269794464, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4207388162612915, + "step": 153 + }, + { + "completion_length": 353.9375, + "epoch": 0.049075844486934354, + "grad_norm": 6.5262908935546875, + "kl": 0.02587890625, + "learning_rate": 9.509241555130656e-07, + "loss": 0.0011, + "reward": 1.4947094917297363, + "reward_std": 0.11990465223789215, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5103343725204468, + "step": 154 + }, + { + "completion_length": 215.359375, + "epoch": 0.049394518801784575, + "grad_norm": 8.246253967285156, + "kl": 0.060546875, + "learning_rate": 9.506054811982154e-07, + "loss": 0.0024, + "reward": 1.4783300161361694, + "reward_std": 0.16495484113693237, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41582998633384705, + "rewards/pad": 0.0625, + "step": 155 + }, + { + "completion_length": 364.953125, + "epoch": 0.0497131931166348, + "grad_norm": 4.4291911125183105, + "kl": 0.049560546875, + "learning_rate": 9.502868068833652e-07, + "loss": 0.002, + "reward": 1.445652723312378, + "reward_std": 0.15399569272994995, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46127766370773315, + "step": 156 + }, + { + "completion_length": 210.84375, + "epoch": 0.05003186743148502, + "grad_norm": 13.04565715789795, + "kl": 0.049560546875, + "learning_rate": 9.499681325685149e-07, + "loss": 0.002, + "reward": 1.5248757600784302, + "reward_std": 0.16148591041564941, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4155007600784302, + "rewards/pad": 0.109375, + "step": 157 + }, + { + "completion_length": 202.609375, + "epoch": 0.05035054174633524, + "grad_norm": 12.038368225097656, + "kl": 0.05908203125, + "learning_rate": 9.496494582536647e-07, + "loss": 0.0023, + "reward": 1.5142139196395874, + "reward_std": 0.22789838910102844, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4517139196395874, + "rewards/pad": 0.078125, + "step": 158 + }, + { + "completion_length": 173.71875, + "epoch": 0.05066921606118547, + "grad_norm": 25.960351943969727, + "kl": 0.0751953125, + "learning_rate": 9.493307839388145e-07, + "loss": 0.003, + "reward": 1.402698040008545, + "reward_std": 0.17803940176963806, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35582298040390015, + "rewards/pad": 0.046875, + "step": 159 + }, + { + "completion_length": 283.125, + "epoch": 0.05098789037603569, + "grad_norm": 7.958873748779297, + "kl": 0.04345703125, + "learning_rate": 9.490121096239643e-07, + "loss": 0.0017, + "reward": 1.4051318168640137, + "reward_std": 0.04836621135473251, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40513181686401367, + "rewards/pad": 0.0, + "step": 160 + }, + { + "completion_length": 214.46875, + "epoch": 0.05130656469088592, + "grad_norm": 11.376971244812012, + "kl": 0.0595703125, + "learning_rate": 9.48693435309114e-07, + "loss": 0.0024, + "reward": 1.4891833066940308, + "reward_std": 0.15629437565803528, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36418330669403076, + "step": 161 + }, + { + "completion_length": 200.84375, + "epoch": 0.05162523900573614, + "grad_norm": 9.97684383392334, + "kl": 0.058837890625, + "learning_rate": 9.483747609942638e-07, + "loss": 0.0024, + "reward": 1.499054193496704, + "reward_std": 0.16176065802574158, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4990541636943817, + "step": 162 + }, + { + "completion_length": 206.84375, + "epoch": 0.05194391332058636, + "grad_norm": 14.745891571044922, + "kl": 0.06591796875, + "learning_rate": 9.480560866794136e-07, + "loss": 0.0026, + "reward": 1.8148226737976074, + "reward_std": 0.13128326833248138, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5804476141929626, + "step": 163 + }, + { + "completion_length": 247.546875, + "epoch": 0.052262587635436585, + "grad_norm": 10.21143913269043, + "kl": 0.05322265625, + "learning_rate": 9.477374123645634e-07, + "loss": 0.0021, + "reward": 1.4356274604797363, + "reward_std": 0.15665191411972046, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4512525200843811, + "rewards/pad": 0.0, + "step": 164 + }, + { + "completion_length": 320.6875, + "epoch": 0.052581261950286805, + "grad_norm": 3.693413734436035, + "kl": 0.037109375, + "learning_rate": 9.474187380497131e-07, + "loss": 0.0015, + "reward": 1.470198392868042, + "reward_std": 0.10511145740747452, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47019830346107483, + "rewards/pad": 0.0, + "step": 165 + }, + { + "completion_length": 234.46875, + "epoch": 0.05289993626513703, + "grad_norm": 6.604076385498047, + "kl": 0.0517578125, + "learning_rate": 9.471000637348629e-07, + "loss": 0.0021, + "reward": 1.4114763736724854, + "reward_std": 0.1726670116186142, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.41147640347480774, + "step": 166 + }, + { + "completion_length": 371.0625, + "epoch": 0.05321861057998725, + "grad_norm": 12.597403526306152, + "kl": 0.042724609375, + "learning_rate": 9.467813894200127e-07, + "loss": 0.0017, + "reward": 1.3728115558624268, + "reward_std": 0.06814874708652496, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.372811496257782, + "step": 167 + }, + { + "completion_length": 206.71875, + "epoch": 0.05353728489483748, + "grad_norm": 14.53585433959961, + "kl": 0.07275390625, + "learning_rate": 9.464627151051625e-07, + "loss": 0.0029, + "reward": 1.4245121479034424, + "reward_std": 0.2063012272119522, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3620120584964752, + "step": 168 + }, + { + "completion_length": 120.09375, + "epoch": 0.0538559592096877, + "grad_norm": 11.440759658813477, + "kl": 0.09619140625, + "learning_rate": 9.461440407903123e-07, + "loss": 0.0039, + "reward": 1.455107569694519, + "reward_std": 0.20997771620750427, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.33010753989219666, + "rewards/pad": 0.125, + "step": 169 + }, + { + "completion_length": 200.703125, + "epoch": 0.05417463352453792, + "grad_norm": 6.415365219116211, + "kl": 0.11328125, + "learning_rate": 9.458253664754621e-07, + "loss": 0.0045, + "reward": 1.4902559518814087, + "reward_std": 0.18468467891216278, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42775604128837585, + "rewards/pad": 0.0625, + "step": 170 + }, + { + "completion_length": 165.421875, + "epoch": 0.05449330783938815, + "grad_norm": 39.82363510131836, + "kl": 0.08837890625, + "learning_rate": 9.455066921606119e-07, + "loss": 0.0035, + "reward": 1.3934909105300903, + "reward_std": 0.11051173508167267, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3934909701347351, + "step": 171 + }, + { + "completion_length": 229.671875, + "epoch": 0.05481198215423837, + "grad_norm": 12.203930854797363, + "kl": 0.056884765625, + "learning_rate": 9.451880178457617e-07, + "loss": 0.0023, + "reward": 1.4572014808654785, + "reward_std": 0.10274035483598709, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4572015404701233, + "rewards/pad": 0.0, + "step": 172 + }, + { + "completion_length": 304.453125, + "epoch": 0.055130656469088594, + "grad_norm": 4.776219844818115, + "kl": 0.0517578125, + "learning_rate": 9.448693435309114e-07, + "loss": 0.0021, + "reward": 1.4793224334716797, + "reward_std": 0.12388372421264648, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4949474334716797, + "rewards/pad": 0.0, + "step": 173 + }, + { + "completion_length": 170.953125, + "epoch": 0.055449330783938815, + "grad_norm": 7.443392753601074, + "kl": 0.061767578125, + "learning_rate": 9.445506692160612e-07, + "loss": 0.0025, + "reward": 1.614640474319458, + "reward_std": 0.30754411220550537, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48964059352874756, + "rewards/pad": 0.140625, + "step": 174 + }, + { + "completion_length": 127.765625, + "epoch": 0.055768005098789035, + "grad_norm": 15.550397872924805, + "kl": 0.095703125, + "learning_rate": 9.44231994901211e-07, + "loss": 0.0038, + "reward": 1.4903450012207031, + "reward_std": 0.2514011859893799, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39659497141838074, + "rewards/pad": 0.109375, + "step": 175 + }, + { + "completion_length": 259.046875, + "epoch": 0.05608667941363926, + "grad_norm": 56.89799880981445, + "kl": 0.06103515625, + "learning_rate": 9.439133205863608e-07, + "loss": 0.0024, + "reward": 1.428381085395813, + "reward_std": 0.10250850021839142, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.428381085395813, + "step": 176 + }, + { + "completion_length": 236.390625, + "epoch": 0.05640535372848948, + "grad_norm": 11.407561302185059, + "kl": 0.049560546875, + "learning_rate": 9.435946462715104e-07, + "loss": 0.002, + "reward": 1.6906650066375732, + "reward_std": 0.22784824669361115, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5187899470329285, + "rewards/pad": 0.203125, + "step": 177 + }, + { + "completion_length": 327.375, + "epoch": 0.05672402804333971, + "grad_norm": 7.302003383636475, + "kl": 0.037841796875, + "learning_rate": 9.432759719566602e-07, + "loss": 0.0015, + "reward": 1.3769761323928833, + "reward_std": 0.01789081282913685, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3769761621952057, + "step": 178 + }, + { + "completion_length": 197.0625, + "epoch": 0.05704270235818993, + "grad_norm": 8.595909118652344, + "kl": 0.06982421875, + "learning_rate": 9.4295729764181e-07, + "loss": 0.0028, + "reward": 1.488729476928711, + "reward_std": 0.16121619939804077, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48872941732406616, + "rewards/pad": 0.015625, + "step": 179 + }, + { + "completion_length": 284.4375, + "epoch": 0.05736137667304015, + "grad_norm": 7.860208988189697, + "kl": 0.053466796875, + "learning_rate": 9.426386233269598e-07, + "loss": 0.0022, + "reward": 1.3685816526412964, + "reward_std": 0.08686651289463043, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.368581622838974, + "step": 180 + }, + { + "completion_length": 257.359375, + "epoch": 0.05768005098789038, + "grad_norm": 6.262935638427734, + "kl": 0.080078125, + "learning_rate": 9.423199490121095e-07, + "loss": 0.0032, + "reward": 1.6427003145217896, + "reward_std": 0.245242178440094, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5489503145217896, + "rewards/pad": 0.125, + "step": 181 + }, + { + "completion_length": 129.53125, + "epoch": 0.0579987253027406, + "grad_norm": 11.011406898498535, + "kl": 0.06640625, + "learning_rate": 9.420012746972593e-07, + "loss": 0.0027, + "reward": 1.786719560623169, + "reward_std": 0.15677408874034882, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.536719560623169, + "rewards/pad": 0.25, + "step": 182 + }, + { + "completion_length": 234.40625, + "epoch": 0.058317399617590825, + "grad_norm": 10.98038101196289, + "kl": 0.05810546875, + "learning_rate": 9.416826003824091e-07, + "loss": 0.0023, + "reward": 1.3131130933761597, + "reward_std": 0.21581920981407166, + "rewards/format_reward_tg": 0.9375, + "rewards/iou_timestamp_reward": 0.32873809337615967, + "rewards/pad": 0.046875, + "step": 183 + }, + { + "completion_length": 256.359375, + "epoch": 0.058636073932441045, + "grad_norm": 6.828302383422852, + "kl": 0.0625, + "learning_rate": 9.413639260675588e-07, + "loss": 0.0025, + "reward": 1.2020618915557861, + "reward_std": 0.07125719636678696, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.20206193625926971, + "step": 184 + }, + { + "completion_length": 223.703125, + "epoch": 0.058954748247291265, + "grad_norm": 8.792315483093262, + "kl": 0.0654296875, + "learning_rate": 9.410452517527086e-07, + "loss": 0.0026, + "reward": 1.465730905532837, + "reward_std": 0.17245060205459595, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4344809055328369, + "step": 185 + }, + { + "completion_length": 191.0625, + "epoch": 0.05927342256214149, + "grad_norm": 90.65461730957031, + "kl": 0.061279296875, + "learning_rate": 9.407265774378584e-07, + "loss": 0.0025, + "reward": 1.5472661256790161, + "reward_std": 0.1555110514163971, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.31289106607437134, + "rewards/pad": 0.25, + "step": 186 + }, + { + "completion_length": 266.484375, + "epoch": 0.05959209687699171, + "grad_norm": 8.738775253295898, + "kl": 0.053955078125, + "learning_rate": 9.404079031230082e-07, + "loss": 0.0022, + "reward": 1.319126009941101, + "reward_std": 0.08598145842552185, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31912603974342346, + "rewards/pad": 0.0, + "step": 187 + }, + { + "completion_length": 210.984375, + "epoch": 0.05991077119184194, + "grad_norm": 14.283634185791016, + "kl": 0.06201171875, + "learning_rate": 9.40089228808158e-07, + "loss": 0.0025, + "reward": 1.5603177547454834, + "reward_std": 0.16099268198013306, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5759426951408386, + "rewards/pad": 0.0, + "step": 188 + }, + { + "completion_length": 322.046875, + "epoch": 0.06022944550669216, + "grad_norm": 4.540855884552002, + "kl": 0.05810546875, + "learning_rate": 9.397705544933078e-07, + "loss": 0.0023, + "reward": 1.466914176940918, + "reward_std": 0.07637807726860046, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46691423654556274, + "rewards/pad": 0.0, + "step": 189 + }, + { + "completion_length": 223.703125, + "epoch": 0.06054811982154239, + "grad_norm": 8.010270118713379, + "kl": 0.06396484375, + "learning_rate": 9.394518801784576e-07, + "loss": 0.0026, + "reward": 1.5183391571044922, + "reward_std": 0.19354218244552612, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4870891571044922, + "step": 190 + }, + { + "completion_length": 286.796875, + "epoch": 0.06086679413639261, + "grad_norm": 8.864216804504395, + "kl": 0.041748046875, + "learning_rate": 9.391332058636074e-07, + "loss": 0.0017, + "reward": 1.661377191543579, + "reward_std": 0.31189072132110596, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.4582522511482239, + "step": 191 + }, + { + "completion_length": 158.6875, + "epoch": 0.06118546845124283, + "grad_norm": 6.764616012573242, + "kl": 0.07275390625, + "learning_rate": 9.388145315487571e-07, + "loss": 0.0029, + "reward": 1.5173699855804443, + "reward_std": 0.1659144163131714, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.28299516439437866, + "step": 192 + }, + { + "completion_length": 211.71875, + "epoch": 0.061504142766093055, + "grad_norm": 9.038619995117188, + "kl": 0.0693359375, + "learning_rate": 9.384958572339069e-07, + "loss": 0.0028, + "reward": 1.4801876544952393, + "reward_std": 0.09755624830722809, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4801877737045288, + "step": 193 + }, + { + "completion_length": 150.46875, + "epoch": 0.061822817080943275, + "grad_norm": 67.90802001953125, + "kl": 0.09228515625, + "learning_rate": 9.381771829190567e-07, + "loss": 0.0037, + "reward": 1.4884817600250244, + "reward_std": 0.12955227494239807, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39473170042037964, + "rewards/pad": 0.09375, + "step": 194 + }, + { + "completion_length": 247.203125, + "epoch": 0.0621414913957935, + "grad_norm": 6.690598011016846, + "kl": 0.064453125, + "learning_rate": 9.378585086042065e-07, + "loss": 0.0026, + "reward": 1.4969897270202637, + "reward_std": 0.06976005434989929, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4969896674156189, + "step": 195 + }, + { + "completion_length": 181.4375, + "epoch": 0.06246016571064372, + "grad_norm": 8.010198593139648, + "kl": 0.1103515625, + "learning_rate": 9.375398342893562e-07, + "loss": 0.0044, + "reward": 1.564711570739746, + "reward_std": 0.11871609091758728, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43971166014671326, + "rewards/pad": 0.125, + "step": 196 + }, + { + "completion_length": 193.390625, + "epoch": 0.06277884002549394, + "grad_norm": 11.752547264099121, + "kl": 0.07275390625, + "learning_rate": 9.37221159974506e-07, + "loss": 0.0029, + "reward": 1.659630298614502, + "reward_std": 0.13107186555862427, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4252552390098572, + "rewards/pad": 0.234375, + "step": 197 + }, + { + "completion_length": 211.46875, + "epoch": 0.06309751434034416, + "grad_norm": 11.705774307250977, + "kl": 0.050048828125, + "learning_rate": 9.369024856596558e-07, + "loss": 0.002, + "reward": 1.81485915184021, + "reward_std": 0.2242487072944641, + "rewards/answer_reward": 0.265625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5492340922355652, + "step": 198 + }, + { + "completion_length": 264.84375, + "epoch": 0.0634161886551944, + "grad_norm": 11.895368576049805, + "kl": 0.052490234375, + "learning_rate": 9.365838113448056e-07, + "loss": 0.0021, + "reward": 1.466489315032959, + "reward_std": 0.1925201416015625, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4196142852306366, + "step": 199 + }, + { + "completion_length": 224.015625, + "epoch": 0.06373486297004462, + "grad_norm": 8.283370971679688, + "kl": 0.057861328125, + "learning_rate": 9.362651370299553e-07, + "loss": 0.0023, + "reward": 1.607632040977478, + "reward_std": 0.16389986872673035, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3732570707798004, + "step": 200 + }, + { + "completion_length": 329.734375, + "epoch": 0.06405353728489484, + "grad_norm": 3.239655017852783, + "kl": 0.0306396484375, + "learning_rate": 9.359464627151051e-07, + "loss": 0.0012, + "reward": 1.567150354385376, + "reward_std": 0.06768325716257095, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33277541399002075, + "step": 201 + }, + { + "completion_length": 195.8125, + "epoch": 0.06437221159974506, + "grad_norm": 24.049251556396484, + "kl": 0.06982421875, + "learning_rate": 9.356277884002549e-07, + "loss": 0.0028, + "reward": 1.3611292839050293, + "reward_std": 0.13703155517578125, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3298793137073517, + "rewards/pad": 0.03125, + "step": 202 + }, + { + "completion_length": 290.625, + "epoch": 0.06469088591459528, + "grad_norm": 9.575765609741211, + "kl": 0.06298828125, + "learning_rate": 9.353091140854047e-07, + "loss": 0.0025, + "reward": 1.4079358577728271, + "reward_std": 0.08651718497276306, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4235607981681824, + "rewards/pad": 0.0, + "step": 203 + }, + { + "completion_length": 230.625, + "epoch": 0.06500956022944551, + "grad_norm": 15.252603530883789, + "kl": 0.05517578125, + "learning_rate": 9.349904397705544e-07, + "loss": 0.0022, + "reward": 1.5752151012420654, + "reward_std": 0.10122407972812653, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45021528005599976, + "rewards/pad": 0.125, + "step": 204 + }, + { + "completion_length": 195.625, + "epoch": 0.06532823454429573, + "grad_norm": 6.833919525146484, + "kl": 0.08056640625, + "learning_rate": 9.346717654557042e-07, + "loss": 0.0032, + "reward": 1.4168448448181152, + "reward_std": 0.1276351809501648, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41684478521347046, + "step": 205 + }, + { + "completion_length": 152.125, + "epoch": 0.06564690885914595, + "grad_norm": 11.947805404663086, + "kl": 0.0712890625, + "learning_rate": 9.34353091140854e-07, + "loss": 0.0028, + "reward": 1.6025731563568115, + "reward_std": 0.13310229778289795, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4619481861591339, + "rewards/pad": 0.140625, + "step": 206 + }, + { + "completion_length": 152.140625, + "epoch": 0.06596558317399617, + "grad_norm": 13.497575759887695, + "kl": 0.09033203125, + "learning_rate": 9.340344168260039e-07, + "loss": 0.0036, + "reward": 1.3815317153930664, + "reward_std": 0.12820778787136078, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38153162598609924, + "step": 207 + }, + { + "completion_length": 206.203125, + "epoch": 0.0662842574888464, + "grad_norm": 6.831051349639893, + "kl": 0.07470703125, + "learning_rate": 9.337157425111536e-07, + "loss": 0.003, + "reward": 1.5932207107543945, + "reward_std": 0.14225542545318604, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45259571075439453, + "rewards/pad": 0.140625, + "step": 208 + }, + { + "completion_length": 335.296875, + "epoch": 0.06660293180369663, + "grad_norm": 6.019325256347656, + "kl": 0.052978515625, + "learning_rate": 9.333970681963034e-07, + "loss": 0.0021, + "reward": 1.528432846069336, + "reward_std": 0.11231175065040588, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5440578460693359, + "step": 209 + }, + { + "completion_length": 253.203125, + "epoch": 0.06692160611854685, + "grad_norm": 7.904732704162598, + "kl": 0.05517578125, + "learning_rate": 9.330783938814532e-07, + "loss": 0.0022, + "reward": 1.5680043697357178, + "reward_std": 0.10188771784305573, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5680044293403625, + "rewards/pad": 0.0, + "step": 210 + }, + { + "completion_length": 202.203125, + "epoch": 0.06724028043339707, + "grad_norm": 6.209277153015137, + "kl": 0.059326171875, + "learning_rate": 9.32759719566603e-07, + "loss": 0.0024, + "reward": 1.5635247230529785, + "reward_std": 0.1443396955728531, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5322748422622681, + "rewards/pad": 0.03125, + "step": 211 + }, + { + "completion_length": 254.203125, + "epoch": 0.06755895474824729, + "grad_norm": 6.30163049697876, + "kl": 0.048583984375, + "learning_rate": 9.324410452517527e-07, + "loss": 0.0019, + "reward": 1.519893765449524, + "reward_std": 0.23284552991390228, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4105188250541687, + "rewards/pad": 0.125, + "step": 212 + }, + { + "completion_length": 271.28125, + "epoch": 0.06787762906309751, + "grad_norm": 15.06171989440918, + "kl": 0.060791015625, + "learning_rate": 9.321223709369025e-07, + "loss": 0.0024, + "reward": 1.4648241996765137, + "reward_std": 0.07391152530908585, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4648240804672241, + "rewards/pad": 0.0, + "step": 213 + }, + { + "completion_length": 197.5625, + "epoch": 0.06819630337794774, + "grad_norm": 17.046770095825195, + "kl": 0.07080078125, + "learning_rate": 9.318036966220523e-07, + "loss": 0.0028, + "reward": 1.5187010765075684, + "reward_std": 0.1929798126220703, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44057604670524597, + "rewards/pad": 0.09375, + "step": 214 + }, + { + "completion_length": 287.140625, + "epoch": 0.06851497769279796, + "grad_norm": 4.82719612121582, + "kl": 0.038330078125, + "learning_rate": 9.314850223072021e-07, + "loss": 0.0015, + "reward": 1.410596251487732, + "reward_std": 0.11355704069137573, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.42622125148773193, + "step": 215 + }, + { + "completion_length": 260.78125, + "epoch": 0.06883365200764818, + "grad_norm": 17.187711715698242, + "kl": 0.052001953125, + "learning_rate": 9.311663479923517e-07, + "loss": 0.0021, + "reward": 1.840174913406372, + "reward_std": 0.11517071723937988, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49642491340637207, + "rewards/pad": 0.34375, + "step": 216 + }, + { + "completion_length": 271.21875, + "epoch": 0.0691523263224984, + "grad_norm": 5.477590560913086, + "kl": 0.052490234375, + "learning_rate": 9.308476736775015e-07, + "loss": 0.0021, + "reward": 1.3956875801086426, + "reward_std": 0.15309034287929535, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.31756263971328735, + "rewards/pad": 0.09375, + "step": 217 + }, + { + "completion_length": 248.046875, + "epoch": 0.06947100063734862, + "grad_norm": 29.749284744262695, + "kl": 0.05078125, + "learning_rate": 9.305289993626513e-07, + "loss": 0.002, + "reward": 1.5329116582870483, + "reward_std": 0.10812871158123016, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4235367476940155, + "rewards/pad": 0.109375, + "step": 218 + }, + { + "completion_length": 320.375, + "epoch": 0.06978967495219886, + "grad_norm": 6.243754863739014, + "kl": 0.0361328125, + "learning_rate": 9.30210325047801e-07, + "loss": 0.0014, + "reward": 1.5504281520843506, + "reward_std": 0.04979484900832176, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4254281222820282, + "step": 219 + }, + { + "completion_length": 232.671875, + "epoch": 0.07010834926704908, + "grad_norm": 12.06491756439209, + "kl": 0.10205078125, + "learning_rate": 9.298916507329508e-07, + "loss": 0.0041, + "reward": 1.6985371112823486, + "reward_std": 0.19800494611263275, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5422872304916382, + "step": 220 + }, + { + "completion_length": 251.9375, + "epoch": 0.0704270235818993, + "grad_norm": 9.655678749084473, + "kl": 0.052490234375, + "learning_rate": 9.295729764181006e-07, + "loss": 0.0021, + "reward": 1.524951696395874, + "reward_std": 0.1220092698931694, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43120163679122925, + "step": 221 + }, + { + "completion_length": 195.0, + "epoch": 0.07074569789674952, + "grad_norm": 11.081148147583008, + "kl": 0.072265625, + "learning_rate": 9.292543021032504e-07, + "loss": 0.0029, + "reward": 1.3704537153244019, + "reward_std": 0.16427022218704224, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33920374512672424, + "step": 222 + }, + { + "completion_length": 270.34375, + "epoch": 0.07106437221159974, + "grad_norm": 4.369564533233643, + "kl": 0.044189453125, + "learning_rate": 9.289356277884001e-07, + "loss": 0.0018, + "reward": 1.6683555841445923, + "reward_std": 0.24814534187316895, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5121055245399475, + "step": 223 + }, + { + "completion_length": 317.8125, + "epoch": 0.07138304652644997, + "grad_norm": 10.045781135559082, + "kl": 0.042236328125, + "learning_rate": 9.286169534735499e-07, + "loss": 0.0017, + "reward": 1.2989885807037354, + "reward_std": 0.11577681452035904, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2677384614944458, + "step": 224 + }, + { + "completion_length": 211.5, + "epoch": 0.07170172084130019, + "grad_norm": 6.441803932189941, + "kl": 0.06591796875, + "learning_rate": 9.282982791586997e-07, + "loss": 0.0026, + "reward": 1.5342642068862915, + "reward_std": 0.12945051491260529, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.534264087677002, + "rewards/pad": 0.0, + "step": 225 + }, + { + "completion_length": 294.078125, + "epoch": 0.07202039515615041, + "grad_norm": 8.124902725219727, + "kl": 0.0556640625, + "learning_rate": 9.279796048438496e-07, + "loss": 0.0022, + "reward": 1.4743280410766602, + "reward_std": 0.060752347111701965, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47432801127433777, + "step": 226 + }, + { + "completion_length": 259.953125, + "epoch": 0.07233906947100063, + "grad_norm": 22.559650421142578, + "kl": 0.06005859375, + "learning_rate": 9.276609305289993e-07, + "loss": 0.0024, + "reward": 1.4046584367752075, + "reward_std": 0.16901271045207977, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.2640334963798523, + "step": 227 + }, + { + "completion_length": 283.515625, + "epoch": 0.07265774378585087, + "grad_norm": 10.921895980834961, + "kl": 0.056396484375, + "learning_rate": 9.273422562141491e-07, + "loss": 0.0023, + "reward": 1.2822096347808838, + "reward_std": 0.04250604659318924, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2822096049785614, + "rewards/pad": 0.0, + "step": 228 + }, + { + "completion_length": 191.0, + "epoch": 0.07297641810070109, + "grad_norm": 14.872382164001465, + "kl": 0.0732421875, + "learning_rate": 9.270235818992989e-07, + "loss": 0.0029, + "reward": 1.5225872993469238, + "reward_std": 0.10570458322763443, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.522587239742279, + "rewards/pad": 0.0, + "step": 229 + }, + { + "completion_length": 291.3125, + "epoch": 0.07329509241555131, + "grad_norm": 5.075259685516357, + "kl": 0.04248046875, + "learning_rate": 9.267049075844487e-07, + "loss": 0.0017, + "reward": 1.3570164442062378, + "reward_std": 0.1311841905117035, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3726414740085602, + "rewards/pad": 0.0, + "step": 230 + }, + { + "completion_length": 240.15625, + "epoch": 0.07361376673040153, + "grad_norm": 47.22294235229492, + "kl": 0.0654296875, + "learning_rate": 9.263862332695984e-07, + "loss": 0.0026, + "reward": 1.4281563758850098, + "reward_std": 0.06662596762180328, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4281563460826874, + "rewards/pad": 0.0, + "step": 231 + }, + { + "completion_length": 208.578125, + "epoch": 0.07393244104525175, + "grad_norm": 13.75542163848877, + "kl": 0.06298828125, + "learning_rate": 9.260675589547482e-07, + "loss": 0.0025, + "reward": 1.466019868850708, + "reward_std": 0.1753520667552948, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.43476995825767517, + "step": 232 + }, + { + "completion_length": 222.59375, + "epoch": 0.07425111536010198, + "grad_norm": 8.445175170898438, + "kl": 0.07958984375, + "learning_rate": 9.25748884639898e-07, + "loss": 0.0032, + "reward": 1.443455696105957, + "reward_std": 0.21538135409355164, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.24033081531524658, + "rewards/pad": 0.21875, + "step": 233 + }, + { + "completion_length": 106.125, + "epoch": 0.0745697896749522, + "grad_norm": 12.408512115478516, + "kl": 0.10400390625, + "learning_rate": 9.254302103250478e-07, + "loss": 0.0042, + "reward": 1.5548592805862427, + "reward_std": 0.16731616854667664, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5548592209815979, + "rewards/pad": 0.0, + "step": 234 + }, + { + "completion_length": 178.796875, + "epoch": 0.07488846398980242, + "grad_norm": 10.482146263122559, + "kl": 0.06982421875, + "learning_rate": 9.251115360101975e-07, + "loss": 0.0028, + "reward": 1.7268046140670776, + "reward_std": 0.13564898073673248, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3830545246601105, + "rewards/pad": 0.34375, + "step": 235 + }, + { + "completion_length": 187.765625, + "epoch": 0.07520713830465264, + "grad_norm": 8.685198783874512, + "kl": 0.0673828125, + "learning_rate": 9.247928616953473e-07, + "loss": 0.0027, + "reward": 1.4304745197296143, + "reward_std": 0.10561563819646835, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43047448992729187, + "rewards/pad": 0.0, + "step": 236 + }, + { + "completion_length": 216.78125, + "epoch": 0.07552581261950286, + "grad_norm": 8.801995277404785, + "kl": 0.0703125, + "learning_rate": 9.244741873804971e-07, + "loss": 0.0028, + "reward": 1.4420864582061768, + "reward_std": 0.2987205684185028, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.39521145820617676, + "step": 237 + }, + { + "completion_length": 221.625, + "epoch": 0.0758444869343531, + "grad_norm": 9.968708038330078, + "kl": 0.076171875, + "learning_rate": 9.241555130656469e-07, + "loss": 0.0031, + "reward": 1.555429220199585, + "reward_std": 0.21994394063949585, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.46167925000190735, + "step": 238 + }, + { + "completion_length": 197.375, + "epoch": 0.07616316124920332, + "grad_norm": 42.812137603759766, + "kl": 0.07177734375, + "learning_rate": 9.238368387507966e-07, + "loss": 0.0029, + "reward": 1.5328187942504883, + "reward_std": 0.12525755167007446, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4234437346458435, + "rewards/pad": 0.109375, + "step": 239 + }, + { + "completion_length": 289.734375, + "epoch": 0.07648183556405354, + "grad_norm": 7.226988792419434, + "kl": 0.051025390625, + "learning_rate": 9.235181644359464e-07, + "loss": 0.002, + "reward": 1.4006308317184448, + "reward_std": 0.20406073331832886, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.32250580191612244, + "step": 240 + }, + { + "completion_length": 174.109375, + "epoch": 0.07680050987890376, + "grad_norm": 9.643024444580078, + "kl": 0.0791015625, + "learning_rate": 9.231994901210962e-07, + "loss": 0.0032, + "reward": 1.610644817352295, + "reward_std": 0.10245459526777267, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47001978754997253, + "step": 241 + }, + { + "completion_length": 216.890625, + "epoch": 0.07711918419375398, + "grad_norm": 10.71285343170166, + "kl": 0.06298828125, + "learning_rate": 9.22880815806246e-07, + "loss": 0.0025, + "reward": 1.7045223712921143, + "reward_std": 0.17062969505786896, + "rewards/answer_reward": 0.328125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3763972520828247, + "step": 242 + }, + { + "completion_length": 239.46875, + "epoch": 0.07743785850860421, + "grad_norm": 20.129676818847656, + "kl": 0.06298828125, + "learning_rate": 9.225621414913957e-07, + "loss": 0.0025, + "reward": 1.4195454120635986, + "reward_std": 0.09258443117141724, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41954541206359863, + "step": 243 + }, + { + "completion_length": 215.234375, + "epoch": 0.07775653282345443, + "grad_norm": 54.93705749511719, + "kl": 0.06787109375, + "learning_rate": 9.222434671765456e-07, + "loss": 0.0027, + "reward": 1.3729498386383057, + "reward_std": 0.07894681394100189, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37294989824295044, + "rewards/pad": 0.0, + "step": 244 + }, + { + "completion_length": 248.703125, + "epoch": 0.07807520713830465, + "grad_norm": 4.213068962097168, + "kl": 0.051025390625, + "learning_rate": 9.219247928616954e-07, + "loss": 0.002, + "reward": 1.441326379776001, + "reward_std": 0.11559078842401505, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.378826379776001, + "rewards/pad": 0.0625, + "step": 245 + }, + { + "completion_length": 192.578125, + "epoch": 0.07839388145315487, + "grad_norm": 6.598721027374268, + "kl": 0.06787109375, + "learning_rate": 9.216061185468452e-07, + "loss": 0.0027, + "reward": 1.6929112672805786, + "reward_std": 0.22981858253479004, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44291120767593384, + "rewards/pad": 0.265625, + "step": 246 + }, + { + "completion_length": 239.59375, + "epoch": 0.0787125557680051, + "grad_norm": 7.554116725921631, + "kl": 0.07763671875, + "learning_rate": 9.212874442319949e-07, + "loss": 0.0031, + "reward": 1.4620293378829956, + "reward_std": 0.15765343606472015, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41515427827835083, + "step": 247 + }, + { + "completion_length": 263.984375, + "epoch": 0.07903123008285533, + "grad_norm": 17.3502254486084, + "kl": 0.056884765625, + "learning_rate": 9.209687699171447e-07, + "loss": 0.0023, + "reward": 1.506328821182251, + "reward_std": 0.10069683194160461, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.396953821182251, + "step": 248 + }, + { + "completion_length": 276.125, + "epoch": 0.07934990439770555, + "grad_norm": 5.626839637756348, + "kl": 0.043212890625, + "learning_rate": 9.206500956022945e-07, + "loss": 0.0017, + "reward": 1.58784019947052, + "reward_std": 0.20608378946781158, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.43159013986587524, + "step": 249 + }, + { + "completion_length": 222.15625, + "epoch": 0.07966857871255577, + "grad_norm": 7.744899749755859, + "kl": 0.060791015625, + "learning_rate": 9.203314212874442e-07, + "loss": 0.0024, + "reward": 1.5529812574386597, + "reward_std": 0.1434810906648636, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4592312276363373, + "rewards/pad": 0.125, + "step": 250 + }, + { + "completion_length": 194.96875, + "epoch": 0.07998725302740599, + "grad_norm": 8.183343887329102, + "kl": 0.06396484375, + "learning_rate": 9.20012746972594e-07, + "loss": 0.0026, + "reward": 1.4947509765625, + "reward_std": 0.1750367283821106, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3853759467601776, + "rewards/pad": 0.109375, + "step": 251 + }, + { + "completion_length": 298.640625, + "epoch": 0.08030592734225621, + "grad_norm": 24.4941349029541, + "kl": 0.045166015625, + "learning_rate": 9.196940726577438e-07, + "loss": 0.0018, + "reward": 1.520071268081665, + "reward_std": 0.11857470124959946, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45757126808166504, + "step": 252 + }, + { + "completion_length": 192.84375, + "epoch": 0.08062460165710644, + "grad_norm": 9.189983367919922, + "kl": 0.064453125, + "learning_rate": 9.193753983428936e-07, + "loss": 0.0026, + "reward": 1.8057680130004883, + "reward_std": 0.13417977094650269, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5870180130004883, + "step": 253 + }, + { + "completion_length": 251.90625, + "epoch": 0.08094327597195666, + "grad_norm": 11.186515808105469, + "kl": 0.052978515625, + "learning_rate": 9.190567240280433e-07, + "loss": 0.0021, + "reward": 1.3663510084152222, + "reward_std": 0.18374572694301605, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3507259488105774, + "rewards/pad": 0.03125, + "step": 254 + }, + { + "completion_length": 204.75, + "epoch": 0.08126195028680688, + "grad_norm": 20.661352157592773, + "kl": 0.0859375, + "learning_rate": 9.18738049713193e-07, + "loss": 0.0034, + "reward": 1.3985034227371216, + "reward_std": 0.0647159069776535, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3985033333301544, + "rewards/pad": 0.0, + "step": 255 + }, + { + "completion_length": 184.46875, + "epoch": 0.0815806246016571, + "grad_norm": 9.674245834350586, + "kl": 0.06396484375, + "learning_rate": 9.184193753983428e-07, + "loss": 0.0026, + "reward": 1.5792434215545654, + "reward_std": 0.21538616716861725, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.422993540763855, + "rewards/pad": 0.15625, + "step": 256 + }, + { + "completion_length": 253.640625, + "epoch": 0.08189929891650732, + "grad_norm": 117.39447784423828, + "kl": 0.068359375, + "learning_rate": 9.181007010834926e-07, + "loss": 0.0027, + "reward": 1.5494431257247925, + "reward_std": 0.10500036925077438, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5494431257247925, + "step": 257 + }, + { + "completion_length": 262.453125, + "epoch": 0.08221797323135756, + "grad_norm": 15.066034317016602, + "kl": 0.04345703125, + "learning_rate": 9.177820267686423e-07, + "loss": 0.0017, + "reward": 1.6858816146850586, + "reward_std": 0.11864806711673737, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5765066146850586, + "step": 258 + }, + { + "completion_length": 214.140625, + "epoch": 0.08253664754620778, + "grad_norm": 14.4360933303833, + "kl": 0.06689453125, + "learning_rate": 9.174633524537921e-07, + "loss": 0.0027, + "reward": 1.4065463542938232, + "reward_std": 0.13927623629570007, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40654638409614563, + "rewards/pad": 0.015625, + "step": 259 + }, + { + "completion_length": 136.421875, + "epoch": 0.082855321861058, + "grad_norm": 14.50709342956543, + "kl": 0.0947265625, + "learning_rate": 9.171446781389419e-07, + "loss": 0.0038, + "reward": 1.6555917263031006, + "reward_std": 0.169988214969635, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4837167263031006, + "step": 260 + }, + { + "completion_length": 211.5625, + "epoch": 0.08317399617590822, + "grad_norm": 27.106956481933594, + "kl": 0.055419921875, + "learning_rate": 9.168260038240917e-07, + "loss": 0.0022, + "reward": 1.4652085304260254, + "reward_std": 0.07502000033855438, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34020841121673584, + "rewards/pad": 0.125, + "step": 261 + }, + { + "completion_length": 207.0, + "epoch": 0.08349267049075844, + "grad_norm": 8.570914268493652, + "kl": 0.05615234375, + "learning_rate": 9.165073295092414e-07, + "loss": 0.0022, + "reward": 1.5496916770935059, + "reward_std": 0.12746258080005646, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5496916770935059, + "step": 262 + }, + { + "completion_length": 290.109375, + "epoch": 0.08381134480560867, + "grad_norm": 10.006585121154785, + "kl": 0.048583984375, + "learning_rate": 9.161886551943912e-07, + "loss": 0.0019, + "reward": 1.3973890542984009, + "reward_std": 0.13302114605903625, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4286390244960785, + "step": 263 + }, + { + "completion_length": 264.984375, + "epoch": 0.0841300191204589, + "grad_norm": 5.002647876739502, + "kl": 0.0703125, + "learning_rate": 9.158699808795411e-07, + "loss": 0.0028, + "reward": 1.3228042125701904, + "reward_std": 0.1068376898765564, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.33842921257019043, + "rewards/pad": 0.0, + "step": 264 + }, + { + "completion_length": 142.734375, + "epoch": 0.08444869343530911, + "grad_norm": 16.07257843017578, + "kl": 0.0859375, + "learning_rate": 9.155513065646909e-07, + "loss": 0.0034, + "reward": 1.4866881370544434, + "reward_std": 0.2174568474292755, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4554380774497986, + "step": 265 + }, + { + "completion_length": 185.0, + "epoch": 0.08476736775015933, + "grad_norm": 14.378037452697754, + "kl": 0.064453125, + "learning_rate": 9.152326322498406e-07, + "loss": 0.0026, + "reward": 1.4063767194747925, + "reward_std": 0.2589850127696991, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.37512677907943726, + "rewards/pad": 0.0625, + "step": 266 + }, + { + "completion_length": 258.453125, + "epoch": 0.08508604206500955, + "grad_norm": 14.581949234008789, + "kl": 0.04541015625, + "learning_rate": 9.149139579349904e-07, + "loss": 0.0018, + "reward": 1.425885796546936, + "reward_std": 0.09042633324861526, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42588573694229126, + "step": 267 + }, + { + "completion_length": 190.171875, + "epoch": 0.08540471637985979, + "grad_norm": 8.301403999328613, + "kl": 0.064453125, + "learning_rate": 9.145952836201402e-07, + "loss": 0.0026, + "reward": 1.5484635829925537, + "reward_std": 0.17337355017662048, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4390886723995209, + "rewards/pad": 0.125, + "step": 268 + }, + { + "completion_length": 208.59375, + "epoch": 0.08572339069471001, + "grad_norm": 9.300251007080078, + "kl": 0.0771484375, + "learning_rate": 9.1427660930529e-07, + "loss": 0.0031, + "reward": 1.4223666191101074, + "reward_std": 0.0689803957939148, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4223666787147522, + "rewards/pad": 0.0, + "step": 269 + }, + { + "completion_length": 337.84375, + "epoch": 0.08604206500956023, + "grad_norm": 7.203383922576904, + "kl": 0.037109375, + "learning_rate": 9.139579349904397e-07, + "loss": 0.0015, + "reward": 1.3084739446640015, + "reward_std": 0.07721705734729767, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.32409897446632385, + "step": 270 + }, + { + "completion_length": 222.78125, + "epoch": 0.08636073932441045, + "grad_norm": 17.609630584716797, + "kl": 0.07763671875, + "learning_rate": 9.136392606755895e-07, + "loss": 0.0031, + "reward": 1.4136435985565186, + "reward_std": 0.11169826984405518, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39801862835884094, + "step": 271 + }, + { + "completion_length": 185.484375, + "epoch": 0.08667941363926067, + "grad_norm": 12.315672874450684, + "kl": 0.06396484375, + "learning_rate": 9.133205863607393e-07, + "loss": 0.0026, + "reward": 1.6809338331222534, + "reward_std": 0.2243930697441101, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4621838927268982, + "step": 272 + }, + { + "completion_length": 156.875, + "epoch": 0.0869980879541109, + "grad_norm": 11.391555786132812, + "kl": 0.07666015625, + "learning_rate": 9.130019120458891e-07, + "loss": 0.0031, + "reward": 1.4851157665252686, + "reward_std": 0.11955823004245758, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48511582612991333, + "rewards/pad": 0.0, + "step": 273 + }, + { + "completion_length": 142.203125, + "epoch": 0.08731676226896112, + "grad_norm": 11.662199020385742, + "kl": 0.0849609375, + "learning_rate": 9.126832377310388e-07, + "loss": 0.0034, + "reward": 1.606271505355835, + "reward_std": 0.13789045810699463, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49689656496047974, + "rewards/pad": 0.109375, + "step": 274 + }, + { + "completion_length": 214.796875, + "epoch": 0.08763543658381134, + "grad_norm": 8.317606925964355, + "kl": 0.064453125, + "learning_rate": 9.123645634161886e-07, + "loss": 0.0026, + "reward": 1.5963852405548096, + "reward_std": 0.1202874481678009, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5026353001594543, + "step": 275 + }, + { + "completion_length": 215.0, + "epoch": 0.08795411089866156, + "grad_norm": 11.95240592956543, + "kl": 0.06787109375, + "learning_rate": 9.120458891013384e-07, + "loss": 0.0027, + "reward": 1.4960030317306519, + "reward_std": 0.0837111845612526, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49600303173065186, + "rewards/pad": 0.0, + "step": 276 + }, + { + "completion_length": 253.28125, + "epoch": 0.08827278521351178, + "grad_norm": 8.151103019714355, + "kl": 0.06689453125, + "learning_rate": 9.117272147864882e-07, + "loss": 0.0027, + "reward": 1.4131813049316406, + "reward_std": 0.12128612399101257, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42880627512931824, + "rewards/pad": 0.0, + "step": 277 + }, + { + "completion_length": 102.109375, + "epoch": 0.08859145952836202, + "grad_norm": 14.23055362701416, + "kl": 0.08056640625, + "learning_rate": 9.114085404716379e-07, + "loss": 0.0032, + "reward": 1.7072069644927979, + "reward_std": 0.04941394180059433, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5822069644927979, + "rewards/pad": 0.125, + "step": 278 + }, + { + "completion_length": 257.078125, + "epoch": 0.08891013384321224, + "grad_norm": 50.22057342529297, + "kl": 0.06005859375, + "learning_rate": 9.110898661567877e-07, + "loss": 0.0024, + "reward": 1.3935086727142334, + "reward_std": 0.1538911759853363, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3310086131095886, + "step": 279 + }, + { + "completion_length": 226.375, + "epoch": 0.08922880815806246, + "grad_norm": 6.466733455657959, + "kl": 0.06396484375, + "learning_rate": 9.107711918419375e-07, + "loss": 0.0026, + "reward": 1.5158886909484863, + "reward_std": 0.0630214735865593, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5158886313438416, + "step": 280 + }, + { + "completion_length": 256.375, + "epoch": 0.08954748247291268, + "grad_norm": 13.729325294494629, + "kl": 0.0693359375, + "learning_rate": 9.104525175270872e-07, + "loss": 0.0028, + "reward": 1.4866538047790527, + "reward_std": 0.0790114551782608, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48665374517440796, + "step": 281 + }, + { + "completion_length": 87.53125, + "epoch": 0.08986615678776291, + "grad_norm": 12.788779258728027, + "kl": 0.12255859375, + "learning_rate": 9.10133843212237e-07, + "loss": 0.0049, + "reward": 1.6292393207550049, + "reward_std": 0.18508005142211914, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5198642015457153, + "rewards/pad": 0.109375, + "step": 282 + }, + { + "completion_length": 160.359375, + "epoch": 0.09018483110261313, + "grad_norm": 10.025726318359375, + "kl": 0.06396484375, + "learning_rate": 9.098151688973869e-07, + "loss": 0.0026, + "reward": 1.8845949172973633, + "reward_std": 0.2100738286972046, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5720949172973633, + "rewards/pad": 0.3125, + "step": 283 + }, + { + "completion_length": 234.46875, + "epoch": 0.09050350541746335, + "grad_norm": 16.763338088989258, + "kl": 0.05419921875, + "learning_rate": 9.094964945825367e-07, + "loss": 0.0022, + "reward": 1.6589397192001343, + "reward_std": 0.19485995173454285, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4401897192001343, + "rewards/pad": 0.21875, + "step": 284 + }, + { + "completion_length": 215.84375, + "epoch": 0.09082217973231357, + "grad_norm": 38.65030288696289, + "kl": 0.055908203125, + "learning_rate": 9.091778202676864e-07, + "loss": 0.0022, + "reward": 1.6431481838226318, + "reward_std": 0.07203955203294754, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5181481838226318, + "rewards/pad": 0.125, + "step": 285 + }, + { + "completion_length": 224.546875, + "epoch": 0.0911408540471638, + "grad_norm": 6.317727088928223, + "kl": 0.052978515625, + "learning_rate": 9.088591459528362e-07, + "loss": 0.0021, + "reward": 1.4984824657440186, + "reward_std": 0.13541144132614136, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.498482346534729, + "step": 286 + }, + { + "completion_length": 241.5, + "epoch": 0.09145952836201403, + "grad_norm": 11.916752815246582, + "kl": 0.059814453125, + "learning_rate": 9.08540471637986e-07, + "loss": 0.0024, + "reward": 1.5799572467803955, + "reward_std": 0.12866558134555817, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4862072765827179, + "step": 287 + }, + { + "completion_length": 322.78125, + "epoch": 0.09177820267686425, + "grad_norm": 13.54063892364502, + "kl": 0.0576171875, + "learning_rate": 9.082217973231358e-07, + "loss": 0.0023, + "reward": 1.5151567459106445, + "reward_std": 0.10878373682498932, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5151568651199341, + "step": 288 + }, + { + "completion_length": 147.203125, + "epoch": 0.09209687699171447, + "grad_norm": 11.628636360168457, + "kl": 0.08544921875, + "learning_rate": 9.079031230082855e-07, + "loss": 0.0034, + "reward": 1.461251974105835, + "reward_std": 0.18991222977638245, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3675019145011902, + "rewards/pad": 0.109375, + "step": 289 + }, + { + "completion_length": 236.890625, + "epoch": 0.09241555130656469, + "grad_norm": 6.667603015899658, + "kl": 0.052490234375, + "learning_rate": 9.075844486934353e-07, + "loss": 0.0021, + "reward": 1.4545420408248901, + "reward_std": 0.11373404413461685, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.28266704082489014, + "step": 290 + }, + { + "completion_length": 266.078125, + "epoch": 0.09273422562141491, + "grad_norm": 23.1329345703125, + "kl": 0.052734375, + "learning_rate": 9.072657743785851e-07, + "loss": 0.0021, + "reward": 1.391666054725647, + "reward_std": 0.0459078848361969, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.391666054725647, + "step": 291 + }, + { + "completion_length": 92.171875, + "epoch": 0.09305289993626514, + "grad_norm": 8.88923168182373, + "kl": 0.0908203125, + "learning_rate": 9.069471000637349e-07, + "loss": 0.0036, + "reward": 1.671968698501587, + "reward_std": 0.13037461042404175, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6719686985015869, + "rewards/pad": 0.0, + "step": 292 + }, + { + "completion_length": 194.09375, + "epoch": 0.09337157425111536, + "grad_norm": 15.927318572998047, + "kl": 0.072265625, + "learning_rate": 9.066284257488846e-07, + "loss": 0.0029, + "reward": 1.463443398475647, + "reward_std": 0.09356062859296799, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46344345808029175, + "step": 293 + }, + { + "completion_length": 185.359375, + "epoch": 0.09369024856596558, + "grad_norm": 19.33968162536621, + "kl": 0.0751953125, + "learning_rate": 9.063097514340344e-07, + "loss": 0.003, + "reward": 1.5570530891418457, + "reward_std": 0.12107745558023453, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5414280295372009, + "rewards/pad": 0.015625, + "step": 294 + }, + { + "completion_length": 228.8125, + "epoch": 0.0940089228808158, + "grad_norm": 13.032269477844238, + "kl": 0.068359375, + "learning_rate": 9.059910771191841e-07, + "loss": 0.0027, + "reward": 1.4693000316619873, + "reward_std": 0.06390385329723358, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4693000912666321, + "rewards/pad": 0.0, + "step": 295 + }, + { + "completion_length": 222.296875, + "epoch": 0.09432759719566602, + "grad_norm": 7.67185115814209, + "kl": 0.054931640625, + "learning_rate": 9.056724028043339e-07, + "loss": 0.0022, + "reward": 1.4528157711029053, + "reward_std": 0.12174256145954132, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39031583070755005, + "step": 296 + }, + { + "completion_length": 262.03125, + "epoch": 0.09464627151051626, + "grad_norm": 9.583202362060547, + "kl": 0.056640625, + "learning_rate": 9.053537284894836e-07, + "loss": 0.0023, + "reward": 1.4682092666625977, + "reward_std": 0.07929195463657379, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46820923686027527, + "rewards/pad": 0.0, + "step": 297 + }, + { + "completion_length": 230.375, + "epoch": 0.09496494582536648, + "grad_norm": 7.358712196350098, + "kl": 0.064453125, + "learning_rate": 9.050350541746334e-07, + "loss": 0.0026, + "reward": 1.418821930885315, + "reward_std": 0.04992415010929108, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.29382190108299255, + "rewards/pad": 0.125, + "step": 298 + }, + { + "completion_length": 247.1875, + "epoch": 0.0952836201402167, + "grad_norm": 5.215261936187744, + "kl": 0.057373046875, + "learning_rate": 9.047163798597832e-07, + "loss": 0.0023, + "reward": 1.459688663482666, + "reward_std": 0.08986417204141617, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47531357407569885, + "rewards/pad": 0.0, + "step": 299 + }, + { + "completion_length": 242.0, + "epoch": 0.09560229445506692, + "grad_norm": 11.121061325073242, + "kl": 0.0703125, + "learning_rate": 9.04397705544933e-07, + "loss": 0.0028, + "reward": 1.7235416173934937, + "reward_std": 0.08684414625167847, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5985416769981384, + "step": 300 + }, + { + "completion_length": 220.203125, + "epoch": 0.09592096876991714, + "grad_norm": 15.279075622558594, + "kl": 0.06640625, + "learning_rate": 9.040790312300827e-07, + "loss": 0.0027, + "reward": 1.3698844909667969, + "reward_std": 0.10006135702133179, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3855094313621521, + "rewards/pad": 0.0, + "step": 301 + }, + { + "completion_length": 252.265625, + "epoch": 0.09623964308476737, + "grad_norm": 9.165273666381836, + "kl": 0.062255859375, + "learning_rate": 9.037603569152326e-07, + "loss": 0.0025, + "reward": 1.6778218746185303, + "reward_std": 0.15550962090492249, + "rewards/answer_reward": 0.359375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3184468150138855, + "step": 302 + }, + { + "completion_length": 277.03125, + "epoch": 0.0965583173996176, + "grad_norm": 4.425095558166504, + "kl": 0.06005859375, + "learning_rate": 9.034416826003824e-07, + "loss": 0.0024, + "reward": 1.3882136344909668, + "reward_std": 0.10006508976221085, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3882136344909668, + "step": 303 + }, + { + "completion_length": 209.78125, + "epoch": 0.09687699171446781, + "grad_norm": 6.989660739898682, + "kl": 0.06640625, + "learning_rate": 9.031230082855322e-07, + "loss": 0.0027, + "reward": 1.6118898391723633, + "reward_std": 0.1484118551015854, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4868898093700409, + "step": 304 + }, + { + "completion_length": 327.5625, + "epoch": 0.09719566602931803, + "grad_norm": 4.546972751617432, + "kl": 0.04248046875, + "learning_rate": 9.028043339706819e-07, + "loss": 0.0017, + "reward": 1.434516191482544, + "reward_std": 0.04103871434926987, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4345163106918335, + "rewards/pad": 0.0, + "step": 305 + }, + { + "completion_length": 159.15625, + "epoch": 0.09751434034416825, + "grad_norm": 7.944620132446289, + "kl": 0.08203125, + "learning_rate": 9.024856596558317e-07, + "loss": 0.0033, + "reward": 1.6820859909057617, + "reward_std": 0.1810736060142517, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5883360505104065, + "rewards/pad": 0.09375, + "step": 306 + }, + { + "completion_length": 188.140625, + "epoch": 0.09783301465901849, + "grad_norm": 123.36129760742188, + "kl": 0.07470703125, + "learning_rate": 9.021669853409815e-07, + "loss": 0.003, + "reward": 1.84897780418396, + "reward_std": 0.17008383572101593, + "rewards/answer_reward": 0.4375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41147780418395996, + "step": 307 + }, + { + "completion_length": 237.53125, + "epoch": 0.09815168897386871, + "grad_norm": 11.163032531738281, + "kl": 0.0654296875, + "learning_rate": 9.018483110261312e-07, + "loss": 0.0026, + "reward": 1.4214814901351929, + "reward_std": 0.14145858585834503, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42148149013519287, + "rewards/pad": 0.0, + "step": 308 + }, + { + "completion_length": 262.546875, + "epoch": 0.09847036328871893, + "grad_norm": 9.466226577758789, + "kl": 0.072265625, + "learning_rate": 9.01529636711281e-07, + "loss": 0.0029, + "reward": 1.4231048822402954, + "reward_std": 0.08867865800857544, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4231048822402954, + "step": 309 + }, + { + "completion_length": 344.421875, + "epoch": 0.09878903760356915, + "grad_norm": 5.422881126403809, + "kl": 0.03564453125, + "learning_rate": 9.012109623964308e-07, + "loss": 0.0014, + "reward": 1.530600666999817, + "reward_std": 0.12080751359462738, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.2962256968021393, + "step": 310 + }, + { + "completion_length": 209.078125, + "epoch": 0.09910771191841937, + "grad_norm": 5.95306921005249, + "kl": 0.07763671875, + "learning_rate": 9.008922880815806e-07, + "loss": 0.0031, + "reward": 1.460267186164856, + "reward_std": 0.0819513350725174, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4602671265602112, + "rewards/pad": 0.0, + "step": 311 + }, + { + "completion_length": 396.609375, + "epoch": 0.0994263862332696, + "grad_norm": 3.307825803756714, + "kl": 0.0296630859375, + "learning_rate": 9.005736137667303e-07, + "loss": 0.0012, + "reward": 1.4927701950073242, + "reward_std": 0.008223021402955055, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36777013540267944, + "step": 312 + }, + { + "completion_length": 235.984375, + "epoch": 0.09974506054811982, + "grad_norm": 10.863787651062012, + "kl": 0.061767578125, + "learning_rate": 9.002549394518801e-07, + "loss": 0.0025, + "reward": 1.4562087059020996, + "reward_std": 0.1860886812210083, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.47183382511138916, + "step": 313 + }, + { + "completion_length": 288.796875, + "epoch": 0.10006373486297004, + "grad_norm": 5.0147881507873535, + "kl": 0.078125, + "learning_rate": 8.999362651370299e-07, + "loss": 0.0031, + "reward": 1.5954546928405762, + "reward_std": 0.18140065670013428, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.517329752445221, + "step": 314 + }, + { + "completion_length": 259.4375, + "epoch": 0.10038240917782026, + "grad_norm": 8.11809253692627, + "kl": 0.060546875, + "learning_rate": 8.996175908221797e-07, + "loss": 0.0024, + "reward": 1.235037922859192, + "reward_std": 0.1669473648071289, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.2037878930568695, + "step": 315 + }, + { + "completion_length": 207.078125, + "epoch": 0.10070108349267048, + "grad_norm": 17.18282699584961, + "kl": 0.08203125, + "learning_rate": 8.992989165073294e-07, + "loss": 0.0033, + "reward": 1.6323869228363037, + "reward_std": 0.13421547412872314, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6011369228363037, + "rewards/pad": 0.03125, + "step": 316 + }, + { + "completion_length": 204.21875, + "epoch": 0.10101975780752072, + "grad_norm": 14.23183536529541, + "kl": 0.07568359375, + "learning_rate": 8.989802421924792e-07, + "loss": 0.003, + "reward": 1.4993996620178223, + "reward_std": 0.09073391556739807, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49939966201782227, + "rewards/pad": 0.0, + "step": 317 + }, + { + "completion_length": 230.40625, + "epoch": 0.10133843212237094, + "grad_norm": 9.19079875946045, + "kl": 0.1103515625, + "learning_rate": 8.98661567877629e-07, + "loss": 0.0044, + "reward": 1.5880275964736938, + "reward_std": 0.17680257558822632, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4942775368690491, + "step": 318 + }, + { + "completion_length": 300.578125, + "epoch": 0.10165710643722116, + "grad_norm": 4.534358501434326, + "kl": 0.046875, + "learning_rate": 8.983428935627788e-07, + "loss": 0.0019, + "reward": 1.7708877325057983, + "reward_std": 0.1038035899400711, + "rewards/answer_reward": 0.1875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5833878517150879, + "step": 319 + }, + { + "completion_length": 244.796875, + "epoch": 0.10197578075207138, + "grad_norm": 7.014145374298096, + "kl": 0.0751953125, + "learning_rate": 8.980242192479286e-07, + "loss": 0.003, + "reward": 1.6308785676956177, + "reward_std": 0.07405470311641693, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6308785080909729, + "step": 320 + }, + { + "completion_length": 257.0625, + "epoch": 0.1022944550669216, + "grad_norm": 9.099129676818848, + "kl": 0.0556640625, + "learning_rate": 8.977055449330784e-07, + "loss": 0.0022, + "reward": 1.7587946653366089, + "reward_std": 0.14023171365261078, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5556697249412537, + "step": 321 + }, + { + "completion_length": 229.03125, + "epoch": 0.10261312938177183, + "grad_norm": 105.9485855102539, + "kl": 0.076171875, + "learning_rate": 8.973868706182282e-07, + "loss": 0.003, + "reward": 1.485621452331543, + "reward_std": 0.10602043569087982, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48562151193618774, + "rewards/pad": 0.0, + "step": 322 + }, + { + "completion_length": 187.78125, + "epoch": 0.10293180369662205, + "grad_norm": 10.623984336853027, + "kl": 0.1025390625, + "learning_rate": 8.97068196303378e-07, + "loss": 0.0041, + "reward": 1.484621524810791, + "reward_std": 0.20591312646865845, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5002465844154358, + "rewards/pad": 0.0, + "step": 323 + }, + { + "completion_length": 318.578125, + "epoch": 0.10325047801147227, + "grad_norm": 34.81837844848633, + "kl": 0.044189453125, + "learning_rate": 8.967495219885277e-07, + "loss": 0.0018, + "reward": 1.6456243991851807, + "reward_std": 0.19282664358615875, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4424992799758911, + "step": 324 + }, + { + "completion_length": 228.359375, + "epoch": 0.1035691523263225, + "grad_norm": 26.958513259887695, + "kl": 0.06640625, + "learning_rate": 8.964308476736775e-07, + "loss": 0.0027, + "reward": 1.664658546447754, + "reward_std": 0.06885822117328644, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4146585166454315, + "rewards/pad": 0.25, + "step": 325 + }, + { + "completion_length": 161.21875, + "epoch": 0.10388782664117271, + "grad_norm": 13.990256309509277, + "kl": 0.08203125, + "learning_rate": 8.961121733588273e-07, + "loss": 0.0033, + "reward": 1.5286167860031128, + "reward_std": 0.20374968647956848, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.497366726398468, + "rewards/pad": 0.03125, + "step": 326 + }, + { + "completion_length": 377.734375, + "epoch": 0.10420650095602295, + "grad_norm": 5.246281623840332, + "kl": 0.0263671875, + "learning_rate": 8.957934990439771e-07, + "loss": 0.0011, + "reward": 1.5219590663909912, + "reward_std": 0.02962879277765751, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.396959125995636, + "step": 327 + }, + { + "completion_length": 199.171875, + "epoch": 0.10452517527087317, + "grad_norm": 12.192336082458496, + "kl": 0.06396484375, + "learning_rate": 8.954748247291268e-07, + "loss": 0.0026, + "reward": 1.5578429698944092, + "reward_std": 0.16721788048744202, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4328429698944092, + "rewards/pad": 0.125, + "step": 328 + }, + { + "completion_length": 299.09375, + "epoch": 0.10484384958572339, + "grad_norm": 6.608733177185059, + "kl": 0.03857421875, + "learning_rate": 8.951561504142766e-07, + "loss": 0.0015, + "reward": 1.5235612392425537, + "reward_std": 0.12073105573654175, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3673113286495209, + "step": 329 + }, + { + "completion_length": 118.078125, + "epoch": 0.10516252390057361, + "grad_norm": 8.536271095275879, + "kl": 0.08349609375, + "learning_rate": 8.948374760994264e-07, + "loss": 0.0033, + "reward": 1.59187912940979, + "reward_std": 0.12341618537902832, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5918790698051453, + "rewards/pad": 0.0, + "step": 330 + }, + { + "completion_length": 265.359375, + "epoch": 0.10548119821542384, + "grad_norm": 7.807065486907959, + "kl": 0.05419921875, + "learning_rate": 8.945188017845762e-07, + "loss": 0.0022, + "reward": 1.397184133529663, + "reward_std": 0.06614439189434052, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39718419313430786, + "step": 331 + }, + { + "completion_length": 214.828125, + "epoch": 0.10579987253027406, + "grad_norm": 19.004209518432617, + "kl": 0.056640625, + "learning_rate": 8.942001274697259e-07, + "loss": 0.0023, + "reward": 1.6144123077392578, + "reward_std": 0.16007590293884277, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4581623673439026, + "rewards/pad": 0.15625, + "step": 332 + }, + { + "completion_length": 267.265625, + "epoch": 0.10611854684512428, + "grad_norm": 14.096921920776367, + "kl": 0.064453125, + "learning_rate": 8.938814531548757e-07, + "loss": 0.0026, + "reward": 1.404233455657959, + "reward_std": 0.05870746821165085, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40423351526260376, + "rewards/pad": 0.0, + "step": 333 + }, + { + "completion_length": 346.796875, + "epoch": 0.1064372211599745, + "grad_norm": 4.083738803863525, + "kl": 0.0400390625, + "learning_rate": 8.935627788400254e-07, + "loss": 0.0016, + "reward": 1.4418816566467285, + "reward_std": 0.08311553299427032, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4262566566467285, + "step": 334 + }, + { + "completion_length": 345.625, + "epoch": 0.10675589547482472, + "grad_norm": 11.4860200881958, + "kl": 0.042236328125, + "learning_rate": 8.932441045251752e-07, + "loss": 0.0017, + "reward": 1.4205396175384521, + "reward_std": 0.1086416095495224, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4361645579338074, + "step": 335 + }, + { + "completion_length": 330.78125, + "epoch": 0.10707456978967496, + "grad_norm": 6.222726821899414, + "kl": 0.048828125, + "learning_rate": 8.929254302103249e-07, + "loss": 0.002, + "reward": 1.408251166343689, + "reward_std": 0.05332936346530914, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40825116634368896, + "step": 336 + }, + { + "completion_length": 351.40625, + "epoch": 0.10739324410452518, + "grad_norm": 4.140896320343018, + "kl": 0.034423828125, + "learning_rate": 8.926067558954747e-07, + "loss": 0.0014, + "reward": 1.4266270399093628, + "reward_std": 0.14883245527744293, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.364126980304718, + "step": 337 + }, + { + "completion_length": 282.296875, + "epoch": 0.1077119184193754, + "grad_norm": 12.383210182189941, + "kl": 0.060302734375, + "learning_rate": 8.922880815806245e-07, + "loss": 0.0024, + "reward": 1.5008018016815186, + "reward_std": 0.1287446767091751, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4383017420768738, + "rewards/pad": 0.0625, + "step": 338 + }, + { + "completion_length": 245.546875, + "epoch": 0.10803059273422562, + "grad_norm": 11.367423057556152, + "kl": 0.07080078125, + "learning_rate": 8.919694072657742e-07, + "loss": 0.0028, + "reward": 1.3609724044799805, + "reward_std": 0.16549429297447205, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31409746408462524, + "rewards/pad": 0.046875, + "step": 339 + }, + { + "completion_length": 259.6875, + "epoch": 0.10834926704907584, + "grad_norm": 8.234755516052246, + "kl": 0.0615234375, + "learning_rate": 8.916507329509241e-07, + "loss": 0.0025, + "reward": 1.5450584888458252, + "reward_std": 0.13844075798988342, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4825584590435028, + "rewards/pad": 0.0625, + "step": 340 + }, + { + "completion_length": 173.46875, + "epoch": 0.10866794136392607, + "grad_norm": 21.999162673950195, + "kl": 0.0966796875, + "learning_rate": 8.913320586360739e-07, + "loss": 0.0039, + "reward": 1.5725913047790527, + "reward_std": 0.22323568165302277, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5257163047790527, + "step": 341 + }, + { + "completion_length": 225.90625, + "epoch": 0.1089866156787763, + "grad_norm": 11.320805549621582, + "kl": 0.056396484375, + "learning_rate": 8.910133843212237e-07, + "loss": 0.0023, + "reward": 1.5097086429595947, + "reward_std": 0.15257683396339417, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.2909587025642395, + "step": 342 + }, + { + "completion_length": 208.484375, + "epoch": 0.10930528999362651, + "grad_norm": 5.40504789352417, + "kl": 0.06884765625, + "learning_rate": 8.906947100063734e-07, + "loss": 0.0028, + "reward": 1.46799635887146, + "reward_std": 0.11990050971508026, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45237141847610474, + "rewards/pad": 0.015625, + "step": 343 + }, + { + "completion_length": 174.84375, + "epoch": 0.10962396430847673, + "grad_norm": 17.96059799194336, + "kl": 0.08544921875, + "learning_rate": 8.903760356915232e-07, + "loss": 0.0034, + "reward": 1.5519987344741821, + "reward_std": 0.12879955768585205, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.44262367486953735, + "step": 344 + }, + { + "completion_length": 249.40625, + "epoch": 0.10994263862332695, + "grad_norm": 9.05187702178955, + "kl": 0.053466796875, + "learning_rate": 8.90057361376673e-07, + "loss": 0.0021, + "reward": 1.4286737442016602, + "reward_std": 0.18673668801784515, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.953125, + "rewards/iou_glue_reward": 0.334923654794693, + "step": 345 + }, + { + "completion_length": 286.1875, + "epoch": 0.11026131293817719, + "grad_norm": 80.49254608154297, + "kl": 0.439453125, + "learning_rate": 8.897386870618228e-07, + "loss": 0.0175, + "reward": 1.314738154411316, + "reward_std": 0.1686519980430603, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.2991131544113159, + "step": 346 + }, + { + "completion_length": 260.796875, + "epoch": 0.11057998725302741, + "grad_norm": 13.089540481567383, + "kl": 0.05322265625, + "learning_rate": 8.894200127469725e-07, + "loss": 0.0021, + "reward": 1.3983441591262817, + "reward_std": 0.20051315426826477, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.41396912932395935, + "rewards/pad": 0.015625, + "step": 347 + }, + { + "completion_length": 216.28125, + "epoch": 0.11089866156787763, + "grad_norm": 8.443741798400879, + "kl": 0.11865234375, + "learning_rate": 8.891013384321223e-07, + "loss": 0.0047, + "reward": 1.6128265857696533, + "reward_std": 0.16433218121528625, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5347015857696533, + "step": 348 + }, + { + "completion_length": 272.296875, + "epoch": 0.11121733588272785, + "grad_norm": 10.677943229675293, + "kl": 0.060791015625, + "learning_rate": 8.887826641172721e-07, + "loss": 0.0024, + "reward": 1.3177645206451416, + "reward_std": 0.11974099278450012, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3333895206451416, + "rewards/pad": 0.0, + "step": 349 + }, + { + "completion_length": 273.75, + "epoch": 0.11153601019757807, + "grad_norm": 24.402652740478516, + "kl": 0.060302734375, + "learning_rate": 8.884639898024219e-07, + "loss": 0.0024, + "reward": 1.3256800174713135, + "reward_std": 0.1037040650844574, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2163049727678299, + "rewards/pad": 0.109375, + "step": 350 + }, + { + "completion_length": 322.640625, + "epoch": 0.1118546845124283, + "grad_norm": 8.839086532592773, + "kl": 0.0458984375, + "learning_rate": 8.881453154875716e-07, + "loss": 0.0018, + "reward": 1.4524686336517334, + "reward_std": 0.14583848416805267, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.389968603849411, + "step": 351 + }, + { + "completion_length": 272.03125, + "epoch": 0.11217335882727852, + "grad_norm": 6.449258804321289, + "kl": 0.0537109375, + "learning_rate": 8.878266411727214e-07, + "loss": 0.0021, + "reward": 1.493812084197998, + "reward_std": 0.16063782572746277, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.41568708419799805, + "step": 352 + }, + { + "completion_length": 241.96875, + "epoch": 0.11249203314212874, + "grad_norm": 7.824343204498291, + "kl": 0.0654296875, + "learning_rate": 8.875079668578712e-07, + "loss": 0.0026, + "reward": 1.4458305835723877, + "reward_std": 0.08239156007766724, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4458305835723877, + "step": 353 + }, + { + "completion_length": 275.515625, + "epoch": 0.11281070745697896, + "grad_norm": 4.663790225982666, + "kl": 0.06884765625, + "learning_rate": 8.87189292543021e-07, + "loss": 0.0028, + "reward": 1.4097020626068115, + "reward_std": 0.0783160850405693, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4097020626068115, + "rewards/pad": 0.0, + "step": 354 + }, + { + "completion_length": 254.234375, + "epoch": 0.11312938177182918, + "grad_norm": 6.693781852722168, + "kl": 0.0673828125, + "learning_rate": 8.868706182281707e-07, + "loss": 0.0027, + "reward": 1.324296474456787, + "reward_std": 0.07587652653455734, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.32429641485214233, + "rewards/pad": 0.0, + "step": 355 + }, + { + "completion_length": 212.15625, + "epoch": 0.11344805608667942, + "grad_norm": 11.825392723083496, + "kl": 0.1328125, + "learning_rate": 8.865519439133205e-07, + "loss": 0.0053, + "reward": 1.5467828512191772, + "reward_std": 0.0978686735033989, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42178288102149963, + "rewards/pad": 0.125, + "step": 356 + }, + { + "completion_length": 292.421875, + "epoch": 0.11376673040152964, + "grad_norm": 14.117955207824707, + "kl": 0.052734375, + "learning_rate": 8.862332695984703e-07, + "loss": 0.0021, + "reward": 1.4794337749481201, + "reward_std": 0.03357456997036934, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47943371534347534, + "step": 357 + }, + { + "completion_length": 367.53125, + "epoch": 0.11408540471637986, + "grad_norm": 4.155237674713135, + "kl": 0.035888671875, + "learning_rate": 8.859145952836202e-07, + "loss": 0.0014, + "reward": 1.467307448387146, + "reward_std": 0.07496380805969238, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.43605750799179077, + "step": 358 + }, + { + "completion_length": 227.203125, + "epoch": 0.11440407903123008, + "grad_norm": 7.234963893890381, + "kl": 0.05859375, + "learning_rate": 8.855959209687699e-07, + "loss": 0.0024, + "reward": 1.510168194770813, + "reward_std": 0.09282274544239044, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5101682543754578, + "rewards/pad": 0.0, + "step": 359 + }, + { + "completion_length": 203.203125, + "epoch": 0.1147227533460803, + "grad_norm": 9.693031311035156, + "kl": 0.146484375, + "learning_rate": 8.852772466539197e-07, + "loss": 0.0059, + "reward": 1.6317123174667358, + "reward_std": 0.09545093774795532, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5067123770713806, + "rewards/pad": 0.125, + "step": 360 + }, + { + "completion_length": 253.171875, + "epoch": 0.11504142766093053, + "grad_norm": 9.964339256286621, + "kl": 0.0595703125, + "learning_rate": 8.849585723390695e-07, + "loss": 0.0024, + "reward": 1.4393465518951416, + "reward_std": 0.16357487440109253, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39247167110443115, + "step": 361 + }, + { + "completion_length": 290.328125, + "epoch": 0.11536010197578075, + "grad_norm": 10.486550331115723, + "kl": 0.04345703125, + "learning_rate": 8.846398980242193e-07, + "loss": 0.0017, + "reward": 1.4310604333877563, + "reward_std": 0.1469384729862213, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.38418546319007874, + "step": 362 + }, + { + "completion_length": 171.390625, + "epoch": 0.11567877629063097, + "grad_norm": 12.362403869628906, + "kl": 0.09375, + "learning_rate": 8.84321223709369e-07, + "loss": 0.0038, + "reward": 1.4338560104370117, + "reward_std": 0.2927459180355072, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.21510589122772217, + "step": 363 + }, + { + "completion_length": 256.15625, + "epoch": 0.1159974506054812, + "grad_norm": 20.424320220947266, + "kl": 0.050537109375, + "learning_rate": 8.840025493945188e-07, + "loss": 0.002, + "reward": 1.4041390419006348, + "reward_std": 0.09797428548336029, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40413904190063477, + "rewards/pad": 0.0, + "step": 364 + }, + { + "completion_length": 311.4375, + "epoch": 0.11631612492033142, + "grad_norm": 22.563579559326172, + "kl": 0.045166015625, + "learning_rate": 8.836838750796686e-07, + "loss": 0.0018, + "reward": 1.4641354084014893, + "reward_std": 0.14457935094833374, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37038543820381165, + "step": 365 + }, + { + "completion_length": 194.265625, + "epoch": 0.11663479923518165, + "grad_norm": 10.244625091552734, + "kl": 0.06689453125, + "learning_rate": 8.833652007648184e-07, + "loss": 0.0027, + "reward": 1.493947982788086, + "reward_std": 0.14087332785129547, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46269798278808594, + "rewards/pad": 0.03125, + "step": 366 + }, + { + "completion_length": 222.171875, + "epoch": 0.11695347355003187, + "grad_norm": 9.311911582946777, + "kl": 0.060546875, + "learning_rate": 8.830465264499681e-07, + "loss": 0.0024, + "reward": 1.7048492431640625, + "reward_std": 0.10070245712995529, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5798492431640625, + "rewards/pad": 0.125, + "step": 367 + }, + { + "completion_length": 237.078125, + "epoch": 0.11727214786488209, + "grad_norm": 17.15612030029297, + "kl": 0.06640625, + "learning_rate": 8.827278521351179e-07, + "loss": 0.0027, + "reward": 1.4977259635925293, + "reward_std": 0.12879906594753265, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4664759337902069, + "rewards/pad": 0.03125, + "step": 368 + }, + { + "completion_length": 280.4375, + "epoch": 0.11759082217973231, + "grad_norm": 9.260272026062012, + "kl": 0.048583984375, + "learning_rate": 8.824091778202677e-07, + "loss": 0.0019, + "reward": 1.4719653129577637, + "reward_std": 0.12129160016775131, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44071537256240845, + "step": 369 + }, + { + "completion_length": 193.40625, + "epoch": 0.11790949649458253, + "grad_norm": 9.00613021850586, + "kl": 0.068359375, + "learning_rate": 8.820905035054175e-07, + "loss": 0.0027, + "reward": 1.5651819705963135, + "reward_std": 0.12391842901706696, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5651820302009583, + "rewards/pad": 0.0, + "step": 370 + }, + { + "completion_length": 176.5625, + "epoch": 0.11822817080943276, + "grad_norm": 8.101126670837402, + "kl": 0.07080078125, + "learning_rate": 8.817718291905672e-07, + "loss": 0.0028, + "reward": 1.5040981769561768, + "reward_std": 0.22234085202217102, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.316598117351532, + "step": 371 + }, + { + "completion_length": 194.0625, + "epoch": 0.11854684512428298, + "grad_norm": 24.81196403503418, + "kl": 0.06884765625, + "learning_rate": 8.81453154875717e-07, + "loss": 0.0027, + "reward": 1.6260745525360107, + "reward_std": 0.1592978686094284, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5323246121406555, + "step": 372 + }, + { + "completion_length": 159.390625, + "epoch": 0.1188655194391332, + "grad_norm": 19.618276596069336, + "kl": 0.078125, + "learning_rate": 8.811344805608667e-07, + "loss": 0.0031, + "reward": 1.5059666633605957, + "reward_std": 0.15724095702171326, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38096657395362854, + "step": 373 + }, + { + "completion_length": 152.96875, + "epoch": 0.11918419375398343, + "grad_norm": 7.2891716957092285, + "kl": 0.0888671875, + "learning_rate": 8.808158062460164e-07, + "loss": 0.0036, + "reward": 1.4844080209732056, + "reward_std": 0.11266922950744629, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4844079315662384, + "rewards/pad": 0.0, + "step": 374 + }, + { + "completion_length": 201.46875, + "epoch": 0.11950286806883365, + "grad_norm": 10.135858535766602, + "kl": 0.07568359375, + "learning_rate": 8.804971319311662e-07, + "loss": 0.003, + "reward": 1.5312637090682983, + "reward_std": 0.17054462432861328, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5156386494636536, + "rewards/pad": 0.03125, + "step": 375 + }, + { + "completion_length": 211.265625, + "epoch": 0.11982154238368388, + "grad_norm": 34.30707931518555, + "kl": 0.06982421875, + "learning_rate": 8.80178457616316e-07, + "loss": 0.0028, + "reward": 1.6000438928604126, + "reward_std": 0.1634560525417328, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3812939524650574, + "rewards/pad": 0.21875, + "step": 376 + }, + { + "completion_length": 356.96875, + "epoch": 0.1201402166985341, + "grad_norm": 14.304428100585938, + "kl": 0.0546875, + "learning_rate": 8.798597833014659e-07, + "loss": 0.0022, + "reward": 1.4964876174926758, + "reward_std": 0.09968910366296768, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.38711249828338623, + "step": 377 + }, + { + "completion_length": 209.75, + "epoch": 0.12045889101338432, + "grad_norm": 7.223150730133057, + "kl": 0.06201171875, + "learning_rate": 8.795411089866156e-07, + "loss": 0.0025, + "reward": 1.6077044010162354, + "reward_std": 0.21932819485664368, + "rewards/pad": 0.265625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.35770437121391296, + "step": 378 + }, + { + "completion_length": 156.109375, + "epoch": 0.12077756532823454, + "grad_norm": 48.64055633544922, + "kl": 0.0888671875, + "learning_rate": 8.792224346717654e-07, + "loss": 0.0036, + "reward": 1.6563538312911987, + "reward_std": 0.12122064083814621, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.500103771686554, + "rewards/pad": 0.15625, + "step": 379 + }, + { + "completion_length": 239.0625, + "epoch": 0.12109623964308477, + "grad_norm": 18.189565658569336, + "kl": 0.06884765625, + "learning_rate": 8.789037603569152e-07, + "loss": 0.0027, + "reward": 1.4749139547348022, + "reward_std": 0.18104201555252075, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.506164014339447, + "step": 380 + }, + { + "completion_length": 249.109375, + "epoch": 0.121414913957935, + "grad_norm": 7.9685139656066895, + "kl": 0.06396484375, + "learning_rate": 8.78585086042065e-07, + "loss": 0.0026, + "reward": 1.3926160335540771, + "reward_std": 0.1556987464427948, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40824100375175476, + "rewards/pad": 0.0, + "step": 381 + }, + { + "completion_length": 235.421875, + "epoch": 0.12173358827278521, + "grad_norm": 16.168167114257812, + "kl": 0.0556640625, + "learning_rate": 8.782664117272147e-07, + "loss": 0.0022, + "reward": 1.5087082386016846, + "reward_std": 0.17400582134723663, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4462081789970398, + "step": 382 + }, + { + "completion_length": 247.765625, + "epoch": 0.12205226258763544, + "grad_norm": 7.005702972412109, + "kl": 0.05517578125, + "learning_rate": 8.779477374123645e-07, + "loss": 0.0022, + "reward": 1.4749236106872559, + "reward_std": 0.1308993399143219, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4280485510826111, + "rewards/pad": 0.046875, + "step": 383 + }, + { + "completion_length": 262.9375, + "epoch": 0.12237093690248566, + "grad_norm": 31.693580627441406, + "kl": 0.055419921875, + "learning_rate": 8.776290630975143e-07, + "loss": 0.0022, + "reward": 1.641816258430481, + "reward_std": 0.12899675965309143, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5480663776397705, + "rewards/pad": 0.109375, + "step": 384 + }, + { + "completion_length": 242.15625, + "epoch": 0.12268961121733589, + "grad_norm": 11.220101356506348, + "kl": 0.09619140625, + "learning_rate": 8.773103887826641e-07, + "loss": 0.0039, + "reward": 1.4588701725006104, + "reward_std": 0.14389871060848236, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4119952321052551, + "step": 385 + }, + { + "completion_length": 183.5625, + "epoch": 0.12300828553218611, + "grad_norm": 34.87577438354492, + "kl": 0.1025390625, + "learning_rate": 8.769917144678138e-07, + "loss": 0.0041, + "reward": 1.3601453304290771, + "reward_std": 0.10742165893316269, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3757703900337219, + "rewards/pad": 0.0, + "step": 386 + }, + { + "completion_length": 163.171875, + "epoch": 0.12332695984703633, + "grad_norm": 11.912552833557129, + "kl": 0.08154296875, + "learning_rate": 8.766730401529636e-07, + "loss": 0.0033, + "reward": 1.6810275316238403, + "reward_std": 0.17379043996334076, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5247775912284851, + "rewards/pad": 0.15625, + "step": 387 + }, + { + "completion_length": 218.671875, + "epoch": 0.12364563416188655, + "grad_norm": 11.059039115905762, + "kl": 0.061767578125, + "learning_rate": 8.763543658381134e-07, + "loss": 0.0025, + "reward": 1.9036428928375244, + "reward_std": 0.17060169577598572, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4973929524421692, + "rewards/pad": 0.40625, + "step": 388 + }, + { + "completion_length": 271.8125, + "epoch": 0.12396430847673677, + "grad_norm": 10.898370742797852, + "kl": 0.047607421875, + "learning_rate": 8.760356915232632e-07, + "loss": 0.0019, + "reward": 1.503767728805542, + "reward_std": 0.1309741884469986, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4881427586078644, + "step": 389 + }, + { + "completion_length": 294.71875, + "epoch": 0.124282982791587, + "grad_norm": 12.679078102111816, + "kl": 0.050048828125, + "learning_rate": 8.757170172084129e-07, + "loss": 0.002, + "reward": 1.5640192031860352, + "reward_std": 0.07892128080129623, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45464423298835754, + "step": 390 + }, + { + "completion_length": 206.59375, + "epoch": 0.12460165710643722, + "grad_norm": 9.037927627563477, + "kl": 0.064453125, + "learning_rate": 8.753983428935627e-07, + "loss": 0.0026, + "reward": 1.6654486656188965, + "reward_std": 0.15028820931911469, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43107372522354126, + "step": 391 + }, + { + "completion_length": 336.921875, + "epoch": 0.12492033142128744, + "grad_norm": 12.565431594848633, + "kl": 0.047607421875, + "learning_rate": 8.750796685787125e-07, + "loss": 0.0019, + "reward": 1.3818833827972412, + "reward_std": 0.11399512737989426, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.39750826358795166, + "step": 392 + }, + { + "completion_length": 284.0625, + "epoch": 0.12523900573613767, + "grad_norm": 6.1361894607543945, + "kl": 0.054443359375, + "learning_rate": 8.747609942638623e-07, + "loss": 0.0022, + "reward": 1.4880211353302002, + "reward_std": 0.21052920818328857, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3005211055278778, + "step": 393 + }, + { + "completion_length": 267.375, + "epoch": 0.12555768005098789, + "grad_norm": 10.701099395751953, + "kl": 0.0556640625, + "learning_rate": 8.74442319949012e-07, + "loss": 0.0022, + "reward": 1.5392203330993652, + "reward_std": 0.12588217854499817, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.42984533309936523, + "step": 394 + }, + { + "completion_length": 369.78125, + "epoch": 0.1258763543658381, + "grad_norm": 5.1260247230529785, + "kl": 0.035888671875, + "learning_rate": 8.741236456341619e-07, + "loss": 0.0014, + "reward": 1.3278656005859375, + "reward_std": 0.07222297787666321, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3434906005859375, + "step": 395 + }, + { + "completion_length": 151.984375, + "epoch": 0.12619502868068833, + "grad_norm": 11.32140064239502, + "kl": 0.10009765625, + "learning_rate": 8.738049713193117e-07, + "loss": 0.004, + "reward": 1.3771092891693115, + "reward_std": 0.22868841886520386, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.2833593487739563, + "step": 396 + }, + { + "completion_length": 156.859375, + "epoch": 0.12651370299553855, + "grad_norm": 6.877212047576904, + "kl": 0.0869140625, + "learning_rate": 8.734862970044615e-07, + "loss": 0.0035, + "reward": 1.7244150638580322, + "reward_std": 0.2154211401939392, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47441503405570984, + "rewards/pad": 0.25, + "step": 397 + }, + { + "completion_length": 295.65625, + "epoch": 0.1268323773103888, + "grad_norm": 10.377167701721191, + "kl": 0.0458984375, + "learning_rate": 8.731676226896112e-07, + "loss": 0.0018, + "reward": 1.5047334432601929, + "reward_std": 0.17024171352386475, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3328584134578705, + "step": 398 + }, + { + "completion_length": 256.75, + "epoch": 0.12715105162523901, + "grad_norm": 14.790071487426758, + "kl": 0.06494140625, + "learning_rate": 8.72848948374761e-07, + "loss": 0.0026, + "reward": 1.4586857557296753, + "reward_std": 0.08622744679450989, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45868581533432007, + "step": 399 + }, + { + "completion_length": 110.484375, + "epoch": 0.12746972594008923, + "grad_norm": 26.442523956298828, + "kl": 0.0986328125, + "learning_rate": 8.725302740599108e-07, + "loss": 0.004, + "reward": 1.4176801443099976, + "reward_std": 0.20065349340438843, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.33955517411231995, + "step": 400 + }, + { + "completion_length": 315.40625, + "epoch": 0.12778840025493945, + "grad_norm": 9.916601181030273, + "kl": 0.0888671875, + "learning_rate": 8.722115997450606e-07, + "loss": 0.0035, + "reward": 1.4531784057617188, + "reward_std": 0.13725990056991577, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4688034653663635, + "step": 401 + }, + { + "completion_length": 150.46875, + "epoch": 0.12810707456978968, + "grad_norm": 11.95114517211914, + "kl": 0.0888671875, + "learning_rate": 8.718929254302103e-07, + "loss": 0.0036, + "reward": 1.3613560199737549, + "reward_std": 0.15858888626098633, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3301060199737549, + "step": 402 + }, + { + "completion_length": 275.046875, + "epoch": 0.1284257488846399, + "grad_norm": 9.421660423278809, + "kl": 0.06103515625, + "learning_rate": 8.715742511153601e-07, + "loss": 0.0024, + "reward": 1.4936609268188477, + "reward_std": 0.1545773297548294, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.3842858672142029, + "rewards/pad": 0.140625, + "step": 403 + }, + { + "completion_length": 194.421875, + "epoch": 0.12874442319949012, + "grad_norm": 11.03164005279541, + "kl": 0.0849609375, + "learning_rate": 8.712555768005099e-07, + "loss": 0.0034, + "reward": 1.454899787902832, + "reward_std": 0.10735487937927246, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45489975810050964, + "rewards/pad": 0.0, + "step": 404 + }, + { + "completion_length": 178.15625, + "epoch": 0.12906309751434034, + "grad_norm": 11.80660343170166, + "kl": 0.07666015625, + "learning_rate": 8.709369024856596e-07, + "loss": 0.0031, + "reward": 1.655862808227539, + "reward_std": 0.16924072802066803, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5464878678321838, + "rewards/pad": 0.125, + "step": 405 + }, + { + "completion_length": 237.390625, + "epoch": 0.12938177182919056, + "grad_norm": 11.708005905151367, + "kl": 0.052490234375, + "learning_rate": 8.706182281708094e-07, + "loss": 0.0021, + "reward": 1.5820250511169434, + "reward_std": 0.18636751174926758, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4882751405239105, + "step": 406 + }, + { + "completion_length": 194.171875, + "epoch": 0.1297004461440408, + "grad_norm": 8.571614265441895, + "kl": 0.0830078125, + "learning_rate": 8.702995538559592e-07, + "loss": 0.0033, + "reward": 1.6992355585098267, + "reward_std": 0.23588824272155762, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4961106479167938, + "step": 407 + }, + { + "completion_length": 223.0625, + "epoch": 0.13001912045889102, + "grad_norm": 8.831110000610352, + "kl": 0.07568359375, + "learning_rate": 8.69980879541109e-07, + "loss": 0.003, + "reward": 1.5184645652770996, + "reward_std": 0.06349757313728333, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5184646844863892, + "step": 408 + }, + { + "completion_length": 347.59375, + "epoch": 0.13033779477374124, + "grad_norm": 4.942235469818115, + "kl": 0.032470703125, + "learning_rate": 8.696622052262587e-07, + "loss": 0.0013, + "reward": 1.588958740234375, + "reward_std": 0.07658353447914124, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4795836806297302, + "step": 409 + }, + { + "completion_length": 211.40625, + "epoch": 0.13065646908859146, + "grad_norm": 22.08603286743164, + "kl": 0.0634765625, + "learning_rate": 8.693435309114085e-07, + "loss": 0.0025, + "reward": 1.6493875980377197, + "reward_std": 0.09739524126052856, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3993876278400421, + "step": 410 + }, + { + "completion_length": 107.5625, + "epoch": 0.13097514340344169, + "grad_norm": 15.189661026000977, + "kl": 0.09619140625, + "learning_rate": 8.690248565965583e-07, + "loss": 0.0038, + "reward": 1.763694167137146, + "reward_std": 0.19237318634986877, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7011940479278564, + "rewards/pad": 0.0625, + "step": 411 + }, + { + "completion_length": 231.03125, + "epoch": 0.1312938177182919, + "grad_norm": 9.504136085510254, + "kl": 0.056640625, + "learning_rate": 8.68706182281708e-07, + "loss": 0.0023, + "reward": 1.6230957508087158, + "reward_std": 0.14532506465911865, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.6230956315994263, + "step": 412 + }, + { + "completion_length": 229.5, + "epoch": 0.13161249203314213, + "grad_norm": 37.026771545410156, + "kl": 0.06787109375, + "learning_rate": 8.683875079668577e-07, + "loss": 0.0027, + "reward": 1.5715396404266357, + "reward_std": 0.10729014128446579, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44653964042663574, + "rewards/pad": 0.125, + "step": 413 + }, + { + "completion_length": 243.46875, + "epoch": 0.13193116634799235, + "grad_norm": 20.16276741027832, + "kl": 0.0693359375, + "learning_rate": 8.680688336520075e-07, + "loss": 0.0028, + "reward": 1.6400110721588135, + "reward_std": 0.1305033415555954, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5462610125541687, + "step": 414 + }, + { + "completion_length": 239.890625, + "epoch": 0.13224984066284257, + "grad_norm": 8.99725341796875, + "kl": 0.06396484375, + "learning_rate": 8.677501593371574e-07, + "loss": 0.0025, + "reward": 1.3902020454406738, + "reward_std": 0.0927668884396553, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.2964521050453186, + "step": 415 + }, + { + "completion_length": 164.96875, + "epoch": 0.1325685149776928, + "grad_norm": 15.773292541503906, + "kl": 0.0830078125, + "learning_rate": 8.674314850223072e-07, + "loss": 0.0033, + "reward": 1.677088737487793, + "reward_std": 0.21771536767482758, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5052136778831482, + "step": 416 + }, + { + "completion_length": 232.4375, + "epoch": 0.13288718929254303, + "grad_norm": 18.085857391357422, + "kl": 0.0615234375, + "learning_rate": 8.671128107074569e-07, + "loss": 0.0025, + "reward": 1.6815763711929321, + "reward_std": 0.17520847916603088, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47845137119293213, + "rewards/pad": 0.203125, + "step": 417 + }, + { + "completion_length": 200.625, + "epoch": 0.13320586360739325, + "grad_norm": 43.0606803894043, + "kl": 0.08544921875, + "learning_rate": 8.667941363926067e-07, + "loss": 0.0034, + "reward": 1.5185892581939697, + "reward_std": 0.08287880569696426, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5185892581939697, + "rewards/pad": 0.0, + "step": 418 + }, + { + "completion_length": 236.75, + "epoch": 0.13352453792224347, + "grad_norm": 10.71466064453125, + "kl": 0.07666015625, + "learning_rate": 8.664754620777565e-07, + "loss": 0.0031, + "reward": 1.409736156463623, + "reward_std": 0.10538657009601593, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40973618626594543, + "step": 419 + }, + { + "completion_length": 191.15625, + "epoch": 0.1338432122370937, + "grad_norm": 24.71515655517578, + "kl": 0.07861328125, + "learning_rate": 8.661567877629063e-07, + "loss": 0.0031, + "reward": 1.4340004920959473, + "reward_std": 0.10084269940853119, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43400052189826965, + "rewards/pad": 0.0, + "step": 420 + }, + { + "completion_length": 268.671875, + "epoch": 0.13416188655194392, + "grad_norm": 14.787396430969238, + "kl": 0.057373046875, + "learning_rate": 8.65838113448056e-07, + "loss": 0.0023, + "reward": 1.55492103099823, + "reward_std": 0.08806140720844269, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5549209713935852, + "rewards/pad": 0.0, + "step": 421 + }, + { + "completion_length": 234.140625, + "epoch": 0.13448056086679414, + "grad_norm": 35.28914260864258, + "kl": 0.060302734375, + "learning_rate": 8.655194391332058e-07, + "loss": 0.0024, + "reward": 1.6005067825317383, + "reward_std": 0.11722254008054733, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6161317825317383, + "rewards/pad": 0.0, + "step": 422 + }, + { + "completion_length": 158.609375, + "epoch": 0.13479923518164436, + "grad_norm": 10.974730491638184, + "kl": 0.07666015625, + "learning_rate": 8.652007648183556e-07, + "loss": 0.0031, + "reward": 1.6964198350906372, + "reward_std": 0.10421520471572876, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4464198350906372, + "rewards/pad": 0.25, + "step": 423 + }, + { + "completion_length": 237.71875, + "epoch": 0.13511790949649458, + "grad_norm": 6.558365345001221, + "kl": 0.064453125, + "learning_rate": 8.648820905035054e-07, + "loss": 0.0026, + "reward": 1.498208999633789, + "reward_std": 0.0822606012225151, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49820899963378906, + "rewards/pad": 0.0, + "step": 424 + }, + { + "completion_length": 225.234375, + "epoch": 0.1354365838113448, + "grad_norm": 40.3632926940918, + "kl": 0.0703125, + "learning_rate": 8.645634161886551e-07, + "loss": 0.0028, + "reward": 1.5778706073760986, + "reward_std": 0.1300792247056961, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5934954881668091, + "step": 425 + }, + { + "completion_length": 362.8125, + "epoch": 0.13575525812619502, + "grad_norm": 7.21042013168335, + "kl": 0.035888671875, + "learning_rate": 8.642447418738049e-07, + "loss": 0.0015, + "reward": 1.5087440013885498, + "reward_std": 0.03255070373415947, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5087440609931946, + "rewards/pad": 0.0, + "step": 426 + }, + { + "completion_length": 242.03125, + "epoch": 0.13607393244104526, + "grad_norm": 11.770732879638672, + "kl": 0.060546875, + "learning_rate": 8.639260675589547e-07, + "loss": 0.0024, + "reward": 1.4291597604751587, + "reward_std": 0.09730469435453415, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.30415982007980347, + "rewards/pad": 0.125, + "step": 427 + }, + { + "completion_length": 291.734375, + "epoch": 0.13639260675589548, + "grad_norm": 19.2504940032959, + "kl": 0.0595703125, + "learning_rate": 8.636073932441045e-07, + "loss": 0.0024, + "reward": 1.5188581943511963, + "reward_std": 0.06681855022907257, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3938581347465515, + "step": 428 + }, + { + "completion_length": 264.296875, + "epoch": 0.1367112810707457, + "grad_norm": 9.25992202758789, + "kl": 0.0625, + "learning_rate": 8.632887189292542e-07, + "loss": 0.0025, + "reward": 1.3473349809646606, + "reward_std": 0.09401439875364304, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.36296001076698303, + "rewards/pad": 0.0, + "step": 429 + }, + { + "completion_length": 194.921875, + "epoch": 0.13702995538559593, + "grad_norm": 27.35711669921875, + "kl": 0.07958984375, + "learning_rate": 8.62970044614404e-07, + "loss": 0.0032, + "reward": 1.5259891748428345, + "reward_std": 0.17146514356136322, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.40098920464515686, + "step": 430 + }, + { + "completion_length": 169.28125, + "epoch": 0.13734862970044615, + "grad_norm": 14.931100845336914, + "kl": 0.08056640625, + "learning_rate": 8.626513702995538e-07, + "loss": 0.0032, + "reward": 1.5208649635314941, + "reward_std": 0.13065671920776367, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5208649635314941, + "rewards/pad": 0.0, + "step": 431 + }, + { + "completion_length": 336.796875, + "epoch": 0.13766730401529637, + "grad_norm": 12.129999160766602, + "kl": 0.05078125, + "learning_rate": 8.623326959847035e-07, + "loss": 0.002, + "reward": 1.339362621307373, + "reward_std": 0.12774814665317535, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3549875020980835, + "rewards/pad": 0.0, + "step": 432 + }, + { + "completion_length": 167.75, + "epoch": 0.13798597833014659, + "grad_norm": 9.612985610961914, + "kl": 0.09033203125, + "learning_rate": 8.620140216698534e-07, + "loss": 0.0036, + "reward": 1.7826504707336426, + "reward_std": 0.1886083334684372, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4857754409313202, + "rewards/pad": 0.296875, + "step": 433 + }, + { + "completion_length": 365.578125, + "epoch": 0.1383046526449968, + "grad_norm": 5.653805255889893, + "kl": 0.035888671875, + "learning_rate": 8.616953473550032e-07, + "loss": 0.0014, + "reward": 1.5266468524932861, + "reward_std": 0.03666418045759201, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5266469717025757, + "rewards/pad": 0.0, + "step": 434 + }, + { + "completion_length": 261.953125, + "epoch": 0.13862332695984703, + "grad_norm": 14.368305206298828, + "kl": 0.130859375, + "learning_rate": 8.61376673040153e-07, + "loss": 0.0053, + "reward": 1.4595415592193604, + "reward_std": 0.11646923422813416, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36579152941703796, + "step": 435 + }, + { + "completion_length": 287.796875, + "epoch": 0.13894200127469725, + "grad_norm": 6.295452117919922, + "kl": 0.0732421875, + "learning_rate": 8.610579987253027e-07, + "loss": 0.0029, + "reward": 1.5171446800231934, + "reward_std": 0.08354522287845612, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39214473962783813, + "step": 436 + }, + { + "completion_length": 266.453125, + "epoch": 0.1392606755895475, + "grad_norm": 8.192060470581055, + "kl": 0.055908203125, + "learning_rate": 8.607393244104525e-07, + "loss": 0.0022, + "reward": 1.730386734008789, + "reward_std": 0.18066661059856415, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5585117340087891, + "step": 437 + }, + { + "completion_length": 155.703125, + "epoch": 0.13957934990439771, + "grad_norm": 38.294921875, + "kl": 0.1015625, + "learning_rate": 8.604206500956023e-07, + "loss": 0.0041, + "reward": 1.4056483507156372, + "reward_std": 0.09939147531986237, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4056483507156372, + "rewards/pad": 0.0, + "step": 438 + }, + { + "completion_length": 140.390625, + "epoch": 0.13989802421924794, + "grad_norm": 14.03580093383789, + "kl": 0.25390625, + "learning_rate": 8.601019757807521e-07, + "loss": 0.0102, + "reward": 1.5623379945755005, + "reward_std": 0.24287503957748413, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5310879349708557, + "step": 439 + }, + { + "completion_length": 216.875, + "epoch": 0.14021669853409816, + "grad_norm": 22.822328567504883, + "kl": 0.06982421875, + "learning_rate": 8.597833014659018e-07, + "loss": 0.0028, + "reward": 1.5086784362792969, + "reward_std": 0.07392227649688721, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3836785554885864, + "step": 440 + }, + { + "completion_length": 176.796875, + "epoch": 0.14053537284894838, + "grad_norm": 43.985252380371094, + "kl": 0.1015625, + "learning_rate": 8.594646271510516e-07, + "loss": 0.0041, + "reward": 1.6859261989593506, + "reward_std": 0.11247699707746506, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5765512585639954, + "rewards/pad": 0.125, + "step": 441 + }, + { + "completion_length": 271.8125, + "epoch": 0.1408540471637986, + "grad_norm": 33.14993667602539, + "kl": 0.06591796875, + "learning_rate": 8.591459528362014e-07, + "loss": 0.0026, + "reward": 1.4822893142700195, + "reward_std": 0.07760314643383026, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48228925466537476, + "rewards/pad": 0.0, + "step": 442 + }, + { + "completion_length": 200.96875, + "epoch": 0.14117272147864882, + "grad_norm": 10.074478149414062, + "kl": 0.07568359375, + "learning_rate": 8.588272785213512e-07, + "loss": 0.003, + "reward": 1.4677287340164185, + "reward_std": 0.12547984719276428, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48335376381874084, + "rewards/pad": 0.0, + "step": 443 + }, + { + "completion_length": 271.46875, + "epoch": 0.14149139579349904, + "grad_norm": 7.2311015129089355, + "kl": 0.10009765625, + "learning_rate": 8.585086042065009e-07, + "loss": 0.004, + "reward": 1.4389028549194336, + "reward_std": 0.06437624990940094, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4389027953147888, + "step": 444 + }, + { + "completion_length": 209.609375, + "epoch": 0.14181007010834926, + "grad_norm": 36.39683151245117, + "kl": 0.08837890625, + "learning_rate": 8.581899298916507e-07, + "loss": 0.0035, + "reward": 1.5782438516616821, + "reward_std": 0.18926000595092773, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46886885166168213, + "rewards/pad": 0.109375, + "step": 445 + }, + { + "completion_length": 293.234375, + "epoch": 0.14212874442319948, + "grad_norm": 8.683002471923828, + "kl": 0.064453125, + "learning_rate": 8.578712555768005e-07, + "loss": 0.0026, + "reward": 1.3747520446777344, + "reward_std": 0.15450097620487213, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.390377014875412, + "step": 446 + }, + { + "completion_length": 319.296875, + "epoch": 0.14244741873804972, + "grad_norm": 11.498235702514648, + "kl": 0.06591796875, + "learning_rate": 8.575525812619503e-07, + "loss": 0.0027, + "reward": 1.410198450088501, + "reward_std": 0.0504818931221962, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4101985692977905, + "step": 447 + }, + { + "completion_length": 177.375, + "epoch": 0.14276609305289995, + "grad_norm": 5.232941150665283, + "kl": 0.0908203125, + "learning_rate": 8.572339069471e-07, + "loss": 0.0036, + "reward": 1.4534001350402832, + "reward_std": 0.053990766406059265, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.453400194644928, + "rewards/pad": 0.0, + "step": 448 + }, + { + "completion_length": 302.578125, + "epoch": 0.14308476736775017, + "grad_norm": 34.351104736328125, + "kl": 0.051513671875, + "learning_rate": 8.569152326322498e-07, + "loss": 0.0021, + "reward": 1.6452140808105469, + "reward_std": 0.06540853530168533, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5202141404151917, + "step": 449 + }, + { + "completion_length": 176.640625, + "epoch": 0.14340344168260039, + "grad_norm": 9.729473114013672, + "kl": 0.08203125, + "learning_rate": 8.565965583173996e-07, + "loss": 0.0033, + "reward": 1.4867746829986572, + "reward_std": 0.07899289578199387, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.486774742603302, + "rewards/pad": 0.0, + "step": 450 + }, + { + "completion_length": 169.0, + "epoch": 0.1437221159974506, + "grad_norm": 9.015345573425293, + "kl": 0.07080078125, + "learning_rate": 8.562778840025495e-07, + "loss": 0.0028, + "reward": 1.9677799940109253, + "reward_std": 0.1721034198999405, + "rewards/answer_reward": 0.53125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.43653005361557007, + "step": 451 + }, + { + "completion_length": 199.875, + "epoch": 0.14404079031230083, + "grad_norm": 8.44957447052002, + "kl": 0.076171875, + "learning_rate": 8.55959209687699e-07, + "loss": 0.003, + "reward": 1.7060662508010864, + "reward_std": 0.12800918519496918, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5029412508010864, + "step": 452 + }, + { + "completion_length": 189.890625, + "epoch": 0.14435946462715105, + "grad_norm": 10.389759063720703, + "kl": 0.06982421875, + "learning_rate": 8.556405353728489e-07, + "loss": 0.0028, + "reward": 1.3262500762939453, + "reward_std": 0.10323597490787506, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3106251060962677, + "rewards/pad": 0.015625, + "step": 453 + }, + { + "completion_length": 333.3125, + "epoch": 0.14467813894200127, + "grad_norm": 6.098585605621338, + "kl": 0.041748046875, + "learning_rate": 8.553218610579987e-07, + "loss": 0.0017, + "reward": 1.5114164352416992, + "reward_std": 0.09109494835138321, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5270413160324097, + "step": 454 + }, + { + "completion_length": 245.8125, + "epoch": 0.1449968132568515, + "grad_norm": 13.058919906616211, + "kl": 0.0771484375, + "learning_rate": 8.550031867431485e-07, + "loss": 0.0031, + "reward": 1.5187058448791504, + "reward_std": 0.0741514340043068, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5187058448791504, + "rewards/pad": 0.0, + "step": 455 + }, + { + "completion_length": 247.71875, + "epoch": 0.14531548757170173, + "grad_norm": 49.04275131225586, + "kl": 0.0771484375, + "learning_rate": 8.546845124282982e-07, + "loss": 0.0031, + "reward": 1.6912260055541992, + "reward_std": 0.06264584511518478, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5662259459495544, + "rewards/pad": 0.125, + "step": 456 + }, + { + "completion_length": 207.390625, + "epoch": 0.14563416188655195, + "grad_norm": 17.323837280273438, + "kl": 0.08544921875, + "learning_rate": 8.54365838113448e-07, + "loss": 0.0034, + "reward": 1.5239768028259277, + "reward_std": 0.0447956919670105, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5239768028259277, + "step": 457 + }, + { + "completion_length": 151.703125, + "epoch": 0.14595283620140218, + "grad_norm": 12.530643463134766, + "kl": 0.09765625, + "learning_rate": 8.540471637985978e-07, + "loss": 0.0039, + "reward": 1.595104455947876, + "reward_std": 0.238202303647995, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4544795751571655, + "rewards/pad": 0.15625, + "step": 458 + }, + { + "completion_length": 178.015625, + "epoch": 0.1462715105162524, + "grad_norm": 24.093931198120117, + "kl": 0.10791015625, + "learning_rate": 8.537284894837476e-07, + "loss": 0.0043, + "reward": 1.462281584739685, + "reward_std": 0.09704331308603287, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46228155493736267, + "rewards/pad": 0.0, + "step": 459 + }, + { + "completion_length": 183.578125, + "epoch": 0.14659018483110262, + "grad_norm": 8.605521202087402, + "kl": 0.1015625, + "learning_rate": 8.534098151688973e-07, + "loss": 0.0041, + "reward": 1.644413948059082, + "reward_std": 0.19280801713466644, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.519413948059082, + "step": 460 + }, + { + "completion_length": 185.109375, + "epoch": 0.14690885914595284, + "grad_norm": 9.727154731750488, + "kl": 0.09521484375, + "learning_rate": 8.530911408540471e-07, + "loss": 0.0038, + "reward": 1.4764418601989746, + "reward_std": 0.1266682744026184, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4608168601989746, + "step": 461 + }, + { + "completion_length": 213.90625, + "epoch": 0.14722753346080306, + "grad_norm": 26.93505096435547, + "kl": 0.07275390625, + "learning_rate": 8.527724665391969e-07, + "loss": 0.0029, + "reward": 1.388505220413208, + "reward_std": 0.15996669232845306, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.31038016080856323, + "step": 462 + }, + { + "completion_length": 340.046875, + "epoch": 0.14754620777565328, + "grad_norm": 5.181948184967041, + "kl": 0.04736328125, + "learning_rate": 8.524537922243466e-07, + "loss": 0.0019, + "reward": 1.4414582252502441, + "reward_std": 0.13214264810085297, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.47270816564559937, + "step": 463 + }, + { + "completion_length": 365.53125, + "epoch": 0.1478648820905035, + "grad_norm": 6.609642505645752, + "kl": 0.03662109375, + "learning_rate": 8.521351179094964e-07, + "loss": 0.0015, + "reward": 1.3399658203125, + "reward_std": 0.1445762664079666, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3712157905101776, + "step": 464 + }, + { + "completion_length": 132.265625, + "epoch": 0.14818355640535372, + "grad_norm": 43.083251953125, + "kl": 0.10009765625, + "learning_rate": 8.518164435946462e-07, + "loss": 0.004, + "reward": 1.3067584037780762, + "reward_std": 0.11628291010856628, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.30675846338272095, + "rewards/pad": 0.0, + "step": 465 + }, + { + "completion_length": 202.140625, + "epoch": 0.14850223072020396, + "grad_norm": 6.811500072479248, + "kl": 0.0869140625, + "learning_rate": 8.51497769279796e-07, + "loss": 0.0035, + "reward": 1.5276761054992676, + "reward_std": 0.2129315733909607, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.35580113530158997, + "rewards/pad": 0.203125, + "step": 466 + }, + { + "completion_length": 209.265625, + "epoch": 0.14882090503505419, + "grad_norm": 13.386691093444824, + "kl": 0.07568359375, + "learning_rate": 8.511790949649457e-07, + "loss": 0.003, + "reward": 1.4336509704589844, + "reward_std": 0.19461283087730408, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.3555260896682739, + "rewards/pad": 0.109375, + "step": 467 + }, + { + "completion_length": 248.515625, + "epoch": 0.1491395793499044, + "grad_norm": 8.822103500366211, + "kl": 0.0703125, + "learning_rate": 8.508604206500955e-07, + "loss": 0.0028, + "reward": 1.490767240524292, + "reward_std": 0.06920873373746872, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49076735973358154, + "rewards/pad": 0.0, + "step": 468 + }, + { + "completion_length": 251.171875, + "epoch": 0.14945825366475463, + "grad_norm": 10.474466323852539, + "kl": 0.07470703125, + "learning_rate": 8.505417463352453e-07, + "loss": 0.003, + "reward": 1.6712453365325928, + "reward_std": 0.20213072001934052, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4524953365325928, + "step": 469 + }, + { + "completion_length": 216.984375, + "epoch": 0.14977692797960485, + "grad_norm": 7.846851825714111, + "kl": 0.0849609375, + "learning_rate": 8.502230720203951e-07, + "loss": 0.0034, + "reward": 1.558816909790039, + "reward_std": 0.2822313904762268, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.35569196939468384, + "step": 470 + }, + { + "completion_length": 231.125, + "epoch": 0.15009560229445507, + "grad_norm": 38.57760238647461, + "kl": 0.06884765625, + "learning_rate": 8.499043977055449e-07, + "loss": 0.0027, + "reward": 1.519878625869751, + "reward_std": 0.0842830240726471, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39487871527671814, + "rewards/pad": 0.125, + "step": 471 + }, + { + "completion_length": 143.25, + "epoch": 0.1504142766093053, + "grad_norm": 32.709808349609375, + "kl": 0.1025390625, + "learning_rate": 8.495857233906947e-07, + "loss": 0.0041, + "reward": 1.6208330392837524, + "reward_std": 0.14068816602230072, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6364580392837524, + "rewards/pad": 0.0, + "step": 472 + }, + { + "completion_length": 276.171875, + "epoch": 0.1507329509241555, + "grad_norm": 51.76963806152344, + "kl": 0.05126953125, + "learning_rate": 8.492670490758445e-07, + "loss": 0.002, + "reward": 1.3026278018951416, + "reward_std": 0.19959302246570587, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.25575268268585205, + "rewards/pad": 0.078125, + "step": 473 + }, + { + "completion_length": 240.546875, + "epoch": 0.15105162523900573, + "grad_norm": 12.2163724899292, + "kl": 0.08935546875, + "learning_rate": 8.489483747609943e-07, + "loss": 0.0036, + "reward": 1.482216715812683, + "reward_std": 0.14593639969825745, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41971665620803833, + "step": 474 + }, + { + "completion_length": 264.0625, + "epoch": 0.15137029955385595, + "grad_norm": 11.082056045532227, + "kl": 0.064453125, + "learning_rate": 8.48629700446144e-07, + "loss": 0.0026, + "reward": 1.5266757011413574, + "reward_std": 0.045626167207956314, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5266757011413574, + "step": 475 + }, + { + "completion_length": 243.375, + "epoch": 0.1516889738687062, + "grad_norm": 10.438226699829102, + "kl": 0.06201171875, + "learning_rate": 8.483110261312938e-07, + "loss": 0.0025, + "reward": 1.5385844707489014, + "reward_std": 0.13940951228141785, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.476084440946579, + "step": 476 + }, + { + "completion_length": 193.578125, + "epoch": 0.15200764818355642, + "grad_norm": 9.08781909942627, + "kl": 0.07763671875, + "learning_rate": 8.479923518164436e-07, + "loss": 0.0031, + "reward": 1.4773859977722168, + "reward_std": 0.10028444975614548, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.477385938167572, + "step": 477 + }, + { + "completion_length": 244.546875, + "epoch": 0.15232632249840664, + "grad_norm": 10.916855812072754, + "kl": 0.072265625, + "learning_rate": 8.476736775015934e-07, + "loss": 0.0029, + "reward": 1.606418490409851, + "reward_std": 0.17406152188777924, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5439184904098511, + "step": 478 + }, + { + "completion_length": 273.5, + "epoch": 0.15264499681325686, + "grad_norm": 5.499225616455078, + "kl": 0.060546875, + "learning_rate": 8.473550031867431e-07, + "loss": 0.0024, + "reward": 1.5092673301696777, + "reward_std": 0.08618798106908798, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5092673301696777, + "step": 479 + }, + { + "completion_length": 236.703125, + "epoch": 0.15296367112810708, + "grad_norm": 19.99932098388672, + "kl": 0.056396484375, + "learning_rate": 8.470363288718929e-07, + "loss": 0.0023, + "reward": 1.6022439002990723, + "reward_std": 0.13475370407104492, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4616188406944275, + "step": 480 + }, + { + "completion_length": 326.90625, + "epoch": 0.1532823454429573, + "grad_norm": 10.373702049255371, + "kl": 0.05810546875, + "learning_rate": 8.467176545570427e-07, + "loss": 0.0023, + "reward": 1.3631759881973267, + "reward_std": 0.11231576651334763, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37880098819732666, + "step": 481 + }, + { + "completion_length": 244.359375, + "epoch": 0.15360101975780752, + "grad_norm": 10.808770179748535, + "kl": 0.0712890625, + "learning_rate": 8.463989802421925e-07, + "loss": 0.0028, + "reward": 1.432814598083496, + "reward_std": 0.14196133613586426, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.35468965768814087, + "step": 482 + }, + { + "completion_length": 307.0, + "epoch": 0.15391969407265774, + "grad_norm": 22.456748962402344, + "kl": 0.0673828125, + "learning_rate": 8.460803059273422e-07, + "loss": 0.0027, + "reward": 1.431779384613037, + "reward_std": 0.07337789237499237, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43177950382232666, + "step": 483 + }, + { + "completion_length": 372.8125, + "epoch": 0.15423836838750796, + "grad_norm": 7.4762139320373535, + "kl": 0.037841796875, + "learning_rate": 8.45761631612492e-07, + "loss": 0.0015, + "reward": 1.5914397239685059, + "reward_std": 0.0908648818731308, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48206475377082825, + "step": 484 + }, + { + "completion_length": 319.0, + "epoch": 0.15455704270235818, + "grad_norm": 7.34827995300293, + "kl": 0.053955078125, + "learning_rate": 8.454429572976418e-07, + "loss": 0.0022, + "reward": 1.6976406574249268, + "reward_std": 0.19344541430473328, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49451562762260437, + "rewards/pad": 0.21875, + "step": 485 + }, + { + "completion_length": 241.71875, + "epoch": 0.15487571701720843, + "grad_norm": 9.271265029907227, + "kl": 0.07763671875, + "learning_rate": 8.451242829827916e-07, + "loss": 0.0031, + "reward": 1.671446442604065, + "reward_std": 0.125516876578331, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5620712637901306, + "rewards/pad": 0.109375, + "step": 486 + }, + { + "completion_length": 307.890625, + "epoch": 0.15519439133205865, + "grad_norm": 5.0968017578125, + "kl": 0.05615234375, + "learning_rate": 8.448056086679413e-07, + "loss": 0.0022, + "reward": 1.4919250011444092, + "reward_std": 0.07325821369886398, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5075501203536987, + "step": 487 + }, + { + "completion_length": 281.625, + "epoch": 0.15551306564690887, + "grad_norm": 8.375843048095703, + "kl": 0.048583984375, + "learning_rate": 8.444869343530911e-07, + "loss": 0.0019, + "reward": 1.793757677078247, + "reward_std": 0.09240252524614334, + "rewards/pad": 0.296875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5125076770782471, + "step": 488 + }, + { + "completion_length": 144.453125, + "epoch": 0.15583173996175909, + "grad_norm": 35.22758865356445, + "kl": 0.1015625, + "learning_rate": 8.44168260038241e-07, + "loss": 0.0041, + "reward": 1.523176670074463, + "reward_std": 0.10526008903980255, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5231766700744629, + "rewards/pad": 0.0, + "step": 489 + }, + { + "completion_length": 159.625, + "epoch": 0.1561504142766093, + "grad_norm": 13.647049903869629, + "kl": 0.08642578125, + "learning_rate": 8.438495857233908e-07, + "loss": 0.0035, + "reward": 1.471238136291504, + "reward_std": 0.06791481375694275, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47123825550079346, + "step": 490 + }, + { + "completion_length": 306.46875, + "epoch": 0.15646908859145953, + "grad_norm": 8.69845962524414, + "kl": 0.05615234375, + "learning_rate": 8.435309114085404e-07, + "loss": 0.0023, + "reward": 1.3391271829605103, + "reward_std": 0.0694485455751419, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.33912718296051025, + "rewards/pad": 0.0, + "step": 491 + }, + { + "completion_length": 232.046875, + "epoch": 0.15678776290630975, + "grad_norm": 10.28249740600586, + "kl": 0.06787109375, + "learning_rate": 8.432122370936902e-07, + "loss": 0.0027, + "reward": 1.5181519985198975, + "reward_std": 0.06104717031121254, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39315202832221985, + "rewards/pad": 0.125, + "step": 492 + }, + { + "completion_length": 142.765625, + "epoch": 0.15710643722115997, + "grad_norm": 10.863986015319824, + "kl": 0.09033203125, + "learning_rate": 8.4289356277884e-07, + "loss": 0.0036, + "reward": 1.5782145261764526, + "reward_std": 0.12296774983406067, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5000894665718079, + "step": 493 + }, + { + "completion_length": 182.34375, + "epoch": 0.1574251115360102, + "grad_norm": 85.28020477294922, + "kl": 0.0966796875, + "learning_rate": 8.425748884639897e-07, + "loss": 0.0038, + "reward": 1.455416202545166, + "reward_std": 0.08780718594789505, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45541617274284363, + "step": 494 + }, + { + "completion_length": 146.9375, + "epoch": 0.1577437858508604, + "grad_norm": 34.29008865356445, + "kl": 0.08837890625, + "learning_rate": 8.422562141491395e-07, + "loss": 0.0035, + "reward": 1.630502700805664, + "reward_std": 0.13765683770179749, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5055026412010193, + "rewards/pad": 0.125, + "step": 495 + }, + { + "completion_length": 223.4375, + "epoch": 0.15806246016571066, + "grad_norm": 7.224981784820557, + "kl": 0.07958984375, + "learning_rate": 8.419375398342893e-07, + "loss": 0.0032, + "reward": 1.4344799518585205, + "reward_std": 0.07560813426971436, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4344799518585205, + "rewards/pad": 0.0, + "step": 496 + }, + { + "completion_length": 287.734375, + "epoch": 0.15838113448056088, + "grad_norm": 5.872875690460205, + "kl": 0.072265625, + "learning_rate": 8.416188655194391e-07, + "loss": 0.0029, + "reward": 1.2482273578643799, + "reward_std": 0.10398498177528381, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.26385238766670227, + "step": 497 + }, + { + "completion_length": 190.484375, + "epoch": 0.1586998087954111, + "grad_norm": 17.003299713134766, + "kl": 0.07421875, + "learning_rate": 8.413001912045888e-07, + "loss": 0.003, + "reward": 1.635709524154663, + "reward_std": 0.15149308741092682, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4325844943523407, + "rewards/pad": 0.203125, + "step": 498 + }, + { + "completion_length": 167.03125, + "epoch": 0.15901848311026132, + "grad_norm": 26.2998104095459, + "kl": 0.08447265625, + "learning_rate": 8.409815168897386e-07, + "loss": 0.0034, + "reward": 1.633731484413147, + "reward_std": 0.21327605843544006, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4306064546108246, + "rewards/pad": 0.21875, + "step": 499 + }, + { + "completion_length": 300.09375, + "epoch": 0.15933715742511154, + "grad_norm": 5.521867752075195, + "kl": 0.054931640625, + "learning_rate": 8.406628425748884e-07, + "loss": 0.0022, + "reward": 1.4881134033203125, + "reward_std": 0.034483470022678375, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4881134629249573, + "step": 500 + }, + { + "completion_length": 256.046875, + "epoch": 0.15965583173996176, + "grad_norm": 10.656488418579102, + "kl": 0.06884765625, + "learning_rate": 8.403441682600382e-07, + "loss": 0.0028, + "reward": 1.7126858234405518, + "reward_std": 0.09016523510217667, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6033110022544861, + "step": 501 + }, + { + "completion_length": 302.796875, + "epoch": 0.15997450605481198, + "grad_norm": 7.291167259216309, + "kl": 0.054443359375, + "learning_rate": 8.400254939451879e-07, + "loss": 0.0022, + "reward": 1.5585349798202515, + "reward_std": 0.10088232904672623, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.48041003942489624, + "step": 502 + }, + { + "completion_length": 225.375, + "epoch": 0.1602931803696622, + "grad_norm": 17.202632904052734, + "kl": 0.064453125, + "learning_rate": 8.397068196303377e-07, + "loss": 0.0026, + "reward": 1.43354070186615, + "reward_std": 0.03540854528546333, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4335406720638275, + "step": 503 + }, + { + "completion_length": 275.703125, + "epoch": 0.16061185468451242, + "grad_norm": 11.276266098022461, + "kl": 0.08154296875, + "learning_rate": 8.393881453154875e-07, + "loss": 0.0033, + "reward": 1.45798659324646, + "reward_std": 0.15015798807144165, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4267365634441376, + "rewards/pad": 0.046875, + "step": 504 + }, + { + "completion_length": 343.984375, + "epoch": 0.16093052899936264, + "grad_norm": 3.738642454147339, + "kl": 0.039306640625, + "learning_rate": 8.390694710006373e-07, + "loss": 0.0016, + "reward": 1.6080937385559082, + "reward_std": 0.022641608491539955, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6080936193466187, + "step": 505 + }, + { + "completion_length": 314.25, + "epoch": 0.16124920331421289, + "grad_norm": 21.043798446655273, + "kl": 0.04150390625, + "learning_rate": 8.38750796685787e-07, + "loss": 0.0017, + "reward": 1.453988790512085, + "reward_std": 0.08246888220310211, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34461385011672974, + "step": 506 + }, + { + "completion_length": 187.671875, + "epoch": 0.1615678776290631, + "grad_norm": 7.886463642120361, + "kl": 0.0859375, + "learning_rate": 8.384321223709368e-07, + "loss": 0.0034, + "reward": 1.611316442489624, + "reward_std": 0.17711031436920166, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5175663232803345, + "rewards/pad": 0.09375, + "step": 507 + }, + { + "completion_length": 243.3125, + "epoch": 0.16188655194391333, + "grad_norm": 5.370201110839844, + "kl": 0.06201171875, + "learning_rate": 8.381134480560866e-07, + "loss": 0.0025, + "reward": 1.4405754804611206, + "reward_std": 0.13271909952163696, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3155754804611206, + "step": 508 + }, + { + "completion_length": 284.84375, + "epoch": 0.16220522625876355, + "grad_norm": 5.100447177886963, + "kl": 0.060791015625, + "learning_rate": 8.377947737412365e-07, + "loss": 0.0024, + "reward": 1.561161756515503, + "reward_std": 0.08327622711658478, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43616175651550293, + "step": 509 + }, + { + "completion_length": 351.1875, + "epoch": 0.16252390057361377, + "grad_norm": 3.4032809734344482, + "kl": 0.037109375, + "learning_rate": 8.374760994263862e-07, + "loss": 0.0015, + "reward": 1.5396835803985596, + "reward_std": 0.06057661026716232, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43030864000320435, + "step": 510 + }, + { + "completion_length": 259.0, + "epoch": 0.162842574888464, + "grad_norm": 5.124754905700684, + "kl": 0.0634765625, + "learning_rate": 8.37157425111536e-07, + "loss": 0.0025, + "reward": 1.4768354892730713, + "reward_std": 0.10499419271945953, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3674604296684265, + "rewards/pad": 0.109375, + "step": 511 + }, + { + "completion_length": 341.5, + "epoch": 0.1631612492033142, + "grad_norm": 11.388672828674316, + "kl": 0.038818359375, + "learning_rate": 8.368387507966858e-07, + "loss": 0.0015, + "reward": 1.487513542175293, + "reward_std": 0.04100422561168671, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4875136613845825, + "step": 512 + }, + { + "completion_length": 271.578125, + "epoch": 0.16347992351816443, + "grad_norm": 17.304515838623047, + "kl": 0.0615234375, + "learning_rate": 8.365200764818356e-07, + "loss": 0.0025, + "reward": 1.50333833694458, + "reward_std": 0.03414406254887581, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5033382773399353, + "rewards/pad": 0.0, + "step": 513 + }, + { + "completion_length": 215.875, + "epoch": 0.16379859783301465, + "grad_norm": 11.562214851379395, + "kl": 0.08154296875, + "learning_rate": 8.362014021669853e-07, + "loss": 0.0033, + "reward": 1.6597778797149658, + "reward_std": 0.1694626361131668, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5660279989242554, + "step": 514 + }, + { + "completion_length": 148.09375, + "epoch": 0.1641172721478649, + "grad_norm": 7.4554901123046875, + "kl": 0.1015625, + "learning_rate": 8.358827278521351e-07, + "loss": 0.0041, + "reward": 1.4301645755767822, + "reward_std": 0.07950934767723083, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4301646053791046, + "rewards/pad": 0.0, + "step": 515 + }, + { + "completion_length": 233.640625, + "epoch": 0.16443594646271512, + "grad_norm": 25.945159912109375, + "kl": 0.06982421875, + "learning_rate": 8.355640535372849e-07, + "loss": 0.0028, + "reward": 1.3905309438705444, + "reward_std": 0.14231663942337036, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37490594387054443, + "step": 516 + }, + { + "completion_length": 276.703125, + "epoch": 0.16475462077756534, + "grad_norm": 23.2886905670166, + "kl": 0.06494140625, + "learning_rate": 8.352453792224347e-07, + "loss": 0.0026, + "reward": 1.654345989227295, + "reward_std": 0.13660773634910583, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4824710786342621, + "step": 517 + }, + { + "completion_length": 314.53125, + "epoch": 0.16507329509241556, + "grad_norm": 8.694071769714355, + "kl": 0.0576171875, + "learning_rate": 8.349267049075844e-07, + "loss": 0.0023, + "reward": 1.5877230167388916, + "reward_std": 0.05514439195394516, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.337723046541214, + "rewards/pad": 0.25, + "step": 518 + }, + { + "completion_length": 337.296875, + "epoch": 0.16539196940726578, + "grad_norm": 6.448973655700684, + "kl": 0.06494140625, + "learning_rate": 8.346080305927342e-07, + "loss": 0.0026, + "reward": 1.4133797883987427, + "reward_std": 0.026479611173272133, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4133797287940979, + "step": 519 + }, + { + "completion_length": 148.125, + "epoch": 0.165710643722116, + "grad_norm": 6.242334365844727, + "kl": 0.0791015625, + "learning_rate": 8.34289356277884e-07, + "loss": 0.0032, + "reward": 1.4564950466156006, + "reward_std": 0.05193711072206497, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3314949870109558, + "rewards/pad": 0.125, + "step": 520 + }, + { + "completion_length": 267.734375, + "epoch": 0.16602931803696622, + "grad_norm": 20.728370666503906, + "kl": 0.055419921875, + "learning_rate": 8.339706819630338e-07, + "loss": 0.0022, + "reward": 1.562483787536621, + "reward_std": 0.042223796248435974, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5624837875366211, + "step": 521 + }, + { + "completion_length": 251.484375, + "epoch": 0.16634799235181644, + "grad_norm": 30.452831268310547, + "kl": 0.08349609375, + "learning_rate": 8.336520076481835e-07, + "loss": 0.0033, + "reward": 1.466627836227417, + "reward_std": 0.08504165709018707, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46662774682044983, + "rewards/pad": 0.0, + "step": 522 + }, + { + "completion_length": 285.390625, + "epoch": 0.16666666666666666, + "grad_norm": 7.482173919677734, + "kl": 0.06689453125, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0027, + "reward": 1.398820161819458, + "reward_std": 0.10944493114948273, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39882004261016846, + "rewards/pad": 0.0, + "step": 523 + }, + { + "completion_length": 212.734375, + "epoch": 0.16698534098151688, + "grad_norm": 14.110750198364258, + "kl": 0.11669921875, + "learning_rate": 8.330146590184831e-07, + "loss": 0.0047, + "reward": 1.3691809177398682, + "reward_std": 0.15498223900794983, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.21293091773986816, + "rewards/pad": 0.171875, + "step": 524 + }, + { + "completion_length": 427.375, + "epoch": 0.16730401529636713, + "grad_norm": 5.056336402893066, + "kl": 0.031982421875, + "learning_rate": 8.326959847036329e-07, + "loss": 0.0013, + "reward": 1.4527101516723633, + "reward_std": 0.054359886795282364, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4527100920677185, + "step": 525 + }, + { + "completion_length": 248.453125, + "epoch": 0.16762268961121735, + "grad_norm": 5.743110179901123, + "kl": 0.058349609375, + "learning_rate": 8.323773103887826e-07, + "loss": 0.0023, + "reward": 1.8058514595031738, + "reward_std": 0.02524116262793541, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6808514595031738, + "step": 526 + }, + { + "completion_length": 208.25, + "epoch": 0.16794136392606757, + "grad_norm": 8.54845905303955, + "kl": 0.08642578125, + "learning_rate": 8.320586360739325e-07, + "loss": 0.0035, + "reward": 1.3544914722442627, + "reward_std": 0.06807970255613327, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3544915020465851, + "rewards/pad": 0.0, + "step": 527 + }, + { + "completion_length": 249.640625, + "epoch": 0.1682600382409178, + "grad_norm": 23.088241577148438, + "kl": 0.06787109375, + "learning_rate": 8.317399617590823e-07, + "loss": 0.0027, + "reward": 1.5373032093048096, + "reward_std": 0.05144185945391655, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5373032093048096, + "step": 528 + }, + { + "completion_length": 321.75, + "epoch": 0.168578712555768, + "grad_norm": 19.688156127929688, + "kl": 0.052490234375, + "learning_rate": 8.31421287444232e-07, + "loss": 0.0021, + "reward": 1.4172598123550415, + "reward_std": 0.08434939384460449, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3235098421573639, + "rewards/pad": 0.09375, + "step": 529 + }, + { + "completion_length": 293.890625, + "epoch": 0.16889738687061823, + "grad_norm": 11.647110939025879, + "kl": 0.05224609375, + "learning_rate": 8.311026131293817e-07, + "loss": 0.0021, + "reward": 1.609508752822876, + "reward_std": 0.13939915597438812, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6251336932182312, + "rewards/pad": 0.0, + "step": 530 + }, + { + "completion_length": 277.0, + "epoch": 0.16921606118546845, + "grad_norm": 34.01567077636719, + "kl": 0.05712890625, + "learning_rate": 8.307839388145315e-07, + "loss": 0.0023, + "reward": 1.577805995941162, + "reward_std": 0.10225144028663635, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3746810853481293, + "rewards/pad": 0.203125, + "step": 531 + }, + { + "completion_length": 313.71875, + "epoch": 0.16953473550031867, + "grad_norm": 5.426848888397217, + "kl": 0.0732421875, + "learning_rate": 8.304652644996813e-07, + "loss": 0.0029, + "reward": 1.470942497253418, + "reward_std": 0.09922796487808228, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48656758666038513, + "rewards/pad": 0.0, + "step": 532 + }, + { + "completion_length": 176.21875, + "epoch": 0.1698534098151689, + "grad_norm": 12.14452075958252, + "kl": 0.0888671875, + "learning_rate": 8.30146590184831e-07, + "loss": 0.0035, + "reward": 1.6665148735046387, + "reward_std": 0.23820053040981293, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4008897840976715, + "rewards/pad": 0.265625, + "step": 533 + }, + { + "completion_length": 247.984375, + "epoch": 0.1701720841300191, + "grad_norm": 4.657873630523682, + "kl": 0.0693359375, + "learning_rate": 8.298279158699808e-07, + "loss": 0.0028, + "reward": 1.4569358825683594, + "reward_std": 0.11433446407318115, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36318591237068176, + "rewards/pad": 0.09375, + "step": 534 + }, + { + "completion_length": 242.4375, + "epoch": 0.17049075844486936, + "grad_norm": 10.610453605651855, + "kl": 0.0732421875, + "learning_rate": 8.295092415551306e-07, + "loss": 0.0029, + "reward": 1.4943509101867676, + "reward_std": 0.06179783120751381, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4943508505821228, + "step": 535 + }, + { + "completion_length": 264.75, + "epoch": 0.17080943275971958, + "grad_norm": 5.561173439025879, + "kl": 0.06396484375, + "learning_rate": 8.291905672402804e-07, + "loss": 0.0026, + "reward": 1.734907627105713, + "reward_std": 0.1684616506099701, + "rewards/pad": 0.328125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4067826569080353, + "step": 536 + }, + { + "completion_length": 125.484375, + "epoch": 0.1711281070745698, + "grad_norm": 11.028822898864746, + "kl": 0.0888671875, + "learning_rate": 8.288718929254301e-07, + "loss": 0.0036, + "reward": 1.6976487636566162, + "reward_std": 0.18358567357063293, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.369523823261261, + "rewards/pad": 0.34375, + "step": 537 + }, + { + "completion_length": 203.015625, + "epoch": 0.17144678138942002, + "grad_norm": 9.298152923583984, + "kl": 0.07421875, + "learning_rate": 8.285532186105799e-07, + "loss": 0.003, + "reward": 1.596775770187378, + "reward_std": 0.14144155383110046, + "rewards/pad": 0.296875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.29990074038505554, + "step": 538 + }, + { + "completion_length": 304.765625, + "epoch": 0.17176545570427024, + "grad_norm": 6.667783737182617, + "kl": 0.046630859375, + "learning_rate": 8.282345442957297e-07, + "loss": 0.0019, + "reward": 1.5704073905944824, + "reward_std": 0.15934810042381287, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49228233098983765, + "rewards/pad": 0.09375, + "step": 539 + }, + { + "completion_length": 342.109375, + "epoch": 0.17208413001912046, + "grad_norm": 4.383968830108643, + "kl": 0.059326171875, + "learning_rate": 8.279158699808795e-07, + "loss": 0.0024, + "reward": 1.5653839111328125, + "reward_std": 0.10253358632326126, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.33100882172584534, + "rewards/pad": 0.25, + "step": 540 + }, + { + "completion_length": 386.6875, + "epoch": 0.17240280433397068, + "grad_norm": 8.986936569213867, + "kl": 0.06298828125, + "learning_rate": 8.275971956660292e-07, + "loss": 0.0025, + "reward": 1.4542319774627686, + "reward_std": 0.1005234345793724, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46985694766044617, + "step": 541 + }, + { + "completion_length": 246.953125, + "epoch": 0.1727214786488209, + "grad_norm": 7.305197715759277, + "kl": 0.091796875, + "learning_rate": 8.27278521351179e-07, + "loss": 0.0037, + "reward": 1.3748791217803955, + "reward_std": 0.12443351745605469, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3748791813850403, + "rewards/pad": 0.0, + "step": 542 + }, + { + "completion_length": 284.890625, + "epoch": 0.17304015296367112, + "grad_norm": 8.34105396270752, + "kl": 0.06298828125, + "learning_rate": 8.269598470363288e-07, + "loss": 0.0025, + "reward": 1.3564362525939941, + "reward_std": 0.15209169685840607, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.2939361333847046, + "step": 543 + }, + { + "completion_length": 222.640625, + "epoch": 0.17335882727852134, + "grad_norm": 7.7216796875, + "kl": 0.09521484375, + "learning_rate": 8.266411727214786e-07, + "loss": 0.0038, + "reward": 1.6124627590179443, + "reward_std": 0.0886450707912445, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36246275901794434, + "rewards/pad": 0.25, + "step": 544 + }, + { + "completion_length": 249.765625, + "epoch": 0.17367750159337159, + "grad_norm": 10.911834716796875, + "kl": 0.0810546875, + "learning_rate": 8.263224984066283e-07, + "loss": 0.0032, + "reward": 1.4904203414916992, + "reward_std": 0.049107346683740616, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.490420401096344, + "rewards/pad": 0.0, + "step": 545 + }, + { + "completion_length": 185.984375, + "epoch": 0.1739961759082218, + "grad_norm": 9.049338340759277, + "kl": 0.1025390625, + "learning_rate": 8.260038240917782e-07, + "loss": 0.0041, + "reward": 1.7010513544082642, + "reward_std": 0.07424585521221161, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5760514736175537, + "rewards/pad": 0.125, + "step": 546 + }, + { + "completion_length": 244.1875, + "epoch": 0.17431485022307203, + "grad_norm": 7.223100185394287, + "kl": 0.0771484375, + "learning_rate": 8.25685149776928e-07, + "loss": 0.0031, + "reward": 1.5383145809173584, + "reward_std": 0.10397733002901077, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.553939700126648, + "step": 547 + }, + { + "completion_length": 255.75, + "epoch": 0.17463352453792225, + "grad_norm": 8.514344215393066, + "kl": 0.072265625, + "learning_rate": 8.253664754620778e-07, + "loss": 0.0029, + "reward": 1.389475703239441, + "reward_std": 0.07426249980926514, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3738507032394409, + "step": 548 + }, + { + "completion_length": 313.265625, + "epoch": 0.17495219885277247, + "grad_norm": 7.112410545349121, + "kl": 0.06982421875, + "learning_rate": 8.250478011472275e-07, + "loss": 0.0028, + "reward": 1.432603120803833, + "reward_std": 0.08065502345561981, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.432603120803833, + "step": 549 + }, + { + "completion_length": 192.671875, + "epoch": 0.1752708731676227, + "grad_norm": 16.948640823364258, + "kl": 0.10107421875, + "learning_rate": 8.247291268323773e-07, + "loss": 0.004, + "reward": 1.6080808639526367, + "reward_std": 0.11161001026630402, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.23308099806308746, + "step": 550 + }, + { + "completion_length": 314.828125, + "epoch": 0.1755895474824729, + "grad_norm": 5.996268272399902, + "kl": 0.06689453125, + "learning_rate": 8.244104525175271e-07, + "loss": 0.0027, + "reward": 1.3713405132293701, + "reward_std": 0.06664109230041504, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37134063243865967, + "step": 551 + }, + { + "completion_length": 266.265625, + "epoch": 0.17590822179732313, + "grad_norm": 9.64345645904541, + "kl": 0.0947265625, + "learning_rate": 8.240917782026769e-07, + "loss": 0.0038, + "reward": 1.412318229675293, + "reward_std": 0.06251288205385208, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4123181998729706, + "step": 552 + }, + { + "completion_length": 397.734375, + "epoch": 0.17622689611217335, + "grad_norm": 7.888392448425293, + "kl": 0.036376953125, + "learning_rate": 8.237731038878266e-07, + "loss": 0.0014, + "reward": 1.5837773084640503, + "reward_std": 0.043806686997413635, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4587773382663727, + "rewards/pad": 0.125, + "step": 553 + }, + { + "completion_length": 215.296875, + "epoch": 0.17654557042702357, + "grad_norm": 9.22640609741211, + "kl": 0.07763671875, + "learning_rate": 8.234544295729764e-07, + "loss": 0.0031, + "reward": 1.536142110824585, + "reward_std": 0.1304551213979721, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42676711082458496, + "rewards/pad": 0.109375, + "step": 554 + }, + { + "completion_length": 223.859375, + "epoch": 0.17686424474187382, + "grad_norm": 29.19223976135254, + "kl": 0.0849609375, + "learning_rate": 8.231357552581262e-07, + "loss": 0.0034, + "reward": 1.5908797979354858, + "reward_std": 0.06541752815246582, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5908798575401306, + "rewards/pad": 0.0, + "step": 555 + }, + { + "completion_length": 319.859375, + "epoch": 0.17718291905672404, + "grad_norm": 8.462791442871094, + "kl": 0.058837890625, + "learning_rate": 8.228170809432759e-07, + "loss": 0.0024, + "reward": 1.4492942094802856, + "reward_std": 0.06144850328564644, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44929423928260803, + "step": 556 + }, + { + "completion_length": 317.234375, + "epoch": 0.17750159337157426, + "grad_norm": 9.007328033447266, + "kl": 0.06494140625, + "learning_rate": 8.224984066284257e-07, + "loss": 0.0026, + "reward": 1.3446791172027588, + "reward_std": 0.13469094038009644, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3290541172027588, + "step": 557 + }, + { + "completion_length": 235.78125, + "epoch": 0.17782026768642448, + "grad_norm": 29.220151901245117, + "kl": 0.08544921875, + "learning_rate": 8.221797323135755e-07, + "loss": 0.0034, + "reward": 1.438340425491333, + "reward_std": 0.10674619674682617, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43834036588668823, + "rewards/pad": 0.015625, + "step": 558 + }, + { + "completion_length": 174.921875, + "epoch": 0.1781389420012747, + "grad_norm": 27.95044708251953, + "kl": 0.08837890625, + "learning_rate": 8.218610579987253e-07, + "loss": 0.0035, + "reward": 1.6616170406341553, + "reward_std": 0.10026431083679199, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6616171002388, + "rewards/pad": 0.0, + "step": 559 + }, + { + "completion_length": 387.671875, + "epoch": 0.17845761631612492, + "grad_norm": 12.593073844909668, + "kl": 0.05615234375, + "learning_rate": 8.21542383683875e-07, + "loss": 0.0022, + "reward": 1.4177520275115967, + "reward_std": 0.05932632088661194, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41775208711624146, + "step": 560 + }, + { + "completion_length": 222.484375, + "epoch": 0.17877629063097514, + "grad_norm": 20.837419509887695, + "kl": 0.07958984375, + "learning_rate": 8.212237093690248e-07, + "loss": 0.0032, + "reward": 1.5061814785003662, + "reward_std": 0.10648869723081589, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3968064785003662, + "rewards/pad": 0.109375, + "step": 561 + }, + { + "completion_length": 238.15625, + "epoch": 0.17909496494582536, + "grad_norm": 7.644387722015381, + "kl": 0.1123046875, + "learning_rate": 8.209050350541746e-07, + "loss": 0.0045, + "reward": 1.3573124408721924, + "reward_std": 0.06864476948976517, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3573123514652252, + "step": 562 + }, + { + "completion_length": 231.3125, + "epoch": 0.17941363926067558, + "grad_norm": 9.32486629486084, + "kl": 0.091796875, + "learning_rate": 8.205863607393244e-07, + "loss": 0.0037, + "reward": 1.5059893131256104, + "reward_std": 0.2075025886297226, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45911431312561035, + "rewards/pad": 0.046875, + "step": 563 + }, + { + "completion_length": 273.6875, + "epoch": 0.17973231357552583, + "grad_norm": 19.53805923461914, + "kl": 0.0673828125, + "learning_rate": 8.202676864244741e-07, + "loss": 0.0027, + "reward": 1.6068167686462402, + "reward_std": 0.10275942087173462, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.513066828250885, + "rewards/pad": 0.09375, + "step": 564 + }, + { + "completion_length": 290.625, + "epoch": 0.18005098789037605, + "grad_norm": 4.721208572387695, + "kl": 0.130859375, + "learning_rate": 8.19949012109624e-07, + "loss": 0.0052, + "reward": 1.5925767421722412, + "reward_std": 0.1044643372297287, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4675767123699188, + "step": 565 + }, + { + "completion_length": 309.234375, + "epoch": 0.18036966220522627, + "grad_norm": 7.534607410430908, + "kl": 0.0693359375, + "learning_rate": 8.196303377947738e-07, + "loss": 0.0028, + "reward": 1.3615877628326416, + "reward_std": 0.07354053854942322, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.361587792634964, + "step": 566 + }, + { + "completion_length": 232.921875, + "epoch": 0.1806883365200765, + "grad_norm": 18.523197174072266, + "kl": 0.08056640625, + "learning_rate": 8.193116634799236e-07, + "loss": 0.0032, + "reward": 1.4811151027679443, + "reward_std": 0.1094781905412674, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3717402219772339, + "rewards/pad": 0.109375, + "step": 567 + }, + { + "completion_length": 328.984375, + "epoch": 0.1810070108349267, + "grad_norm": 28.020334243774414, + "kl": 0.0498046875, + "learning_rate": 8.189929891650733e-07, + "loss": 0.002, + "reward": 1.4994876384735107, + "reward_std": 0.0968121737241745, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.35886263847351074, + "step": 568 + }, + { + "completion_length": 382.9375, + "epoch": 0.18132568514977693, + "grad_norm": 5.482840061187744, + "kl": 0.045654296875, + "learning_rate": 8.18674314850223e-07, + "loss": 0.0018, + "reward": 1.4488133192062378, + "reward_std": 0.10397167503833771, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4019383192062378, + "rewards/pad": 0.046875, + "step": 569 + }, + { + "completion_length": 328.40625, + "epoch": 0.18164435946462715, + "grad_norm": 12.763792037963867, + "kl": 0.062255859375, + "learning_rate": 8.183556405353728e-07, + "loss": 0.0025, + "reward": 1.6313751935958862, + "reward_std": 0.19210481643676758, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.30325016379356384, + "rewards/pad": 0.34375, + "step": 570 + }, + { + "completion_length": 365.875, + "epoch": 0.18196303377947737, + "grad_norm": 6.801854610443115, + "kl": 0.0517578125, + "learning_rate": 8.180369662205226e-07, + "loss": 0.0021, + "reward": 1.2952144145965576, + "reward_std": 0.05773142725229263, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.29521435499191284, + "rewards/pad": 0.0, + "step": 571 + }, + { + "completion_length": 301.984375, + "epoch": 0.1822817080943276, + "grad_norm": 9.7481689453125, + "kl": 0.0654296875, + "learning_rate": 8.177182919056723e-07, + "loss": 0.0026, + "reward": 1.4989292621612549, + "reward_std": 0.14893344044685364, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4364292621612549, + "step": 572 + }, + { + "completion_length": 313.953125, + "epoch": 0.1826003824091778, + "grad_norm": 5.001101016998291, + "kl": 0.058837890625, + "learning_rate": 8.173996175908221e-07, + "loss": 0.0024, + "reward": 1.522552728652954, + "reward_std": 0.06399345397949219, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5225528478622437, + "rewards/pad": 0.0, + "step": 573 + }, + { + "completion_length": 324.203125, + "epoch": 0.18291905672402806, + "grad_norm": 6.752080917358398, + "kl": 0.058349609375, + "learning_rate": 8.170809432759719e-07, + "loss": 0.0023, + "reward": 1.5995622873306274, + "reward_std": 0.14907418191432953, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5370622277259827, + "rewards/pad": 0.078125, + "step": 574 + }, + { + "completion_length": 303.546875, + "epoch": 0.18323773103887828, + "grad_norm": 6.67637300491333, + "kl": 0.07080078125, + "learning_rate": 8.167622689611217e-07, + "loss": 0.0028, + "reward": 1.6408474445343018, + "reward_std": 0.10487158596515656, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.422097384929657, + "rewards/pad": 0.21875, + "step": 575 + }, + { + "completion_length": 222.40625, + "epoch": 0.1835564053537285, + "grad_norm": 7.407890796661377, + "kl": 0.08544921875, + "learning_rate": 8.164435946462714e-07, + "loss": 0.0034, + "reward": 1.5350260734558105, + "reward_std": 0.12564942240715027, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3944009840488434, + "rewards/pad": 0.140625, + "step": 576 + }, + { + "completion_length": 247.796875, + "epoch": 0.18387507966857872, + "grad_norm": 9.383212089538574, + "kl": 0.091796875, + "learning_rate": 8.161249203314212e-07, + "loss": 0.0037, + "reward": 1.7426847219467163, + "reward_std": 0.1123775988817215, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5083097219467163, + "step": 577 + }, + { + "completion_length": 363.25, + "epoch": 0.18419375398342894, + "grad_norm": 17.369152069091797, + "kl": 0.0595703125, + "learning_rate": 8.15806246016571e-07, + "loss": 0.0024, + "reward": 1.4469183683395386, + "reward_std": 0.06069286912679672, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44691839814186096, + "step": 578 + }, + { + "completion_length": 197.15625, + "epoch": 0.18451242829827916, + "grad_norm": 10.02291488647461, + "kl": 0.10009765625, + "learning_rate": 8.154875717017208e-07, + "loss": 0.004, + "reward": 1.4997634887695312, + "reward_std": 0.14637836813926697, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4216384291648865, + "step": 579 + }, + { + "completion_length": 319.875, + "epoch": 0.18483110261312938, + "grad_norm": 5.547020435333252, + "kl": 0.0712890625, + "learning_rate": 8.151688973868705e-07, + "loss": 0.0029, + "reward": 1.5564733743667603, + "reward_std": 0.21002715826034546, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.587723433971405, + "rewards/pad": 0.0, + "step": 580 + }, + { + "completion_length": 228.546875, + "epoch": 0.1851497769279796, + "grad_norm": 7.936356544494629, + "kl": 0.0859375, + "learning_rate": 8.148502230720203e-07, + "loss": 0.0034, + "reward": 1.9707231521606445, + "reward_std": 0.17871692776679993, + "rewards/pad": 0.453125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5332232117652893, + "step": 581 + }, + { + "completion_length": 443.796875, + "epoch": 0.18546845124282982, + "grad_norm": 11.128722190856934, + "kl": 0.03466796875, + "learning_rate": 8.145315487571701e-07, + "loss": 0.0014, + "reward": 1.5804551839828491, + "reward_std": 0.06354731321334839, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5648301243782043, + "step": 582 + }, + { + "completion_length": 176.03125, + "epoch": 0.18578712555768004, + "grad_norm": 10.60787296295166, + "kl": 0.08935546875, + "learning_rate": 8.1421287444232e-07, + "loss": 0.0036, + "reward": 1.6280150413513184, + "reward_std": 0.07694578170776367, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5030151009559631, + "step": 583 + }, + { + "completion_length": 329.921875, + "epoch": 0.1861057998725303, + "grad_norm": 12.799588203430176, + "kl": 0.06494140625, + "learning_rate": 8.138942001274697e-07, + "loss": 0.0026, + "reward": 1.241806983947754, + "reward_std": 0.1604565978050232, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.24180711805820465, + "step": 584 + }, + { + "completion_length": 455.765625, + "epoch": 0.1864244741873805, + "grad_norm": 15.1790132522583, + "kl": 0.03515625, + "learning_rate": 8.135755258126195e-07, + "loss": 0.0014, + "reward": 1.3431535959243774, + "reward_std": 0.08880558609962463, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.29627859592437744, + "step": 585 + }, + { + "completion_length": 440.984375, + "epoch": 0.18674314850223073, + "grad_norm": 12.594864845275879, + "kl": 0.0341796875, + "learning_rate": 8.132568514977693e-07, + "loss": 0.0013, + "reward": 1.6465810537338257, + "reward_std": 0.024850212037563324, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5215809345245361, + "step": 586 + }, + { + "completion_length": 395.734375, + "epoch": 0.18706182281708095, + "grad_norm": 7.501439571380615, + "kl": 0.047119140625, + "learning_rate": 8.12938177182919e-07, + "loss": 0.0019, + "reward": 1.5232059955596924, + "reward_std": 0.06539130210876465, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5232060551643372, + "rewards/pad": 0.0, + "step": 587 + }, + { + "completion_length": 453.578125, + "epoch": 0.18738049713193117, + "grad_norm": 9.147506713867188, + "kl": 0.04443359375, + "learning_rate": 8.126195028680688e-07, + "loss": 0.0018, + "reward": 1.4919631481170654, + "reward_std": 0.17542997002601624, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5232131481170654, + "rewards/pad": 0.0, + "step": 588 + }, + { + "completion_length": 256.046875, + "epoch": 0.1876991714467814, + "grad_norm": 32.91367721557617, + "kl": 0.061279296875, + "learning_rate": 8.123008285532186e-07, + "loss": 0.0025, + "reward": 1.458876371383667, + "reward_std": 0.06437614560127258, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.333876371383667, + "rewards/pad": 0.125, + "step": 589 + }, + { + "completion_length": 372.671875, + "epoch": 0.1880178457616316, + "grad_norm": 7.198757648468018, + "kl": 0.05029296875, + "learning_rate": 8.119821542383684e-07, + "loss": 0.002, + "reward": 1.3763469457626343, + "reward_std": 0.02698800340294838, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3763468563556671, + "step": 590 + }, + { + "completion_length": 326.828125, + "epoch": 0.18833652007648183, + "grad_norm": 7.202263355255127, + "kl": 0.05810546875, + "learning_rate": 8.116634799235181e-07, + "loss": 0.0023, + "reward": 1.6532502174377441, + "reward_std": 0.10701927542686462, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5438752770423889, + "step": 591 + }, + { + "completion_length": 314.234375, + "epoch": 0.18865519439133205, + "grad_norm": 6.360630035400391, + "kl": 0.068359375, + "learning_rate": 8.113448056086679e-07, + "loss": 0.0027, + "reward": 1.6760071516036987, + "reward_std": 0.08180226385593414, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5510071516036987, + "rewards/pad": 0.125, + "step": 592 + }, + { + "completion_length": 357.375, + "epoch": 0.18897386870618227, + "grad_norm": 8.12671947479248, + "kl": 0.05615234375, + "learning_rate": 8.110261312938177e-07, + "loss": 0.0023, + "reward": 1.4480271339416504, + "reward_std": 0.1161215677857399, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4480271339416504, + "step": 593 + }, + { + "completion_length": 367.203125, + "epoch": 0.18929254302103252, + "grad_norm": 5.130788326263428, + "kl": 0.05322265625, + "learning_rate": 8.107074569789675e-07, + "loss": 0.0021, + "reward": 1.4342541694641113, + "reward_std": 0.1340075582265854, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.34050410985946655, + "step": 594 + }, + { + "completion_length": 161.765625, + "epoch": 0.18961121733588274, + "grad_norm": 10.24941349029541, + "kl": 0.0908203125, + "learning_rate": 8.103887826641172e-07, + "loss": 0.0036, + "reward": 1.6754908561706543, + "reward_std": 0.12678924202919006, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5504908561706543, + "rewards/pad": 0.125, + "step": 595 + }, + { + "completion_length": 260.359375, + "epoch": 0.18992989165073296, + "grad_norm": 19.758913040161133, + "kl": 0.0810546875, + "learning_rate": 8.10070108349267e-07, + "loss": 0.0032, + "reward": 1.43495512008667, + "reward_std": 0.20263132452964783, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3568301796913147, + "rewards/pad": 0.09375, + "step": 596 + }, + { + "completion_length": 468.34375, + "epoch": 0.19024856596558318, + "grad_norm": 5.984434127807617, + "kl": 0.03515625, + "learning_rate": 8.097514340344168e-07, + "loss": 0.0014, + "reward": 1.4179304838180542, + "reward_std": 0.07944416999816895, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4335554540157318, + "step": 597 + }, + { + "completion_length": 187.96875, + "epoch": 0.1905672402804334, + "grad_norm": 79.40922546386719, + "kl": 0.080078125, + "learning_rate": 8.094327597195666e-07, + "loss": 0.0032, + "reward": 1.5468125343322754, + "reward_std": 0.17716550827026367, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4374375343322754, + "step": 598 + }, + { + "completion_length": 276.34375, + "epoch": 0.19088591459528362, + "grad_norm": 13.014153480529785, + "kl": 0.0654296875, + "learning_rate": 8.091140854047163e-07, + "loss": 0.0026, + "reward": 1.6321213245391846, + "reward_std": 0.18163417279720306, + "rewards/pad": 0.265625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36649632453918457, + "step": 599 + }, + { + "completion_length": 140.703125, + "epoch": 0.19120458891013384, + "grad_norm": 88.6712417602539, + "kl": 0.08154296875, + "learning_rate": 8.087954110898661e-07, + "loss": 0.0033, + "reward": 1.6089478731155396, + "reward_std": 0.1710633635520935, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39019784331321716, + "rewards/pad": 0.21875, + "step": 600 + }, + { + "completion_length": 283.28125, + "epoch": 0.19152326322498406, + "grad_norm": 8.6228609085083, + "kl": 0.0791015625, + "learning_rate": 8.084767367750159e-07, + "loss": 0.0032, + "reward": 1.5642704963684082, + "reward_std": 0.11657722294330597, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.454895555973053, + "step": 601 + }, + { + "completion_length": 319.125, + "epoch": 0.19184193753983428, + "grad_norm": 10.412154197692871, + "kl": 0.06591796875, + "learning_rate": 8.081580624601658e-07, + "loss": 0.0026, + "reward": 1.352163314819336, + "reward_std": 0.07580342888832092, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3521634042263031, + "rewards/pad": 0.0, + "step": 602 + }, + { + "completion_length": 278.3125, + "epoch": 0.1921606118546845, + "grad_norm": 7.997784614562988, + "kl": 0.060546875, + "learning_rate": 8.078393881453155e-07, + "loss": 0.0024, + "reward": 1.5723495483398438, + "reward_std": 0.10167841613292694, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5723496079444885, + "rewards/pad": 0.0, + "step": 603 + }, + { + "completion_length": 179.96875, + "epoch": 0.19247928616953475, + "grad_norm": 7.610307693481445, + "kl": 0.07666015625, + "learning_rate": 8.075207138304653e-07, + "loss": 0.0031, + "reward": 1.6623280048370361, + "reward_std": 0.1471826136112213, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41232794523239136, + "rewards/pad": 0.25, + "step": 604 + }, + { + "completion_length": 243.203125, + "epoch": 0.19279796048438497, + "grad_norm": 9.530278205871582, + "kl": 0.07275390625, + "learning_rate": 8.072020395156151e-07, + "loss": 0.0029, + "reward": 1.700823187828064, + "reward_std": 0.11023609340190887, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46644818782806396, + "step": 605 + }, + { + "completion_length": 267.640625, + "epoch": 0.1931166347992352, + "grad_norm": 4.225679397583008, + "kl": 0.076171875, + "learning_rate": 8.068833652007649e-07, + "loss": 0.003, + "reward": 1.633243441581726, + "reward_std": 0.1748502105474472, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4144933819770813, + "step": 606 + }, + { + "completion_length": 405.625, + "epoch": 0.1934353091140854, + "grad_norm": 16.529685974121094, + "kl": 0.051025390625, + "learning_rate": 8.065646908859146e-07, + "loss": 0.002, + "reward": 1.3954849243164062, + "reward_std": 0.08648538589477539, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2861098051071167, + "step": 607 + }, + { + "completion_length": 263.96875, + "epoch": 0.19375398342893563, + "grad_norm": 6.294305324554443, + "kl": 0.0732421875, + "learning_rate": 8.062460165710643e-07, + "loss": 0.0029, + "reward": 1.2127344608306885, + "reward_std": 0.0745549276471138, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.21273449063301086, + "step": 608 + }, + { + "completion_length": 229.03125, + "epoch": 0.19407265774378585, + "grad_norm": 12.357077598571777, + "kl": 0.068359375, + "learning_rate": 8.059273422562141e-07, + "loss": 0.0027, + "reward": 1.5320799350738525, + "reward_std": 0.0908544585108757, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5320799350738525, + "step": 609 + }, + { + "completion_length": 250.765625, + "epoch": 0.19439133205863607, + "grad_norm": 9.141507148742676, + "kl": 0.0732421875, + "learning_rate": 8.056086679413639e-07, + "loss": 0.0029, + "reward": 1.5585919618606567, + "reward_std": 0.1449308544397354, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5273419618606567, + "rewards/pad": 0.03125, + "step": 610 + }, + { + "completion_length": 320.9375, + "epoch": 0.1947100063734863, + "grad_norm": 4.861262798309326, + "kl": 0.06005859375, + "learning_rate": 8.052899936265136e-07, + "loss": 0.0024, + "reward": 1.5317165851593018, + "reward_std": 0.15812718868255615, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.46921658515930176, + "rewards/pad": 0.09375, + "step": 611 + }, + { + "completion_length": 268.875, + "epoch": 0.1950286806883365, + "grad_norm": 28.3754825592041, + "kl": 0.06884765625, + "learning_rate": 8.049713193116634e-07, + "loss": 0.0028, + "reward": 1.4912457466125488, + "reward_std": 0.04721307381987572, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49124568700790405, + "rewards/pad": 0.0, + "step": 612 + }, + { + "completion_length": 235.453125, + "epoch": 0.19534735500318676, + "grad_norm": 7.317533493041992, + "kl": 0.0771484375, + "learning_rate": 8.046526449968132e-07, + "loss": 0.0031, + "reward": 1.3803117275238037, + "reward_std": 0.11250771582126617, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39593687653541565, + "rewards/pad": 0.0, + "step": 613 + }, + { + "completion_length": 226.03125, + "epoch": 0.19566602931803698, + "grad_norm": 38.98457336425781, + "kl": 0.0703125, + "learning_rate": 8.043339706819629e-07, + "loss": 0.0028, + "reward": 1.6466970443725586, + "reward_std": 0.07787991315126419, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5216969847679138, + "step": 614 + }, + { + "completion_length": 232.640625, + "epoch": 0.1959847036328872, + "grad_norm": 9.064290046691895, + "kl": 0.072265625, + "learning_rate": 8.040152963671127e-07, + "loss": 0.0029, + "reward": 1.4208617210388184, + "reward_std": 0.2527421712875366, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.29586169123649597, + "step": 615 + }, + { + "completion_length": 233.796875, + "epoch": 0.19630337794773742, + "grad_norm": 5.284163951873779, + "kl": 0.08740234375, + "learning_rate": 8.036966220522625e-07, + "loss": 0.0035, + "reward": 1.4077630043029785, + "reward_std": 0.13147412240505219, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42338788509368896, + "rewards/pad": 0.0, + "step": 616 + }, + { + "completion_length": 284.546875, + "epoch": 0.19662205226258764, + "grad_norm": 25.7481632232666, + "kl": 0.072265625, + "learning_rate": 8.033779477374123e-07, + "loss": 0.0029, + "reward": 1.5234477519989014, + "reward_std": 0.060542307794094086, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5234477519989014, + "step": 617 + }, + { + "completion_length": 264.3125, + "epoch": 0.19694072657743786, + "grad_norm": 7.553581237792969, + "kl": 0.07275390625, + "learning_rate": 8.03059273422562e-07, + "loss": 0.0029, + "reward": 1.4492477178573608, + "reward_std": 0.09408283978700638, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4492477774620056, + "rewards/pad": 0.0, + "step": 618 + }, + { + "completion_length": 266.9375, + "epoch": 0.19725940089228808, + "grad_norm": 6.398976802825928, + "kl": 0.07958984375, + "learning_rate": 8.027405991077118e-07, + "loss": 0.0032, + "reward": 1.5198715925216675, + "reward_std": 0.15117725729942322, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5354965925216675, + "step": 619 + }, + { + "completion_length": 330.71875, + "epoch": 0.1975780752071383, + "grad_norm": 5.78820276260376, + "kl": 0.07568359375, + "learning_rate": 8.024219247928616e-07, + "loss": 0.003, + "reward": 1.3081625699996948, + "reward_std": 0.09341893345117569, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3081625699996948, + "rewards/pad": 0.0, + "step": 620 + }, + { + "completion_length": 389.359375, + "epoch": 0.19789674952198852, + "grad_norm": 13.41144847869873, + "kl": 0.055419921875, + "learning_rate": 8.021032504780114e-07, + "loss": 0.0022, + "reward": 1.5052798986434937, + "reward_std": 0.21025700867176056, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5209048390388489, + "rewards/pad": 0.015625, + "step": 621 + }, + { + "completion_length": 312.203125, + "epoch": 0.19821542383683874, + "grad_norm": 41.177154541015625, + "kl": 0.1015625, + "learning_rate": 8.017845761631612e-07, + "loss": 0.0041, + "reward": 1.566091537475586, + "reward_std": 0.1455710232257843, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5348414778709412, + "step": 622 + }, + { + "completion_length": 491.96875, + "epoch": 0.198534098151689, + "grad_norm": 2.699554920196533, + "kl": 0.037353515625, + "learning_rate": 8.01465901848311e-07, + "loss": 0.0015, + "reward": 1.5370073318481445, + "reward_std": 0.14047783613204956, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3182574212551117, + "step": 623 + }, + { + "completion_length": 431.703125, + "epoch": 0.1988527724665392, + "grad_norm": 5.324953556060791, + "kl": 0.0498046875, + "learning_rate": 8.011472275334608e-07, + "loss": 0.002, + "reward": 1.475203275680542, + "reward_std": 0.15671223402023315, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.3814533054828644, + "step": 624 + }, + { + "completion_length": 307.0625, + "epoch": 0.19917144678138943, + "grad_norm": 6.110250949859619, + "kl": 0.14453125, + "learning_rate": 8.008285532186106e-07, + "loss": 0.0058, + "reward": 1.4999936819076538, + "reward_std": 0.1042509377002716, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4999937415122986, + "step": 625 + }, + { + "completion_length": 255.578125, + "epoch": 0.19949012109623965, + "grad_norm": 10.814800262451172, + "kl": 0.07958984375, + "learning_rate": 8.005098789037603e-07, + "loss": 0.0032, + "reward": 1.4932688474655151, + "reward_std": 0.08080501109361649, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3682689070701599, + "step": 626 + }, + { + "completion_length": 157.046875, + "epoch": 0.19980879541108987, + "grad_norm": 13.435721397399902, + "kl": 0.1025390625, + "learning_rate": 8.001912045889101e-07, + "loss": 0.0041, + "reward": 1.4771438837051392, + "reward_std": 0.11090736836194992, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47714388370513916, + "rewards/pad": 0.0, + "step": 627 + }, + { + "completion_length": 341.984375, + "epoch": 0.2001274697259401, + "grad_norm": 7.098702430725098, + "kl": 0.0712890625, + "learning_rate": 7.998725302740599e-07, + "loss": 0.0028, + "reward": 1.6881647109985352, + "reward_std": 0.14802853763103485, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6256648302078247, + "step": 628 + }, + { + "completion_length": 205.40625, + "epoch": 0.2004461440407903, + "grad_norm": 8.152799606323242, + "kl": 0.08837890625, + "learning_rate": 7.995538559592097e-07, + "loss": 0.0035, + "reward": 1.4634058475494385, + "reward_std": 0.12990927696228027, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4321558475494385, + "rewards/pad": 0.03125, + "step": 629 + }, + { + "completion_length": 288.375, + "epoch": 0.20076481835564053, + "grad_norm": 7.801412582397461, + "kl": 0.07958984375, + "learning_rate": 7.992351816443594e-07, + "loss": 0.0032, + "reward": 1.6792851686477661, + "reward_std": 0.12204693257808685, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3199102282524109, + "rewards/pad": 0.359375, + "step": 630 + }, + { + "completion_length": 332.359375, + "epoch": 0.20108349267049075, + "grad_norm": 56.13440704345703, + "kl": 0.08154296875, + "learning_rate": 7.989165073295092e-07, + "loss": 0.0033, + "reward": 1.4205741882324219, + "reward_std": 0.10428917407989502, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42057424783706665, + "step": 631 + }, + { + "completion_length": 342.0625, + "epoch": 0.20140216698534097, + "grad_norm": 48.81986618041992, + "kl": 0.0830078125, + "learning_rate": 7.98597833014659e-07, + "loss": 0.0033, + "reward": 1.4643616676330566, + "reward_std": 0.1833011955022812, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4956117570400238, + "step": 632 + }, + { + "completion_length": 326.984375, + "epoch": 0.20172084130019122, + "grad_norm": 11.626622200012207, + "kl": 0.06884765625, + "learning_rate": 7.982791586998088e-07, + "loss": 0.0027, + "reward": 1.444153904914856, + "reward_std": 0.10968662798404694, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44415396451950073, + "rewards/pad": 0.0, + "step": 633 + }, + { + "completion_length": 498.578125, + "epoch": 0.20203951561504144, + "grad_norm": 9.971516609191895, + "kl": 0.061767578125, + "learning_rate": 7.979604843849585e-07, + "loss": 0.0025, + "reward": 1.4112811088562012, + "reward_std": 0.22392134368419647, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.45815610885620117, + "rewards/pad": 0.0, + "step": 634 + }, + { + "completion_length": 241.34375, + "epoch": 0.20235818992989166, + "grad_norm": 4.857548713684082, + "kl": 0.08447265625, + "learning_rate": 7.976418100701083e-07, + "loss": 0.0034, + "reward": 1.5090776681900024, + "reward_std": 0.2434341311454773, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.36845266819000244, + "rewards/pad": 0.171875, + "step": 635 + }, + { + "completion_length": 480.46875, + "epoch": 0.20267686424474188, + "grad_norm": 11.361845970153809, + "kl": 0.053955078125, + "learning_rate": 7.973231357552581e-07, + "loss": 0.0022, + "reward": 1.4582862854003906, + "reward_std": 0.056097157299518585, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45828622579574585, + "rewards/pad": 0.0, + "step": 636 + }, + { + "completion_length": 309.1875, + "epoch": 0.2029955385595921, + "grad_norm": 17.00067901611328, + "kl": 0.07275390625, + "learning_rate": 7.970044614404079e-07, + "loss": 0.0029, + "reward": 1.5798858404159546, + "reward_std": 0.09062710404396057, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5798859000205994, + "rewards/pad": 0.0, + "step": 637 + }, + { + "completion_length": 386.1875, + "epoch": 0.20331421287444232, + "grad_norm": 5.623465538024902, + "kl": 0.05810546875, + "learning_rate": 7.966857871255576e-07, + "loss": 0.0023, + "reward": 1.492444396018982, + "reward_std": 0.04644552618265152, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49244433641433716, + "step": 638 + }, + { + "completion_length": 340.109375, + "epoch": 0.20363288718929254, + "grad_norm": 6.998098373413086, + "kl": 0.0693359375, + "learning_rate": 7.963671128107074e-07, + "loss": 0.0028, + "reward": 1.4608345031738281, + "reward_std": 0.11724086850881577, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4764595329761505, + "rewards/pad": 0.0, + "step": 639 + }, + { + "completion_length": 251.234375, + "epoch": 0.20395156150414276, + "grad_norm": 7.686694145202637, + "kl": 0.103515625, + "learning_rate": 7.960484384958573e-07, + "loss": 0.0041, + "reward": 1.6384403705596924, + "reward_std": 0.12739157676696777, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5290653705596924, + "rewards/pad": 0.125, + "step": 640 + }, + { + "completion_length": 280.984375, + "epoch": 0.20427023581899298, + "grad_norm": 9.52418327331543, + "kl": 0.07958984375, + "learning_rate": 7.957297641810071e-07, + "loss": 0.0032, + "reward": 1.3519306182861328, + "reward_std": 0.13024069368839264, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35193055868148804, + "rewards/pad": 0.0, + "step": 641 + }, + { + "completion_length": 382.640625, + "epoch": 0.2045889101338432, + "grad_norm": 6.8839874267578125, + "kl": 0.0859375, + "learning_rate": 7.954110898661568e-07, + "loss": 0.0034, + "reward": 1.5839730501174927, + "reward_std": 0.13091622292995453, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5995981097221375, + "rewards/pad": 0.0, + "step": 642 + }, + { + "completion_length": 219.859375, + "epoch": 0.20490758444869345, + "grad_norm": 8.28393840789795, + "kl": 0.08984375, + "learning_rate": 7.950924155513066e-07, + "loss": 0.0036, + "reward": 1.5429143905639648, + "reward_std": 0.17061397433280945, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5741643905639648, + "rewards/pad": 0.0, + "step": 643 + }, + { + "completion_length": 446.3125, + "epoch": 0.20522625876354367, + "grad_norm": 7.347504615783691, + "kl": 0.044189453125, + "learning_rate": 7.947737412364564e-07, + "loss": 0.0018, + "reward": 1.511324405670166, + "reward_std": 0.14396220445632935, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.417574405670166, + "step": 644 + }, + { + "completion_length": 311.671875, + "epoch": 0.2055449330783939, + "grad_norm": 11.247817039489746, + "kl": 0.06982421875, + "learning_rate": 7.944550669216062e-07, + "loss": 0.0028, + "reward": 1.5525568723678589, + "reward_std": 0.1326773762702942, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5681818127632141, + "rewards/pad": 0.0, + "step": 645 + }, + { + "completion_length": 325.859375, + "epoch": 0.2058636073932441, + "grad_norm": 14.660234451293945, + "kl": 0.07470703125, + "learning_rate": 7.941363926067559e-07, + "loss": 0.003, + "reward": 1.2435545921325684, + "reward_std": 0.06781746447086334, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.25917962193489075, + "rewards/pad": 0.0, + "step": 646 + }, + { + "completion_length": 291.4375, + "epoch": 0.20618228170809433, + "grad_norm": 4.295619964599609, + "kl": 0.056884765625, + "learning_rate": 7.938177182919057e-07, + "loss": 0.0023, + "reward": 1.562973976135254, + "reward_std": 0.0560150146484375, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4379739761352539, + "rewards/pad": 0.125, + "step": 647 + }, + { + "completion_length": 393.671875, + "epoch": 0.20650095602294455, + "grad_norm": 8.650598526000977, + "kl": 0.048583984375, + "learning_rate": 7.934990439770554e-07, + "loss": 0.0019, + "reward": 1.2792221307754517, + "reward_std": 0.04953842982649803, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.27922216057777405, + "step": 648 + }, + { + "completion_length": 247.828125, + "epoch": 0.20681963033779477, + "grad_norm": 14.719244003295898, + "kl": 0.0859375, + "learning_rate": 7.931803696622051e-07, + "loss": 0.0034, + "reward": 1.6231489181518555, + "reward_std": 0.13309356570243835, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4043988883495331, + "rewards/pad": 0.21875, + "step": 649 + }, + { + "completion_length": 289.953125, + "epoch": 0.207138304652645, + "grad_norm": 10.855441093444824, + "kl": 0.072265625, + "learning_rate": 7.928616953473549e-07, + "loss": 0.0029, + "reward": 1.5013923645019531, + "reward_std": 0.11773170530796051, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40764227509498596, + "rewards/pad": 0.109375, + "step": 650 + }, + { + "completion_length": 231.125, + "epoch": 0.2074569789674952, + "grad_norm": 9.067728042602539, + "kl": 0.07177734375, + "learning_rate": 7.925430210325047e-07, + "loss": 0.0029, + "reward": 1.5861713886260986, + "reward_std": 0.12338827550411224, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5392964482307434, + "rewards/pad": 0.046875, + "step": 651 + }, + { + "completion_length": 306.40625, + "epoch": 0.20777565328234543, + "grad_norm": 6.772564888000488, + "kl": 0.06689453125, + "learning_rate": 7.922243467176545e-07, + "loss": 0.0027, + "reward": 1.4238431453704834, + "reward_std": 0.08837040513753891, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4082180857658386, + "rewards/pad": 0.015625, + "step": 652 + }, + { + "completion_length": 296.8125, + "epoch": 0.20809432759719568, + "grad_norm": 6.618319034576416, + "kl": 0.06494140625, + "learning_rate": 7.919056724028042e-07, + "loss": 0.0026, + "reward": 1.3240604400634766, + "reward_std": 0.1518493890762329, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3396855592727661, + "rewards/pad": 0.0, + "step": 653 + }, + { + "completion_length": 300.421875, + "epoch": 0.2084130019120459, + "grad_norm": 11.761967658996582, + "kl": 0.07666015625, + "learning_rate": 7.91586998087954e-07, + "loss": 0.0031, + "reward": 1.5044121742248535, + "reward_std": 0.06377843022346497, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5044121742248535, + "rewards/pad": 0.0, + "step": 654 + }, + { + "completion_length": 414.296875, + "epoch": 0.20873167622689612, + "grad_norm": 14.195477485656738, + "kl": 0.08642578125, + "learning_rate": 7.912683237731038e-07, + "loss": 0.0035, + "reward": 1.5149987936019897, + "reward_std": 0.07117429375648499, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.514998733997345, + "step": 655 + }, + { + "completion_length": 234.09375, + "epoch": 0.20905035054174634, + "grad_norm": 11.114954948425293, + "kl": 0.07666015625, + "learning_rate": 7.909496494582536e-07, + "loss": 0.0031, + "reward": 1.4249340295791626, + "reward_std": 0.1590331494808197, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.424934059381485, + "rewards/pad": 0.015625, + "step": 656 + }, + { + "completion_length": 388.84375, + "epoch": 0.20936902485659656, + "grad_norm": 4.132210731506348, + "kl": 0.06494140625, + "learning_rate": 7.906309751434033e-07, + "loss": 0.0026, + "reward": 1.378748893737793, + "reward_std": 0.2675632834434509, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.39437395334243774, + "rewards/pad": 0.03125, + "step": 657 + }, + { + "completion_length": 257.328125, + "epoch": 0.20968769917144678, + "grad_norm": 8.155116081237793, + "kl": 0.080078125, + "learning_rate": 7.903123008285531e-07, + "loss": 0.0032, + "reward": 1.4638173580169678, + "reward_std": 0.17092521488666534, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.29194238781929016, + "rewards/pad": 0.1875, + "step": 658 + }, + { + "completion_length": 364.8125, + "epoch": 0.210006373486297, + "grad_norm": 8.514001846313477, + "kl": 0.06884765625, + "learning_rate": 7.89993626513703e-07, + "loss": 0.0028, + "reward": 1.4162960052490234, + "reward_std": 0.055003244429826736, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4162960648536682, + "rewards/pad": 0.0, + "step": 659 + }, + { + "completion_length": 302.5625, + "epoch": 0.21032504780114722, + "grad_norm": 5.264654159545898, + "kl": 0.07421875, + "learning_rate": 7.896749521988528e-07, + "loss": 0.003, + "reward": 1.4601562023162842, + "reward_std": 0.07050922513008118, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46015623211860657, + "rewards/pad": 0.0, + "step": 660 + }, + { + "completion_length": 300.1875, + "epoch": 0.21064372211599744, + "grad_norm": 4.894284248352051, + "kl": 0.06982421875, + "learning_rate": 7.893562778840025e-07, + "loss": 0.0028, + "reward": 1.594339370727539, + "reward_std": 0.08459107577800751, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5787143707275391, + "rewards/pad": 0.015625, + "step": 661 + }, + { + "completion_length": 361.890625, + "epoch": 0.2109623964308477, + "grad_norm": 27.475067138671875, + "kl": 0.0517578125, + "learning_rate": 7.890376035691523e-07, + "loss": 0.0021, + "reward": 1.4996387958526611, + "reward_std": 0.07242294400930405, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3746388852596283, + "rewards/pad": 0.125, + "step": 662 + }, + { + "completion_length": 232.5625, + "epoch": 0.2112810707456979, + "grad_norm": 5.580474853515625, + "kl": 0.08251953125, + "learning_rate": 7.887189292543021e-07, + "loss": 0.0033, + "reward": 1.6235365867614746, + "reward_std": 0.08368790149688721, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6235365867614746, + "rewards/pad": 0.0, + "step": 663 + }, + { + "completion_length": 273.9375, + "epoch": 0.21159974506054813, + "grad_norm": 6.202267169952393, + "kl": 0.0751953125, + "learning_rate": 7.884002549394519e-07, + "loss": 0.003, + "reward": 1.5605616569519043, + "reward_std": 0.07850479334592819, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5605616569519043, + "rewards/pad": 0.0, + "step": 664 + }, + { + "completion_length": 287.671875, + "epoch": 0.21191841937539835, + "grad_norm": 14.803051948547363, + "kl": 0.0673828125, + "learning_rate": 7.880815806246016e-07, + "loss": 0.0027, + "reward": 1.4791467189788818, + "reward_std": 0.19891056418418884, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3697718381881714, + "step": 665 + }, + { + "completion_length": 361.96875, + "epoch": 0.21223709369024857, + "grad_norm": 6.08893346786499, + "kl": 0.051025390625, + "learning_rate": 7.877629063097514e-07, + "loss": 0.002, + "reward": 1.542660117149353, + "reward_std": 0.10150934010744095, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.433285117149353, + "step": 666 + }, + { + "completion_length": 272.828125, + "epoch": 0.2125557680050988, + "grad_norm": 10.113510131835938, + "kl": 0.10400390625, + "learning_rate": 7.874442319949012e-07, + "loss": 0.0042, + "reward": 1.5347037315368652, + "reward_std": 0.05837291479110718, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5347037315368652, + "rewards/pad": 0.0, + "step": 667 + }, + { + "completion_length": 348.390625, + "epoch": 0.212874442319949, + "grad_norm": 6.178546905517578, + "kl": 0.052978515625, + "learning_rate": 7.87125557680051e-07, + "loss": 0.0021, + "reward": 1.5060725212097168, + "reward_std": 0.1880113184452057, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.365447461605072, + "step": 668 + }, + { + "completion_length": 383.359375, + "epoch": 0.21319311663479923, + "grad_norm": 10.284262657165527, + "kl": 0.055908203125, + "learning_rate": 7.868068833652007e-07, + "loss": 0.0022, + "reward": 1.4481096267700195, + "reward_std": 0.109918974339962, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46373462677001953, + "step": 669 + }, + { + "completion_length": 283.953125, + "epoch": 0.21351179094964945, + "grad_norm": 17.789011001586914, + "kl": 0.076171875, + "learning_rate": 7.864882090503505e-07, + "loss": 0.0031, + "reward": 1.293283224105835, + "reward_std": 0.12071557343006134, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.30890828371047974, + "rewards/pad": 0.0, + "step": 670 + }, + { + "completion_length": 225.4375, + "epoch": 0.21383046526449967, + "grad_norm": 13.195700645446777, + "kl": 0.08544921875, + "learning_rate": 7.861695347355003e-07, + "loss": 0.0034, + "reward": 1.5755658149719238, + "reward_std": 0.10046328604221344, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5755658745765686, + "rewards/pad": 0.0, + "step": 671 + }, + { + "completion_length": 261.46875, + "epoch": 0.21414913957934992, + "grad_norm": 5.178892135620117, + "kl": 0.0654296875, + "learning_rate": 7.858508604206501e-07, + "loss": 0.0026, + "reward": 1.61923086643219, + "reward_std": 0.06453816592693329, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2442309409379959, + "step": 672 + }, + { + "completion_length": 316.890625, + "epoch": 0.21446781389420014, + "grad_norm": 12.810659408569336, + "kl": 0.06640625, + "learning_rate": 7.855321861057998e-07, + "loss": 0.0027, + "reward": 1.6415035724639893, + "reward_std": 0.046421363949775696, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5165035724639893, + "step": 673 + }, + { + "completion_length": 327.6875, + "epoch": 0.21478648820905036, + "grad_norm": 13.431648254394531, + "kl": 0.064453125, + "learning_rate": 7.852135117909496e-07, + "loss": 0.0026, + "reward": 1.3542231321334839, + "reward_std": 0.10205789655447006, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33859819173812866, + "step": 674 + }, + { + "completion_length": 278.578125, + "epoch": 0.21510516252390058, + "grad_norm": 7.597766399383545, + "kl": 0.07958984375, + "learning_rate": 7.848948374760994e-07, + "loss": 0.0032, + "reward": 1.4646666049957275, + "reward_std": 0.26397255063056946, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.44904154539108276, + "step": 675 + }, + { + "completion_length": 238.296875, + "epoch": 0.2154238368387508, + "grad_norm": 25.516254425048828, + "kl": 0.076171875, + "learning_rate": 7.845761631612492e-07, + "loss": 0.003, + "reward": 1.469029188156128, + "reward_std": 0.21218442916870117, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39090412855148315, + "rewards/pad": 0.09375, + "step": 676 + }, + { + "completion_length": 239.734375, + "epoch": 0.21574251115360102, + "grad_norm": 5.068884372711182, + "kl": 0.087890625, + "learning_rate": 7.842574888463989e-07, + "loss": 0.0035, + "reward": 1.4459609985351562, + "reward_std": 0.10616542398929596, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3678360879421234, + "step": 677 + }, + { + "completion_length": 303.71875, + "epoch": 0.21606118546845124, + "grad_norm": 6.922590255737305, + "kl": 0.07177734375, + "learning_rate": 7.839388145315488e-07, + "loss": 0.0029, + "reward": 1.3253346681594849, + "reward_std": 0.08155903965234756, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34095969796180725, + "step": 678 + }, + { + "completion_length": 164.375, + "epoch": 0.21637985978330146, + "grad_norm": 12.585428237915039, + "kl": 0.119140625, + "learning_rate": 7.836201402166986e-07, + "loss": 0.0048, + "reward": 1.6467983722686768, + "reward_std": 0.15064901113510132, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4124232530593872, + "rewards/pad": 0.234375, + "step": 679 + }, + { + "completion_length": 330.078125, + "epoch": 0.21669853409815168, + "grad_norm": 18.918582916259766, + "kl": 0.06396484375, + "learning_rate": 7.833014659018483e-07, + "loss": 0.0026, + "reward": 1.3851571083068848, + "reward_std": 0.1387689709663391, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.30703213810920715, + "rewards/pad": 0.109375, + "step": 680 + }, + { + "completion_length": 298.328125, + "epoch": 0.2170172084130019, + "grad_norm": 8.898725509643555, + "kl": 0.0751953125, + "learning_rate": 7.829827915869981e-07, + "loss": 0.003, + "reward": 1.3700380325317383, + "reward_std": 0.06074301898479462, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3700379729270935, + "step": 681 + }, + { + "completion_length": 322.640625, + "epoch": 0.21733588272785215, + "grad_norm": 14.546353340148926, + "kl": 0.064453125, + "learning_rate": 7.826641172721479e-07, + "loss": 0.0026, + "reward": 1.4926016330718994, + "reward_std": 0.08959314972162247, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.367601603269577, + "rewards/pad": 0.125, + "step": 682 + }, + { + "completion_length": 256.671875, + "epoch": 0.21765455704270237, + "grad_norm": 5.957823753356934, + "kl": 0.08056640625, + "learning_rate": 7.823454429572977e-07, + "loss": 0.0032, + "reward": 1.5421984195709229, + "reward_std": 0.08622148633003235, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5421984791755676, + "step": 683 + }, + { + "completion_length": 307.234375, + "epoch": 0.2179732313575526, + "grad_norm": 6.484907150268555, + "kl": 0.09033203125, + "learning_rate": 7.820267686424474e-07, + "loss": 0.0036, + "reward": 1.4343141317367554, + "reward_std": 0.09127238392829895, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44993916153907776, + "rewards/pad": 0.0, + "step": 684 + }, + { + "completion_length": 377.796875, + "epoch": 0.2182919056724028, + "grad_norm": 4.331064701080322, + "kl": 0.053955078125, + "learning_rate": 7.817080943275972e-07, + "loss": 0.0022, + "reward": 1.450018286705017, + "reward_std": 0.07817307859659195, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.29376834630966187, + "step": 685 + }, + { + "completion_length": 259.765625, + "epoch": 0.21861057998725303, + "grad_norm": 7.207475185394287, + "kl": 0.07958984375, + "learning_rate": 7.81389420012747e-07, + "loss": 0.0032, + "reward": 1.5709216594696045, + "reward_std": 0.10550111532211304, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46154657006263733, + "rewards/pad": 0.109375, + "step": 686 + }, + { + "completion_length": 300.4375, + "epoch": 0.21892925430210325, + "grad_norm": 34.08000564575195, + "kl": 0.09228515625, + "learning_rate": 7.810707456978967e-07, + "loss": 0.0037, + "reward": 1.5559680461883545, + "reward_std": 0.1384851634502411, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5559679269790649, + "rewards/pad": 0.015625, + "step": 687 + }, + { + "completion_length": 272.15625, + "epoch": 0.21924792861695347, + "grad_norm": 8.619714736938477, + "kl": 0.0791015625, + "learning_rate": 7.807520713830464e-07, + "loss": 0.0032, + "reward": 1.5306808948516846, + "reward_std": 0.11045212298631668, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4681808650493622, + "rewards/pad": 0.0625, + "step": 688 + }, + { + "completion_length": 356.859375, + "epoch": 0.2195666029318037, + "grad_norm": 13.284117698669434, + "kl": 0.1142578125, + "learning_rate": 7.804333970681962e-07, + "loss": 0.0046, + "reward": 1.48612642288208, + "reward_std": 0.10852101445198059, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3923763036727905, + "step": 689 + }, + { + "completion_length": 257.390625, + "epoch": 0.2198852772466539, + "grad_norm": 15.725584983825684, + "kl": 0.072265625, + "learning_rate": 7.80114722753346e-07, + "loss": 0.0029, + "reward": 1.5707192420959473, + "reward_std": 0.18728512525558472, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4769693613052368, + "rewards/pad": 0.09375, + "step": 690 + }, + { + "completion_length": 204.015625, + "epoch": 0.22020395156150413, + "grad_norm": 16.46550941467285, + "kl": 0.09814453125, + "learning_rate": 7.797960484384958e-07, + "loss": 0.0039, + "reward": 1.4956278800964355, + "reward_std": 0.06200110539793968, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49562788009643555, + "step": 691 + }, + { + "completion_length": 253.015625, + "epoch": 0.22052262587635438, + "grad_norm": 10.900519371032715, + "kl": 0.09375, + "learning_rate": 7.794773741236455e-07, + "loss": 0.0037, + "reward": 1.4226266145706177, + "reward_std": 0.08624820411205292, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42262664437294006, + "rewards/pad": 0.0, + "step": 692 + }, + { + "completion_length": 255.9375, + "epoch": 0.2208413001912046, + "grad_norm": 16.583126068115234, + "kl": 0.08544921875, + "learning_rate": 7.791586998087953e-07, + "loss": 0.0034, + "reward": 1.4481335878372192, + "reward_std": 0.09301026910543442, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44813355803489685, + "rewards/pad": 0.0, + "step": 693 + }, + { + "completion_length": 377.390625, + "epoch": 0.22115997450605482, + "grad_norm": 4.421432018280029, + "kl": 0.0556640625, + "learning_rate": 7.788400254939451e-07, + "loss": 0.0022, + "reward": 1.5260976552963257, + "reward_std": 0.12610724568367004, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5417227149009705, + "step": 694 + }, + { + "completion_length": 397.59375, + "epoch": 0.22147864882090504, + "grad_norm": 6.189242362976074, + "kl": 0.0537109375, + "learning_rate": 7.785213511790949e-07, + "loss": 0.0021, + "reward": 1.4347798824310303, + "reward_std": 0.09408040344715118, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4035297632217407, + "step": 695 + }, + { + "completion_length": 166.515625, + "epoch": 0.22179732313575526, + "grad_norm": 16.723281860351562, + "kl": 0.08984375, + "learning_rate": 7.782026768642446e-07, + "loss": 0.0036, + "reward": 1.554368495941162, + "reward_std": 0.2356569468975067, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.28874337673187256, + "rewards/pad": 0.296875, + "step": 696 + }, + { + "completion_length": 258.390625, + "epoch": 0.22211599745060548, + "grad_norm": 9.294581413269043, + "kl": 0.06884765625, + "learning_rate": 7.778840025493945e-07, + "loss": 0.0028, + "reward": 1.6144590377807617, + "reward_std": 0.06388699263334274, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.48945894837379456, + "step": 697 + }, + { + "completion_length": 345.328125, + "epoch": 0.2224346717654557, + "grad_norm": 9.66220760345459, + "kl": 0.05859375, + "learning_rate": 7.775653282345443e-07, + "loss": 0.0023, + "reward": 1.3565130233764648, + "reward_std": 0.18195012211799622, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3877629339694977, + "step": 698 + }, + { + "completion_length": 250.3125, + "epoch": 0.22275334608030592, + "grad_norm": 6.309233665466309, + "kl": 0.0869140625, + "learning_rate": 7.772466539196941e-07, + "loss": 0.0035, + "reward": 1.4840912818908691, + "reward_std": 0.10586719214916229, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37471622228622437, + "step": 699 + }, + { + "completion_length": 351.03125, + "epoch": 0.22307202039515614, + "grad_norm": 21.303956985473633, + "kl": 0.060546875, + "learning_rate": 7.769279796048438e-07, + "loss": 0.0024, + "reward": 1.451223611831665, + "reward_std": 0.12037895619869232, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46684861183166504, + "rewards/pad": 0.0, + "step": 700 + }, + { + "completion_length": 304.6875, + "epoch": 0.22339069471000636, + "grad_norm": 11.189263343811035, + "kl": 0.0693359375, + "learning_rate": 7.766093052899936e-07, + "loss": 0.0028, + "reward": 1.632739543914795, + "reward_std": 0.13571488857269287, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5233644247055054, + "rewards/pad": 0.125, + "step": 701 + }, + { + "completion_length": 203.375, + "epoch": 0.2237093690248566, + "grad_norm": 20.83745002746582, + "kl": 0.09619140625, + "learning_rate": 7.762906309751434e-07, + "loss": 0.0039, + "reward": 1.6446971893310547, + "reward_std": 0.117695152759552, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6446971893310547, + "rewards/pad": 0.0, + "step": 702 + }, + { + "completion_length": 168.8125, + "epoch": 0.22402804333970683, + "grad_norm": 10.630196571350098, + "kl": 0.091796875, + "learning_rate": 7.759719566602932e-07, + "loss": 0.0037, + "reward": 1.5714305639266968, + "reward_std": 0.21625912189483643, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.46205565333366394, + "step": 703 + }, + { + "completion_length": 352.703125, + "epoch": 0.22434671765455705, + "grad_norm": 13.083415985107422, + "kl": 0.052978515625, + "learning_rate": 7.756532823454429e-07, + "loss": 0.0021, + "reward": 1.4256035089492798, + "reward_std": 0.0655084028840065, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3006035089492798, + "rewards/pad": 0.125, + "step": 704 + }, + { + "completion_length": 321.890625, + "epoch": 0.22466539196940727, + "grad_norm": 8.3506441116333, + "kl": 0.07470703125, + "learning_rate": 7.753346080305927e-07, + "loss": 0.003, + "reward": 1.4344754219055176, + "reward_std": 0.13548129796981812, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4501004219055176, + "rewards/pad": 0.0, + "step": 705 + }, + { + "completion_length": 122.609375, + "epoch": 0.2249840662842575, + "grad_norm": 23.514019012451172, + "kl": 0.0986328125, + "learning_rate": 7.750159337157425e-07, + "loss": 0.0039, + "reward": 1.5593478679656982, + "reward_std": 0.3233591616153717, + "rewards/answer_reward": 0.1875, + "rewards/format_reward_gqa": 0.9375, + "rewards/iou_glue_reward": 0.434347927570343, + "step": 706 + }, + { + "completion_length": 261.96875, + "epoch": 0.2253027405991077, + "grad_norm": 14.593097686767578, + "kl": 0.080078125, + "learning_rate": 7.746972594008923e-07, + "loss": 0.0032, + "reward": 1.6064164638519287, + "reward_std": 0.2630051374435425, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4657914638519287, + "rewards/pad": 0.15625, + "step": 707 + }, + { + "completion_length": 269.46875, + "epoch": 0.22562141491395793, + "grad_norm": 6.744601726531982, + "kl": 0.08642578125, + "learning_rate": 7.74378585086042e-07, + "loss": 0.0034, + "reward": 1.5797683000564575, + "reward_std": 0.20066973567008972, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4860183000564575, + "rewards/pad": 0.109375, + "step": 708 + }, + { + "completion_length": 151.21875, + "epoch": 0.22594008922880815, + "grad_norm": 8.127285957336426, + "kl": 0.11083984375, + "learning_rate": 7.740599107711918e-07, + "loss": 0.0044, + "reward": 1.563246250152588, + "reward_std": 0.17007926106452942, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.4694961905479431, + "step": 709 + }, + { + "completion_length": 189.953125, + "epoch": 0.22625876354365837, + "grad_norm": 4.988349914550781, + "kl": 0.0927734375, + "learning_rate": 7.737412364563416e-07, + "loss": 0.0037, + "reward": 1.4685695171356201, + "reward_std": 0.13317400217056274, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4841945171356201, + "step": 710 + }, + { + "completion_length": 166.859375, + "epoch": 0.22657743785850862, + "grad_norm": 18.31987190246582, + "kl": 0.0888671875, + "learning_rate": 7.734225621414913e-07, + "loss": 0.0035, + "reward": 1.537738561630249, + "reward_std": 0.1872149109840393, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.381488561630249, + "step": 711 + }, + { + "completion_length": 273.859375, + "epoch": 0.22689611217335884, + "grad_norm": 6.696666717529297, + "kl": 0.0732421875, + "learning_rate": 7.731038878266411e-07, + "loss": 0.0029, + "reward": 1.527390718460083, + "reward_std": 0.16815558075904846, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5586407780647278, + "step": 712 + }, + { + "completion_length": 287.9375, + "epoch": 0.22721478648820906, + "grad_norm": 13.334807395935059, + "kl": 0.06494140625, + "learning_rate": 7.727852135117909e-07, + "loss": 0.0026, + "reward": 1.5138880014419556, + "reward_std": 0.11974107474088669, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4201379418373108, + "rewards/pad": 0.109375, + "step": 713 + }, + { + "completion_length": 256.140625, + "epoch": 0.22753346080305928, + "grad_norm": 9.865249633789062, + "kl": 0.06982421875, + "learning_rate": 7.724665391969407e-07, + "loss": 0.0028, + "reward": 1.3564677238464355, + "reward_std": 0.1476736217737198, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.26271772384643555, + "rewards/pad": 0.109375, + "step": 714 + }, + { + "completion_length": 207.171875, + "epoch": 0.2278521351179095, + "grad_norm": 15.958372116088867, + "kl": 0.109375, + "learning_rate": 7.721478648820904e-07, + "loss": 0.0044, + "reward": 1.476958155632019, + "reward_std": 0.1478617787361145, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49258315563201904, + "rewards/pad": 0.0, + "step": 715 + }, + { + "completion_length": 367.640625, + "epoch": 0.22817080943275972, + "grad_norm": 15.024460792541504, + "kl": 0.038818359375, + "learning_rate": 7.718291905672403e-07, + "loss": 0.0016, + "reward": 1.4593737125396729, + "reward_std": 0.17769308388233185, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.42812371253967285, + "step": 716 + }, + { + "completion_length": 244.96875, + "epoch": 0.22848948374760994, + "grad_norm": 13.350499153137207, + "kl": 0.07958984375, + "learning_rate": 7.715105162523901e-07, + "loss": 0.0032, + "reward": 1.5930790901184082, + "reward_std": 0.13378813862800598, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48370397090911865, + "rewards/pad": 0.125, + "step": 717 + }, + { + "completion_length": 182.25, + "epoch": 0.22880815806246016, + "grad_norm": 37.75197219848633, + "kl": 0.09619140625, + "learning_rate": 7.711918419375399e-07, + "loss": 0.0038, + "reward": 1.5131914615631104, + "reward_std": 0.22410151362419128, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5131914615631104, + "step": 718 + }, + { + "completion_length": 186.75, + "epoch": 0.22912683237731038, + "grad_norm": 25.390933990478516, + "kl": 0.10595703125, + "learning_rate": 7.708731676226896e-07, + "loss": 0.0042, + "reward": 1.4066598415374756, + "reward_std": 0.12678267061710358, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4222848117351532, + "rewards/pad": 0.0, + "step": 719 + }, + { + "completion_length": 166.078125, + "epoch": 0.2294455066921606, + "grad_norm": 20.202503204345703, + "kl": 0.1025390625, + "learning_rate": 7.705544933078394e-07, + "loss": 0.0041, + "reward": 1.506861925125122, + "reward_std": 0.12078607827425003, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5068618059158325, + "step": 720 + }, + { + "completion_length": 233.359375, + "epoch": 0.22976418100701085, + "grad_norm": 24.11056900024414, + "kl": 0.078125, + "learning_rate": 7.702358189929892e-07, + "loss": 0.0031, + "reward": 1.4882006645202637, + "reward_std": 0.13335324823856354, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5038256049156189, + "rewards/pad": 0.0, + "step": 721 + }, + { + "completion_length": 275.6875, + "epoch": 0.23008285532186107, + "grad_norm": 4.728917121887207, + "kl": 0.07275390625, + "learning_rate": 7.69917144678139e-07, + "loss": 0.0029, + "reward": 1.3744094371795654, + "reward_std": 0.11881347000598907, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32753440737724304, + "step": 722 + }, + { + "completion_length": 251.9375, + "epoch": 0.2304015296367113, + "grad_norm": 8.52653694152832, + "kl": 0.08154296875, + "learning_rate": 7.695984703632887e-07, + "loss": 0.0033, + "reward": 1.6842291355133057, + "reward_std": 0.20814046263694763, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48110413551330566, + "rewards/pad": 0.21875, + "step": 723 + }, + { + "completion_length": 306.109375, + "epoch": 0.2307202039515615, + "grad_norm": 11.247517585754395, + "kl": 0.0888671875, + "learning_rate": 7.692797960484385e-07, + "loss": 0.0036, + "reward": 1.4908294677734375, + "reward_std": 0.06939513236284256, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3658294081687927, + "step": 724 + }, + { + "completion_length": 337.0625, + "epoch": 0.23103887826641173, + "grad_norm": 6.2073445320129395, + "kl": 0.0791015625, + "learning_rate": 7.689611217335883e-07, + "loss": 0.0031, + "reward": 1.376875877380371, + "reward_std": 0.033518463373184204, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.25187593698501587, + "step": 725 + }, + { + "completion_length": 276.9375, + "epoch": 0.23135755258126195, + "grad_norm": 17.229686737060547, + "kl": 0.10009765625, + "learning_rate": 7.68642447418738e-07, + "loss": 0.004, + "reward": 1.5234543085098267, + "reward_std": 0.08890549838542938, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5234543681144714, + "step": 726 + }, + { + "completion_length": 176.5625, + "epoch": 0.23167622689611217, + "grad_norm": 7.54797887802124, + "kl": 0.10546875, + "learning_rate": 7.683237731038877e-07, + "loss": 0.0042, + "reward": 1.566821813583374, + "reward_std": 0.06920469552278519, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5668217539787292, + "rewards/pad": 0.0, + "step": 727 + }, + { + "completion_length": 184.5625, + "epoch": 0.2319949012109624, + "grad_norm": 11.765900611877441, + "kl": 0.12158203125, + "learning_rate": 7.680050987890375e-07, + "loss": 0.0049, + "reward": 1.5227802991867065, + "reward_std": 0.18534991145133972, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44465523958206177, + "rewards/pad": 0.078125, + "step": 728 + }, + { + "completion_length": 327.484375, + "epoch": 0.2323135755258126, + "grad_norm": 7.624211311340332, + "kl": 0.059814453125, + "learning_rate": 7.676864244741873e-07, + "loss": 0.0024, + "reward": 1.540311574935913, + "reward_std": 0.11048051714897156, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4309366047382355, + "step": 729 + }, + { + "completion_length": 179.96875, + "epoch": 0.23263224984066283, + "grad_norm": 12.753997802734375, + "kl": 0.1015625, + "learning_rate": 7.673677501593371e-07, + "loss": 0.0041, + "reward": 1.4728918075561523, + "reward_std": 0.12791694700717926, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4728918671607971, + "rewards/pad": 0.0, + "step": 730 + }, + { + "completion_length": 218.59375, + "epoch": 0.23295092415551308, + "grad_norm": 13.48316478729248, + "kl": 0.10205078125, + "learning_rate": 7.670490758444868e-07, + "loss": 0.0041, + "reward": 1.6227314472198486, + "reward_std": 0.08450779318809509, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4977312684059143, + "step": 731 + }, + { + "completion_length": 158.90625, + "epoch": 0.2332695984703633, + "grad_norm": 159.3242950439453, + "kl": 0.103515625, + "learning_rate": 7.667304015296366e-07, + "loss": 0.0041, + "reward": 1.6386592388153076, + "reward_std": 0.1035318672657013, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5136591196060181, + "rewards/pad": 0.125, + "step": 732 + }, + { + "completion_length": 363.671875, + "epoch": 0.23358827278521352, + "grad_norm": 5.775212287902832, + "kl": 0.0556640625, + "learning_rate": 7.664117272147864e-07, + "loss": 0.0022, + "reward": 1.324002742767334, + "reward_std": 0.01590358465909958, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.32400280237197876, + "rewards/pad": 0.0, + "step": 733 + }, + { + "completion_length": 273.5625, + "epoch": 0.23390694710006374, + "grad_norm": 9.607227325439453, + "kl": 0.078125, + "learning_rate": 7.660930528999362e-07, + "loss": 0.0031, + "reward": 1.46985924243927, + "reward_std": 0.11552847921848297, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4542343020439148, + "rewards/pad": 0.03125, + "step": 734 + }, + { + "completion_length": 264.625, + "epoch": 0.23422562141491396, + "grad_norm": 11.888554573059082, + "kl": 0.076171875, + "learning_rate": 7.65774378585086e-07, + "loss": 0.003, + "reward": 1.63518226146698, + "reward_std": 0.11117585748434067, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49455729126930237, + "rewards/pad": 0.140625, + "step": 735 + }, + { + "completion_length": 212.28125, + "epoch": 0.23454429572976418, + "grad_norm": 8.31882381439209, + "kl": 0.091796875, + "learning_rate": 7.654557042702358e-07, + "loss": 0.0037, + "reward": 1.5230109691619873, + "reward_std": 0.09353186190128326, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5230109691619873, + "step": 736 + }, + { + "completion_length": 247.890625, + "epoch": 0.2348629700446144, + "grad_norm": 8.512290954589844, + "kl": 0.099609375, + "learning_rate": 7.651370299553856e-07, + "loss": 0.004, + "reward": 1.6262836456298828, + "reward_std": 0.09318853914737701, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5012836456298828, + "step": 737 + }, + { + "completion_length": 112.421875, + "epoch": 0.23518164435946462, + "grad_norm": 15.200469017028809, + "kl": 0.12158203125, + "learning_rate": 7.648183556405353e-07, + "loss": 0.0049, + "reward": 1.7096667289733887, + "reward_std": 0.13319414854049683, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.33466672897338867, + "rewards/pad": 0.375, + "step": 738 + }, + { + "completion_length": 145.859375, + "epoch": 0.23550031867431484, + "grad_norm": 9.961045265197754, + "kl": 0.11279296875, + "learning_rate": 7.644996813256851e-07, + "loss": 0.0045, + "reward": 1.5271918773651123, + "reward_std": 0.060988496989011765, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5271918773651123, + "rewards/pad": 0.0, + "step": 739 + }, + { + "completion_length": 339.8125, + "epoch": 0.23581899298916506, + "grad_norm": 15.928852081298828, + "kl": 0.0615234375, + "learning_rate": 7.641810070108349e-07, + "loss": 0.0025, + "reward": 1.3527032136917114, + "reward_std": 0.06639071553945541, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3527032136917114, + "step": 740 + }, + { + "completion_length": 264.21875, + "epoch": 0.2361376673040153, + "grad_norm": 12.001644134521484, + "kl": 0.08154296875, + "learning_rate": 7.638623326959847e-07, + "loss": 0.0033, + "reward": 1.316277027130127, + "reward_std": 0.14961186051368713, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.33190202713012695, + "step": 741 + }, + { + "completion_length": 178.5, + "epoch": 0.23645634161886553, + "grad_norm": 14.840734481811523, + "kl": 0.12353515625, + "learning_rate": 7.635436583811344e-07, + "loss": 0.0049, + "reward": 1.619857907295227, + "reward_std": 0.17236697673797607, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5261078476905823, + "rewards/pad": 0.125, + "step": 742 + }, + { + "completion_length": 174.9375, + "epoch": 0.23677501593371575, + "grad_norm": 9.551653861999512, + "kl": 0.10205078125, + "learning_rate": 7.632249840662842e-07, + "loss": 0.0041, + "reward": 1.6346447467803955, + "reward_std": 0.21686974167823792, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43151968717575073, + "rewards/pad": 0.21875, + "step": 743 + }, + { + "completion_length": 309.453125, + "epoch": 0.23709369024856597, + "grad_norm": 18.017536163330078, + "kl": 0.0703125, + "learning_rate": 7.62906309751434e-07, + "loss": 0.0028, + "reward": 1.5284950733184814, + "reward_std": 0.11433098465204239, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5128699541091919, + "step": 744 + }, + { + "completion_length": 213.6875, + "epoch": 0.2374123645634162, + "grad_norm": 11.528428077697754, + "kl": 0.10546875, + "learning_rate": 7.625876354365838e-07, + "loss": 0.0042, + "reward": 1.6650524139404297, + "reward_std": 0.09348995238542557, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5400524139404297, + "step": 745 + }, + { + "completion_length": 309.265625, + "epoch": 0.2377310388782664, + "grad_norm": 20.259113311767578, + "kl": 0.0712890625, + "learning_rate": 7.622689611217335e-07, + "loss": 0.0029, + "reward": 1.4184303283691406, + "reward_std": 0.10087809711694717, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38718029856681824, + "step": 746 + }, + { + "completion_length": 388.25, + "epoch": 0.23804971319311663, + "grad_norm": 15.798137664794922, + "kl": 0.171875, + "learning_rate": 7.619502868068833e-07, + "loss": 0.0069, + "reward": 1.6084961891174316, + "reward_std": 0.27900782227516174, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.4366213083267212, + "step": 747 + }, + { + "completion_length": 251.15625, + "epoch": 0.23836838750796685, + "grad_norm": 27.81381607055664, + "kl": 0.07421875, + "learning_rate": 7.616316124920331e-07, + "loss": 0.003, + "reward": 1.7127543687820435, + "reward_std": 0.13633526861667633, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46275442838668823, + "rewards/pad": 0.25, + "step": 748 + }, + { + "completion_length": 234.53125, + "epoch": 0.23868706182281707, + "grad_norm": 21.107439041137695, + "kl": 0.1171875, + "learning_rate": 7.613129381771829e-07, + "loss": 0.0047, + "reward": 1.5439451932907104, + "reward_std": 0.11229214072227478, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5439450740814209, + "rewards/pad": 0.0, + "step": 749 + }, + { + "completion_length": 424.484375, + "epoch": 0.2390057361376673, + "grad_norm": 13.354806900024414, + "kl": 0.05224609375, + "learning_rate": 7.609942638623326e-07, + "loss": 0.0021, + "reward": 1.4798873662948608, + "reward_std": 0.10450975596904755, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49551236629486084, + "rewards/pad": 0.0, + "step": 750 + }, + { + "completion_length": 335.078125, + "epoch": 0.23932441045251754, + "grad_norm": 7.3585333824157715, + "kl": 0.0849609375, + "learning_rate": 7.606755895474824e-07, + "loss": 0.0034, + "reward": 1.5674034357070923, + "reward_std": 0.14845271408557892, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4580284357070923, + "rewards/pad": 0.125, + "step": 751 + }, + { + "completion_length": 361.0625, + "epoch": 0.23964308476736776, + "grad_norm": 6.82198429107666, + "kl": 0.07861328125, + "learning_rate": 7.603569152326322e-07, + "loss": 0.0032, + "reward": 1.4816830158233643, + "reward_std": 0.16714411973953247, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.512933075428009, + "rewards/pad": 0.0, + "step": 752 + }, + { + "completion_length": 221.421875, + "epoch": 0.23996175908221798, + "grad_norm": 9.491703987121582, + "kl": 0.10986328125, + "learning_rate": 7.60038240917782e-07, + "loss": 0.0044, + "reward": 1.6149952411651611, + "reward_std": 0.12568673491477966, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48999521136283875, + "rewards/pad": 0.125, + "step": 753 + }, + { + "completion_length": 211.296875, + "epoch": 0.2402804333970682, + "grad_norm": 20.870731353759766, + "kl": 0.10498046875, + "learning_rate": 7.597195666029318e-07, + "loss": 0.0042, + "reward": 1.5477931499481201, + "reward_std": 0.22584594786167145, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4540431499481201, + "rewards/pad": 0.125, + "step": 754 + }, + { + "completion_length": 276.09375, + "epoch": 0.24059910771191842, + "grad_norm": 10.502787590026855, + "kl": 0.09326171875, + "learning_rate": 7.594008922880816e-07, + "loss": 0.0037, + "reward": 1.3897948265075684, + "reward_std": 0.262864351272583, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.43666979670524597, + "rewards/pad": 0.0, + "step": 755 + }, + { + "completion_length": 205.359375, + "epoch": 0.24091778202676864, + "grad_norm": 22.47008514404297, + "kl": 0.1044921875, + "learning_rate": 7.590822179732314e-07, + "loss": 0.0042, + "reward": 1.4155709743499756, + "reward_std": 0.27894821763038635, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.3843209445476532, + "rewards/pad": 0.078125, + "step": 756 + }, + { + "completion_length": 271.828125, + "epoch": 0.24123645634161886, + "grad_norm": 11.023628234863281, + "kl": 0.078125, + "learning_rate": 7.587635436583812e-07, + "loss": 0.0031, + "reward": 1.4424021244049072, + "reward_std": 0.211650550365448, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.3799021244049072, + "step": 757 + }, + { + "completion_length": 219.71875, + "epoch": 0.24155513065646908, + "grad_norm": 13.070090293884277, + "kl": 0.09814453125, + "learning_rate": 7.584448693435309e-07, + "loss": 0.0039, + "reward": 1.633631944656372, + "reward_std": 0.2932754456996918, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.5242569446563721, + "step": 758 + }, + { + "completion_length": 165.84375, + "epoch": 0.2418738049713193, + "grad_norm": 11.352629661560059, + "kl": 0.1533203125, + "learning_rate": 7.581261950286807e-07, + "loss": 0.0061, + "reward": 1.5862655639648438, + "reward_std": 0.16199254989624023, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4925156235694885, + "rewards/pad": 0.125, + "step": 759 + }, + { + "completion_length": 421.359375, + "epoch": 0.24219247928616955, + "grad_norm": 4.43811559677124, + "kl": 0.0615234375, + "learning_rate": 7.578075207138305e-07, + "loss": 0.0025, + "reward": 1.3123204708099365, + "reward_std": 0.34193655848503113, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.765625, + "rewards/iou_glue_reward": 0.4216954708099365, + "step": 760 + }, + { + "completion_length": 112.921875, + "epoch": 0.24251115360101977, + "grad_norm": 7.119170188903809, + "kl": 0.12158203125, + "learning_rate": 7.574888463989803e-07, + "loss": 0.0049, + "reward": 1.7790417671203613, + "reward_std": 0.1314246654510498, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5446667671203613, + "rewards/pad": 0.234375, + "step": 761 + }, + { + "completion_length": 456.25, + "epoch": 0.24282982791587, + "grad_norm": 3.5768837928771973, + "kl": 0.046875, + "learning_rate": 7.5717017208413e-07, + "loss": 0.0019, + "reward": 1.3192476034164429, + "reward_std": 0.2558511197566986, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.875, + "rewards/tracking_iou_reward": 0.44424766302108765, + "step": 762 + }, + { + "completion_length": 277.578125, + "epoch": 0.2431485022307202, + "grad_norm": 15.019842147827148, + "kl": 0.08251953125, + "learning_rate": 7.568514977692798e-07, + "loss": 0.0033, + "reward": 1.4773023128509521, + "reward_std": 0.1882389783859253, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.39917728304862976, + "step": 763 + }, + { + "completion_length": 241.515625, + "epoch": 0.24346717654557043, + "grad_norm": 36.11360549926758, + "kl": 0.091796875, + "learning_rate": 7.565328234544296e-07, + "loss": 0.0037, + "reward": 1.727245569229126, + "reward_std": 0.22436536848545074, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.555370569229126, + "step": 764 + }, + { + "completion_length": 330.703125, + "epoch": 0.24378585086042065, + "grad_norm": 6.199359893798828, + "kl": 0.07080078125, + "learning_rate": 7.562141491395793e-07, + "loss": 0.0028, + "reward": 1.5140249729156494, + "reward_std": 0.20898380875587463, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.420274943113327, + "step": 765 + }, + { + "completion_length": 347.546875, + "epoch": 0.24410452517527087, + "grad_norm": 5.358139514923096, + "kl": 0.08203125, + "learning_rate": 7.55895474824729e-07, + "loss": 0.0033, + "reward": 1.5010960102081299, + "reward_std": 0.15243446826934814, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.5479710102081299, + "rewards/pad": 0.0, + "step": 766 + }, + { + "completion_length": 249.953125, + "epoch": 0.2444231994901211, + "grad_norm": 19.22927474975586, + "kl": 0.10107421875, + "learning_rate": 7.555768005098788e-07, + "loss": 0.004, + "reward": 1.586197853088379, + "reward_std": 0.1058286726474762, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46119794249534607, + "rewards/pad": 0.125, + "step": 767 + }, + { + "completion_length": 265.515625, + "epoch": 0.2447418738049713, + "grad_norm": 12.999495506286621, + "kl": 0.09375, + "learning_rate": 7.552581261950286e-07, + "loss": 0.0038, + "reward": 1.595004677772522, + "reward_std": 0.17190062999725342, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.516879677772522, + "rewards/pad": 0.09375, + "step": 768 + }, + { + "completion_length": 227.625, + "epoch": 0.24506054811982153, + "grad_norm": 10.447826385498047, + "kl": 0.08349609375, + "learning_rate": 7.549394518801783e-07, + "loss": 0.0033, + "reward": 1.4319894313812256, + "reward_std": 0.20667222142219543, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3382394015789032, + "step": 769 + }, + { + "completion_length": 322.34375, + "epoch": 0.24537922243467178, + "grad_norm": 18.960357666015625, + "kl": 0.0712890625, + "learning_rate": 7.546207775653281e-07, + "loss": 0.0029, + "reward": 1.3130937814712524, + "reward_std": 0.03393920511007309, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31309378147125244, + "rewards/pad": 0.0, + "step": 770 + }, + { + "completion_length": 217.875, + "epoch": 0.245697896749522, + "grad_norm": 11.330354690551758, + "kl": 0.10986328125, + "learning_rate": 7.543021032504779e-07, + "loss": 0.0044, + "reward": 1.6246107816696167, + "reward_std": 0.11168281733989716, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4371107518672943, + "rewards/pad": 0.1875, + "step": 771 + }, + { + "completion_length": 165.9375, + "epoch": 0.24601657106437222, + "grad_norm": 14.55956745147705, + "kl": 0.10205078125, + "learning_rate": 7.539834289356277e-07, + "loss": 0.0041, + "reward": 1.569342017173767, + "reward_std": 0.15387828648090363, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4599670171737671, + "rewards/pad": 0.125, + "step": 772 + }, + { + "completion_length": 263.03125, + "epoch": 0.24633524537922244, + "grad_norm": 4.5672101974487305, + "kl": 0.08056640625, + "learning_rate": 7.536647546207775e-07, + "loss": 0.0032, + "reward": 1.6846940517425537, + "reward_std": 0.05211274325847626, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5596940517425537, + "rewards/pad": 0.125, + "step": 773 + }, + { + "completion_length": 324.390625, + "epoch": 0.24665391969407266, + "grad_norm": 10.738593101501465, + "kl": 0.0771484375, + "learning_rate": 7.533460803059273e-07, + "loss": 0.0031, + "reward": 1.5184435844421387, + "reward_std": 0.19243696331977844, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5496935844421387, + "rewards/pad": 0.0, + "step": 774 + }, + { + "completion_length": 228.078125, + "epoch": 0.24697259400892288, + "grad_norm": 8.436495780944824, + "kl": 0.10888671875, + "learning_rate": 7.530274059910771e-07, + "loss": 0.0044, + "reward": 1.4948652982711792, + "reward_std": 0.1509888470172882, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.510490357875824, + "rewards/pad": 0.0, + "step": 775 + }, + { + "completion_length": 174.34375, + "epoch": 0.2472912683237731, + "grad_norm": 35.71000289916992, + "kl": 0.369140625, + "learning_rate": 7.527087316762269e-07, + "loss": 0.0149, + "reward": 1.3583810329437256, + "reward_std": 0.09419108927249908, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3583810329437256, + "rewards/pad": 0.0, + "step": 776 + }, + { + "completion_length": 281.59375, + "epoch": 0.24760994263862332, + "grad_norm": 9.188167572021484, + "kl": 0.08056640625, + "learning_rate": 7.523900573613766e-07, + "loss": 0.0032, + "reward": 1.4431560039520264, + "reward_std": 0.22239582240581512, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.49003103375434875, + "rewards/pad": 0.0, + "step": 777 + }, + { + "completion_length": 165.984375, + "epoch": 0.24792861695347354, + "grad_norm": 7.557226657867432, + "kl": 0.1162109375, + "learning_rate": 7.520713830465264e-07, + "loss": 0.0047, + "reward": 1.5415880680084229, + "reward_std": 0.22100627422332764, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46346306800842285, + "rewards/pad": 0.09375, + "step": 778 + }, + { + "completion_length": 328.5, + "epoch": 0.24824729126832376, + "grad_norm": 12.88247013092041, + "kl": 0.08740234375, + "learning_rate": 7.517527087316762e-07, + "loss": 0.0035, + "reward": 1.4905920028686523, + "reward_std": 0.046114400029182434, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4905920624732971, + "rewards/pad": 0.0, + "step": 779 + }, + { + "completion_length": 326.34375, + "epoch": 0.248565965583174, + "grad_norm": 5.975739479064941, + "kl": 0.06884765625, + "learning_rate": 7.51434034416826e-07, + "loss": 0.0028, + "reward": 1.367580533027649, + "reward_std": 0.14677515625953674, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3519555330276489, + "step": 780 + }, + { + "completion_length": 280.015625, + "epoch": 0.24888463989802423, + "grad_norm": 6.762017726898193, + "kl": 0.076171875, + "learning_rate": 7.511153601019757e-07, + "loss": 0.0031, + "reward": 1.6784141063690186, + "reward_std": 0.08459261059761047, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5690390467643738, + "rewards/pad": 0.109375, + "step": 781 + }, + { + "completion_length": 287.09375, + "epoch": 0.24920331421287445, + "grad_norm": 9.65837287902832, + "kl": 0.083984375, + "learning_rate": 7.507966857871255e-07, + "loss": 0.0034, + "reward": 1.3916666507720947, + "reward_std": 0.05018036067485809, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3916667401790619, + "rewards/pad": 0.0, + "step": 782 + }, + { + "completion_length": 252.90625, + "epoch": 0.24952198852772467, + "grad_norm": 15.337854385375977, + "kl": 0.0810546875, + "learning_rate": 7.504780114722753e-07, + "loss": 0.0032, + "reward": 1.4898476600646973, + "reward_std": 0.17910003662109375, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.38047271966934204, + "step": 783 + }, + { + "completion_length": 221.5, + "epoch": 0.2498406628425749, + "grad_norm": 24.365131378173828, + "kl": 0.09619140625, + "learning_rate": 7.501593371574251e-07, + "loss": 0.0038, + "reward": 1.6819980144500732, + "reward_std": 0.18875043094158173, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5726230144500732, + "rewards/pad": 0.125, + "step": 784 + }, + { + "completion_length": 278.296875, + "epoch": 0.2501593371574251, + "grad_norm": 10.942834854125977, + "kl": 0.08056640625, + "learning_rate": 7.498406628425748e-07, + "loss": 0.0032, + "reward": 1.412353754043579, + "reward_std": 0.07322700321674347, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4123537838459015, + "rewards/pad": 0.0, + "step": 785 + }, + { + "completion_length": 269.3125, + "epoch": 0.25047801147227533, + "grad_norm": 8.739012718200684, + "kl": 0.0732421875, + "learning_rate": 7.495219885277246e-07, + "loss": 0.0029, + "reward": 1.7182860374450684, + "reward_std": 0.08406949788331985, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5932859778404236, + "rewards/pad": 0.125, + "step": 786 + }, + { + "completion_length": 289.34375, + "epoch": 0.25079668578712555, + "grad_norm": 9.329911231994629, + "kl": 0.11083984375, + "learning_rate": 7.492033142128744e-07, + "loss": 0.0044, + "reward": 1.573399305343628, + "reward_std": 0.0749359279870987, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5733993053436279, + "step": 787 + }, + { + "completion_length": 469.59375, + "epoch": 0.25111536010197577, + "grad_norm": 6.025957107543945, + "kl": 0.052001953125, + "learning_rate": 7.488846398980242e-07, + "loss": 0.0021, + "reward": 1.4300410747528076, + "reward_std": 0.08247831463813782, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43004101514816284, + "rewards/pad": 0.0, + "step": 788 + }, + { + "completion_length": 302.46875, + "epoch": 0.251434034416826, + "grad_norm": 9.100362777709961, + "kl": 0.0771484375, + "learning_rate": 7.485659655831739e-07, + "loss": 0.0031, + "reward": 1.7954503297805786, + "reward_std": 0.0859527662396431, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6704503297805786, + "rewards/pad": 0.125, + "step": 789 + }, + { + "completion_length": 356.828125, + "epoch": 0.2517527087316762, + "grad_norm": 30.79294776916504, + "kl": 0.07080078125, + "learning_rate": 7.482472912683237e-07, + "loss": 0.0028, + "reward": 1.555119514465332, + "reward_std": 0.1339489072561264, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44574448466300964, + "step": 790 + }, + { + "completion_length": 380.453125, + "epoch": 0.25207138304652643, + "grad_norm": 8.440163612365723, + "kl": 0.04833984375, + "learning_rate": 7.479286169534736e-07, + "loss": 0.0019, + "reward": 1.7130658626556396, + "reward_std": 0.09868164360523224, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4786908030509949, + "step": 791 + }, + { + "completion_length": 134.921875, + "epoch": 0.25239005736137665, + "grad_norm": 20.89670181274414, + "kl": 0.1279296875, + "learning_rate": 7.476099426386234e-07, + "loss": 0.0051, + "reward": 1.4510223865509033, + "reward_std": 0.10243634879589081, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45102232694625854, + "rewards/pad": 0.0, + "step": 792 + }, + { + "completion_length": 368.234375, + "epoch": 0.25270873167622687, + "grad_norm": 9.967374801635742, + "kl": 0.1787109375, + "learning_rate": 7.472912683237731e-07, + "loss": 0.0071, + "reward": 1.417081356048584, + "reward_std": 0.09499022364616394, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43270638585090637, + "step": 793 + }, + { + "completion_length": 348.078125, + "epoch": 0.2530274059910771, + "grad_norm": 9.970160484313965, + "kl": 0.054443359375, + "learning_rate": 7.469725940089229e-07, + "loss": 0.0022, + "reward": 1.6479089260101318, + "reward_std": 0.21434885263442993, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.42915892601013184, + "step": 794 + }, + { + "completion_length": 443.890625, + "epoch": 0.25334608030592737, + "grad_norm": 5.636565208435059, + "kl": 0.06005859375, + "learning_rate": 7.466539196940727e-07, + "loss": 0.0024, + "reward": 1.5904240608215332, + "reward_std": 0.1765565574169159, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.4029240608215332, + "step": 795 + }, + { + "completion_length": 364.765625, + "epoch": 0.2536647546207776, + "grad_norm": 4.0662126541137695, + "kl": 0.0693359375, + "learning_rate": 7.463352453792225e-07, + "loss": 0.0028, + "reward": 1.3498592376708984, + "reward_std": 0.21396957337856293, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.3967343270778656, + "rewards/pad": 0.0, + "step": 796 + }, + { + "completion_length": 382.546875, + "epoch": 0.2539834289356278, + "grad_norm": 6.166215896606445, + "kl": 0.07763671875, + "learning_rate": 7.460165710643722e-07, + "loss": 0.0031, + "reward": 1.6516375541687012, + "reward_std": 0.07979218661785126, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.651637613773346, + "rewards/pad": 0.0, + "step": 797 + }, + { + "completion_length": 203.703125, + "epoch": 0.25430210325047803, + "grad_norm": 8.784749031066895, + "kl": 0.0908203125, + "learning_rate": 7.45697896749522e-07, + "loss": 0.0036, + "reward": 1.5942569971084595, + "reward_std": 0.19434666633605957, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.39113202691078186, + "step": 798 + }, + { + "completion_length": 250.40625, + "epoch": 0.25462077756532825, + "grad_norm": 12.700088500976562, + "kl": 0.09814453125, + "learning_rate": 7.453792224346718e-07, + "loss": 0.0039, + "reward": 1.5345667600631714, + "reward_std": 0.12197552621364594, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.425191730260849, + "rewards/pad": 0.125, + "step": 799 + }, + { + "completion_length": 324.671875, + "epoch": 0.25493945188017847, + "grad_norm": 45.329933166503906, + "kl": 0.09912109375, + "learning_rate": 7.450605481198216e-07, + "loss": 0.004, + "reward": 1.6125965118408203, + "reward_std": 0.11991085857152939, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5032214522361755, + "rewards/pad": 0.125, + "step": 800 + }, + { + "completion_length": 314.046875, + "epoch": 0.2552581261950287, + "grad_norm": 9.435230255126953, + "kl": 0.0751953125, + "learning_rate": 7.447418738049713e-07, + "loss": 0.003, + "reward": 1.6179510354995728, + "reward_std": 0.10827474296092987, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49295106530189514, + "rewards/pad": 0.125, + "step": 801 + }, + { + "completion_length": 252.265625, + "epoch": 0.2555768005098789, + "grad_norm": 7.633427143096924, + "kl": 0.09619140625, + "learning_rate": 7.444231994901211e-07, + "loss": 0.0038, + "reward": 1.540982723236084, + "reward_std": 0.14200761914253235, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.556607723236084, + "rewards/pad": 0.0, + "step": 802 + }, + { + "completion_length": 227.34375, + "epoch": 0.25589547482472913, + "grad_norm": 113.0080795288086, + "kl": 0.09814453125, + "learning_rate": 7.441045251752709e-07, + "loss": 0.0039, + "reward": 1.6083734035491943, + "reward_std": 0.10746157169342041, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5614984035491943, + "step": 803 + }, + { + "completion_length": 353.703125, + "epoch": 0.25621414913957935, + "grad_norm": 13.557099342346191, + "kl": 0.0703125, + "learning_rate": 7.437858508604206e-07, + "loss": 0.0028, + "reward": 1.6362968683242798, + "reward_std": 0.1419190913438797, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6519218683242798, + "rewards/pad": 0.0, + "step": 804 + }, + { + "completion_length": 249.703125, + "epoch": 0.25653282345442957, + "grad_norm": 7.443843364715576, + "kl": 0.07861328125, + "learning_rate": 7.434671765455703e-07, + "loss": 0.0032, + "reward": 1.5101332664489746, + "reward_std": 0.11596070230007172, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4788833260536194, + "rewards/pad": 0.03125, + "step": 805 + }, + { + "completion_length": 300.8125, + "epoch": 0.2568514977692798, + "grad_norm": 5.476566791534424, + "kl": 0.08740234375, + "learning_rate": 7.431485022307201e-07, + "loss": 0.0035, + "reward": 1.482029676437378, + "reward_std": 0.08476638793945312, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48202961683273315, + "step": 806 + }, + { + "completion_length": 384.265625, + "epoch": 0.25717017208413, + "grad_norm": 8.814421653747559, + "kl": 0.049072265625, + "learning_rate": 7.428298279158699e-07, + "loss": 0.002, + "reward": 1.6688939332962036, + "reward_std": 0.11245816946029663, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43451887369155884, + "step": 807 + }, + { + "completion_length": 557.03125, + "epoch": 0.25748884639898023, + "grad_norm": 4.086426734924316, + "kl": 0.025146484375, + "learning_rate": 7.425111536010196e-07, + "loss": 0.001, + "reward": 1.4199795722961426, + "reward_std": 0.0845545083284378, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43560463190078735, + "step": 808 + }, + { + "completion_length": 185.625, + "epoch": 0.25780752071383045, + "grad_norm": 16.384538650512695, + "kl": 0.09423828125, + "learning_rate": 7.421924792861694e-07, + "loss": 0.0038, + "reward": 1.9109994173049927, + "reward_std": 0.20538440346717834, + "rewards/answer_reward": 0.34375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5828745365142822, + "step": 809 + }, + { + "completion_length": 299.78125, + "epoch": 0.25812619502868067, + "grad_norm": 10.312644958496094, + "kl": 0.0859375, + "learning_rate": 7.418738049713192e-07, + "loss": 0.0034, + "reward": 1.4104437828063965, + "reward_std": 0.12108160555362701, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3323187530040741, + "step": 810 + }, + { + "completion_length": 283.796875, + "epoch": 0.2584448693435309, + "grad_norm": 8.11568546295166, + "kl": 0.09521484375, + "learning_rate": 7.415551306564691e-07, + "loss": 0.0038, + "reward": 1.3238170146942139, + "reward_std": 0.11337171494960785, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.27694201469421387, + "rewards/pad": 0.046875, + "step": 811 + }, + { + "completion_length": 222.328125, + "epoch": 0.2587635436583811, + "grad_norm": 5.870856285095215, + "kl": 0.09228515625, + "learning_rate": 7.412364563416188e-07, + "loss": 0.0037, + "reward": 1.6045132875442505, + "reward_std": 0.17492227256298065, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4951382875442505, + "step": 812 + }, + { + "completion_length": 175.265625, + "epoch": 0.25908221797323133, + "grad_norm": 9.085700035095215, + "kl": 0.1123046875, + "learning_rate": 7.409177820267686e-07, + "loss": 0.0045, + "reward": 1.6282504796981812, + "reward_std": 0.17620429396629333, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42512547969818115, + "rewards/pad": 0.203125, + "step": 813 + }, + { + "completion_length": 242.953125, + "epoch": 0.2594008922880816, + "grad_norm": 7.416430950164795, + "kl": 0.08740234375, + "learning_rate": 7.405991077119184e-07, + "loss": 0.0035, + "reward": 1.7750552892684937, + "reward_std": 0.16323842108249664, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6344302296638489, + "step": 814 + }, + { + "completion_length": 284.890625, + "epoch": 0.25971956660293183, + "grad_norm": 13.476738929748535, + "kl": 0.076171875, + "learning_rate": 7.402804333970682e-07, + "loss": 0.003, + "reward": 1.6334511041641235, + "reward_std": 0.09985249489545822, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38345110416412354, + "rewards/pad": 0.25, + "step": 815 + }, + { + "completion_length": 491.9375, + "epoch": 0.26003824091778205, + "grad_norm": 5.566437244415283, + "kl": 0.0439453125, + "learning_rate": 7.399617590822179e-07, + "loss": 0.0018, + "reward": 1.2809858322143555, + "reward_std": 0.19581392407417297, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.2966108024120331, + "step": 816 + }, + { + "completion_length": 368.796875, + "epoch": 0.26035691523263227, + "grad_norm": 6.461248397827148, + "kl": 0.06005859375, + "learning_rate": 7.396430847673677e-07, + "loss": 0.0024, + "reward": 1.3516507148742676, + "reward_std": 0.0791245549917221, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3516506552696228, + "step": 817 + }, + { + "completion_length": 213.984375, + "epoch": 0.2606755895474825, + "grad_norm": 7.895339488983154, + "kl": 0.091796875, + "learning_rate": 7.393244104525175e-07, + "loss": 0.0037, + "reward": 1.6286418437957764, + "reward_std": 0.1728130280971527, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45676690340042114, + "rewards/pad": 0.171875, + "step": 818 + }, + { + "completion_length": 238.34375, + "epoch": 0.2609942638623327, + "grad_norm": 6.759038925170898, + "kl": 0.08740234375, + "learning_rate": 7.390057361376673e-07, + "loss": 0.0035, + "reward": 1.5328681468963623, + "reward_std": 0.060063887387514114, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5328680872917175, + "rewards/pad": 0.0, + "step": 819 + }, + { + "completion_length": 272.8125, + "epoch": 0.26131293817718293, + "grad_norm": 15.274812698364258, + "kl": 0.07177734375, + "learning_rate": 7.38687061822817e-07, + "loss": 0.0029, + "reward": 1.5370078086853027, + "reward_std": 0.24703297019004822, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3495078682899475, + "step": 820 + }, + { + "completion_length": 297.90625, + "epoch": 0.26163161249203315, + "grad_norm": 46.81049346923828, + "kl": 0.07958984375, + "learning_rate": 7.383683875079668e-07, + "loss": 0.0032, + "reward": 1.7406848669052124, + "reward_std": 0.13599297404289246, + "rewards/pad": 0.359375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3969349265098572, + "step": 821 + }, + { + "completion_length": 189.015625, + "epoch": 0.26195028680688337, + "grad_norm": 7.546118259429932, + "kl": 0.09326171875, + "learning_rate": 7.380497131931166e-07, + "loss": 0.0037, + "reward": 1.5959341526031494, + "reward_std": 0.05791737139225006, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47093406319618225, + "step": 822 + }, + { + "completion_length": 254.265625, + "epoch": 0.2622689611217336, + "grad_norm": 6.935544490814209, + "kl": 0.09375, + "learning_rate": 7.377310388782664e-07, + "loss": 0.0037, + "reward": 1.5564478635787964, + "reward_std": 0.0899038165807724, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5564479231834412, + "rewards/pad": 0.0, + "step": 823 + }, + { + "completion_length": 343.359375, + "epoch": 0.2625876354365838, + "grad_norm": 6.846329689025879, + "kl": 0.06396484375, + "learning_rate": 7.374123645634161e-07, + "loss": 0.0026, + "reward": 1.5198067426681519, + "reward_std": 0.1529795527458191, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.33230674266815186, + "step": 824 + }, + { + "completion_length": 214.453125, + "epoch": 0.26290630975143403, + "grad_norm": 14.635757446289062, + "kl": 0.09423828125, + "learning_rate": 7.370936902485659e-07, + "loss": 0.0038, + "reward": 1.3967777490615845, + "reward_std": 0.2116951197385788, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.33427777886390686, + "rewards/pad": 0.0625, + "step": 825 + }, + { + "completion_length": 299.140625, + "epoch": 0.26322498406628425, + "grad_norm": 21.44894027709961, + "kl": 0.0654296875, + "learning_rate": 7.367750159337157e-07, + "loss": 0.0026, + "reward": 1.5391669273376465, + "reward_std": 0.18289467692375183, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.41416695713996887, + "step": 826 + }, + { + "completion_length": 222.34375, + "epoch": 0.26354365838113447, + "grad_norm": 7.3377180099487305, + "kl": 0.1015625, + "learning_rate": 7.364563416188655e-07, + "loss": 0.0041, + "reward": 1.3520796298980713, + "reward_std": 0.0663057416677475, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3520796298980713, + "rewards/pad": 0.0, + "step": 827 + }, + { + "completion_length": 245.6875, + "epoch": 0.2638623326959847, + "grad_norm": 11.794517517089844, + "kl": 0.150390625, + "learning_rate": 7.361376673040152e-07, + "loss": 0.006, + "reward": 1.4975171089172363, + "reward_std": 0.09263218194246292, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49751707911491394, + "rewards/pad": 0.0, + "step": 828 + }, + { + "completion_length": 163.625, + "epoch": 0.2641810070108349, + "grad_norm": 10.353131294250488, + "kl": 0.1220703125, + "learning_rate": 7.35818992989165e-07, + "loss": 0.0049, + "reward": 1.4297380447387695, + "reward_std": 0.11561650782823563, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3984881043434143, + "rewards/pad": 0.03125, + "step": 829 + }, + { + "completion_length": 366.953125, + "epoch": 0.26449968132568513, + "grad_norm": 5.801058292388916, + "kl": 0.045166015625, + "learning_rate": 7.355003186743149e-07, + "loss": 0.0018, + "reward": 1.5223000049591064, + "reward_std": 0.05760771036148071, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3972998559474945, + "rewards/pad": 0.125, + "step": 830 + }, + { + "completion_length": 317.1875, + "epoch": 0.26481835564053535, + "grad_norm": 5.280611038208008, + "kl": 0.052734375, + "learning_rate": 7.351816443594647e-07, + "loss": 0.0021, + "reward": 1.3706518411636353, + "reward_std": 0.049535803496837616, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37065184116363525, + "rewards/pad": 0.0, + "step": 831 + }, + { + "completion_length": 245.953125, + "epoch": 0.2651370299553856, + "grad_norm": 13.63158130645752, + "kl": 0.0771484375, + "learning_rate": 7.348629700446144e-07, + "loss": 0.0031, + "reward": 1.5924638509750366, + "reward_std": 0.1569618582725525, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4987138509750366, + "step": 832 + }, + { + "completion_length": 306.5, + "epoch": 0.2654557042702358, + "grad_norm": 5.494880676269531, + "kl": 0.0712890625, + "learning_rate": 7.345442957297642e-07, + "loss": 0.0028, + "reward": 1.4672374725341797, + "reward_std": 0.05155729874968529, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4672374725341797, + "rewards/pad": 0.0, + "step": 833 + }, + { + "completion_length": 140.21875, + "epoch": 0.26577437858508607, + "grad_norm": 7.520358085632324, + "kl": 0.1171875, + "learning_rate": 7.34225621414914e-07, + "loss": 0.0047, + "reward": 1.6826391220092773, + "reward_std": 0.14447268843650818, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5732640624046326, + "step": 834 + }, + { + "completion_length": 138.9375, + "epoch": 0.2660930528999363, + "grad_norm": 13.060035705566406, + "kl": 0.1044921875, + "learning_rate": 7.339069471000637e-07, + "loss": 0.0042, + "reward": 1.6643251180648804, + "reward_std": 0.17719486355781555, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5705751180648804, + "rewards/pad": 0.09375, + "step": 835 + }, + { + "completion_length": 344.078125, + "epoch": 0.2664117272147865, + "grad_norm": 11.838889122009277, + "kl": 0.0712890625, + "learning_rate": 7.335882727852135e-07, + "loss": 0.0029, + "reward": 1.3920658826828003, + "reward_std": 0.11614827811717987, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3139408528804779, + "step": 836 + }, + { + "completion_length": 219.90625, + "epoch": 0.26673040152963673, + "grad_norm": 15.134505271911621, + "kl": 0.09716796875, + "learning_rate": 7.332695984703633e-07, + "loss": 0.0039, + "reward": 1.1549283266067505, + "reward_std": 0.12660345435142517, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.1705532819032669, + "step": 837 + }, + { + "completion_length": 292.984375, + "epoch": 0.26704907584448695, + "grad_norm": 7.054495334625244, + "kl": 0.064453125, + "learning_rate": 7.329509241555131e-07, + "loss": 0.0026, + "reward": 1.4501910209655762, + "reward_std": 0.1041649878025055, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4345659911632538, + "rewards/pad": 0.015625, + "step": 838 + }, + { + "completion_length": 229.09375, + "epoch": 0.26736775015933717, + "grad_norm": 11.810515403747559, + "kl": 0.10498046875, + "learning_rate": 7.326322498406628e-07, + "loss": 0.0042, + "reward": 1.550565242767334, + "reward_std": 0.14647451043128967, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5661901235580444, + "rewards/pad": 0.0, + "step": 839 + }, + { + "completion_length": 283.015625, + "epoch": 0.2676864244741874, + "grad_norm": 5.316693305969238, + "kl": 0.06298828125, + "learning_rate": 7.323135755258126e-07, + "loss": 0.0025, + "reward": 1.310988187789917, + "reward_std": 0.17328143119812012, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.24848829209804535, + "step": 840 + }, + { + "completion_length": 235.046875, + "epoch": 0.2680050987890376, + "grad_norm": 8.124117851257324, + "kl": 0.09375, + "learning_rate": 7.319949012109624e-07, + "loss": 0.0038, + "reward": 1.5256720781326294, + "reward_std": 0.09780138731002808, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5256721377372742, + "step": 841 + }, + { + "completion_length": 192.15625, + "epoch": 0.26832377310388783, + "grad_norm": 15.035517692565918, + "kl": 0.09375, + "learning_rate": 7.316762268961122e-07, + "loss": 0.0038, + "reward": 1.4462971687316895, + "reward_std": 0.07383842766284943, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44629716873168945, + "rewards/pad": 0.0, + "step": 842 + }, + { + "completion_length": 194.171875, + "epoch": 0.26864244741873805, + "grad_norm": 14.58903980255127, + "kl": 0.09619140625, + "learning_rate": 7.313575525812619e-07, + "loss": 0.0039, + "reward": 1.7406694889068604, + "reward_std": 0.08720827102661133, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4906695485115051, + "step": 843 + }, + { + "completion_length": 204.609375, + "epoch": 0.26896112173358827, + "grad_norm": 15.045029640197754, + "kl": 0.07568359375, + "learning_rate": 7.310388782664116e-07, + "loss": 0.003, + "reward": 1.4478912353515625, + "reward_std": 0.1405174881219864, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41664132475852966, + "step": 844 + }, + { + "completion_length": 233.0625, + "epoch": 0.2692797960484385, + "grad_norm": 7.217484474182129, + "kl": 0.076171875, + "learning_rate": 7.307202039515614e-07, + "loss": 0.0031, + "reward": 1.3446323871612549, + "reward_std": 0.048448316752910614, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3446323573589325, + "rewards/pad": 0.0, + "step": 845 + }, + { + "completion_length": 316.203125, + "epoch": 0.2695984703632887, + "grad_norm": 7.464609146118164, + "kl": 0.059326171875, + "learning_rate": 7.304015296367112e-07, + "loss": 0.0024, + "reward": 1.4307100772857666, + "reward_std": 0.08219650387763977, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43070995807647705, + "rewards/pad": 0.0, + "step": 846 + }, + { + "completion_length": 240.34375, + "epoch": 0.26991714467813893, + "grad_norm": 24.719892501831055, + "kl": 0.0830078125, + "learning_rate": 7.300828553218609e-07, + "loss": 0.0033, + "reward": 1.3982957601547241, + "reward_std": 0.12989813089370728, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4139207601547241, + "step": 847 + }, + { + "completion_length": 259.9375, + "epoch": 0.27023581899298915, + "grad_norm": 6.298829555511475, + "kl": 0.064453125, + "learning_rate": 7.297641810070108e-07, + "loss": 0.0026, + "reward": 1.451304316520691, + "reward_std": 0.09524716436862946, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4669293463230133, + "step": 848 + }, + { + "completion_length": 193.5, + "epoch": 0.27055449330783937, + "grad_norm": 10.255414962768555, + "kl": 0.09521484375, + "learning_rate": 7.294455066921606e-07, + "loss": 0.0038, + "reward": 1.4266576766967773, + "reward_std": 0.12549547851085663, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39540770649909973, + "step": 849 + }, + { + "completion_length": 196.390625, + "epoch": 0.2708731676226896, + "grad_norm": 11.679965019226074, + "kl": 0.10546875, + "learning_rate": 7.291268323773104e-07, + "loss": 0.0042, + "reward": 1.5198023319244385, + "reward_std": 0.17359545826911926, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5198023319244385, + "rewards/pad": 0.0, + "step": 850 + }, + { + "completion_length": 215.90625, + "epoch": 0.2711918419375398, + "grad_norm": 12.998888969421387, + "kl": 0.109375, + "learning_rate": 7.288081580624601e-07, + "loss": 0.0044, + "reward": 1.6264359951019287, + "reward_std": 0.07382266968488693, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5014359354972839, + "step": 851 + }, + { + "completion_length": 260.921875, + "epoch": 0.27151051625239003, + "grad_norm": 11.922432899475098, + "kl": 0.0810546875, + "learning_rate": 7.284894837476099e-07, + "loss": 0.0032, + "reward": 1.342989444732666, + "reward_std": 0.15005718171596527, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34298938512802124, + "rewards/pad": 0.015625, + "step": 852 + }, + { + "completion_length": 192.109375, + "epoch": 0.27182919056724025, + "grad_norm": 8.800082206726074, + "kl": 0.09033203125, + "learning_rate": 7.281708094327597e-07, + "loss": 0.0036, + "reward": 1.6281015872955322, + "reward_std": 0.08344338089227676, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6281015872955322, + "rewards/pad": 0.0, + "step": 853 + }, + { + "completion_length": 140.328125, + "epoch": 0.27214786488209053, + "grad_norm": 448.08642578125, + "kl": 0.0966796875, + "learning_rate": 7.278521351179095e-07, + "loss": 0.0039, + "reward": 1.8013361692428589, + "reward_std": 0.09915797412395477, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.8013361692428589, + "rewards/pad": 0.0, + "step": 854 + }, + { + "completion_length": 185.484375, + "epoch": 0.27246653919694075, + "grad_norm": 18.73372459411621, + "kl": 0.10693359375, + "learning_rate": 7.275334608030592e-07, + "loss": 0.0043, + "reward": 1.503127932548523, + "reward_std": 0.08186651021242142, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5031278729438782, + "step": 855 + }, + { + "completion_length": 183.21875, + "epoch": 0.27278521351179097, + "grad_norm": 7.774461269378662, + "kl": 0.0927734375, + "learning_rate": 7.27214786488209e-07, + "loss": 0.0037, + "reward": 1.4252768754959106, + "reward_std": 0.07498372346162796, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42527687549591064, + "rewards/pad": 0.0, + "step": 856 + }, + { + "completion_length": 146.203125, + "epoch": 0.2731038878266412, + "grad_norm": 19.676912307739258, + "kl": 0.11279296875, + "learning_rate": 7.268961121733588e-07, + "loss": 0.0045, + "reward": 1.4774161577224731, + "reward_std": 0.08733925223350525, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47741612792015076, + "step": 857 + }, + { + "completion_length": 256.875, + "epoch": 0.2734225621414914, + "grad_norm": 6.925992488861084, + "kl": 0.09130859375, + "learning_rate": 7.265774378585086e-07, + "loss": 0.0036, + "reward": 1.4271292686462402, + "reward_std": 0.08978510648012161, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.427129328250885, + "rewards/pad": 0.0, + "step": 858 + }, + { + "completion_length": 152.0625, + "epoch": 0.27374123645634163, + "grad_norm": 7.952486991882324, + "kl": 0.08740234375, + "learning_rate": 7.262587635436583e-07, + "loss": 0.0035, + "reward": 1.6472711563110352, + "reward_std": 0.11177260428667068, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.41289621591567993, + "rewards/pad": 0.25, + "step": 859 + }, + { + "completion_length": 257.03125, + "epoch": 0.27405991077119185, + "grad_norm": 9.25637435913086, + "kl": 0.08447265625, + "learning_rate": 7.259400892288081e-07, + "loss": 0.0034, + "reward": 1.4652546644210815, + "reward_std": 0.06389153748750687, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46525469422340393, + "step": 860 + }, + { + "completion_length": 141.109375, + "epoch": 0.27437858508604207, + "grad_norm": 9.085542678833008, + "kl": 0.10791015625, + "learning_rate": 7.256214149139579e-07, + "loss": 0.0043, + "reward": 1.6769859790802002, + "reward_std": 0.14516539871692657, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.614486038684845, + "rewards/pad": 0.0625, + "step": 861 + }, + { + "completion_length": 237.296875, + "epoch": 0.2746972594008923, + "grad_norm": 7.306424140930176, + "kl": 0.07958984375, + "learning_rate": 7.253027405991076e-07, + "loss": 0.0032, + "reward": 1.5290510654449463, + "reward_std": 0.08280518651008606, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5290510654449463, + "rewards/pad": 0.0, + "step": 862 + }, + { + "completion_length": 193.671875, + "epoch": 0.2750159337157425, + "grad_norm": 12.244972229003906, + "kl": 0.0849609375, + "learning_rate": 7.249840662842574e-07, + "loss": 0.0034, + "reward": 1.6881325244903564, + "reward_std": 0.16697387397289276, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4693824350833893, + "rewards/pad": 0.234375, + "step": 863 + }, + { + "completion_length": 176.578125, + "epoch": 0.27533460803059273, + "grad_norm": 29.74585723876953, + "kl": 0.07763671875, + "learning_rate": 7.246653919694072e-07, + "loss": 0.0031, + "reward": 1.7264668941497803, + "reward_std": 0.14079639315605164, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.6170920133590698, + "rewards/pad": 0.140625, + "step": 864 + }, + { + "completion_length": 275.171875, + "epoch": 0.27565328234544295, + "grad_norm": 14.265174865722656, + "kl": 0.06494140625, + "learning_rate": 7.24346717654557e-07, + "loss": 0.0026, + "reward": 1.6074364185333252, + "reward_std": 0.19189007580280304, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.38868647813796997, + "step": 865 + }, + { + "completion_length": 186.53125, + "epoch": 0.27597195666029317, + "grad_norm": 5.63706636428833, + "kl": 0.0673828125, + "learning_rate": 7.240280433397067e-07, + "loss": 0.0027, + "reward": 1.4767136573791504, + "reward_std": 0.1421107053756714, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3517136573791504, + "step": 866 + }, + { + "completion_length": 253.46875, + "epoch": 0.2762906309751434, + "grad_norm": 214.72650146484375, + "kl": 0.076171875, + "learning_rate": 7.237093690248566e-07, + "loss": 0.003, + "reward": 1.5148383378982544, + "reward_std": 0.11171391606330872, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5148383378982544, + "step": 867 + }, + { + "completion_length": 242.109375, + "epoch": 0.2766093052899936, + "grad_norm": 17.771141052246094, + "kl": 0.107421875, + "learning_rate": 7.233906947100064e-07, + "loss": 0.0043, + "reward": 1.527846097946167, + "reward_std": 0.10529676079750061, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40284615755081177, + "step": 868 + }, + { + "completion_length": 236.796875, + "epoch": 0.27692797960484383, + "grad_norm": 8.183026313781738, + "kl": 0.0625, + "learning_rate": 7.230720203951562e-07, + "loss": 0.0025, + "reward": 1.6474695205688477, + "reward_std": 0.07472085952758789, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39746958017349243, + "rewards/pad": 0.25, + "step": 869 + }, + { + "completion_length": 97.1875, + "epoch": 0.27724665391969405, + "grad_norm": 14.428912162780762, + "kl": 0.1298828125, + "learning_rate": 7.227533460803059e-07, + "loss": 0.0052, + "reward": 1.3619276285171509, + "reward_std": 0.1646977812051773, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.37755271792411804, + "rewards/pad": 0.0, + "step": 870 + }, + { + "completion_length": 140.953125, + "epoch": 0.2775653282345443, + "grad_norm": 9.364572525024414, + "kl": 0.166015625, + "learning_rate": 7.224346717654557e-07, + "loss": 0.0067, + "reward": 1.5766136646270752, + "reward_std": 0.1297195702791214, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4828636944293976, + "rewards/pad": 0.09375, + "step": 871 + }, + { + "completion_length": 227.78125, + "epoch": 0.2778840025493945, + "grad_norm": 53.78718185424805, + "kl": 0.08984375, + "learning_rate": 7.221159974506055e-07, + "loss": 0.0036, + "reward": 1.5020396709442139, + "reward_std": 0.06564459204673767, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5020396113395691, + "rewards/pad": 0.0, + "step": 872 + }, + { + "completion_length": 215.78125, + "epoch": 0.27820267686424477, + "grad_norm": 19.729019165039062, + "kl": 0.08349609375, + "learning_rate": 7.217973231357553e-07, + "loss": 0.0033, + "reward": 1.5267720222473145, + "reward_std": 0.13667306303977966, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47989708185195923, + "rewards/pad": 0.0625, + "step": 873 + }, + { + "completion_length": 203.4375, + "epoch": 0.278521351179095, + "grad_norm": 22.31565284729004, + "kl": 0.07080078125, + "learning_rate": 7.21478648820905e-07, + "loss": 0.0028, + "reward": 1.5913581848144531, + "reward_std": 0.09478233754634857, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4663581848144531, + "step": 874 + }, + { + "completion_length": 168.90625, + "epoch": 0.2788400254939452, + "grad_norm": 8.827746391296387, + "kl": 0.10986328125, + "learning_rate": 7.211599745060548e-07, + "loss": 0.0044, + "reward": 1.4382688999176025, + "reward_std": 0.1273890733718872, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4382690191268921, + "rewards/pad": 0.0, + "step": 875 + }, + { + "completion_length": 296.96875, + "epoch": 0.27915869980879543, + "grad_norm": 7.749444484710693, + "kl": 0.07421875, + "learning_rate": 7.208413001912046e-07, + "loss": 0.003, + "reward": 1.5172462463378906, + "reward_std": 0.10500482469797134, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5172461867332458, + "rewards/pad": 0.0, + "step": 876 + }, + { + "completion_length": 272.921875, + "epoch": 0.27947737412364565, + "grad_norm": 7.46060848236084, + "kl": 0.0654296875, + "learning_rate": 7.205226258763544e-07, + "loss": 0.0026, + "reward": 1.5742771625518799, + "reward_std": 0.04187481105327606, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44927719235420227, + "rewards/pad": 0.125, + "step": 877 + }, + { + "completion_length": 258.484375, + "epoch": 0.27979604843849587, + "grad_norm": 27.48328971862793, + "kl": 0.0693359375, + "learning_rate": 7.202039515615041e-07, + "loss": 0.0028, + "reward": 1.642381191253662, + "reward_std": 0.05967498570680618, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3923811912536621, + "rewards/pad": 0.25, + "step": 878 + }, + { + "completion_length": 246.8125, + "epoch": 0.2801147227533461, + "grad_norm": 10.169364929199219, + "kl": 0.06494140625, + "learning_rate": 7.198852772466539e-07, + "loss": 0.0026, + "reward": 1.5621225833892822, + "reward_std": 0.11288601160049438, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5308725833892822, + "step": 879 + }, + { + "completion_length": 247.578125, + "epoch": 0.2804333970681963, + "grad_norm": 8.097737312316895, + "kl": 0.0703125, + "learning_rate": 7.195666029318037e-07, + "loss": 0.0028, + "reward": 1.6014553308486938, + "reward_std": 0.06341607868671417, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47645536065101624, + "step": 880 + }, + { + "completion_length": 238.84375, + "epoch": 0.28075207138304653, + "grad_norm": 16.25465202331543, + "kl": 0.07666015625, + "learning_rate": 7.192479286169535e-07, + "loss": 0.0031, + "reward": 1.5091499090194702, + "reward_std": 0.077355295419693, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5091499090194702, + "step": 881 + }, + { + "completion_length": 206.546875, + "epoch": 0.28107074569789675, + "grad_norm": 26.487606048583984, + "kl": 0.07177734375, + "learning_rate": 7.189292543021032e-07, + "loss": 0.0029, + "reward": 1.6797242164611816, + "reward_std": 0.10398919880390167, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42972421646118164, + "step": 882 + }, + { + "completion_length": 191.9375, + "epoch": 0.28138942001274697, + "grad_norm": 11.197914123535156, + "kl": 0.087890625, + "learning_rate": 7.186105799872529e-07, + "loss": 0.0035, + "reward": 1.8811314105987549, + "reward_std": 0.09780030697584152, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6311314105987549, + "rewards/pad": 0.25, + "step": 883 + }, + { + "completion_length": 248.859375, + "epoch": 0.2817080943275972, + "grad_norm": 11.701794624328613, + "kl": 0.0791015625, + "learning_rate": 7.182919056724027e-07, + "loss": 0.0032, + "reward": 1.3616845607757568, + "reward_std": 0.11649852991104126, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37730950117111206, + "step": 884 + }, + { + "completion_length": 176.015625, + "epoch": 0.2820267686424474, + "grad_norm": 13.844351768493652, + "kl": 0.1103515625, + "learning_rate": 7.179732313575525e-07, + "loss": 0.0044, + "reward": 1.5564985275268555, + "reward_std": 0.0744241252541542, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5564984679222107, + "rewards/pad": 0.0, + "step": 885 + }, + { + "completion_length": 216.5, + "epoch": 0.28234544295729763, + "grad_norm": 16.154176712036133, + "kl": 0.08056640625, + "learning_rate": 7.176545570427023e-07, + "loss": 0.0032, + "reward": 1.5752778053283691, + "reward_std": 0.09883183240890503, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5909029245376587, + "rewards/pad": 0.0, + "step": 886 + }, + { + "completion_length": 170.671875, + "epoch": 0.28266411727214785, + "grad_norm": 15.331693649291992, + "kl": 0.091796875, + "learning_rate": 7.173358827278521e-07, + "loss": 0.0037, + "reward": 1.3783342838287354, + "reward_std": 0.18478818237781525, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3627093434333801, + "step": 887 + }, + { + "completion_length": 302.640625, + "epoch": 0.2829827915869981, + "grad_norm": 11.60513973236084, + "kl": 0.06787109375, + "learning_rate": 7.170172084130019e-07, + "loss": 0.0027, + "reward": 1.373932123184204, + "reward_std": 0.09270681440830231, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3739321827888489, + "rewards/pad": 0.0, + "step": 888 + }, + { + "completion_length": 144.0625, + "epoch": 0.2833014659018483, + "grad_norm": 15.91201400756836, + "kl": 0.08203125, + "learning_rate": 7.166985340981517e-07, + "loss": 0.0033, + "reward": 1.8487534523010254, + "reward_std": 0.1471112072467804, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5362533926963806, + "step": 889 + }, + { + "completion_length": 182.625, + "epoch": 0.2836201402166985, + "grad_norm": 19.750171661376953, + "kl": 0.103515625, + "learning_rate": 7.163798597833014e-07, + "loss": 0.0042, + "reward": 1.5224273204803467, + "reward_std": 0.11883044987916946, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3974272608757019, + "rewards/pad": 0.125, + "step": 890 + }, + { + "completion_length": 238.671875, + "epoch": 0.28393881453154873, + "grad_norm": 16.4511661529541, + "kl": 0.07421875, + "learning_rate": 7.160611854684512e-07, + "loss": 0.003, + "reward": 1.7034804821014404, + "reward_std": 0.14997044205665588, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.48473045229911804, + "step": 891 + }, + { + "completion_length": 259.921875, + "epoch": 0.28425748884639895, + "grad_norm": 10.787720680236816, + "kl": 0.08056640625, + "learning_rate": 7.15742511153601e-07, + "loss": 0.0032, + "reward": 1.3430256843566895, + "reward_std": 0.13406524062156677, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.24927571415901184, + "step": 892 + }, + { + "completion_length": 123.34375, + "epoch": 0.28457616316124923, + "grad_norm": 14.076187133789062, + "kl": 0.09912109375, + "learning_rate": 7.154238368387507e-07, + "loss": 0.004, + "reward": 1.7401636838912964, + "reward_std": 0.17341656982898712, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4276636838912964, + "rewards/pad": 0.3125, + "step": 893 + }, + { + "completion_length": 320.71875, + "epoch": 0.28489483747609945, + "grad_norm": 12.595187187194824, + "kl": 0.0673828125, + "learning_rate": 7.151051625239005e-07, + "loss": 0.0027, + "reward": 1.4644830226898193, + "reward_std": 0.07389900088310242, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4644829034805298, + "step": 894 + }, + { + "completion_length": 217.75, + "epoch": 0.28521351179094967, + "grad_norm": 23.805479049682617, + "kl": 0.072265625, + "learning_rate": 7.147864882090503e-07, + "loss": 0.0029, + "reward": 1.5951404571533203, + "reward_std": 0.19070225954055786, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4076404869556427, + "rewards/pad": 0.1875, + "step": 895 + }, + { + "completion_length": 274.15625, + "epoch": 0.2855321861057999, + "grad_norm": 11.52724552154541, + "kl": 0.057861328125, + "learning_rate": 7.144678138942001e-07, + "loss": 0.0023, + "reward": 1.6295645236968994, + "reward_std": 0.12622328102588654, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4733145833015442, + "step": 896 + }, + { + "completion_length": 266.609375, + "epoch": 0.2858508604206501, + "grad_norm": 9.084197044372559, + "kl": 0.07568359375, + "learning_rate": 7.141491395793498e-07, + "loss": 0.003, + "reward": 1.4604337215423584, + "reward_std": 0.048416588455438614, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4604337811470032, + "rewards/pad": 0.0, + "step": 897 + }, + { + "completion_length": 195.125, + "epoch": 0.28616953473550033, + "grad_norm": 129.77342224121094, + "kl": 0.09033203125, + "learning_rate": 7.138304652644996e-07, + "loss": 0.0036, + "reward": 1.7572461366653442, + "reward_std": 0.1534266173839569, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5384961366653442, + "rewards/pad": 0.234375, + "step": 898 + }, + { + "completion_length": 303.796875, + "epoch": 0.28648820905035055, + "grad_norm": 6.984428405761719, + "kl": 0.062255859375, + "learning_rate": 7.135117909496494e-07, + "loss": 0.0025, + "reward": 1.5522358417510986, + "reward_std": 0.1406959742307663, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3334859311580658, + "step": 899 + }, + { + "completion_length": 177.53125, + "epoch": 0.28680688336520077, + "grad_norm": 10.488786697387695, + "kl": 0.08642578125, + "learning_rate": 7.131931166347992e-07, + "loss": 0.0035, + "reward": 1.5821726322174072, + "reward_std": 0.06763388216495514, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.33217257261276245, + "step": 900 + }, + { + "completion_length": 236.484375, + "epoch": 0.287125557680051, + "grad_norm": 12.510342597961426, + "kl": 0.083984375, + "learning_rate": 7.128744423199489e-07, + "loss": 0.0033, + "reward": 1.6787164211273193, + "reward_std": 0.1262408196926117, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5693413615226746, + "step": 901 + }, + { + "completion_length": 238.9375, + "epoch": 0.2874442319949012, + "grad_norm": 9.29166316986084, + "kl": 0.07421875, + "learning_rate": 7.125557680050987e-07, + "loss": 0.003, + "reward": 1.4003498554229736, + "reward_std": 0.14340917766094208, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.41597479581832886, + "rewards/pad": 0.0, + "step": 902 + }, + { + "completion_length": 367.359375, + "epoch": 0.28776290630975143, + "grad_norm": 8.235332489013672, + "kl": 0.0439453125, + "learning_rate": 7.122370936902485e-07, + "loss": 0.0018, + "reward": 1.6242144107818604, + "reward_std": 0.21928203105926514, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.5460892915725708, + "step": 903 + }, + { + "completion_length": 156.9375, + "epoch": 0.28808158062460165, + "grad_norm": 15.566165924072266, + "kl": 0.1142578125, + "learning_rate": 7.119184193753984e-07, + "loss": 0.0046, + "reward": 1.7355973720550537, + "reward_std": 0.16478106379508972, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6730973720550537, + "rewards/pad": 0.0625, + "step": 904 + }, + { + "completion_length": 344.46875, + "epoch": 0.28840025493945187, + "grad_norm": 5.945833206176758, + "kl": 0.06298828125, + "learning_rate": 7.115997450605481e-07, + "loss": 0.0025, + "reward": 1.3633050918579102, + "reward_std": 0.12290232628583908, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.37893012166023254, + "rewards/pad": 0.0, + "step": 905 + }, + { + "completion_length": 116.40625, + "epoch": 0.2887189292543021, + "grad_norm": 10.99114990234375, + "kl": 0.1484375, + "learning_rate": 7.112810707456979e-07, + "loss": 0.0059, + "reward": 1.7536451816558838, + "reward_std": 0.169193834066391, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.628645122051239, + "rewards/pad": 0.125, + "step": 906 + }, + { + "completion_length": 288.375, + "epoch": 0.2890376035691523, + "grad_norm": 6.997311115264893, + "kl": 0.06689453125, + "learning_rate": 7.109623964308477e-07, + "loss": 0.0027, + "reward": 1.539267897605896, + "reward_std": 0.12635095417499542, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.554892897605896, + "step": 907 + }, + { + "completion_length": 315.0625, + "epoch": 0.28935627788400253, + "grad_norm": 8.444843292236328, + "kl": 0.06591796875, + "learning_rate": 7.106437221159975e-07, + "loss": 0.0026, + "reward": 1.4306590557098389, + "reward_std": 0.094997838139534, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44628414511680603, + "rewards/pad": 0.0, + "step": 908 + }, + { + "completion_length": 232.84375, + "epoch": 0.28967495219885275, + "grad_norm": 10.925509452819824, + "kl": 0.0751953125, + "learning_rate": 7.103250478011472e-07, + "loss": 0.003, + "reward": 1.534977912902832, + "reward_std": 0.06576971709728241, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.534977912902832, + "rewards/pad": 0.0, + "step": 909 + }, + { + "completion_length": 153.984375, + "epoch": 0.289993626513703, + "grad_norm": 9.139330863952637, + "kl": 0.0947265625, + "learning_rate": 7.10006373486297e-07, + "loss": 0.0038, + "reward": 1.4619795083999634, + "reward_std": 0.0989057868719101, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4619795083999634, + "rewards/pad": 0.0, + "step": 910 + }, + { + "completion_length": 303.1875, + "epoch": 0.2903123008285532, + "grad_norm": 10.7050142288208, + "kl": 0.06396484375, + "learning_rate": 7.096876991714468e-07, + "loss": 0.0026, + "reward": 1.5856952667236328, + "reward_std": 0.08746597915887833, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.585695207118988, + "step": 911 + }, + { + "completion_length": 228.046875, + "epoch": 0.29063097514340347, + "grad_norm": 8.583359718322754, + "kl": 0.064453125, + "learning_rate": 7.093690248565966e-07, + "loss": 0.0026, + "reward": 1.718181848526001, + "reward_std": 0.1566476970911026, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46818190813064575, + "step": 912 + }, + { + "completion_length": 194.46875, + "epoch": 0.2909496494582537, + "grad_norm": 14.333474159240723, + "kl": 0.09912109375, + "learning_rate": 7.090503505417463e-07, + "loss": 0.004, + "reward": 1.5772305727005005, + "reward_std": 0.15488551557064056, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4678555428981781, + "rewards/pad": 0.125, + "step": 913 + }, + { + "completion_length": 313.671875, + "epoch": 0.2912683237731039, + "grad_norm": 15.910902976989746, + "kl": 0.0576171875, + "learning_rate": 7.087316762268961e-07, + "loss": 0.0023, + "reward": 1.5254408121109009, + "reward_std": 0.11735684424638748, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4785658121109009, + "rewards/pad": 0.046875, + "step": 914 + }, + { + "completion_length": 346.765625, + "epoch": 0.29158699808795413, + "grad_norm": 4.648716449737549, + "kl": 0.05615234375, + "learning_rate": 7.084130019120459e-07, + "loss": 0.0023, + "reward": 1.4281669855117798, + "reward_std": 0.051169656217098236, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4281669855117798, + "step": 915 + }, + { + "completion_length": 272.890625, + "epoch": 0.29190567240280435, + "grad_norm": 8.32939338684082, + "kl": 0.08642578125, + "learning_rate": 7.080943275971957e-07, + "loss": 0.0035, + "reward": 1.4463294744491577, + "reward_std": 0.0748964250087738, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4463294744491577, + "rewards/pad": 0.0, + "step": 916 + }, + { + "completion_length": 218.453125, + "epoch": 0.29222434671765457, + "grad_norm": 19.580448150634766, + "kl": 0.07763671875, + "learning_rate": 7.077756532823454e-07, + "loss": 0.0031, + "reward": 1.6393697261810303, + "reward_std": 0.17384447157382965, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5299947261810303, + "rewards/pad": 0.125, + "step": 917 + }, + { + "completion_length": 255.40625, + "epoch": 0.2925430210325048, + "grad_norm": 8.05824089050293, + "kl": 0.07421875, + "learning_rate": 7.074569789674952e-07, + "loss": 0.003, + "reward": 1.5728120803833008, + "reward_std": 0.15007048845291138, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4321870505809784, + "step": 918 + }, + { + "completion_length": 320.453125, + "epoch": 0.292861695347355, + "grad_norm": 8.89892578125, + "kl": 0.0517578125, + "learning_rate": 7.07138304652645e-07, + "loss": 0.0021, + "reward": 1.5870790481567383, + "reward_std": 0.1334977000951767, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5245789289474487, + "step": 919 + }, + { + "completion_length": 245.453125, + "epoch": 0.29318036966220523, + "grad_norm": 15.274248123168945, + "kl": 0.103515625, + "learning_rate": 7.068196303377948e-07, + "loss": 0.0041, + "reward": 1.6676104068756104, + "reward_std": 0.2029799222946167, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4176103174686432, + "rewards/pad": 0.25, + "step": 920 + }, + { + "completion_length": 287.90625, + "epoch": 0.29349904397705545, + "grad_norm": 6.506035804748535, + "kl": 0.0888671875, + "learning_rate": 7.065009560229445e-07, + "loss": 0.0036, + "reward": 1.3093442916870117, + "reward_std": 0.17159739136695862, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3405943810939789, + "step": 921 + }, + { + "completion_length": 230.25, + "epoch": 0.29381771829190567, + "grad_norm": 6.8670806884765625, + "kl": 0.1015625, + "learning_rate": 7.061822817080942e-07, + "loss": 0.0041, + "reward": 1.4682257175445557, + "reward_std": 0.23254211246967316, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3744756579399109, + "step": 922 + }, + { + "completion_length": 216.6875, + "epoch": 0.2941363926067559, + "grad_norm": 13.328472137451172, + "kl": 0.07861328125, + "learning_rate": 7.05863607393244e-07, + "loss": 0.0031, + "reward": 1.7020766735076904, + "reward_std": 0.0875585600733757, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5770766735076904, + "step": 923 + }, + { + "completion_length": 109.640625, + "epoch": 0.2944550669216061, + "grad_norm": 8.600102424621582, + "kl": 0.11279296875, + "learning_rate": 7.055449330783938e-07, + "loss": 0.0045, + "reward": 1.7592469453811646, + "reward_std": 0.1416284590959549, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5248720049858093, + "rewards/pad": 0.234375, + "step": 924 + }, + { + "completion_length": 282.90625, + "epoch": 0.29477374123645633, + "grad_norm": 7.736202716827393, + "kl": 0.06689453125, + "learning_rate": 7.052262587635436e-07, + "loss": 0.0027, + "reward": 1.674477219581604, + "reward_std": 0.12267719209194183, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4244771897792816, + "rewards/pad": 0.25, + "step": 925 + }, + { + "completion_length": 420.171875, + "epoch": 0.29509241555130655, + "grad_norm": 5.292170524597168, + "kl": 0.044189453125, + "learning_rate": 7.049075844486934e-07, + "loss": 0.0018, + "reward": 1.5019543170928955, + "reward_std": 0.08003868907690048, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5019543170928955, + "rewards/pad": 0.0, + "step": 926 + }, + { + "completion_length": 300.0625, + "epoch": 0.2954110898661568, + "grad_norm": 11.51095962524414, + "kl": 0.07666015625, + "learning_rate": 7.045889101338432e-07, + "loss": 0.0031, + "reward": 1.627131700515747, + "reward_std": 0.07033547013998032, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5021317005157471, + "step": 927 + }, + { + "completion_length": 202.125, + "epoch": 0.295729764181007, + "grad_norm": 10.411168098449707, + "kl": 0.08447265625, + "learning_rate": 7.042702358189929e-07, + "loss": 0.0034, + "reward": 1.411263108253479, + "reward_std": 0.11484085023403168, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3643881380558014, + "rewards/pad": 0.046875, + "step": 928 + }, + { + "completion_length": 237.90625, + "epoch": 0.2960484384958572, + "grad_norm": 22.574514389038086, + "kl": 0.115234375, + "learning_rate": 7.039515615041427e-07, + "loss": 0.0046, + "reward": 1.503699779510498, + "reward_std": 0.17890988290309906, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4880746901035309, + "rewards/pad": 0.03125, + "step": 929 + }, + { + "completion_length": 209.921875, + "epoch": 0.29636711281070743, + "grad_norm": 6.581096172332764, + "kl": 0.0849609375, + "learning_rate": 7.036328871892925e-07, + "loss": 0.0034, + "reward": 1.4850600957870483, + "reward_std": 0.18157213926315308, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37568503618240356, + "rewards/pad": 0.109375, + "step": 930 + }, + { + "completion_length": 385.5, + "epoch": 0.29668578712555765, + "grad_norm": 4.934545516967773, + "kl": 0.055419921875, + "learning_rate": 7.033142128744423e-07, + "loss": 0.0022, + "reward": 1.3227486610412598, + "reward_std": 0.1512432098388672, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.24462367594242096, + "step": 931 + }, + { + "completion_length": 258.609375, + "epoch": 0.29700446144040793, + "grad_norm": 9.287094116210938, + "kl": 0.1123046875, + "learning_rate": 7.02995538559592e-07, + "loss": 0.0045, + "reward": 1.538500189781189, + "reward_std": 0.14638297259807587, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46037521958351135, + "rewards/pad": 0.078125, + "step": 932 + }, + { + "completion_length": 200.71875, + "epoch": 0.29732313575525815, + "grad_norm": 8.858744621276855, + "kl": 0.09375, + "learning_rate": 7.026768642447418e-07, + "loss": 0.0038, + "reward": 1.552154302597046, + "reward_std": 0.07690747827291489, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5521542429924011, + "step": 933 + }, + { + "completion_length": 269.921875, + "epoch": 0.29764181007010837, + "grad_norm": 11.787874221801758, + "kl": 0.07373046875, + "learning_rate": 7.023581899298916e-07, + "loss": 0.0029, + "reward": 1.6634650230407715, + "reward_std": 0.13022476434707642, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36659008264541626, + "rewards/pad": 0.296875, + "step": 934 + }, + { + "completion_length": 350.765625, + "epoch": 0.2979604843849586, + "grad_norm": 11.997943878173828, + "kl": 0.0859375, + "learning_rate": 7.020395156150414e-07, + "loss": 0.0034, + "reward": 1.3943145275115967, + "reward_std": 0.18118952214717865, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3474395275115967, + "step": 935 + }, + { + "completion_length": 480.125, + "epoch": 0.2982791586998088, + "grad_norm": 4.998562335968018, + "kl": 0.040771484375, + "learning_rate": 7.017208413001911e-07, + "loss": 0.0016, + "reward": 1.3589204549789429, + "reward_std": 0.010401003994047642, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.35892045497894287, + "step": 936 + }, + { + "completion_length": 206.6875, + "epoch": 0.29859783301465903, + "grad_norm": 10.145219802856445, + "kl": 0.08984375, + "learning_rate": 7.014021669853409e-07, + "loss": 0.0036, + "reward": 1.6010687351226807, + "reward_std": 0.15740467607975006, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4760686159133911, + "rewards/pad": 0.125, + "step": 937 + }, + { + "completion_length": 254.484375, + "epoch": 0.29891650732950925, + "grad_norm": 8.828103065490723, + "kl": 0.08544921875, + "learning_rate": 7.010834926704907e-07, + "loss": 0.0034, + "reward": 1.708954930305481, + "reward_std": 0.05463279038667679, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.583954930305481, + "rewards/pad": 0.125, + "step": 938 + }, + { + "completion_length": 208.828125, + "epoch": 0.29923518164435947, + "grad_norm": 9.465192794799805, + "kl": 0.08349609375, + "learning_rate": 7.007648183556405e-07, + "loss": 0.0033, + "reward": 1.4708402156829834, + "reward_std": 0.15929856896400452, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4552152156829834, + "step": 939 + }, + { + "completion_length": 264.234375, + "epoch": 0.2995538559592097, + "grad_norm": 5.2820024490356445, + "kl": 0.072265625, + "learning_rate": 7.004461440407902e-07, + "loss": 0.0029, + "reward": 1.665083646774292, + "reward_std": 0.14066141843795776, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5088337063789368, + "step": 940 + }, + { + "completion_length": 341.015625, + "epoch": 0.2998725302740599, + "grad_norm": 3.734297275543213, + "kl": 0.05712890625, + "learning_rate": 7.0012746972594e-07, + "loss": 0.0023, + "reward": 1.414062738418579, + "reward_std": 0.04965207725763321, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41406285762786865, + "step": 941 + }, + { + "completion_length": 198.28125, + "epoch": 0.30019120458891013, + "grad_norm": 13.466355323791504, + "kl": 0.06982421875, + "learning_rate": 6.998087954110899e-07, + "loss": 0.0028, + "reward": 1.7419767379760742, + "reward_std": 0.22685980796813965, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44510167837142944, + "rewards/pad": 0.3125, + "step": 942 + }, + { + "completion_length": 252.84375, + "epoch": 0.30050987890376035, + "grad_norm": 8.96711254119873, + "kl": 0.09521484375, + "learning_rate": 6.994901210962397e-07, + "loss": 0.0038, + "reward": 1.5406872034072876, + "reward_std": 0.22085599601268768, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5406872034072876, + "rewards/pad": 0.03125, + "step": 943 + }, + { + "completion_length": 164.78125, + "epoch": 0.3008285532186106, + "grad_norm": 33.10133361816406, + "kl": 0.10009765625, + "learning_rate": 6.991714467813894e-07, + "loss": 0.004, + "reward": 1.4604114294052124, + "reward_std": 0.23422494530677795, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4291614294052124, + "rewards/pad": 0.046875, + "step": 944 + }, + { + "completion_length": 267.5, + "epoch": 0.3011472275334608, + "grad_norm": 9.718307495117188, + "kl": 0.09130859375, + "learning_rate": 6.988527724665392e-07, + "loss": 0.0037, + "reward": 1.4997873306274414, + "reward_std": 0.08561849594116211, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.499787300825119, + "rewards/pad": 0.0, + "step": 945 + }, + { + "completion_length": 322.53125, + "epoch": 0.301465901848311, + "grad_norm": 6.480549335479736, + "kl": 0.08447265625, + "learning_rate": 6.98534098151689e-07, + "loss": 0.0034, + "reward": 1.5078763961791992, + "reward_std": 0.18586468696594238, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44537633657455444, + "rewards/pad": 0.0625, + "step": 946 + }, + { + "completion_length": 241.09375, + "epoch": 0.30178457616316123, + "grad_norm": 102.33425903320312, + "kl": 0.091796875, + "learning_rate": 6.982154238368388e-07, + "loss": 0.0037, + "reward": 1.5356953144073486, + "reward_std": 0.0902363583445549, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5356953144073486, + "rewards/pad": 0.0, + "step": 947 + }, + { + "completion_length": 360.828125, + "epoch": 0.30210325047801145, + "grad_norm": 8.094786643981934, + "kl": 0.07275390625, + "learning_rate": 6.978967495219885e-07, + "loss": 0.0029, + "reward": 1.5371003150939941, + "reward_std": 0.11551005393266678, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4277253746986389, + "step": 948 + }, + { + "completion_length": 185.453125, + "epoch": 0.3024219247928617, + "grad_norm": 14.498106002807617, + "kl": 0.09765625, + "learning_rate": 6.975780752071383e-07, + "loss": 0.0039, + "reward": 1.489546537399292, + "reward_std": 0.1398780643939972, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.473921537399292, + "rewards/pad": 0.015625, + "step": 949 + }, + { + "completion_length": 255.578125, + "epoch": 0.3027405991077119, + "grad_norm": 6.012439727783203, + "kl": 0.10498046875, + "learning_rate": 6.972594008922881e-07, + "loss": 0.0042, + "reward": 1.3009319305419922, + "reward_std": 0.11145664751529694, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2853069305419922, + "rewards/pad": 0.015625, + "step": 950 + }, + { + "completion_length": 307.671875, + "epoch": 0.3030592734225621, + "grad_norm": 8.013301849365234, + "kl": 0.07421875, + "learning_rate": 6.969407265774379e-07, + "loss": 0.003, + "reward": 1.384726881980896, + "reward_std": 0.06668713688850403, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.384726881980896, + "step": 951 + }, + { + "completion_length": 421.75, + "epoch": 0.3033779477374124, + "grad_norm": 8.324105262756348, + "kl": 0.06494140625, + "learning_rate": 6.966220522625876e-07, + "loss": 0.0026, + "reward": 1.3232536315917969, + "reward_std": 0.1271124929189682, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.35450369119644165, + "step": 952 + }, + { + "completion_length": 200.96875, + "epoch": 0.3036966220522626, + "grad_norm": 13.980183601379395, + "kl": 0.078125, + "learning_rate": 6.963033779477374e-07, + "loss": 0.0031, + "reward": 1.4275410175323486, + "reward_std": 0.1396736055612564, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42754098773002625, + "rewards/pad": 0.0, + "step": 953 + }, + { + "completion_length": 337.875, + "epoch": 0.30401529636711283, + "grad_norm": 9.594482421875, + "kl": 0.08203125, + "learning_rate": 6.959847036328872e-07, + "loss": 0.0033, + "reward": 1.5653290748596191, + "reward_std": 0.13285420835018158, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4715789556503296, + "step": 954 + }, + { + "completion_length": 319.203125, + "epoch": 0.30433397068196305, + "grad_norm": 19.658885955810547, + "kl": 0.06396484375, + "learning_rate": 6.95666029318037e-07, + "loss": 0.0026, + "reward": 1.576867938041687, + "reward_std": 0.08937421441078186, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.467492938041687, + "step": 955 + }, + { + "completion_length": 446.875, + "epoch": 0.30465264499681327, + "grad_norm": 4.599447250366211, + "kl": 0.049072265625, + "learning_rate": 6.953473550031867e-07, + "loss": 0.002, + "reward": 1.4357045888900757, + "reward_std": 0.11575381457805634, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3419545590877533, + "step": 956 + }, + { + "completion_length": 283.515625, + "epoch": 0.3049713193116635, + "grad_norm": 13.620803833007812, + "kl": 0.0869140625, + "learning_rate": 6.950286806883365e-07, + "loss": 0.0035, + "reward": 1.4528107643127441, + "reward_std": 0.076294906437397, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45281076431274414, + "rewards/pad": 0.0, + "step": 957 + }, + { + "completion_length": 236.9375, + "epoch": 0.3052899936265137, + "grad_norm": 9.338239669799805, + "kl": 0.0859375, + "learning_rate": 6.947100063734863e-07, + "loss": 0.0034, + "reward": 1.412969946861267, + "reward_std": 0.15528462827205658, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2879698872566223, + "rewards/pad": 0.125, + "step": 958 + }, + { + "completion_length": 278.078125, + "epoch": 0.30560866794136393, + "grad_norm": 7.011009693145752, + "kl": 0.07861328125, + "learning_rate": 6.94391332058636e-07, + "loss": 0.0031, + "reward": 1.4641616344451904, + "reward_std": 0.1217249259352684, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.47978660464286804, + "step": 959 + }, + { + "completion_length": 203.078125, + "epoch": 0.30592734225621415, + "grad_norm": 12.84834098815918, + "kl": 0.08837890625, + "learning_rate": 6.940726577437858e-07, + "loss": 0.0035, + "reward": 1.4572542905807495, + "reward_std": 0.22139707207679749, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4728792905807495, + "rewards/pad": 0.015625, + "step": 960 + }, + { + "completion_length": 292.71875, + "epoch": 0.3062460165710644, + "grad_norm": 17.683250427246094, + "kl": 0.08203125, + "learning_rate": 6.937539834289357e-07, + "loss": 0.0033, + "reward": 1.4127620458602905, + "reward_std": 0.11772623658180237, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3190120756626129, + "step": 961 + }, + { + "completion_length": 159.9375, + "epoch": 0.3065646908859146, + "grad_norm": 12.655012130737305, + "kl": 0.10546875, + "learning_rate": 6.934353091140854e-07, + "loss": 0.0042, + "reward": 1.7172095775604248, + "reward_std": 0.14537625014781952, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4984596073627472, + "rewards/pad": 0.234375, + "step": 962 + }, + { + "completion_length": 157.9375, + "epoch": 0.3068833652007648, + "grad_norm": 10.002306938171387, + "kl": 0.111328125, + "learning_rate": 6.931166347992351e-07, + "loss": 0.0045, + "reward": 1.5363670587539673, + "reward_std": 0.08813170343637466, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5363670587539673, + "rewards/pad": 0.0, + "step": 963 + }, + { + "completion_length": 441.34375, + "epoch": 0.30720203951561503, + "grad_norm": 7.946275234222412, + "kl": 0.052490234375, + "learning_rate": 6.927979604843849e-07, + "loss": 0.0021, + "reward": 1.5722520351409912, + "reward_std": 0.08100847154855728, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5878770351409912, + "step": 964 + }, + { + "completion_length": 269.84375, + "epoch": 0.30752071383046525, + "grad_norm": 6.707942962646484, + "kl": 0.0859375, + "learning_rate": 6.924792861695347e-07, + "loss": 0.0034, + "reward": 1.5477776527404785, + "reward_std": 0.10902424156665802, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4540277421474457, + "rewards/pad": 0.125, + "step": 965 + }, + { + "completion_length": 257.84375, + "epoch": 0.3078393881453155, + "grad_norm": 7.706878185272217, + "kl": 0.07763671875, + "learning_rate": 6.921606118546845e-07, + "loss": 0.0031, + "reward": 1.347168207168579, + "reward_std": 0.07446466386318207, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3471682071685791, + "step": 966 + }, + { + "completion_length": 212.40625, + "epoch": 0.3081580624601657, + "grad_norm": 10.741114616394043, + "kl": 0.10791015625, + "learning_rate": 6.918419375398342e-07, + "loss": 0.0043, + "reward": 1.5717949867248535, + "reward_std": 0.10261975973844528, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5717950463294983, + "step": 967 + }, + { + "completion_length": 264.390625, + "epoch": 0.3084767367750159, + "grad_norm": 13.138110160827637, + "kl": 0.10791015625, + "learning_rate": 6.91523263224984e-07, + "loss": 0.0043, + "reward": 1.5748289823532104, + "reward_std": 0.1230107843875885, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5904540419578552, + "rewards/pad": 0.0, + "step": 968 + }, + { + "completion_length": 164.9375, + "epoch": 0.30879541108986613, + "grad_norm": 15.702935218811035, + "kl": 0.10498046875, + "learning_rate": 6.912045889101338e-07, + "loss": 0.0042, + "reward": 1.4226768016815186, + "reward_std": 0.09258827567100525, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31330177187919617, + "rewards/pad": 0.109375, + "step": 969 + }, + { + "completion_length": 234.34375, + "epoch": 0.30911408540471635, + "grad_norm": 7.595775604248047, + "kl": 0.09130859375, + "learning_rate": 6.908859145952836e-07, + "loss": 0.0037, + "reward": 1.6368181705474854, + "reward_std": 0.13625429570674896, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5274431109428406, + "rewards/pad": 0.109375, + "step": 970 + }, + { + "completion_length": 213.921875, + "epoch": 0.30943275971956663, + "grad_norm": 12.487693786621094, + "kl": 0.1376953125, + "learning_rate": 6.905672402804333e-07, + "loss": 0.0055, + "reward": 1.4605860710144043, + "reward_std": 0.07868099212646484, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4605861306190491, + "rewards/pad": 0.0, + "step": 971 + }, + { + "completion_length": 219.09375, + "epoch": 0.30975143403441685, + "grad_norm": 7.416385173797607, + "kl": 0.11962890625, + "learning_rate": 6.902485659655831e-07, + "loss": 0.0048, + "reward": 1.4688482284545898, + "reward_std": 0.09033405780792236, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46884816884994507, + "rewards/pad": 0.0, + "step": 972 + }, + { + "completion_length": 356.0, + "epoch": 0.31007010834926707, + "grad_norm": 28.166015625, + "kl": 0.06982421875, + "learning_rate": 6.899298916507329e-07, + "loss": 0.0028, + "reward": 1.463951826095581, + "reward_std": 0.14015373587608337, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47957685589790344, + "rewards/pad": 0.0, + "step": 973 + }, + { + "completion_length": 314.265625, + "epoch": 0.3103887826641173, + "grad_norm": 6.10435676574707, + "kl": 0.060302734375, + "learning_rate": 6.896112173358827e-07, + "loss": 0.0024, + "reward": 1.4725921154022217, + "reward_std": 0.076809361577034, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3475921154022217, + "step": 974 + }, + { + "completion_length": 303.375, + "epoch": 0.3107074569789675, + "grad_norm": 17.244062423706055, + "kl": 0.080078125, + "learning_rate": 6.892925430210324e-07, + "loss": 0.0032, + "reward": 1.5629791021347046, + "reward_std": 0.10295584797859192, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4536040127277374, + "step": 975 + }, + { + "completion_length": 313.078125, + "epoch": 0.31102613129381773, + "grad_norm": 6.946135997772217, + "kl": 0.062255859375, + "learning_rate": 6.889738687061822e-07, + "loss": 0.0025, + "reward": 1.5047985315322876, + "reward_std": 0.18012316524982452, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.41104856133461, + "step": 976 + }, + { + "completion_length": 163.3125, + "epoch": 0.31134480560866795, + "grad_norm": 21.414690017700195, + "kl": 0.09423828125, + "learning_rate": 6.88655194391332e-07, + "loss": 0.0038, + "reward": 1.8146766424179077, + "reward_std": 0.1765768826007843, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5021766424179077, + "step": 977 + }, + { + "completion_length": 248.625, + "epoch": 0.31166347992351817, + "grad_norm": 23.901901245117188, + "kl": 0.07568359375, + "learning_rate": 6.883365200764818e-07, + "loss": 0.003, + "reward": 1.4768123626708984, + "reward_std": 0.11282205581665039, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4924373924732208, + "rewards/pad": 0.0, + "step": 978 + }, + { + "completion_length": 333.125, + "epoch": 0.3119821542383684, + "grad_norm": 5.673859119415283, + "kl": 0.05712890625, + "learning_rate": 6.880178457616315e-07, + "loss": 0.0023, + "reward": 1.4660868644714355, + "reward_std": 0.08494298905134201, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4817117750644684, + "step": 979 + }, + { + "completion_length": 258.796875, + "epoch": 0.3123008285532186, + "grad_norm": 12.043933868408203, + "kl": 0.07958984375, + "learning_rate": 6.876991714467814e-07, + "loss": 0.0032, + "reward": 1.6999037265777588, + "reward_std": 0.17478086054325104, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.496778666973114, + "step": 980 + }, + { + "completion_length": 331.9375, + "epoch": 0.31261950286806883, + "grad_norm": 4.478605270385742, + "kl": 0.05419921875, + "learning_rate": 6.873804971319312e-07, + "loss": 0.0022, + "reward": 1.4835875034332275, + "reward_std": 0.1486971229314804, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3898375630378723, + "step": 981 + }, + { + "completion_length": 365.6875, + "epoch": 0.31293817718291905, + "grad_norm": 7.0540313720703125, + "kl": 0.0703125, + "learning_rate": 6.87061822817081e-07, + "loss": 0.0028, + "reward": 1.3310976028442383, + "reward_std": 0.100373774766922, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34672248363494873, + "step": 982 + }, + { + "completion_length": 296.3125, + "epoch": 0.3132568514977693, + "grad_norm": 43.66001892089844, + "kl": 0.087890625, + "learning_rate": 6.867431485022307e-07, + "loss": 0.0035, + "reward": 1.3896299600601196, + "reward_std": 0.1810145378112793, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.327129989862442, + "rewards/pad": 0.078125, + "step": 983 + }, + { + "completion_length": 221.3125, + "epoch": 0.3135755258126195, + "grad_norm": 7.561755657196045, + "kl": 0.07958984375, + "learning_rate": 6.864244741873805e-07, + "loss": 0.0032, + "reward": 1.6374634504318237, + "reward_std": 0.13763760030269623, + "rewards/answer_reward": 0.1875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.44996345043182373, + "step": 984 + }, + { + "completion_length": 334.921875, + "epoch": 0.3138942001274697, + "grad_norm": 9.691668510437012, + "kl": 0.076171875, + "learning_rate": 6.861057998725303e-07, + "loss": 0.003, + "reward": 1.5845732688903809, + "reward_std": 0.1470044106245041, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5064482688903809, + "step": 985 + }, + { + "completion_length": 273.375, + "epoch": 0.31421287444231993, + "grad_norm": 7.891676902770996, + "kl": 0.07177734375, + "learning_rate": 6.8578712555768e-07, + "loss": 0.0029, + "reward": 1.649653434753418, + "reward_std": 0.20933878421783447, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.47777846455574036, + "step": 986 + }, + { + "completion_length": 300.1875, + "epoch": 0.31453154875717015, + "grad_norm": 8.156953811645508, + "kl": 0.07373046875, + "learning_rate": 6.854684512428298e-07, + "loss": 0.003, + "reward": 1.531054973602295, + "reward_std": 0.07905551791191101, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5310550332069397, + "rewards/pad": 0.0, + "step": 987 + }, + { + "completion_length": 386.78125, + "epoch": 0.3148502230720204, + "grad_norm": 8.51009750366211, + "kl": 0.05078125, + "learning_rate": 6.851497769279796e-07, + "loss": 0.002, + "reward": 1.456154704093933, + "reward_std": 0.05731338635087013, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4561547040939331, + "rewards/pad": 0.0, + "step": 988 + }, + { + "completion_length": 310.0625, + "epoch": 0.3151688973868706, + "grad_norm": 9.364920616149902, + "kl": 0.078125, + "learning_rate": 6.848311026131294e-07, + "loss": 0.0031, + "reward": 1.444016933441162, + "reward_std": 0.11831048130989075, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4440169036388397, + "step": 989 + }, + { + "completion_length": 266.671875, + "epoch": 0.3154875717017208, + "grad_norm": 9.522704124450684, + "kl": 0.08251953125, + "learning_rate": 6.845124282982791e-07, + "loss": 0.0033, + "reward": 1.562047004699707, + "reward_std": 0.1287516951560974, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.45267191529273987, + "step": 990 + }, + { + "completion_length": 273.453125, + "epoch": 0.3158062460165711, + "grad_norm": 11.734932899475098, + "kl": 0.1044921875, + "learning_rate": 6.841937539834289e-07, + "loss": 0.0042, + "reward": 1.616694450378418, + "reward_std": 0.14030656218528748, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4135693609714508, + "rewards/pad": 0.203125, + "step": 991 + }, + { + "completion_length": 195.390625, + "epoch": 0.3161249203314213, + "grad_norm": 8.491342544555664, + "kl": 0.09716796875, + "learning_rate": 6.838750796685787e-07, + "loss": 0.0039, + "reward": 1.4952812194824219, + "reward_std": 0.1000450998544693, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4952811598777771, + "step": 992 + }, + { + "completion_length": 257.5625, + "epoch": 0.31644359464627153, + "grad_norm": 6.126900672912598, + "kl": 0.0673828125, + "learning_rate": 6.835564053537285e-07, + "loss": 0.0027, + "reward": 1.375550389289856, + "reward_std": 0.06355856359004974, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37555044889450073, + "step": 993 + }, + { + "completion_length": 296.40625, + "epoch": 0.31676226896112175, + "grad_norm": 6.934712886810303, + "kl": 0.0537109375, + "learning_rate": 6.832377310388782e-07, + "loss": 0.0021, + "reward": 1.6266264915466309, + "reward_std": 0.18362689018249512, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5016264915466309, + "step": 994 + }, + { + "completion_length": 314.453125, + "epoch": 0.31708094327597197, + "grad_norm": 8.887853622436523, + "kl": 0.06298828125, + "learning_rate": 6.82919056724028e-07, + "loss": 0.0025, + "reward": 1.5279200077056885, + "reward_std": 0.06337783485651016, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4029200077056885, + "rewards/pad": 0.125, + "step": 995 + }, + { + "completion_length": 367.015625, + "epoch": 0.3173996175908222, + "grad_norm": 12.307510375976562, + "kl": 0.05712890625, + "learning_rate": 6.826003824091778e-07, + "loss": 0.0023, + "reward": 1.352768898010254, + "reward_std": 0.10241373628377914, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3683939576148987, + "step": 996 + }, + { + "completion_length": 228.46875, + "epoch": 0.3177182919056724, + "grad_norm": 30.698068618774414, + "kl": 0.08544921875, + "learning_rate": 6.822817080943276e-07, + "loss": 0.0034, + "reward": 1.551741123199463, + "reward_std": 0.1621362864971161, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5048661828041077, + "step": 997 + }, + { + "completion_length": 181.734375, + "epoch": 0.31803696622052263, + "grad_norm": 11.052029609680176, + "kl": 0.09765625, + "learning_rate": 6.819630337794773e-07, + "loss": 0.0039, + "reward": 1.4496850967407227, + "reward_std": 0.08132076263427734, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44968509674072266, + "step": 998 + }, + { + "completion_length": 226.171875, + "epoch": 0.31835564053537285, + "grad_norm": 7.181490898132324, + "kl": 0.091796875, + "learning_rate": 6.816443594646272e-07, + "loss": 0.0037, + "reward": 1.499758243560791, + "reward_std": 0.09113484621047974, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5153831243515015, + "rewards/pad": 0.0, + "step": 999 + }, + { + "completion_length": 305.421875, + "epoch": 0.3186743148502231, + "grad_norm": 6.9231977462768555, + "kl": 0.0693359375, + "learning_rate": 6.81325685149777e-07, + "loss": 0.0028, + "reward": 1.4802541732788086, + "reward_std": 0.10031703859567642, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3865041732788086, + "step": 1000 + }, + { + "completion_length": 341.59375, + "epoch": 0.3189929891650733, + "grad_norm": 6.3926520347595215, + "kl": 0.047119140625, + "learning_rate": 6.810070108349267e-07, + "loss": 0.0019, + "reward": 1.6489269733428955, + "reward_std": 0.08554818481206894, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4145520329475403, + "step": 1001 + }, + { + "completion_length": 300.484375, + "epoch": 0.3193116634799235, + "grad_norm": 10.810160636901855, + "kl": 0.1005859375, + "learning_rate": 6.806883365200764e-07, + "loss": 0.004, + "reward": 1.564328908920288, + "reward_std": 0.13377408683300018, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4549539089202881, + "rewards/pad": 0.125, + "step": 1002 + }, + { + "completion_length": 297.421875, + "epoch": 0.31963033779477373, + "grad_norm": 6.0769243240356445, + "kl": 0.0810546875, + "learning_rate": 6.803696622052262e-07, + "loss": 0.0032, + "reward": 1.5560057163238525, + "reward_std": 0.06804891675710678, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43100565671920776, + "step": 1003 + }, + { + "completion_length": 156.453125, + "epoch": 0.31994901210962395, + "grad_norm": 38.39896011352539, + "kl": 0.138671875, + "learning_rate": 6.80050987890376e-07, + "loss": 0.0056, + "reward": 1.532203197479248, + "reward_std": 0.11442001163959503, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.532203197479248, + "rewards/pad": 0.0, + "step": 1004 + }, + { + "completion_length": 117.546875, + "epoch": 0.3202676864244742, + "grad_norm": 7.369303226470947, + "kl": 0.11962890625, + "learning_rate": 6.797323135755258e-07, + "loss": 0.0048, + "reward": 1.6599302291870117, + "reward_std": 0.10183878988027573, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40993016958236694, + "rewards/pad": 0.25, + "step": 1005 + }, + { + "completion_length": 138.515625, + "epoch": 0.3205863607393244, + "grad_norm": 14.212640762329102, + "kl": 0.11376953125, + "learning_rate": 6.794136392606755e-07, + "loss": 0.0046, + "reward": 1.5369373559951782, + "reward_std": 0.1619691550731659, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.427562415599823, + "rewards/pad": 0.125, + "step": 1006 + }, + { + "completion_length": 266.234375, + "epoch": 0.3209050350541746, + "grad_norm": 13.497116088867188, + "kl": 0.06787109375, + "learning_rate": 6.790949649458253e-07, + "loss": 0.0027, + "reward": 1.3564127683639526, + "reward_std": 0.028475552797317505, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.35641270875930786, + "step": 1007 + }, + { + "completion_length": 250.4375, + "epoch": 0.32122370936902483, + "grad_norm": 15.16408634185791, + "kl": 0.06640625, + "learning_rate": 6.787762906309751e-07, + "loss": 0.0026, + "reward": 1.6797528266906738, + "reward_std": 0.14224869012832642, + "rewards/pad": 0.359375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3516278862953186, + "step": 1008 + }, + { + "completion_length": 304.734375, + "epoch": 0.32154238368387505, + "grad_norm": 8.194258689880371, + "kl": 0.06787109375, + "learning_rate": 6.784576163161249e-07, + "loss": 0.0027, + "reward": 1.4798452854156494, + "reward_std": 0.08323159068822861, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4798452854156494, + "step": 1009 + }, + { + "completion_length": 147.609375, + "epoch": 0.3218610579987253, + "grad_norm": 27.742565155029297, + "kl": 0.09423828125, + "learning_rate": 6.781389420012746e-07, + "loss": 0.0038, + "reward": 2.128603935241699, + "reward_std": 0.14139896631240845, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5036039352416992, + "rewards/pad": 0.625, + "step": 1010 + }, + { + "completion_length": 166.9375, + "epoch": 0.32217973231357555, + "grad_norm": 14.250349044799805, + "kl": 0.09521484375, + "learning_rate": 6.778202676864244e-07, + "loss": 0.0038, + "reward": 1.4786567687988281, + "reward_std": 0.13321171700954437, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.24428173899650574, + "step": 1011 + }, + { + "completion_length": 306.46875, + "epoch": 0.32249840662842577, + "grad_norm": 4.766030788421631, + "kl": 0.06982421875, + "learning_rate": 6.775015933715742e-07, + "loss": 0.0028, + "reward": 1.446626901626587, + "reward_std": 0.09482300281524658, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4622519016265869, + "step": 1012 + }, + { + "completion_length": 207.09375, + "epoch": 0.322817080943276, + "grad_norm": 5.062275409698486, + "kl": 0.11083984375, + "learning_rate": 6.77182919056724e-07, + "loss": 0.0044, + "reward": 1.72430419921875, + "reward_std": 0.10817757248878479, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6149291396141052, + "step": 1013 + }, + { + "completion_length": 279.78125, + "epoch": 0.3231357552581262, + "grad_norm": 72.79349517822266, + "kl": 0.072265625, + "learning_rate": 6.768642447418737e-07, + "loss": 0.0029, + "reward": 1.5395193099975586, + "reward_std": 0.22496607899665833, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44576939940452576, + "step": 1014 + }, + { + "completion_length": 222.890625, + "epoch": 0.32345442957297643, + "grad_norm": 8.13277816772461, + "kl": 0.07373046875, + "learning_rate": 6.765455704270235e-07, + "loss": 0.003, + "reward": 1.5967485904693604, + "reward_std": 0.10184575617313385, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3623736798763275, + "step": 1015 + }, + { + "completion_length": 203.046875, + "epoch": 0.32377310388782665, + "grad_norm": 7.447783946990967, + "kl": 0.1044921875, + "learning_rate": 6.762268961121733e-07, + "loss": 0.0042, + "reward": 1.6282625198364258, + "reward_std": 0.11571632325649261, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5345126390457153, + "step": 1016 + }, + { + "completion_length": 128.625, + "epoch": 0.3240917782026769, + "grad_norm": 7.846916198730469, + "kl": 0.1005859375, + "learning_rate": 6.75908221797323e-07, + "loss": 0.004, + "reward": 1.5380330085754395, + "reward_std": 0.10079097002744675, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4286580979824066, + "rewards/pad": 0.109375, + "step": 1017 + }, + { + "completion_length": 328.453125, + "epoch": 0.3244104525175271, + "grad_norm": 6.111870288848877, + "kl": 0.0546875, + "learning_rate": 6.755895474824729e-07, + "loss": 0.0022, + "reward": 1.4796979427337646, + "reward_std": 0.13149893283843994, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4484480023384094, + "step": 1018 + }, + { + "completion_length": 253.84375, + "epoch": 0.3247291268323773, + "grad_norm": 30.818262100219727, + "kl": 0.0693359375, + "learning_rate": 6.752708731676227e-07, + "loss": 0.0028, + "reward": 1.561851978302002, + "reward_std": 0.11902665346860886, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45247694849967957, + "rewards/pad": 0.125, + "step": 1019 + }, + { + "completion_length": 249.4375, + "epoch": 0.32504780114722753, + "grad_norm": 16.430784225463867, + "kl": 0.0732421875, + "learning_rate": 6.749521988527725e-07, + "loss": 0.0029, + "reward": 1.5126055479049683, + "reward_std": 0.12142893671989441, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48135554790496826, + "step": 1020 + }, + { + "completion_length": 231.203125, + "epoch": 0.32536647546207775, + "grad_norm": 11.677145957946777, + "kl": 0.08154296875, + "learning_rate": 6.746335245379222e-07, + "loss": 0.0033, + "reward": 1.5022270679473877, + "reward_std": 0.0498536080121994, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37722718715667725, + "rewards/pad": 0.125, + "step": 1021 + }, + { + "completion_length": 262.21875, + "epoch": 0.325685149776928, + "grad_norm": 6.868616580963135, + "kl": 0.060302734375, + "learning_rate": 6.74314850223072e-07, + "loss": 0.0024, + "reward": 1.6476733684539795, + "reward_std": 0.11569195985794067, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4914233982563019, + "step": 1022 + }, + { + "completion_length": 272.5, + "epoch": 0.3260038240917782, + "grad_norm": 5.200622081756592, + "kl": 0.08544921875, + "learning_rate": 6.739961759082218e-07, + "loss": 0.0034, + "reward": 1.463226556777954, + "reward_std": 0.042159367352724075, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33822667598724365, + "step": 1023 + }, + { + "completion_length": 138.390625, + "epoch": 0.3263224984066284, + "grad_norm": 22.599626541137695, + "kl": 0.1357421875, + "learning_rate": 6.736775015933716e-07, + "loss": 0.0054, + "reward": 1.4473992586135864, + "reward_std": 0.08425348997116089, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4473992586135864, + "rewards/pad": 0.0, + "step": 1024 + }, + { + "completion_length": 181.8125, + "epoch": 0.32664117272147863, + "grad_norm": 10.378974914550781, + "kl": 0.10205078125, + "learning_rate": 6.733588272785213e-07, + "loss": 0.0041, + "reward": 1.6889574527740479, + "reward_std": 0.11436554789543152, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5795823335647583, + "rewards/pad": 0.109375, + "step": 1025 + }, + { + "completion_length": 274.765625, + "epoch": 0.32695984703632885, + "grad_norm": 10.433825492858887, + "kl": 0.06396484375, + "learning_rate": 6.730401529636711e-07, + "loss": 0.0026, + "reward": 1.571969747543335, + "reward_std": 0.132668137550354, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3219696879386902, + "step": 1026 + }, + { + "completion_length": 295.328125, + "epoch": 0.3272785213511791, + "grad_norm": 16.413532257080078, + "kl": 0.0673828125, + "learning_rate": 6.727214786488209e-07, + "loss": 0.0027, + "reward": 1.5735794305801392, + "reward_std": 0.17031529545783997, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.3860793709754944, + "step": 1027 + }, + { + "completion_length": 197.0, + "epoch": 0.3275971956660293, + "grad_norm": 81.36949157714844, + "kl": 0.08349609375, + "learning_rate": 6.724028043339707e-07, + "loss": 0.0033, + "reward": 1.575305700302124, + "reward_std": 0.14447948336601257, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3409307599067688, + "step": 1028 + }, + { + "completion_length": 226.46875, + "epoch": 0.3279158699808795, + "grad_norm": 11.29088020324707, + "kl": 0.0859375, + "learning_rate": 6.720841300191204e-07, + "loss": 0.0034, + "reward": 1.5066144466400146, + "reward_std": 0.07048650085926056, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5066144466400146, + "step": 1029 + }, + { + "completion_length": 150.375, + "epoch": 0.3282345442957298, + "grad_norm": 13.777542114257812, + "kl": 0.09228515625, + "learning_rate": 6.717654557042702e-07, + "loss": 0.0037, + "reward": 1.2772927284240723, + "reward_std": 0.1390942633152008, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.29291781783103943, + "step": 1030 + }, + { + "completion_length": 179.109375, + "epoch": 0.32855321861058, + "grad_norm": 15.101614952087402, + "kl": 0.09912109375, + "learning_rate": 6.7144678138942e-07, + "loss": 0.004, + "reward": 1.6715672016143799, + "reward_std": 0.12946265935897827, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5778171420097351, + "rewards/pad": 0.125, + "step": 1031 + }, + { + "completion_length": 270.25, + "epoch": 0.32887189292543023, + "grad_norm": 7.602063179016113, + "kl": 0.07763671875, + "learning_rate": 6.711281070745698e-07, + "loss": 0.0031, + "reward": 1.615061640739441, + "reward_std": 0.11350873112678528, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5213116407394409, + "step": 1032 + }, + { + "completion_length": 228.609375, + "epoch": 0.32919056724028045, + "grad_norm": 6.101695537567139, + "kl": 0.0791015625, + "learning_rate": 6.708094327597195e-07, + "loss": 0.0032, + "reward": 1.5489016771316528, + "reward_std": 0.14920702576637268, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5176516771316528, + "rewards/pad": 0.046875, + "step": 1033 + }, + { + "completion_length": 189.640625, + "epoch": 0.32950924155513067, + "grad_norm": 7.619702339172363, + "kl": 0.095703125, + "learning_rate": 6.704907584448693e-07, + "loss": 0.0038, + "reward": 1.3966412544250488, + "reward_std": 0.08044132590293884, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39664119482040405, + "rewards/pad": 0.0, + "step": 1034 + }, + { + "completion_length": 234.75, + "epoch": 0.3298279158699809, + "grad_norm": 16.621103286743164, + "kl": 0.08935546875, + "learning_rate": 6.701720841300191e-07, + "loss": 0.0036, + "reward": 1.644860863685608, + "reward_std": 0.20091207325458527, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5198608636856079, + "rewards/pad": 0.140625, + "step": 1035 + }, + { + "completion_length": 176.734375, + "epoch": 0.3301465901848311, + "grad_norm": 19.77828598022461, + "kl": 0.08935546875, + "learning_rate": 6.69853409815169e-07, + "loss": 0.0036, + "reward": 1.7387616634368896, + "reward_std": 0.18001246452331543, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5825115442276001, + "step": 1036 + }, + { + "completion_length": 139.5, + "epoch": 0.33046526449968133, + "grad_norm": 155.10440063476562, + "kl": 0.123046875, + "learning_rate": 6.695347355003187e-07, + "loss": 0.0049, + "reward": 1.554897665977478, + "reward_std": 0.13476990163326263, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4923976957798004, + "step": 1037 + }, + { + "completion_length": 306.921875, + "epoch": 0.33078393881453155, + "grad_norm": 6.137125492095947, + "kl": 0.08251953125, + "learning_rate": 6.692160611854685e-07, + "loss": 0.0033, + "reward": 1.5144294500350952, + "reward_std": 0.04760780930519104, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5144294500350952, + "step": 1038 + }, + { + "completion_length": 249.40625, + "epoch": 0.3311026131293818, + "grad_norm": 5.570843696594238, + "kl": 0.0751953125, + "learning_rate": 6.688973868706183e-07, + "loss": 0.003, + "reward": 1.6632411479949951, + "reward_std": 0.13550686836242676, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41324126720428467, + "step": 1039 + }, + { + "completion_length": 315.546875, + "epoch": 0.331421287444232, + "grad_norm": 114.8282699584961, + "kl": 0.064453125, + "learning_rate": 6.68578712555768e-07, + "loss": 0.0026, + "reward": 1.3185205459594727, + "reward_std": 0.06591153144836426, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.33414554595947266, + "step": 1040 + }, + { + "completion_length": 144.296875, + "epoch": 0.3317399617590822, + "grad_norm": 19.302204132080078, + "kl": 0.1201171875, + "learning_rate": 6.682600382409177e-07, + "loss": 0.0048, + "reward": 1.7855418920516968, + "reward_std": 0.09190037101507187, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4105418920516968, + "rewards/pad": 0.375, + "step": 1041 + }, + { + "completion_length": 193.359375, + "epoch": 0.33205863607393243, + "grad_norm": 12.114130973815918, + "kl": 0.1005859375, + "learning_rate": 6.679413639260675e-07, + "loss": 0.004, + "reward": 1.4832127094268799, + "reward_std": 0.12668836116790771, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48321273922920227, + "rewards/pad": 0.0, + "step": 1042 + }, + { + "completion_length": 97.15625, + "epoch": 0.33237731038878265, + "grad_norm": 34.39638900756836, + "kl": 0.1279296875, + "learning_rate": 6.676226896112173e-07, + "loss": 0.0051, + "reward": 1.7472283840179443, + "reward_std": 0.10076180100440979, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6222284436225891, + "rewards/pad": 0.125, + "step": 1043 + }, + { + "completion_length": 251.078125, + "epoch": 0.3326959847036329, + "grad_norm": 6.926640510559082, + "kl": 0.10791015625, + "learning_rate": 6.67304015296367e-07, + "loss": 0.0043, + "reward": 1.4593260288238525, + "reward_std": 0.12262436747550964, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45932596921920776, + "step": 1044 + }, + { + "completion_length": 210.65625, + "epoch": 0.3330146590184831, + "grad_norm": 17.774791717529297, + "kl": 0.0986328125, + "learning_rate": 6.669853409815168e-07, + "loss": 0.0039, + "reward": 1.5273455381393433, + "reward_std": 0.10245544463396072, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.527345597743988, + "rewards/pad": 0.0, + "step": 1045 + }, + { + "completion_length": 325.578125, + "epoch": 0.3333333333333333, + "grad_norm": 4.738499641418457, + "kl": 0.056640625, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0023, + "reward": 1.5215575695037842, + "reward_std": 0.07813851535320282, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5371826887130737, + "rewards/pad": 0.0, + "step": 1046 + }, + { + "completion_length": 152.828125, + "epoch": 0.33365200764818354, + "grad_norm": 9.14372730255127, + "kl": 0.11376953125, + "learning_rate": 6.663479923518164e-07, + "loss": 0.0045, + "reward": 1.4708912372589111, + "reward_std": 0.188470259308815, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.48651620745658875, + "rewards/pad": 0.0, + "step": 1047 + }, + { + "completion_length": 110.484375, + "epoch": 0.33397068196303376, + "grad_norm": 9.19238567352295, + "kl": 0.1328125, + "learning_rate": 6.660293180369661e-07, + "loss": 0.0053, + "reward": 1.639833688735962, + "reward_std": 0.10873033106327057, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4054586887359619, + "step": 1048 + }, + { + "completion_length": 317.78125, + "epoch": 0.334289356277884, + "grad_norm": 139.2799072265625, + "kl": 0.1337890625, + "learning_rate": 6.657106437221159e-07, + "loss": 0.0053, + "reward": 1.6599500179290771, + "reward_std": 0.126263827085495, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44119998812675476, + "step": 1049 + }, + { + "completion_length": 299.421875, + "epoch": 0.33460803059273425, + "grad_norm": 7.264336585998535, + "kl": 0.0830078125, + "learning_rate": 6.653919694072657e-07, + "loss": 0.0033, + "reward": 1.4243035316467285, + "reward_std": 0.1417325735092163, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3930535614490509, + "step": 1050 + }, + { + "completion_length": 243.484375, + "epoch": 0.33492670490758447, + "grad_norm": 27.143455505371094, + "kl": 0.0703125, + "learning_rate": 6.650732950924155e-07, + "loss": 0.0028, + "reward": 1.8336070775985718, + "reward_std": 0.07981158792972565, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5836069583892822, + "step": 1051 + }, + { + "completion_length": 283.359375, + "epoch": 0.3352453792224347, + "grad_norm": 8.567912101745605, + "kl": 0.0693359375, + "learning_rate": 6.647546207775652e-07, + "loss": 0.0028, + "reward": 1.3368821144104004, + "reward_std": 0.04559015482664108, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3368821144104004, + "step": 1052 + }, + { + "completion_length": 209.0, + "epoch": 0.3355640535372849, + "grad_norm": 18.560606002807617, + "kl": 0.10400390625, + "learning_rate": 6.64435946462715e-07, + "loss": 0.0042, + "reward": 1.6890558004379272, + "reward_std": 0.09991727769374847, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6890558004379272, + "step": 1053 + }, + { + "completion_length": 410.75, + "epoch": 0.33588272785213513, + "grad_norm": 16.412952423095703, + "kl": 0.04931640625, + "learning_rate": 6.641172721478648e-07, + "loss": 0.002, + "reward": 1.4114445447921753, + "reward_std": 0.09891023486852646, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3645695149898529, + "rewards/pad": 0.046875, + "step": 1054 + }, + { + "completion_length": 262.953125, + "epoch": 0.33620140216698535, + "grad_norm": 9.055387496948242, + "kl": 0.083984375, + "learning_rate": 6.637985978330147e-07, + "loss": 0.0034, + "reward": 1.5122671127319336, + "reward_std": 0.10402856022119522, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5122671723365784, + "step": 1055 + }, + { + "completion_length": 208.296875, + "epoch": 0.3365200764818356, + "grad_norm": 10.077004432678223, + "kl": 0.0810546875, + "learning_rate": 6.634799235181644e-07, + "loss": 0.0032, + "reward": 1.6834617853164673, + "reward_std": 0.10777908563613892, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4490867853164673, + "rewards/pad": 0.234375, + "step": 1056 + }, + { + "completion_length": 161.75, + "epoch": 0.3368387507966858, + "grad_norm": 12.025467872619629, + "kl": 0.0986328125, + "learning_rate": 6.631612492033142e-07, + "loss": 0.004, + "reward": 1.5560848712921143, + "reward_std": 0.19594892859458923, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46233493089675903, + "rewards/pad": 0.09375, + "step": 1057 + }, + { + "completion_length": 338.34375, + "epoch": 0.337157425111536, + "grad_norm": 11.654243469238281, + "kl": 0.0537109375, + "learning_rate": 6.62842574888464e-07, + "loss": 0.0021, + "reward": 1.517212152481079, + "reward_std": 0.06006244942545891, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5172120928764343, + "step": 1058 + }, + { + "completion_length": 357.046875, + "epoch": 0.33747609942638623, + "grad_norm": 10.765448570251465, + "kl": 0.06787109375, + "learning_rate": 6.625239005736138e-07, + "loss": 0.0027, + "reward": 1.3917007446289062, + "reward_std": 0.11147765070199966, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4073258936405182, + "step": 1059 + }, + { + "completion_length": 162.109375, + "epoch": 0.33779477374123645, + "grad_norm": 16.117137908935547, + "kl": 0.10302734375, + "learning_rate": 6.622052262587635e-07, + "loss": 0.0041, + "reward": 1.499964714050293, + "reward_std": 0.23960351943969727, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.37496474385261536, + "rewards/pad": 0.140625, + "step": 1060 + }, + { + "completion_length": 258.34375, + "epoch": 0.3381134480560867, + "grad_norm": 7.214158058166504, + "kl": 0.08984375, + "learning_rate": 6.618865519439133e-07, + "loss": 0.0036, + "reward": 1.595639705657959, + "reward_std": 0.09060221910476685, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.470639705657959, + "step": 1061 + }, + { + "completion_length": 353.171875, + "epoch": 0.3384321223709369, + "grad_norm": 8.844647407531738, + "kl": 0.0498046875, + "learning_rate": 6.615678776290631e-07, + "loss": 0.002, + "reward": 1.5343294143676758, + "reward_std": 0.1305486261844635, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.29995444416999817, + "step": 1062 + }, + { + "completion_length": 342.921875, + "epoch": 0.3387507966857871, + "grad_norm": 10.557599067687988, + "kl": 0.0908203125, + "learning_rate": 6.612492033142129e-07, + "loss": 0.0036, + "reward": 1.603943109512329, + "reward_std": 0.08996868133544922, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4789431691169739, + "rewards/pad": 0.125, + "step": 1063 + }, + { + "completion_length": 219.890625, + "epoch": 0.33906947100063733, + "grad_norm": 10.76680850982666, + "kl": 0.09521484375, + "learning_rate": 6.609305289993626e-07, + "loss": 0.0038, + "reward": 1.5738226175308228, + "reward_std": 0.22833140194416046, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.38632258772850037, + "step": 1064 + }, + { + "completion_length": 187.28125, + "epoch": 0.33938814531548755, + "grad_norm": 8.700113296508789, + "kl": 0.1181640625, + "learning_rate": 6.606118546845124e-07, + "loss": 0.0047, + "reward": 1.413580060005188, + "reward_std": 0.11447126418352127, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.413580060005188, + "step": 1065 + }, + { + "completion_length": 266.84375, + "epoch": 0.3397068196303378, + "grad_norm": 19.892236709594727, + "kl": 0.07861328125, + "learning_rate": 6.602931803696622e-07, + "loss": 0.0031, + "reward": 1.4972413778305054, + "reward_std": 0.15969689190387726, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.34099140763282776, + "step": 1066 + }, + { + "completion_length": 318.78125, + "epoch": 0.340025493945188, + "grad_norm": 6.460472583770752, + "kl": 0.0849609375, + "learning_rate": 6.59974506054812e-07, + "loss": 0.0034, + "reward": 1.3890538215637207, + "reward_std": 0.1033376008272171, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4046788811683655, + "step": 1067 + }, + { + "completion_length": 224.34375, + "epoch": 0.3403441682600382, + "grad_norm": 10.880087852478027, + "kl": 0.09423828125, + "learning_rate": 6.596558317399617e-07, + "loss": 0.0038, + "reward": 1.6705816984176636, + "reward_std": 0.17944398522377014, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5612067580223083, + "rewards/pad": 0.125, + "step": 1068 + }, + { + "completion_length": 202.859375, + "epoch": 0.3406628425748885, + "grad_norm": 12.304518699645996, + "kl": 0.119140625, + "learning_rate": 6.593371574251115e-07, + "loss": 0.0048, + "reward": 1.6697089672088623, + "reward_std": 0.10854683816432953, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6697089076042175, + "rewards/pad": 0.0, + "step": 1069 + }, + { + "completion_length": 233.8125, + "epoch": 0.3409815168897387, + "grad_norm": 15.415122985839844, + "kl": 0.0986328125, + "learning_rate": 6.590184831102613e-07, + "loss": 0.0039, + "reward": 1.6353888511657715, + "reward_std": 0.23096291720867157, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43226397037506104, + "rewards/pad": 0.21875, + "step": 1070 + }, + { + "completion_length": 235.84375, + "epoch": 0.34130019120458893, + "grad_norm": 20.07423973083496, + "kl": 0.09765625, + "learning_rate": 6.586998087954111e-07, + "loss": 0.0039, + "reward": 1.5326218605041504, + "reward_std": 0.10475602746009827, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43887192010879517, + "step": 1071 + }, + { + "completion_length": 360.171875, + "epoch": 0.34161886551943915, + "grad_norm": 18.752527236938477, + "kl": 0.0576171875, + "learning_rate": 6.583811344805608e-07, + "loss": 0.0023, + "reward": 1.5809378623962402, + "reward_std": 0.11181546002626419, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4715629518032074, + "rewards/pad": 0.125, + "step": 1072 + }, + { + "completion_length": 268.5, + "epoch": 0.3419375398342894, + "grad_norm": 7.56311559677124, + "kl": 0.0712890625, + "learning_rate": 6.580624601657106e-07, + "loss": 0.0029, + "reward": 1.630232572555542, + "reward_std": 0.1588975340127945, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42710763216018677, + "step": 1073 + }, + { + "completion_length": 208.84375, + "epoch": 0.3422562141491396, + "grad_norm": 47.703773498535156, + "kl": 0.09033203125, + "learning_rate": 6.577437858508605e-07, + "loss": 0.0036, + "reward": 1.799774408340454, + "reward_std": 0.13436491787433624, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5653994679450989, + "step": 1074 + }, + { + "completion_length": 368.671875, + "epoch": 0.3425748884639898, + "grad_norm": 8.87077808380127, + "kl": 0.06494140625, + "learning_rate": 6.574251115360103e-07, + "loss": 0.0026, + "reward": 1.5052409172058105, + "reward_std": 0.1388981193304062, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5208659768104553, + "step": 1075 + }, + { + "completion_length": 194.671875, + "epoch": 0.34289356277884003, + "grad_norm": 8.328705787658691, + "kl": 0.126953125, + "learning_rate": 6.5710643722116e-07, + "loss": 0.0051, + "reward": 1.571947693824768, + "reward_std": 0.10731147229671478, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5719476938247681, + "rewards/pad": 0.0, + "step": 1076 + }, + { + "completion_length": 171.859375, + "epoch": 0.34321223709369025, + "grad_norm": 34.82987594604492, + "kl": 0.0986328125, + "learning_rate": 6.567877629063098e-07, + "loss": 0.0039, + "reward": 1.6688724756240845, + "reward_std": 0.17240235209465027, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5126224756240845, + "step": 1077 + }, + { + "completion_length": 262.25, + "epoch": 0.3435309114085405, + "grad_norm": 13.668285369873047, + "kl": 0.0712890625, + "learning_rate": 6.564690885914596e-07, + "loss": 0.0029, + "reward": 1.568549633026123, + "reward_std": 0.2681657671928406, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.5216747522354126, + "step": 1078 + }, + { + "completion_length": 150.703125, + "epoch": 0.3438495857233907, + "grad_norm": 23.212413787841797, + "kl": 0.1142578125, + "learning_rate": 6.561504142766092e-07, + "loss": 0.0046, + "reward": 1.6253793239593506, + "reward_std": 0.109074667096138, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6253793239593506, + "rewards/pad": 0.0, + "step": 1079 + }, + { + "completion_length": 169.578125, + "epoch": 0.3441682600382409, + "grad_norm": 12.365942001342773, + "kl": 0.10546875, + "learning_rate": 6.55831739961759e-07, + "loss": 0.0042, + "reward": 1.4926584959030151, + "reward_std": 0.1042180210351944, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38328349590301514, + "rewards/pad": 0.109375, + "step": 1080 + }, + { + "completion_length": 335.625, + "epoch": 0.34448693435309113, + "grad_norm": 11.104180335998535, + "kl": 0.072265625, + "learning_rate": 6.555130656469088e-07, + "loss": 0.0029, + "reward": 1.6401329040527344, + "reward_std": 0.12838345766067505, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4995078444480896, + "step": 1081 + }, + { + "completion_length": 284.8125, + "epoch": 0.34480560866794135, + "grad_norm": 11.32220458984375, + "kl": 0.0791015625, + "learning_rate": 6.551943913320586e-07, + "loss": 0.0032, + "reward": 1.522026777267456, + "reward_std": 0.1750384420156479, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.42827683687210083, + "step": 1082 + }, + { + "completion_length": 226.515625, + "epoch": 0.3451242829827916, + "grad_norm": 3.8591694831848145, + "kl": 0.1064453125, + "learning_rate": 6.548757170172083e-07, + "loss": 0.0043, + "reward": 1.390260934829712, + "reward_std": 0.1007055938243866, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.2808860242366791, + "rewards/pad": 0.125, + "step": 1083 + }, + { + "completion_length": 168.015625, + "epoch": 0.3454429572976418, + "grad_norm": 29.858274459838867, + "kl": 0.09765625, + "learning_rate": 6.545570427023581e-07, + "loss": 0.0039, + "reward": 1.3682701587677002, + "reward_std": 0.14364153146743774, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.38389521837234497, + "rewards/pad": 0.0, + "step": 1084 + }, + { + "completion_length": 322.296875, + "epoch": 0.345761631612492, + "grad_norm": 6.255033493041992, + "kl": 0.08203125, + "learning_rate": 6.542383683875079e-07, + "loss": 0.0033, + "reward": 1.3874914646148682, + "reward_std": 0.14402329921722412, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.40311652421951294, + "step": 1085 + }, + { + "completion_length": 211.0, + "epoch": 0.34608030592734224, + "grad_norm": 11.35192584991455, + "kl": 0.12890625, + "learning_rate": 6.539196940726577e-07, + "loss": 0.0052, + "reward": 1.5528533458709717, + "reward_std": 0.10497722029685974, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5528532862663269, + "step": 1086 + }, + { + "completion_length": 242.859375, + "epoch": 0.34639898024219246, + "grad_norm": 10.361661911010742, + "kl": 0.08837890625, + "learning_rate": 6.536010197578074e-07, + "loss": 0.0035, + "reward": 1.5939083099365234, + "reward_std": 0.22381728887557983, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45328325033187866, + "rewards/pad": 0.15625, + "step": 1087 + }, + { + "completion_length": 267.234375, + "epoch": 0.3467176545570427, + "grad_norm": 12.8701753616333, + "kl": 0.08447265625, + "learning_rate": 6.532823454429572e-07, + "loss": 0.0034, + "reward": 1.414771318435669, + "reward_std": 0.16493813693523407, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43039631843566895, + "rewards/pad": 0.0, + "step": 1088 + }, + { + "completion_length": 280.21875, + "epoch": 0.34703632887189295, + "grad_norm": 12.832812309265137, + "kl": 0.1015625, + "learning_rate": 6.52963671128107e-07, + "loss": 0.0041, + "reward": 1.3166179656982422, + "reward_std": 0.10120934247970581, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.30099302530288696, + "step": 1089 + }, + { + "completion_length": 390.125, + "epoch": 0.34735500318674317, + "grad_norm": 10.07620906829834, + "kl": 0.044677734375, + "learning_rate": 6.526449968132568e-07, + "loss": 0.0018, + "reward": 1.6574950218200684, + "reward_std": 0.0385076180100441, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40749502182006836, + "step": 1090 + }, + { + "completion_length": 331.1875, + "epoch": 0.3476736775015934, + "grad_norm": 12.087833404541016, + "kl": 0.06298828125, + "learning_rate": 6.523263224984065e-07, + "loss": 0.0025, + "reward": 1.497642159461975, + "reward_std": 0.1338558793067932, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5132672190666199, + "rewards/pad": 0.0, + "step": 1091 + }, + { + "completion_length": 305.421875, + "epoch": 0.3479923518164436, + "grad_norm": 5.817469596862793, + "kl": 0.0908203125, + "learning_rate": 6.520076481835563e-07, + "loss": 0.0036, + "reward": 1.504326581954956, + "reward_std": 0.04959062859416008, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.504326581954956, + "rewards/pad": 0.0, + "step": 1092 + }, + { + "completion_length": 410.53125, + "epoch": 0.34831102613129383, + "grad_norm": 7.265234470367432, + "kl": 0.0517578125, + "learning_rate": 6.516889738687062e-07, + "loss": 0.0021, + "reward": 1.4915004968643188, + "reward_std": 0.118180051445961, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.39775052666664124, + "step": 1093 + }, + { + "completion_length": 277.375, + "epoch": 0.34862970044614405, + "grad_norm": 4.923130512237549, + "kl": 0.07763671875, + "learning_rate": 6.51370299553856e-07, + "loss": 0.0031, + "reward": 1.48738694190979, + "reward_std": 0.14238008856773376, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3936370015144348, + "step": 1094 + }, + { + "completion_length": 300.453125, + "epoch": 0.3489483747609943, + "grad_norm": 5.92523193359375, + "kl": 0.0830078125, + "learning_rate": 6.510516252390057e-07, + "loss": 0.0033, + "reward": 1.5825649499893188, + "reward_std": 0.1137721836566925, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.47318994998931885, + "step": 1095 + }, + { + "completion_length": 289.015625, + "epoch": 0.3492670490758445, + "grad_norm": 40.995445251464844, + "kl": 0.07568359375, + "learning_rate": 6.507329509241555e-07, + "loss": 0.003, + "reward": 1.5625817775726318, + "reward_std": 0.061178095638751984, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43758174777030945, + "step": 1096 + }, + { + "completion_length": 306.546875, + "epoch": 0.3495857233906947, + "grad_norm": 9.393498420715332, + "kl": 0.0791015625, + "learning_rate": 6.504142766093053e-07, + "loss": 0.0032, + "reward": 1.4845378398895264, + "reward_std": 0.09015172719955444, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4845377206802368, + "rewards/pad": 0.0, + "step": 1097 + }, + { + "completion_length": 341.359375, + "epoch": 0.34990439770554493, + "grad_norm": 19.660934448242188, + "kl": 0.06689453125, + "learning_rate": 6.500956022944551e-07, + "loss": 0.0027, + "reward": 1.3632309436798096, + "reward_std": 0.1405061036348343, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37885600328445435, + "step": 1098 + }, + { + "completion_length": 215.171875, + "epoch": 0.35022307202039515, + "grad_norm": 6.55889892578125, + "kl": 0.09912109375, + "learning_rate": 6.497769279796048e-07, + "loss": 0.004, + "reward": 1.5062105655670166, + "reward_std": 0.13409946858882904, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5062106251716614, + "step": 1099 + }, + { + "completion_length": 414.453125, + "epoch": 0.3505417463352454, + "grad_norm": 5.437408924102783, + "kl": 0.060791015625, + "learning_rate": 6.494582536647546e-07, + "loss": 0.0024, + "reward": 1.4838175773620605, + "reward_std": 0.04797791689634323, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48381760716438293, + "step": 1100 + }, + { + "completion_length": 451.140625, + "epoch": 0.3508604206500956, + "grad_norm": 7.171418190002441, + "kl": 0.06005859375, + "learning_rate": 6.491395793499044e-07, + "loss": 0.0024, + "reward": 1.4617466926574707, + "reward_std": 0.042779937386512756, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4617466330528259, + "step": 1101 + }, + { + "completion_length": 185.0, + "epoch": 0.3511790949649458, + "grad_norm": 8.356879234313965, + "kl": 0.11669921875, + "learning_rate": 6.488209050350542e-07, + "loss": 0.0047, + "reward": 1.632096767425537, + "reward_std": 0.16088390350341797, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41334670782089233, + "step": 1102 + }, + { + "completion_length": 311.703125, + "epoch": 0.35149776927979604, + "grad_norm": 9.635887145996094, + "kl": 0.08251953125, + "learning_rate": 6.485022307202039e-07, + "loss": 0.0033, + "reward": 1.5167649984359741, + "reward_std": 0.08570042997598648, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5167650580406189, + "rewards/pad": 0.0, + "step": 1103 + }, + { + "completion_length": 422.015625, + "epoch": 0.35181644359464626, + "grad_norm": 16.782541275024414, + "kl": 0.04931640625, + "learning_rate": 6.481835564053537e-07, + "loss": 0.002, + "reward": 1.4156150817871094, + "reward_std": 0.0803535133600235, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3062400221824646, + "step": 1104 + }, + { + "completion_length": 340.796875, + "epoch": 0.3521351179094965, + "grad_norm": 13.343717575073242, + "kl": 0.134765625, + "learning_rate": 6.478648820905035e-07, + "loss": 0.0054, + "reward": 1.3510937690734863, + "reward_std": 0.2080368846654892, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.41359373927116394, + "step": 1105 + }, + { + "completion_length": 413.28125, + "epoch": 0.3524537922243467, + "grad_norm": 14.27628231048584, + "kl": 0.0693359375, + "learning_rate": 6.475462077756533e-07, + "loss": 0.0028, + "reward": 1.4851425886154175, + "reward_std": 0.14510664343833923, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5163924694061279, + "step": 1106 + }, + { + "completion_length": 186.953125, + "epoch": 0.3527724665391969, + "grad_norm": 58.4865608215332, + "kl": 0.10546875, + "learning_rate": 6.47227533460803e-07, + "loss": 0.0042, + "reward": 1.7755894660949707, + "reward_std": 0.18116974830627441, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4162144660949707, + "rewards/pad": 0.375, + "step": 1107 + }, + { + "completion_length": 299.9375, + "epoch": 0.35309114085404714, + "grad_norm": 6.266265392303467, + "kl": 0.068359375, + "learning_rate": 6.469088591459528e-07, + "loss": 0.0027, + "reward": 1.6438732147216797, + "reward_std": 0.3781220614910126, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.3313731849193573, + "step": 1108 + }, + { + "completion_length": 228.34375, + "epoch": 0.3534098151688974, + "grad_norm": 7.116986274719238, + "kl": 0.0927734375, + "learning_rate": 6.465901848311026e-07, + "loss": 0.0037, + "reward": 1.5064152479171753, + "reward_std": 0.04102984443306923, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5064153075218201, + "rewards/pad": 0.0, + "step": 1109 + }, + { + "completion_length": 240.015625, + "epoch": 0.35372848948374763, + "grad_norm": 24.907663345336914, + "kl": 0.0859375, + "learning_rate": 6.462715105162523e-07, + "loss": 0.0034, + "reward": 1.4360811710357666, + "reward_std": 0.17298558354377747, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4517062306404114, + "step": 1110 + }, + { + "completion_length": 298.5, + "epoch": 0.35404716379859785, + "grad_norm": 39.48649215698242, + "kl": 0.109375, + "learning_rate": 6.459528362014021e-07, + "loss": 0.0044, + "reward": 1.4850733280181885, + "reward_std": 0.10610349476337433, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3756982386112213, + "rewards/pad": 0.109375, + "step": 1111 + }, + { + "completion_length": 384.453125, + "epoch": 0.3543658381134481, + "grad_norm": 11.466015815734863, + "kl": 0.06201171875, + "learning_rate": 6.45634161886552e-07, + "loss": 0.0025, + "reward": 1.6032086610794067, + "reward_std": 0.18208816647529602, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.38445860147476196, + "step": 1112 + }, + { + "completion_length": 240.921875, + "epoch": 0.3546845124282983, + "grad_norm": 13.035429954528809, + "kl": 0.08984375, + "learning_rate": 6.453154875717018e-07, + "loss": 0.0036, + "reward": 1.5618797540664673, + "reward_std": 0.1373620331287384, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.43687981367111206, + "step": 1113 + }, + { + "completion_length": 292.515625, + "epoch": 0.3550031867431485, + "grad_norm": 99.24024200439453, + "kl": 0.0810546875, + "learning_rate": 6.449968132568515e-07, + "loss": 0.0032, + "reward": 1.5708576440811157, + "reward_std": 0.11023964732885361, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.586482584476471, + "rewards/pad": 0.0, + "step": 1114 + }, + { + "completion_length": 226.4375, + "epoch": 0.35532186105799873, + "grad_norm": 49.051387786865234, + "kl": 0.1064453125, + "learning_rate": 6.446781389420013e-07, + "loss": 0.0043, + "reward": 1.342435359954834, + "reward_std": 0.14247508347034454, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34243538975715637, + "rewards/pad": 0.015625, + "step": 1115 + }, + { + "completion_length": 195.265625, + "epoch": 0.35564053537284895, + "grad_norm": 21.15662956237793, + "kl": 0.091796875, + "learning_rate": 6.443594646271511e-07, + "loss": 0.0037, + "reward": 1.7528045177459717, + "reward_std": 0.24256056547164917, + "rewards/answer_reward": 0.328125, + "rewards/format_reward_gqa": 0.953125, + "rewards/iou_glue_reward": 0.4715544283390045, + "step": 1116 + }, + { + "completion_length": 260.078125, + "epoch": 0.3559592096876992, + "grad_norm": 7.468287944793701, + "kl": 0.08154296875, + "learning_rate": 6.440407903123009e-07, + "loss": 0.0033, + "reward": 1.5833702087402344, + "reward_std": 0.2094673067331314, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4739951491355896, + "step": 1117 + }, + { + "completion_length": 219.46875, + "epoch": 0.3562778840025494, + "grad_norm": 15.786979675292969, + "kl": 0.10693359375, + "learning_rate": 6.437221159974505e-07, + "loss": 0.0043, + "reward": 1.3431097269058228, + "reward_std": 0.216970756649971, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.35873472690582275, + "step": 1118 + }, + { + "completion_length": 393.140625, + "epoch": 0.3565965583173996, + "grad_norm": 8.422698974609375, + "kl": 0.07373046875, + "learning_rate": 6.434034416826003e-07, + "loss": 0.003, + "reward": 1.5210447311401367, + "reward_std": 0.0758834108710289, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5210448503494263, + "step": 1119 + }, + { + "completion_length": 266.890625, + "epoch": 0.35691523263224983, + "grad_norm": 19.675073623657227, + "kl": 0.0888671875, + "learning_rate": 6.430847673677501e-07, + "loss": 0.0036, + "reward": 1.4346909523010254, + "reward_std": 0.15363526344299316, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4503158926963806, + "rewards/pad": 0.0, + "step": 1120 + }, + { + "completion_length": 204.578125, + "epoch": 0.35723390694710005, + "grad_norm": 15.127815246582031, + "kl": 0.0869140625, + "learning_rate": 6.427660930528999e-07, + "loss": 0.0035, + "reward": 1.678032398223877, + "reward_std": 0.17766845226287842, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3342824876308441, + "rewards/pad": 0.34375, + "step": 1121 + }, + { + "completion_length": 208.203125, + "epoch": 0.3575525812619503, + "grad_norm": 14.863090515136719, + "kl": 0.1064453125, + "learning_rate": 6.424474187380496e-07, + "loss": 0.0043, + "reward": 1.639469861984253, + "reward_std": 0.14503180980682373, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5144699215888977, + "rewards/pad": 0.125, + "step": 1122 + }, + { + "completion_length": 290.828125, + "epoch": 0.3578712555768005, + "grad_norm": 12.484613418579102, + "kl": 0.099609375, + "learning_rate": 6.421287444231994e-07, + "loss": 0.004, + "reward": 1.412679672241211, + "reward_std": 0.0679410845041275, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41267964243888855, + "rewards/pad": 0.0, + "step": 1123 + }, + { + "completion_length": 260.8125, + "epoch": 0.3581899298916507, + "grad_norm": 4.807104110717773, + "kl": 0.142578125, + "learning_rate": 6.418100701083492e-07, + "loss": 0.0057, + "reward": 1.489712119102478, + "reward_std": 0.05640888959169388, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4897121489048004, + "rewards/pad": 0.0, + "step": 1124 + }, + { + "completion_length": 400.609375, + "epoch": 0.35850860420650094, + "grad_norm": 10.27554702758789, + "kl": 0.068359375, + "learning_rate": 6.41491395793499e-07, + "loss": 0.0027, + "reward": 1.5083768367767334, + "reward_std": 0.06959781795740128, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38337668776512146, + "rewards/pad": 0.125, + "step": 1125 + }, + { + "completion_length": 227.84375, + "epoch": 0.35882727852135116, + "grad_norm": 8.963057518005371, + "kl": 0.109375, + "learning_rate": 6.411727214786487e-07, + "loss": 0.0044, + "reward": 1.6768724918365479, + "reward_std": 0.12050767987966537, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44249746203422546, + "rewards/pad": 0.234375, + "step": 1126 + }, + { + "completion_length": 224.09375, + "epoch": 0.3591459528362014, + "grad_norm": 12.289810180664062, + "kl": 0.10009765625, + "learning_rate": 6.408540471637985e-07, + "loss": 0.004, + "reward": 1.565162181854248, + "reward_std": 0.10991180688142776, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.33078712224960327, + "step": 1127 + }, + { + "completion_length": 421.34375, + "epoch": 0.35946462715105165, + "grad_norm": 4.199387073516846, + "kl": 0.044677734375, + "learning_rate": 6.405353728489483e-07, + "loss": 0.0018, + "reward": 1.6045417785644531, + "reward_std": 0.06083228066563606, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4795418679714203, + "step": 1128 + }, + { + "completion_length": 217.96875, + "epoch": 0.3597833014659019, + "grad_norm": 13.499602317810059, + "kl": 0.09326171875, + "learning_rate": 6.402166985340981e-07, + "loss": 0.0037, + "reward": 1.736114263534546, + "reward_std": 0.14537036418914795, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.6267393827438354, + "step": 1129 + }, + { + "completion_length": 290.921875, + "epoch": 0.3601019757807521, + "grad_norm": 20.29520034790039, + "kl": 0.07373046875, + "learning_rate": 6.398980242192478e-07, + "loss": 0.0029, + "reward": 1.541394829750061, + "reward_std": 0.13326993584632874, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43201982975006104, + "rewards/pad": 0.125, + "step": 1130 + }, + { + "completion_length": 358.734375, + "epoch": 0.3604206500956023, + "grad_norm": 6.83867883682251, + "kl": 0.07421875, + "learning_rate": 6.395793499043977e-07, + "loss": 0.003, + "reward": 1.656518816947937, + "reward_std": 0.09795132279396057, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42214375734329224, + "rewards/pad": 0.25, + "step": 1131 + }, + { + "completion_length": 255.9375, + "epoch": 0.36073932441045253, + "grad_norm": 9.172416687011719, + "kl": 0.09228515625, + "learning_rate": 6.392606755895475e-07, + "loss": 0.0037, + "reward": 1.353350281715393, + "reward_std": 0.10851083695888519, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.36897528171539307, + "rewards/pad": 0.0, + "step": 1132 + }, + { + "completion_length": 253.890625, + "epoch": 0.36105799872530275, + "grad_norm": 10.658964157104492, + "kl": 0.07763671875, + "learning_rate": 6.389420012746973e-07, + "loss": 0.0031, + "reward": 1.5653178691864014, + "reward_std": 0.16297033429145813, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.471567839384079, + "step": 1133 + }, + { + "completion_length": 167.703125, + "epoch": 0.361376673040153, + "grad_norm": 11.124542236328125, + "kl": 0.1044921875, + "learning_rate": 6.38623326959847e-07, + "loss": 0.0042, + "reward": 1.7909257411956787, + "reward_std": 0.1615280658006668, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5565507411956787, + "step": 1134 + }, + { + "completion_length": 308.296875, + "epoch": 0.3616953473550032, + "grad_norm": 8.000903129577637, + "kl": 0.07666015625, + "learning_rate": 6.383046526449968e-07, + "loss": 0.0031, + "reward": 1.5040984153747559, + "reward_std": 0.08258339762687683, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5197234153747559, + "rewards/pad": 0.0, + "step": 1135 + }, + { + "completion_length": 110.34375, + "epoch": 0.3620140216698534, + "grad_norm": 23.70810890197754, + "kl": 0.1259765625, + "learning_rate": 6.379859783301466e-07, + "loss": 0.0051, + "reward": 1.719245433807373, + "reward_std": 0.2565840482711792, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.594245433807373, + "rewards/pad": 0.140625, + "step": 1136 + }, + { + "completion_length": 305.5625, + "epoch": 0.36233269598470363, + "grad_norm": 9.16686725616455, + "kl": 0.09521484375, + "learning_rate": 6.376673040152964e-07, + "loss": 0.0038, + "reward": 1.5536150932312012, + "reward_std": 0.08969675004482269, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5536150336265564, + "rewards/pad": 0.0, + "step": 1137 + }, + { + "completion_length": 154.703125, + "epoch": 0.36265137029955385, + "grad_norm": 36.27172088623047, + "kl": 0.11962890625, + "learning_rate": 6.373486297004461e-07, + "loss": 0.0048, + "reward": 1.5816446542739868, + "reward_std": 0.2370244562625885, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3785196840763092, + "rewards/pad": 0.21875, + "step": 1138 + }, + { + "completion_length": 368.28125, + "epoch": 0.3629700446144041, + "grad_norm": 9.339393615722656, + "kl": 0.0810546875, + "learning_rate": 6.370299553855959e-07, + "loss": 0.0032, + "reward": 1.4603204727172852, + "reward_std": 0.15351173281669617, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.49157047271728516, + "step": 1139 + }, + { + "completion_length": 211.109375, + "epoch": 0.3632887189292543, + "grad_norm": 10.896927833557129, + "kl": 0.10595703125, + "learning_rate": 6.367112810707457e-07, + "loss": 0.0042, + "reward": 1.655792474746704, + "reward_std": 0.12964951992034912, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3745424747467041, + "rewards/pad": 0.28125, + "step": 1140 + }, + { + "completion_length": 322.25, + "epoch": 0.3636073932441045, + "grad_norm": 7.350358486175537, + "kl": 0.083984375, + "learning_rate": 6.363926067558954e-07, + "loss": 0.0034, + "reward": 1.4856479167938232, + "reward_std": 0.0888536125421524, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48564794659614563, + "step": 1141 + }, + { + "completion_length": 148.03125, + "epoch": 0.36392606755895474, + "grad_norm": 18.382137298583984, + "kl": 0.1318359375, + "learning_rate": 6.360739324410452e-07, + "loss": 0.0053, + "reward": 1.7307488918304443, + "reward_std": 0.13341474533081055, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6213740110397339, + "rewards/pad": 0.109375, + "step": 1142 + }, + { + "completion_length": 216.484375, + "epoch": 0.36424474187380496, + "grad_norm": 9.965367317199707, + "kl": 0.08642578125, + "learning_rate": 6.35755258126195e-07, + "loss": 0.0034, + "reward": 1.7189104557037354, + "reward_std": 0.22512134909629822, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34391042590141296, + "rewards/pad": 0.390625, + "step": 1143 + }, + { + "completion_length": 205.03125, + "epoch": 0.3645634161886552, + "grad_norm": 12.729777336120605, + "kl": 0.11767578125, + "learning_rate": 6.354365838113448e-07, + "loss": 0.0047, + "reward": 1.4384092092514038, + "reward_std": 0.15935580432415009, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4540341794490814, + "rewards/pad": 0.0, + "step": 1144 + }, + { + "completion_length": 180.859375, + "epoch": 0.3648820905035054, + "grad_norm": 16.469371795654297, + "kl": 0.107421875, + "learning_rate": 6.351179094964945e-07, + "loss": 0.0043, + "reward": 1.4881618022918701, + "reward_std": 0.14285990595817566, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5037867426872253, + "rewards/pad": 0.0, + "step": 1145 + }, + { + "completion_length": 212.1875, + "epoch": 0.3652007648183556, + "grad_norm": 15.975790023803711, + "kl": 0.1171875, + "learning_rate": 6.347992351816443e-07, + "loss": 0.0047, + "reward": 1.677734613418579, + "reward_std": 0.11066339164972305, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5683596134185791, + "rewards/pad": 0.125, + "step": 1146 + }, + { + "completion_length": 321.84375, + "epoch": 0.36551943913320584, + "grad_norm": 7.0683369636535645, + "kl": 0.07470703125, + "learning_rate": 6.344805608667941e-07, + "loss": 0.003, + "reward": 1.570380449295044, + "reward_std": 0.18008087575435638, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.49225541949272156, + "step": 1147 + }, + { + "completion_length": 272.15625, + "epoch": 0.3658381134480561, + "grad_norm": 20.573633193969727, + "kl": 0.10205078125, + "learning_rate": 6.34161886551944e-07, + "loss": 0.0041, + "reward": 1.6857969760894775, + "reward_std": 0.1163695752620697, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5764220356941223, + "step": 1148 + }, + { + "completion_length": 276.328125, + "epoch": 0.36615678776290633, + "grad_norm": 13.387776374816895, + "kl": 0.083984375, + "learning_rate": 6.338432122370936e-07, + "loss": 0.0034, + "reward": 1.8507922887802124, + "reward_std": 0.15579496324062347, + "rewards/pad": 0.3125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5382922887802124, + "step": 1149 + }, + { + "completion_length": 300.03125, + "epoch": 0.36647546207775655, + "grad_norm": 10.872426986694336, + "kl": 0.07763671875, + "learning_rate": 6.335245379222435e-07, + "loss": 0.0031, + "reward": 1.4849570989608765, + "reward_std": 0.11124669015407562, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.37558209896087646, + "step": 1150 + }, + { + "completion_length": 148.578125, + "epoch": 0.3667941363926068, + "grad_norm": 12.177960395812988, + "kl": 0.134765625, + "learning_rate": 6.332058636073933e-07, + "loss": 0.0054, + "reward": 1.610792636871338, + "reward_std": 0.15256120264530182, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5014176964759827, + "rewards/pad": 0.125, + "step": 1151 + }, + { + "completion_length": 315.015625, + "epoch": 0.367112810707457, + "grad_norm": 5.850987911224365, + "kl": 0.0751953125, + "learning_rate": 6.328871892925431e-07, + "loss": 0.003, + "reward": 1.6577093601226807, + "reward_std": 0.04746227711439133, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40770936012268066, + "step": 1152 + }, + { + "completion_length": 322.078125, + "epoch": 0.3674314850223072, + "grad_norm": 12.73082447052002, + "kl": 0.08251953125, + "learning_rate": 6.325685149776928e-07, + "loss": 0.0033, + "reward": 1.3450021743774414, + "reward_std": 0.12023515999317169, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.36062711477279663, + "rewards/pad": 0.0, + "step": 1153 + }, + { + "completion_length": 214.46875, + "epoch": 0.36775015933715743, + "grad_norm": 20.796520233154297, + "kl": 0.10400390625, + "learning_rate": 6.322498406628426e-07, + "loss": 0.0042, + "reward": 1.5687785148620605, + "reward_std": 0.09632916003465652, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5687785148620605, + "step": 1154 + }, + { + "completion_length": 232.375, + "epoch": 0.36806883365200765, + "grad_norm": 9.744770050048828, + "kl": 0.1044921875, + "learning_rate": 6.319311663479924e-07, + "loss": 0.0042, + "reward": 1.6491217613220215, + "reward_std": 0.11243297159671783, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6491218209266663, + "rewards/pad": 0.0, + "step": 1155 + }, + { + "completion_length": 274.15625, + "epoch": 0.3683875079668579, + "grad_norm": 6.4882941246032715, + "kl": 0.0810546875, + "learning_rate": 6.316124920331422e-07, + "loss": 0.0032, + "reward": 1.8337061405181885, + "reward_std": 0.05185786634683609, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4587061405181885, + "step": 1156 + }, + { + "completion_length": 258.109375, + "epoch": 0.3687061822817081, + "grad_norm": 7.578658580780029, + "kl": 0.10400390625, + "learning_rate": 6.312938177182919e-07, + "loss": 0.0042, + "reward": 1.5650343894958496, + "reward_std": 0.1916087567806244, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.5962843894958496, + "step": 1157 + }, + { + "completion_length": 146.390625, + "epoch": 0.3690248565965583, + "grad_norm": 9.791213035583496, + "kl": 0.1318359375, + "learning_rate": 6.309751434034416e-07, + "loss": 0.0053, + "reward": 1.6399765014648438, + "reward_std": 0.12476305663585663, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6399766206741333, + "rewards/pad": 0.0, + "step": 1158 + }, + { + "completion_length": 354.1875, + "epoch": 0.36934353091140854, + "grad_norm": 7.83534049987793, + "kl": 0.0869140625, + "learning_rate": 6.306564690885914e-07, + "loss": 0.0035, + "reward": 1.4030874967575073, + "reward_std": 0.04663660749793053, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4030875563621521, + "step": 1159 + }, + { + "completion_length": 207.25, + "epoch": 0.36966220522625876, + "grad_norm": 6.344560146331787, + "kl": 0.1259765625, + "learning_rate": 6.303377947737412e-07, + "loss": 0.005, + "reward": 1.7244865894317627, + "reward_std": 0.12499900907278061, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.5057364702224731, + "step": 1160 + }, + { + "completion_length": 215.921875, + "epoch": 0.369980879541109, + "grad_norm": 9.869667053222656, + "kl": 0.11474609375, + "learning_rate": 6.300191204588909e-07, + "loss": 0.0046, + "reward": 1.5617637634277344, + "reward_std": 0.1978577971458435, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46801382303237915, + "rewards/pad": 0.109375, + "step": 1161 + }, + { + "completion_length": 308.4375, + "epoch": 0.3702995538559592, + "grad_norm": 7.597326755523682, + "kl": 0.08544921875, + "learning_rate": 6.297004461440407e-07, + "loss": 0.0034, + "reward": 1.5469553470611572, + "reward_std": 0.10091152042150497, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5625803470611572, + "rewards/pad": 0.0, + "step": 1162 + }, + { + "completion_length": 261.265625, + "epoch": 0.3706182281708094, + "grad_norm": 7.743888854980469, + "kl": 0.1064453125, + "learning_rate": 6.293817718291905e-07, + "loss": 0.0043, + "reward": 1.524780511856079, + "reward_std": 0.1921815574169159, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4779054522514343, + "rewards/pad": 0.0625, + "step": 1163 + }, + { + "completion_length": 361.046875, + "epoch": 0.37093690248565964, + "grad_norm": 4.7014617919921875, + "kl": 0.07568359375, + "learning_rate": 6.290630975143403e-07, + "loss": 0.003, + "reward": 1.4223251342773438, + "reward_std": 0.048480890691280365, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42232510447502136, + "step": 1164 + }, + { + "completion_length": 212.40625, + "epoch": 0.37125557680050986, + "grad_norm": 6.080229759216309, + "kl": 0.091796875, + "learning_rate": 6.2874442319949e-07, + "loss": 0.0037, + "reward": 1.5960010290145874, + "reward_std": 0.11411029100418091, + "rewards/answer_reward": 0.34375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.252250999212265, + "step": 1165 + }, + { + "completion_length": 353.0625, + "epoch": 0.3715742511153601, + "grad_norm": 12.403491973876953, + "kl": 0.06787109375, + "learning_rate": 6.284257488846398e-07, + "loss": 0.0027, + "reward": 1.3574970960617065, + "reward_std": 0.08834861218929291, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.37312212586402893, + "rewards/pad": 0.0, + "step": 1166 + }, + { + "completion_length": 251.390625, + "epoch": 0.37189292543021035, + "grad_norm": 15.094289779663086, + "kl": 0.10595703125, + "learning_rate": 6.281070745697896e-07, + "loss": 0.0042, + "reward": 1.5393974781036377, + "reward_std": 0.14277897775173187, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5550224781036377, + "rewards/pad": 0.0, + "step": 1167 + }, + { + "completion_length": 219.515625, + "epoch": 0.3722115997450606, + "grad_norm": 10.170633316040039, + "kl": 0.11669921875, + "learning_rate": 6.277884002549393e-07, + "loss": 0.0047, + "reward": 1.4170877933502197, + "reward_std": 0.11784126609563828, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4327127933502197, + "step": 1168 + }, + { + "completion_length": 200.75, + "epoch": 0.3725302740599108, + "grad_norm": 14.906290054321289, + "kl": 0.1171875, + "learning_rate": 6.274697259400892e-07, + "loss": 0.0047, + "reward": 1.5319913625717163, + "reward_std": 0.11990997195243835, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4226164221763611, + "step": 1169 + }, + { + "completion_length": 267.609375, + "epoch": 0.372848948374761, + "grad_norm": 13.22391128540039, + "kl": 0.08251953125, + "learning_rate": 6.27151051625239e-07, + "loss": 0.0033, + "reward": 1.6729364395141602, + "reward_std": 0.1706743836402893, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5010614395141602, + "rewards/pad": 0.171875, + "step": 1170 + }, + { + "completion_length": 158.421875, + "epoch": 0.37316762268961123, + "grad_norm": 29.263126373291016, + "kl": 0.1357421875, + "learning_rate": 6.268323773103888e-07, + "loss": 0.0054, + "reward": 1.6222381591796875, + "reward_std": 0.1717788130044937, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6066131591796875, + "rewards/pad": 0.03125, + "step": 1171 + }, + { + "completion_length": 332.09375, + "epoch": 0.37348629700446145, + "grad_norm": 6.8517985343933105, + "kl": 0.06982421875, + "learning_rate": 6.265137029955385e-07, + "loss": 0.0028, + "reward": 1.7126264572143555, + "reward_std": 0.175262451171875, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.556376576423645, + "step": 1172 + }, + { + "completion_length": 221.40625, + "epoch": 0.3738049713193117, + "grad_norm": 9.87380599975586, + "kl": 0.1337890625, + "learning_rate": 6.261950286806883e-07, + "loss": 0.0053, + "reward": 1.3858520984649658, + "reward_std": 0.07210617512464523, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38585203886032104, + "rewards/pad": 0.0, + "step": 1173 + }, + { + "completion_length": 336.1875, + "epoch": 0.3741236456341619, + "grad_norm": 15.57335090637207, + "kl": 0.08837890625, + "learning_rate": 6.258763543658381e-07, + "loss": 0.0035, + "reward": 1.4978127479553223, + "reward_std": 0.18304958939552307, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4665627181529999, + "step": 1174 + }, + { + "completion_length": 386.296875, + "epoch": 0.3744423199490121, + "grad_norm": 6.890348434448242, + "kl": 0.078125, + "learning_rate": 6.255576800509879e-07, + "loss": 0.0031, + "reward": 1.4197403192520142, + "reward_std": 0.19700422883033752, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.32599037885665894, + "step": 1175 + }, + { + "completion_length": 226.4375, + "epoch": 0.37476099426386233, + "grad_norm": 7.337767124176025, + "kl": 0.1376953125, + "learning_rate": 6.252390057361376e-07, + "loss": 0.0055, + "reward": 1.4902740716934204, + "reward_std": 0.08827096223831177, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5058990120887756, + "step": 1176 + }, + { + "completion_length": 207.796875, + "epoch": 0.37507966857871256, + "grad_norm": 35.47562789916992, + "kl": 0.140625, + "learning_rate": 6.249203314212874e-07, + "loss": 0.0056, + "reward": 1.5208287239074707, + "reward_std": 0.1746503859758377, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5208288431167603, + "step": 1177 + }, + { + "completion_length": 177.046875, + "epoch": 0.3753983428935628, + "grad_norm": 8.447120666503906, + "kl": 0.1435546875, + "learning_rate": 6.246016571064372e-07, + "loss": 0.0058, + "reward": 1.6760151386260986, + "reward_std": 0.11283519864082336, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5510150790214539, + "rewards/pad": 0.125, + "step": 1178 + }, + { + "completion_length": 372.421875, + "epoch": 0.375717017208413, + "grad_norm": 8.715481758117676, + "kl": 0.099609375, + "learning_rate": 6.24282982791587e-07, + "loss": 0.004, + "reward": 1.5441935062408447, + "reward_std": 0.08572202920913696, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5598185658454895, + "step": 1179 + }, + { + "completion_length": 358.90625, + "epoch": 0.3760356915232632, + "grad_norm": 15.392694473266602, + "kl": 0.10986328125, + "learning_rate": 6.239643084767367e-07, + "loss": 0.0044, + "reward": 1.3878960609436035, + "reward_std": 0.08293549716472626, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4035210609436035, + "step": 1180 + }, + { + "completion_length": 277.671875, + "epoch": 0.37635436583811344, + "grad_norm": 188.2808380126953, + "kl": 0.130859375, + "learning_rate": 6.236456341618865e-07, + "loss": 0.0053, + "reward": 1.5231683254241943, + "reward_std": 0.1508413404226303, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5387933254241943, + "step": 1181 + }, + { + "completion_length": 415.734375, + "epoch": 0.37667304015296366, + "grad_norm": 10.059452056884766, + "kl": 0.0732421875, + "learning_rate": 6.233269598470363e-07, + "loss": 0.0029, + "reward": 1.50213623046875, + "reward_std": 0.04022324085235596, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37713623046875, + "step": 1182 + }, + { + "completion_length": 385.234375, + "epoch": 0.3769917144678139, + "grad_norm": 4.8774285316467285, + "kl": 0.08056640625, + "learning_rate": 6.230082855321861e-07, + "loss": 0.0032, + "reward": 1.5162938833236694, + "reward_std": 0.06330114603042603, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5162937641143799, + "step": 1183 + }, + { + "completion_length": 274.0625, + "epoch": 0.3773103887826641, + "grad_norm": 9.35615062713623, + "kl": 0.099609375, + "learning_rate": 6.226896112173358e-07, + "loss": 0.004, + "reward": 1.544995903968811, + "reward_std": 0.11332038044929504, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43562090396881104, + "step": 1184 + }, + { + "completion_length": 275.453125, + "epoch": 0.3776290630975143, + "grad_norm": 11.464639663696289, + "kl": 0.11328125, + "learning_rate": 6.223709369024856e-07, + "loss": 0.0045, + "reward": 1.4839555025100708, + "reward_std": 0.16924186050891876, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4995805025100708, + "rewards/pad": 0.0, + "step": 1185 + }, + { + "completion_length": 320.609375, + "epoch": 0.37794773741236454, + "grad_norm": 8.494017601013184, + "kl": 0.07470703125, + "learning_rate": 6.220522625876354e-07, + "loss": 0.003, + "reward": 1.5268003940582275, + "reward_std": 0.07187595218420029, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.38617539405822754, + "step": 1186 + }, + { + "completion_length": 152.828125, + "epoch": 0.3782664117272148, + "grad_norm": 11.815348625183105, + "kl": 0.138671875, + "learning_rate": 6.217335882727853e-07, + "loss": 0.0055, + "reward": 1.6774251461029053, + "reward_std": 0.10706285387277603, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.55242520570755, + "rewards/pad": 0.125, + "step": 1187 + }, + { + "completion_length": 235.515625, + "epoch": 0.37858508604206503, + "grad_norm": 13.886299133300781, + "kl": 0.12451171875, + "learning_rate": 6.21414913957935e-07, + "loss": 0.005, + "reward": 1.4992268085479736, + "reward_std": 0.1577257215976715, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4523519277572632, + "step": 1188 + }, + { + "completion_length": 262.796875, + "epoch": 0.37890376035691525, + "grad_norm": 7.124457359313965, + "kl": 0.087890625, + "learning_rate": 6.210962396430848e-07, + "loss": 0.0035, + "reward": 1.6487696170806885, + "reward_std": 0.11338422447443008, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4456444978713989, + "rewards/pad": 0.203125, + "step": 1189 + }, + { + "completion_length": 329.03125, + "epoch": 0.3792224346717655, + "grad_norm": 7.236481666564941, + "kl": 0.126953125, + "learning_rate": 6.207775653282346e-07, + "loss": 0.0051, + "reward": 1.5534617900848389, + "reward_std": 0.05858501046895981, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5534616708755493, + "step": 1190 + }, + { + "completion_length": 324.421875, + "epoch": 0.3795411089866157, + "grad_norm": 11.887310028076172, + "kl": 0.095703125, + "learning_rate": 6.204588910133844e-07, + "loss": 0.0038, + "reward": 1.5425376892089844, + "reward_std": 0.09648825228214264, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3081625998020172, + "rewards/pad": 0.25, + "step": 1191 + }, + { + "completion_length": 102.515625, + "epoch": 0.3798597833014659, + "grad_norm": 11.819649696350098, + "kl": 0.185546875, + "learning_rate": 6.201402166985341e-07, + "loss": 0.0074, + "reward": 1.5959168672561646, + "reward_std": 0.12162573635578156, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5959168672561646, + "rewards/pad": 0.0, + "step": 1192 + }, + { + "completion_length": 342.609375, + "epoch": 0.38017845761631613, + "grad_norm": 12.393624305725098, + "kl": 0.07470703125, + "learning_rate": 6.198215423836839e-07, + "loss": 0.003, + "reward": 1.4370999336242676, + "reward_std": 0.09190460294485092, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4370998442173004, + "step": 1193 + }, + { + "completion_length": 262.828125, + "epoch": 0.38049713193116635, + "grad_norm": 7.728225231170654, + "kl": 0.11767578125, + "learning_rate": 6.195028680688337e-07, + "loss": 0.0047, + "reward": 1.4948556423187256, + "reward_std": 0.07122111320495605, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3698556125164032, + "step": 1194 + }, + { + "completion_length": 217.4375, + "epoch": 0.3808158062460166, + "grad_norm": 9.003069877624512, + "kl": 0.142578125, + "learning_rate": 6.191841937539835e-07, + "loss": 0.0057, + "reward": 1.6949548721313477, + "reward_std": 0.19778338074684143, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5699548721313477, + "rewards/pad": 0.140625, + "step": 1195 + }, + { + "completion_length": 205.515625, + "epoch": 0.3811344805608668, + "grad_norm": 10.357717514038086, + "kl": 0.1279296875, + "learning_rate": 6.188655194391332e-07, + "loss": 0.0051, + "reward": 1.5979564189910889, + "reward_std": 0.1371481865644455, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6135815382003784, + "rewards/pad": 0.0, + "step": 1196 + }, + { + "completion_length": 144.28125, + "epoch": 0.381453154875717, + "grad_norm": 25.7012939453125, + "kl": 0.15234375, + "learning_rate": 6.185468451242829e-07, + "loss": 0.0061, + "reward": 1.5692253112792969, + "reward_std": 0.08479062467813492, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4442253112792969, + "rewards/pad": 0.125, + "step": 1197 + }, + { + "completion_length": 235.46875, + "epoch": 0.38177182919056724, + "grad_norm": 9.451396942138672, + "kl": 0.1240234375, + "learning_rate": 6.182281708094327e-07, + "loss": 0.005, + "reward": 1.5140479803085327, + "reward_std": 0.047571178525686264, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5140478610992432, + "step": 1198 + }, + { + "completion_length": 328.328125, + "epoch": 0.38209050350541746, + "grad_norm": 6.12078332901001, + "kl": 0.06396484375, + "learning_rate": 6.179094964945824e-07, + "loss": 0.0026, + "reward": 1.356693983078003, + "reward_std": 0.11052098125219345, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.23169396817684174, + "step": 1199 + }, + { + "completion_length": 233.75, + "epoch": 0.3824091778202677, + "grad_norm": 13.038230895996094, + "kl": 0.10693359375, + "learning_rate": 6.175908221797322e-07, + "loss": 0.0043, + "reward": 1.5692347288131714, + "reward_std": 0.107730433344841, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4286096692085266, + "rewards/pad": 0.140625, + "step": 1200 + }, + { + "completion_length": 378.09375, + "epoch": 0.3827278521351179, + "grad_norm": 15.127769470214844, + "kl": 0.0654296875, + "learning_rate": 6.17272147864882e-07, + "loss": 0.0026, + "reward": 1.5065699815750122, + "reward_std": 0.14783266186714172, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4440700113773346, + "step": 1201 + }, + { + "completion_length": 332.796875, + "epoch": 0.3830465264499681, + "grad_norm": 7.491520881652832, + "kl": 0.083984375, + "learning_rate": 6.169534735500318e-07, + "loss": 0.0034, + "reward": 1.448569893836975, + "reward_std": 0.13246291875839233, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3548198640346527, + "step": 1202 + }, + { + "completion_length": 346.9375, + "epoch": 0.38336520076481834, + "grad_norm": 9.888702392578125, + "kl": 0.08984375, + "learning_rate": 6.166347992351815e-07, + "loss": 0.0036, + "reward": 1.416809320449829, + "reward_std": 0.20294572412967682, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.4636843800544739, + "step": 1203 + }, + { + "completion_length": 200.375, + "epoch": 0.38368387507966856, + "grad_norm": 14.056575775146484, + "kl": 0.126953125, + "learning_rate": 6.163161249203313e-07, + "loss": 0.0051, + "reward": 1.5999902486801147, + "reward_std": 0.06742437183856964, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47499027848243713, + "rewards/pad": 0.125, + "step": 1204 + }, + { + "completion_length": 347.796875, + "epoch": 0.3840025493945188, + "grad_norm": 7.76228141784668, + "kl": 0.0732421875, + "learning_rate": 6.159974506054811e-07, + "loss": 0.0029, + "reward": 1.6015219688415527, + "reward_std": 0.1911095827817917, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5077719688415527, + "step": 1205 + }, + { + "completion_length": 294.546875, + "epoch": 0.384321223709369, + "grad_norm": 15.17544937133789, + "kl": 0.103515625, + "learning_rate": 6.15678776290631e-07, + "loss": 0.0041, + "reward": 1.4424169063568115, + "reward_std": 0.07422250509262085, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4424169063568115, + "step": 1206 + }, + { + "completion_length": 97.765625, + "epoch": 0.3846398980242193, + "grad_norm": 11.784152030944824, + "kl": 0.16015625, + "learning_rate": 6.153601019757807e-07, + "loss": 0.0064, + "reward": 1.7995710372924805, + "reward_std": 0.1624748706817627, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.45582109689712524, + "step": 1207 + }, + { + "completion_length": 192.078125, + "epoch": 0.3849585723390695, + "grad_norm": 19.06814956665039, + "kl": 0.1416015625, + "learning_rate": 6.150414276609305e-07, + "loss": 0.0056, + "reward": 1.4531996250152588, + "reward_std": 0.13103961944580078, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.406324565410614, + "rewards/pad": 0.046875, + "step": 1208 + }, + { + "completion_length": 281.359375, + "epoch": 0.3852772466539197, + "grad_norm": 9.539800643920898, + "kl": 0.0908203125, + "learning_rate": 6.147227533460803e-07, + "loss": 0.0036, + "reward": 1.3577734231948853, + "reward_std": 0.2437555491924286, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.3577733635902405, + "rewards/pad": 0.046875, + "step": 1209 + }, + { + "completion_length": 295.640625, + "epoch": 0.38559592096876993, + "grad_norm": 7.663999557495117, + "kl": 0.0869140625, + "learning_rate": 6.144040790312301e-07, + "loss": 0.0035, + "reward": 1.4731587171554565, + "reward_std": 0.11033116281032562, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5044087171554565, + "step": 1210 + }, + { + "completion_length": 198.0625, + "epoch": 0.38591459528362015, + "grad_norm": 12.535527229309082, + "kl": 0.1357421875, + "learning_rate": 6.140854047163798e-07, + "loss": 0.0054, + "reward": 1.341176152229309, + "reward_std": 0.06295371055603027, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3411761522293091, + "step": 1211 + }, + { + "completion_length": 309.921875, + "epoch": 0.3862332695984704, + "grad_norm": 6.9051103591918945, + "kl": 0.08984375, + "learning_rate": 6.137667304015296e-07, + "loss": 0.0036, + "reward": 1.4991049766540527, + "reward_std": 0.04289538785815239, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49910494685173035, + "rewards/pad": 0.0, + "step": 1212 + }, + { + "completion_length": 203.25, + "epoch": 0.3865519439133206, + "grad_norm": 15.828720092773438, + "kl": 0.1494140625, + "learning_rate": 6.134480560866794e-07, + "loss": 0.006, + "reward": 1.5709058046340942, + "reward_std": 0.15641742944717407, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46153074502944946, + "rewards/pad": 0.109375, + "step": 1213 + }, + { + "completion_length": 192.6875, + "epoch": 0.3868706182281708, + "grad_norm": 10.661425590515137, + "kl": 0.119140625, + "learning_rate": 6.131293817718292e-07, + "loss": 0.0048, + "reward": 1.7105867862701416, + "reward_std": 0.18071383237838745, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6324617862701416, + "rewards/pad": 0.09375, + "step": 1214 + }, + { + "completion_length": 303.71875, + "epoch": 0.38718929254302104, + "grad_norm": 11.888273239135742, + "kl": 0.10205078125, + "learning_rate": 6.128107074569789e-07, + "loss": 0.0041, + "reward": 1.4614834785461426, + "reward_std": 0.15480071306228638, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4302334785461426, + "step": 1215 + }, + { + "completion_length": 373.265625, + "epoch": 0.38750796685787126, + "grad_norm": 6.659944534301758, + "kl": 0.0654296875, + "learning_rate": 6.124920331421287e-07, + "loss": 0.0026, + "reward": 1.3879508972167969, + "reward_std": 0.03459502011537552, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38795098662376404, + "rewards/pad": 0.0, + "step": 1216 + }, + { + "completion_length": 281.328125, + "epoch": 0.3878266411727215, + "grad_norm": 18.89515495300293, + "kl": 0.0869140625, + "learning_rate": 6.121733588272785e-07, + "loss": 0.0035, + "reward": 1.3299328088760376, + "reward_std": 0.08711449801921844, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3299327492713928, + "step": 1217 + }, + { + "completion_length": 273.03125, + "epoch": 0.3881453154875717, + "grad_norm": 36.573307037353516, + "kl": 0.10107421875, + "learning_rate": 6.118546845124283e-07, + "loss": 0.004, + "reward": 1.700791835784912, + "reward_std": 0.0859699547290802, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5914169549942017, + "rewards/pad": 0.109375, + "step": 1218 + }, + { + "completion_length": 284.71875, + "epoch": 0.3884639898024219, + "grad_norm": 7.265768527984619, + "kl": 0.11083984375, + "learning_rate": 6.11536010197578e-07, + "loss": 0.0044, + "reward": 1.5254764556884766, + "reward_std": 0.04545580968260765, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5254765152931213, + "step": 1219 + }, + { + "completion_length": 408.3125, + "epoch": 0.38878266411727214, + "grad_norm": 3.9388315677642822, + "kl": 0.0654296875, + "learning_rate": 6.112173358827278e-07, + "loss": 0.0026, + "reward": 1.3649344444274902, + "reward_std": 0.04178182780742645, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.364934504032135, + "step": 1220 + }, + { + "completion_length": 349.359375, + "epoch": 0.38910133843212236, + "grad_norm": 5.0478973388671875, + "kl": 0.07080078125, + "learning_rate": 6.108986615678776e-07, + "loss": 0.0028, + "reward": 1.4779832363128662, + "reward_std": 0.0777759924530983, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3686083257198334, + "step": 1221 + }, + { + "completion_length": 308.421875, + "epoch": 0.3894200127469726, + "grad_norm": 6.850950717926025, + "kl": 0.09375, + "learning_rate": 6.105799872530274e-07, + "loss": 0.0038, + "reward": 1.4696778059005737, + "reward_std": 0.06025514006614685, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46967771649360657, + "rewards/pad": 0.0, + "step": 1222 + }, + { + "completion_length": 224.59375, + "epoch": 0.3897386870618228, + "grad_norm": 9.790903091430664, + "kl": 0.095703125, + "learning_rate": 6.102613129381771e-07, + "loss": 0.0038, + "reward": 1.7676475048065186, + "reward_std": 0.19071504473686218, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.45514750480651855, + "step": 1223 + }, + { + "completion_length": 252.84375, + "epoch": 0.390057361376673, + "grad_norm": 15.577993392944336, + "kl": 0.09814453125, + "learning_rate": 6.09942638623327e-07, + "loss": 0.0039, + "reward": 1.5877152681350708, + "reward_std": 0.13476546108722687, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3377152681350708, + "step": 1224 + }, + { + "completion_length": 228.5625, + "epoch": 0.39037603569152324, + "grad_norm": 10.700752258300781, + "kl": 0.11669921875, + "learning_rate": 6.096239643084768e-07, + "loss": 0.0047, + "reward": 1.5201473236083984, + "reward_std": 0.09742455929517746, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5201473832130432, + "rewards/pad": 0.0, + "step": 1225 + }, + { + "completion_length": 255.1875, + "epoch": 0.3906947100063735, + "grad_norm": 24.338665008544922, + "kl": 0.09814453125, + "learning_rate": 6.093052899936266e-07, + "loss": 0.0039, + "reward": 1.564429759979248, + "reward_std": 0.15615159273147583, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47067975997924805, + "rewards/pad": 0.109375, + "step": 1226 + }, + { + "completion_length": 262.765625, + "epoch": 0.39101338432122373, + "grad_norm": 17.822742462158203, + "kl": 0.09814453125, + "learning_rate": 6.089866156787763e-07, + "loss": 0.0039, + "reward": 1.5323125123977661, + "reward_std": 0.09317489713430405, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5323124527931213, + "step": 1227 + }, + { + "completion_length": 167.21875, + "epoch": 0.39133205863607395, + "grad_norm": 18.925622940063477, + "kl": 0.1201171875, + "learning_rate": 6.086679413639261e-07, + "loss": 0.0048, + "reward": 1.6316561698913574, + "reward_std": 0.15108171105384827, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41290608048439026, + "step": 1228 + }, + { + "completion_length": 367.5, + "epoch": 0.3916507329509242, + "grad_norm": 5.661565780639648, + "kl": 0.0732421875, + "learning_rate": 6.083492670490759e-07, + "loss": 0.0029, + "reward": 1.4048430919647217, + "reward_std": 0.11436806619167328, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.29546812176704407, + "step": 1229 + }, + { + "completion_length": 165.53125, + "epoch": 0.3919694072657744, + "grad_norm": 11.724607467651367, + "kl": 0.146484375, + "learning_rate": 6.080305927342257e-07, + "loss": 0.0059, + "reward": 1.4511432647705078, + "reward_std": 0.08234697580337524, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.451143354177475, + "rewards/pad": 0.0, + "step": 1230 + }, + { + "completion_length": 284.28125, + "epoch": 0.3922880815806246, + "grad_norm": 7.128300666809082, + "kl": 0.07763671875, + "learning_rate": 6.077119184193754e-07, + "loss": 0.0031, + "reward": 1.6145113706588745, + "reward_std": 0.10974004864692688, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5051363110542297, + "step": 1231 + }, + { + "completion_length": 295.9375, + "epoch": 0.39260675589547483, + "grad_norm": 8.443071365356445, + "kl": 0.08984375, + "learning_rate": 6.073932441045252e-07, + "loss": 0.0036, + "reward": 1.4106472730636597, + "reward_std": 0.16361010074615479, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4418972134590149, + "rewards/pad": 0.0, + "step": 1232 + }, + { + "completion_length": 194.109375, + "epoch": 0.39292543021032506, + "grad_norm": 13.870003700256348, + "kl": 0.1279296875, + "learning_rate": 6.07074569789675e-07, + "loss": 0.0051, + "reward": 1.374561071395874, + "reward_std": 0.06559236347675323, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.374561071395874, + "rewards/pad": 0.0, + "step": 1233 + }, + { + "completion_length": 285.28125, + "epoch": 0.3932441045251753, + "grad_norm": 11.3886137008667, + "kl": 0.07421875, + "learning_rate": 6.067558954748247e-07, + "loss": 0.003, + "reward": 1.4970085620880127, + "reward_std": 0.15465247631072998, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3563835024833679, + "step": 1234 + }, + { + "completion_length": 303.234375, + "epoch": 0.3935627788400255, + "grad_norm": 6.8807148933410645, + "kl": 0.0830078125, + "learning_rate": 6.064372211599745e-07, + "loss": 0.0033, + "reward": 1.4436665773391724, + "reward_std": 0.08115865290164948, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44366663694381714, + "step": 1235 + }, + { + "completion_length": 360.5, + "epoch": 0.3938814531548757, + "grad_norm": 162.79302978515625, + "kl": 0.060546875, + "learning_rate": 6.061185468451242e-07, + "loss": 0.0024, + "reward": 1.3667831420898438, + "reward_std": 0.01843656599521637, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.24178317189216614, + "step": 1236 + }, + { + "completion_length": 240.015625, + "epoch": 0.39420012746972594, + "grad_norm": 8.241400718688965, + "kl": 0.0927734375, + "learning_rate": 6.05799872530274e-07, + "loss": 0.0037, + "reward": 1.536606788635254, + "reward_std": 0.16440913081169128, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48973172903060913, + "step": 1237 + }, + { + "completion_length": 215.53125, + "epoch": 0.39451880178457616, + "grad_norm": 37.798824310302734, + "kl": 0.1005859375, + "learning_rate": 6.054811982154237e-07, + "loss": 0.004, + "reward": 1.6488937139511108, + "reward_std": 0.1047549918293953, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5238937139511108, + "step": 1238 + }, + { + "completion_length": 248.921875, + "epoch": 0.3948374760994264, + "grad_norm": 19.88370132446289, + "kl": 0.12451171875, + "learning_rate": 6.051625239005735e-07, + "loss": 0.005, + "reward": 1.4146902561187744, + "reward_std": 0.1598891019821167, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3678152859210968, + "rewards/pad": 0.0625, + "step": 1239 + }, + { + "completion_length": 303.359375, + "epoch": 0.3951561504142766, + "grad_norm": 18.375629425048828, + "kl": 0.087890625, + "learning_rate": 6.048438495857233e-07, + "loss": 0.0035, + "reward": 1.5469294786453247, + "reward_std": 0.06787046790122986, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5469294190406799, + "rewards/pad": 0.0, + "step": 1240 + }, + { + "completion_length": 195.78125, + "epoch": 0.3954748247291268, + "grad_norm": 13.132492065429688, + "kl": 0.1123046875, + "learning_rate": 6.045251752708731e-07, + "loss": 0.0045, + "reward": 1.5397614240646362, + "reward_std": 0.14754366874694824, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.44601136445999146, + "rewards/pad": 0.125, + "step": 1241 + }, + { + "completion_length": 203.59375, + "epoch": 0.39579349904397704, + "grad_norm": 16.14130210876465, + "kl": 0.09033203125, + "learning_rate": 6.042065009560228e-07, + "loss": 0.0036, + "reward": 1.5089311599731445, + "reward_std": 0.13677260279655457, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.383931040763855, + "step": 1242 + }, + { + "completion_length": 341.375, + "epoch": 0.39611217335882726, + "grad_norm": 7.483922004699707, + "kl": 0.0654296875, + "learning_rate": 6.038878266411726e-07, + "loss": 0.0026, + "reward": 1.6305183172225952, + "reward_std": 0.04471452906727791, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6305183172225952, + "rewards/pad": 0.0, + "step": 1243 + }, + { + "completion_length": 385.09375, + "epoch": 0.3964308476736775, + "grad_norm": 4.253243446350098, + "kl": 0.058837890625, + "learning_rate": 6.035691523263225e-07, + "loss": 0.0024, + "reward": 1.4404851198196411, + "reward_std": 0.09122411906719208, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3936101198196411, + "step": 1244 + }, + { + "completion_length": 324.625, + "epoch": 0.3967495219885277, + "grad_norm": 12.449899673461914, + "kl": 0.09130859375, + "learning_rate": 6.032504780114723e-07, + "loss": 0.0036, + "reward": 1.3082168102264404, + "reward_std": 0.1398642361164093, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3238418400287628, + "step": 1245 + }, + { + "completion_length": 205.71875, + "epoch": 0.397068196303378, + "grad_norm": 5.796684741973877, + "kl": 0.09912109375, + "learning_rate": 6.02931803696622e-07, + "loss": 0.004, + "reward": 1.5298190116882324, + "reward_std": 0.06253817677497864, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4048190116882324, + "rewards/pad": 0.125, + "step": 1246 + }, + { + "completion_length": 254.40625, + "epoch": 0.3973868706182282, + "grad_norm": 8.309307098388672, + "kl": 0.1572265625, + "learning_rate": 6.026131293817718e-07, + "loss": 0.0063, + "reward": 1.5799627304077148, + "reward_std": 0.06893587857484818, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4549628496170044, + "step": 1247 + }, + { + "completion_length": 227.40625, + "epoch": 0.3977055449330784, + "grad_norm": 15.03865909576416, + "kl": 0.10302734375, + "learning_rate": 6.022944550669216e-07, + "loss": 0.0041, + "reward": 1.6695342063903809, + "reward_std": 0.12210407853126526, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5445340871810913, + "rewards/pad": 0.125, + "step": 1248 + }, + { + "completion_length": 195.484375, + "epoch": 0.39802421924792863, + "grad_norm": 12.202496528625488, + "kl": 0.09912109375, + "learning_rate": 6.019757807520714e-07, + "loss": 0.004, + "reward": 1.6603246927261353, + "reward_std": 0.17108845710754395, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5196996927261353, + "step": 1249 + }, + { + "completion_length": 404.640625, + "epoch": 0.39834289356277885, + "grad_norm": 4.665548324584961, + "kl": 0.055419921875, + "learning_rate": 6.016571064372211e-07, + "loss": 0.0022, + "reward": 1.3473976850509644, + "reward_std": 0.03374429792165756, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.34739768505096436, + "step": 1250 + }, + { + "completion_length": 306.078125, + "epoch": 0.3986615678776291, + "grad_norm": 11.606754302978516, + "kl": 0.07763671875, + "learning_rate": 6.013384321223709e-07, + "loss": 0.0031, + "reward": 1.5133811235427856, + "reward_std": 0.029662977904081345, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5133810639381409, + "rewards/pad": 0.0, + "step": 1251 + }, + { + "completion_length": 312.015625, + "epoch": 0.3989802421924793, + "grad_norm": 6.143039226531982, + "kl": 0.08203125, + "learning_rate": 6.010197578075207e-07, + "loss": 0.0033, + "reward": 1.588003158569336, + "reward_std": 0.06406011432409286, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5880030989646912, + "step": 1252 + }, + { + "completion_length": 233.0, + "epoch": 0.3992989165073295, + "grad_norm": 87.68891143798828, + "kl": 0.41796875, + "learning_rate": 6.007010834926705e-07, + "loss": 0.0168, + "reward": 1.5894889831542969, + "reward_std": 0.13853877782821655, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4801139235496521, + "step": 1253 + }, + { + "completion_length": 368.859375, + "epoch": 0.39961759082217974, + "grad_norm": 5.574906826019287, + "kl": 0.049072265625, + "learning_rate": 6.003824091778202e-07, + "loss": 0.002, + "reward": 1.417471170425415, + "reward_std": 0.07500630617141724, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43309611082077026, + "step": 1254 + }, + { + "completion_length": 430.4375, + "epoch": 0.39993626513702996, + "grad_norm": 4.724660873413086, + "kl": 0.042236328125, + "learning_rate": 6.0006373486297e-07, + "loss": 0.0017, + "reward": 1.5073049068450928, + "reward_std": 0.042998362332582474, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38230496644973755, + "step": 1255 + }, + { + "completion_length": 270.03125, + "epoch": 0.4002549394518802, + "grad_norm": 10.774860382080078, + "kl": 0.09423828125, + "learning_rate": 5.997450605481198e-07, + "loss": 0.0038, + "reward": 1.4578254222869873, + "reward_std": 0.06299932301044464, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4578254222869873, + "step": 1256 + }, + { + "completion_length": 343.953125, + "epoch": 0.4005736137667304, + "grad_norm": 19.947551727294922, + "kl": 0.05908203125, + "learning_rate": 5.994263862332696e-07, + "loss": 0.0024, + "reward": 1.423647403717041, + "reward_std": 0.10177726298570633, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36114731431007385, + "step": 1257 + }, + { + "completion_length": 260.625, + "epoch": 0.4008922880815806, + "grad_norm": 13.201912879943848, + "kl": 0.0927734375, + "learning_rate": 5.991077119184193e-07, + "loss": 0.0037, + "reward": 1.6356565952301025, + "reward_std": 0.0843108594417572, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5106565952301025, + "step": 1258 + }, + { + "completion_length": 246.203125, + "epoch": 0.40121096239643084, + "grad_norm": 10.583910942077637, + "kl": 0.11572265625, + "learning_rate": 5.987890376035691e-07, + "loss": 0.0046, + "reward": 1.322936773300171, + "reward_std": 0.09079517424106598, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3229368031024933, + "rewards/pad": 0.0, + "step": 1259 + }, + { + "completion_length": 252.765625, + "epoch": 0.40152963671128106, + "grad_norm": 14.975433349609375, + "kl": 0.08544921875, + "learning_rate": 5.984703632887189e-07, + "loss": 0.0034, + "reward": 1.4862103462219238, + "reward_std": 0.04628577083349228, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48621028661727905, + "rewards/pad": 0.0, + "step": 1260 + }, + { + "completion_length": 273.09375, + "epoch": 0.4018483110261313, + "grad_norm": 9.01754379272461, + "kl": 0.07568359375, + "learning_rate": 5.981516889738687e-07, + "loss": 0.003, + "reward": 1.4627516269683838, + "reward_std": 0.18093456327915192, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3533766269683838, + "step": 1261 + }, + { + "completion_length": 236.515625, + "epoch": 0.4021669853409815, + "grad_norm": 10.139057159423828, + "kl": 0.080078125, + "learning_rate": 5.978330146590184e-07, + "loss": 0.0032, + "reward": 1.6219439506530762, + "reward_std": 0.14522536098957062, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37194395065307617, + "step": 1262 + }, + { + "completion_length": 364.71875, + "epoch": 0.4024856596558317, + "grad_norm": 23.10379981994629, + "kl": 0.06689453125, + "learning_rate": 5.975143403441683e-07, + "loss": 0.0027, + "reward": 1.3567779064178467, + "reward_std": 0.0712767094373703, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.35677778720855713, + "step": 1263 + }, + { + "completion_length": 208.328125, + "epoch": 0.40280433397068194, + "grad_norm": 18.585952758789062, + "kl": 0.10546875, + "learning_rate": 5.971956660293181e-07, + "loss": 0.0042, + "reward": 1.637681245803833, + "reward_std": 0.1087605282664299, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.528306245803833, + "rewards/pad": 0.125, + "step": 1264 + }, + { + "completion_length": 301.5, + "epoch": 0.40312300828553216, + "grad_norm": 8.328240394592285, + "kl": 0.09130859375, + "learning_rate": 5.968769917144678e-07, + "loss": 0.0037, + "reward": 1.3431166410446167, + "reward_std": 0.1047779768705368, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3587416112422943, + "step": 1265 + }, + { + "completion_length": 238.015625, + "epoch": 0.40344168260038243, + "grad_norm": 9.131882667541504, + "kl": 0.0966796875, + "learning_rate": 5.965583173996176e-07, + "loss": 0.0039, + "reward": 1.5039496421813965, + "reward_std": 0.0670909658074379, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5039497017860413, + "step": 1266 + }, + { + "completion_length": 262.109375, + "epoch": 0.40376035691523265, + "grad_norm": 9.783851623535156, + "kl": 0.083984375, + "learning_rate": 5.962396430847674e-07, + "loss": 0.0034, + "reward": 1.7425625324249268, + "reward_std": 0.08505283296108246, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49256250262260437, + "rewards/pad": 0.25, + "step": 1267 + }, + { + "completion_length": 196.40625, + "epoch": 0.4040790312300829, + "grad_norm": 17.94754409790039, + "kl": 0.10888671875, + "learning_rate": 5.959209687699172e-07, + "loss": 0.0044, + "reward": 1.6453616619110107, + "reward_std": 0.0707654058933258, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5203617215156555, + "step": 1268 + }, + { + "completion_length": 204.0, + "epoch": 0.4043977055449331, + "grad_norm": 8.018806457519531, + "kl": 0.09912109375, + "learning_rate": 5.956022944550669e-07, + "loss": 0.004, + "reward": 1.431675910949707, + "reward_std": 0.05108753964304924, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.30667582154273987, + "rewards/pad": 0.125, + "step": 1269 + }, + { + "completion_length": 338.890625, + "epoch": 0.4047163798597833, + "grad_norm": 7.1575703620910645, + "kl": 0.06640625, + "learning_rate": 5.952836201402167e-07, + "loss": 0.0027, + "reward": 1.3842601776123047, + "reward_std": 0.03410865738987923, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3842601478099823, + "rewards/pad": 0.0, + "step": 1270 + }, + { + "completion_length": 289.5, + "epoch": 0.40503505417463354, + "grad_norm": 8.842584609985352, + "kl": 0.08544921875, + "learning_rate": 5.949649458253665e-07, + "loss": 0.0034, + "reward": 1.524345874786377, + "reward_std": 0.05037612468004227, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5243459939956665, + "step": 1271 + }, + { + "completion_length": 138.84375, + "epoch": 0.40535372848948376, + "grad_norm": 12.332841873168945, + "kl": 0.130859375, + "learning_rate": 5.946462715105163e-07, + "loss": 0.0053, + "reward": 1.454542875289917, + "reward_std": 0.11645975708961487, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45454293489456177, + "rewards/pad": 0.0, + "step": 1272 + }, + { + "completion_length": 91.953125, + "epoch": 0.405672402804334, + "grad_norm": 11.617120742797852, + "kl": 0.138671875, + "learning_rate": 5.94327597195666e-07, + "loss": 0.0055, + "reward": 1.7734625339508057, + "reward_std": 0.0727016031742096, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6484625339508057, + "rewards/pad": 0.125, + "step": 1273 + }, + { + "completion_length": 254.703125, + "epoch": 0.4059910771191842, + "grad_norm": 18.43532943725586, + "kl": 0.08837890625, + "learning_rate": 5.940089228808158e-07, + "loss": 0.0035, + "reward": 1.6537601947784424, + "reward_std": 0.12687700986862183, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5912603139877319, + "rewards/pad": 0.0625, + "step": 1274 + }, + { + "completion_length": 222.0, + "epoch": 0.4063097514340344, + "grad_norm": 10.08658504486084, + "kl": 0.10498046875, + "learning_rate": 5.936902485659655e-07, + "loss": 0.0042, + "reward": 1.674185872077942, + "reward_std": 0.1543225795030594, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5648109316825867, + "step": 1275 + }, + { + "completion_length": 333.1875, + "epoch": 0.40662842574888464, + "grad_norm": 19.096281051635742, + "kl": 0.078125, + "learning_rate": 5.933715742511153e-07, + "loss": 0.0031, + "reward": 1.3640334606170654, + "reward_std": 0.04004434123635292, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36403337121009827, + "step": 1276 + }, + { + "completion_length": 152.875, + "epoch": 0.40694710006373486, + "grad_norm": 8.920004844665527, + "kl": 0.12353515625, + "learning_rate": 5.93052899936265e-07, + "loss": 0.0049, + "reward": 1.5706473588943481, + "reward_std": 0.10923536121845245, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5706473588943481, + "rewards/pad": 0.0, + "step": 1277 + }, + { + "completion_length": 246.859375, + "epoch": 0.4072657743785851, + "grad_norm": 13.205419540405273, + "kl": 0.09228515625, + "learning_rate": 5.927342256214148e-07, + "loss": 0.0037, + "reward": 1.4794493913650513, + "reward_std": 0.05123839154839516, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47944939136505127, + "rewards/pad": 0.0, + "step": 1278 + }, + { + "completion_length": 253.328125, + "epoch": 0.4075844486934353, + "grad_norm": 6.297164440155029, + "kl": 0.11376953125, + "learning_rate": 5.924155513065646e-07, + "loss": 0.0046, + "reward": 1.3850655555725098, + "reward_std": 0.07302466779947281, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38506558537483215, + "rewards/pad": 0.0, + "step": 1279 + }, + { + "completion_length": 202.90625, + "epoch": 0.4079031230082855, + "grad_norm": 4.877528190612793, + "kl": 0.1123046875, + "learning_rate": 5.920968769917144e-07, + "loss": 0.0045, + "reward": 1.4636024236679077, + "reward_std": 0.10132384300231934, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3386024236679077, + "rewards/pad": 0.125, + "step": 1280 + }, + { + "completion_length": 256.96875, + "epoch": 0.40822179732313574, + "grad_norm": 16.753814697265625, + "kl": 0.0966796875, + "learning_rate": 5.917782026768641e-07, + "loss": 0.0039, + "reward": 1.5623836517333984, + "reward_std": 0.08463288843631744, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5623837113380432, + "rewards/pad": 0.0, + "step": 1281 + }, + { + "completion_length": 140.40625, + "epoch": 0.40854047163798596, + "grad_norm": 8.194445610046387, + "kl": 0.11279296875, + "learning_rate": 5.91459528362014e-07, + "loss": 0.0045, + "reward": 1.5440839529037476, + "reward_std": 0.10215876996517181, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41908398270606995, + "rewards/pad": 0.125, + "step": 1282 + }, + { + "completion_length": 308.765625, + "epoch": 0.4088591459528362, + "grad_norm": 6.8862385749816895, + "kl": 0.0859375, + "learning_rate": 5.911408540471638e-07, + "loss": 0.0035, + "reward": 1.499886155128479, + "reward_std": 0.09754335880279541, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3905111253261566, + "step": 1283 + }, + { + "completion_length": 205.046875, + "epoch": 0.4091778202676864, + "grad_norm": 13.631743431091309, + "kl": 0.09619140625, + "learning_rate": 5.908221797323136e-07, + "loss": 0.0038, + "reward": 1.5217753648757935, + "reward_std": 0.1388101577758789, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5217753648757935, + "rewards/pad": 0.0, + "step": 1284 + }, + { + "completion_length": 339.546875, + "epoch": 0.4094964945825367, + "grad_norm": 5.26681661605835, + "kl": 0.061767578125, + "learning_rate": 5.905035054174633e-07, + "loss": 0.0025, + "reward": 1.5010402202606201, + "reward_std": 0.04901232570409775, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3760402202606201, + "rewards/pad": 0.125, + "step": 1285 + }, + { + "completion_length": 380.78125, + "epoch": 0.4098151688973869, + "grad_norm": 7.436905384063721, + "kl": 0.068359375, + "learning_rate": 5.901848311026131e-07, + "loss": 0.0027, + "reward": 1.6161510944366455, + "reward_std": 0.2011728286743164, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5380261540412903, + "step": 1286 + }, + { + "completion_length": 266.265625, + "epoch": 0.4101338432122371, + "grad_norm": 13.028887748718262, + "kl": 0.08935546875, + "learning_rate": 5.898661567877629e-07, + "loss": 0.0036, + "reward": 1.3868389129638672, + "reward_std": 0.04168622940778732, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3868389427661896, + "rewards/pad": 0.0, + "step": 1287 + }, + { + "completion_length": 374.234375, + "epoch": 0.41045251752708733, + "grad_norm": 8.806180000305176, + "kl": 0.06396484375, + "learning_rate": 5.895474824729127e-07, + "loss": 0.0026, + "reward": 1.3828778266906738, + "reward_std": 0.15111730992794037, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.41412773728370667, + "step": 1288 + }, + { + "completion_length": 309.984375, + "epoch": 0.41077119184193756, + "grad_norm": 5.311010360717773, + "kl": 0.06982421875, + "learning_rate": 5.892288081580624e-07, + "loss": 0.0028, + "reward": 1.6289262771606445, + "reward_std": 0.061325907707214355, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.503926157951355, + "step": 1289 + }, + { + "completion_length": 310.125, + "epoch": 0.4110898661567878, + "grad_norm": 6.758955001831055, + "kl": 0.08203125, + "learning_rate": 5.889101338432122e-07, + "loss": 0.0033, + "reward": 1.640629529953003, + "reward_std": 0.10892556607723236, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48437952995300293, + "step": 1290 + }, + { + "completion_length": 323.109375, + "epoch": 0.411408540471638, + "grad_norm": 4.773209571838379, + "kl": 0.0771484375, + "learning_rate": 5.88591459528362e-07, + "loss": 0.0031, + "reward": 1.4355682134628296, + "reward_std": 0.15114247798919678, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4668181538581848, + "rewards/pad": 0.0, + "step": 1291 + }, + { + "completion_length": 267.65625, + "epoch": 0.4117272147864882, + "grad_norm": 10.178011894226074, + "kl": 0.09326171875, + "learning_rate": 5.882727852135117e-07, + "loss": 0.0037, + "reward": 1.4803059101104736, + "reward_std": 0.05648474767804146, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48030588030815125, + "rewards/pad": 0.0, + "step": 1292 + }, + { + "completion_length": 189.1875, + "epoch": 0.41204588910133844, + "grad_norm": 10.093996047973633, + "kl": 0.095703125, + "learning_rate": 5.879541108986615e-07, + "loss": 0.0038, + "reward": 1.7790437936782837, + "reward_std": 0.15045028924942017, + "rewards/pad": 0.34375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43529391288757324, + "step": 1293 + }, + { + "completion_length": 289.765625, + "epoch": 0.41236456341618866, + "grad_norm": 50.02298355102539, + "kl": 0.10107421875, + "learning_rate": 5.876354365838113e-07, + "loss": 0.004, + "reward": 1.5010088682174683, + "reward_std": 0.13455824553966522, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.516633927822113, + "step": 1294 + }, + { + "completion_length": 246.734375, + "epoch": 0.4126832377310389, + "grad_norm": 36.4998779296875, + "kl": 0.08056640625, + "learning_rate": 5.873167622689611e-07, + "loss": 0.0032, + "reward": 1.3580009937286377, + "reward_std": 0.05124887079000473, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35800105333328247, + "rewards/pad": 0.0, + "step": 1295 + }, + { + "completion_length": 324.390625, + "epoch": 0.4130019120458891, + "grad_norm": 9.309545516967773, + "kl": 0.060791015625, + "learning_rate": 5.869980879541108e-07, + "loss": 0.0024, + "reward": 1.502913475036621, + "reward_std": 0.06350995600223541, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37791338562965393, + "rewards/pad": 0.125, + "step": 1296 + }, + { + "completion_length": 166.875, + "epoch": 0.4133205863607393, + "grad_norm": 10.437187194824219, + "kl": 0.09521484375, + "learning_rate": 5.866794136392606e-07, + "loss": 0.0038, + "reward": 1.823692798614502, + "reward_std": 0.11942209303379059, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49556779861450195, + "rewards/pad": 0.328125, + "step": 1297 + }, + { + "completion_length": 334.578125, + "epoch": 0.41363926067558954, + "grad_norm": 15.77004337310791, + "kl": 0.08251953125, + "learning_rate": 5.863607393244104e-07, + "loss": 0.0033, + "reward": 1.4892622232437134, + "reward_std": 0.08658753335475922, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4892622232437134, + "rewards/pad": 0.0, + "step": 1298 + }, + { + "completion_length": 276.96875, + "epoch": 0.41395793499043976, + "grad_norm": 10.103075981140137, + "kl": 0.09130859375, + "learning_rate": 5.860420650095602e-07, + "loss": 0.0036, + "reward": 1.6745097637176514, + "reward_std": 0.06824688613414764, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5495096445083618, + "step": 1299 + }, + { + "completion_length": 222.203125, + "epoch": 0.41427660930529, + "grad_norm": 13.353841781616211, + "kl": 0.103515625, + "learning_rate": 5.8572339069471e-07, + "loss": 0.0041, + "reward": 1.5671412944793701, + "reward_std": 0.21640679240226746, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.44214126467704773, + "step": 1300 + }, + { + "completion_length": 206.265625, + "epoch": 0.4145952836201402, + "grad_norm": 17.48212242126465, + "kl": 0.11962890625, + "learning_rate": 5.854047163798598e-07, + "loss": 0.0048, + "reward": 1.500386357307434, + "reward_std": 0.14595331251621246, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37538638710975647, + "rewards/pad": 0.125, + "step": 1301 + }, + { + "completion_length": 234.625, + "epoch": 0.4149139579349904, + "grad_norm": 11.124568939208984, + "kl": 0.11572265625, + "learning_rate": 5.850860420650096e-07, + "loss": 0.0046, + "reward": 1.4751906394958496, + "reward_std": 0.11550100892782211, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4751906394958496, + "rewards/pad": 0.0, + "step": 1302 + }, + { + "completion_length": 261.25, + "epoch": 0.41523263224984064, + "grad_norm": 7.58814811706543, + "kl": 0.08642578125, + "learning_rate": 5.847673677501594e-07, + "loss": 0.0035, + "reward": 1.567017912864685, + "reward_std": 0.15186744928359985, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45764294266700745, + "rewards/pad": 0.125, + "step": 1303 + }, + { + "completion_length": 390.40625, + "epoch": 0.41555130656469086, + "grad_norm": 16.442684173583984, + "kl": 0.06640625, + "learning_rate": 5.844486934353091e-07, + "loss": 0.0027, + "reward": 1.398057460784912, + "reward_std": 0.060513533651828766, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3980574607849121, + "step": 1304 + }, + { + "completion_length": 286.890625, + "epoch": 0.41586998087954113, + "grad_norm": 8.115775108337402, + "kl": 0.08935546875, + "learning_rate": 5.841300191204589e-07, + "loss": 0.0036, + "reward": 1.5188145637512207, + "reward_std": 0.05607176199555397, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5188145637512207, + "step": 1305 + }, + { + "completion_length": 213.28125, + "epoch": 0.41618865519439135, + "grad_norm": 10.0141019821167, + "kl": 0.1025390625, + "learning_rate": 5.838113448056087e-07, + "loss": 0.0041, + "reward": 1.6528501510620117, + "reward_std": 0.11340200901031494, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6528501510620117, + "step": 1306 + }, + { + "completion_length": 430.546875, + "epoch": 0.4165073295092416, + "grad_norm": 7.513751983642578, + "kl": 0.051513671875, + "learning_rate": 5.834926704907585e-07, + "loss": 0.0021, + "reward": 1.5538432598114014, + "reward_std": 0.04181980714201927, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5538431406021118, + "step": 1307 + }, + { + "completion_length": 354.734375, + "epoch": 0.4168260038240918, + "grad_norm": 7.730433940887451, + "kl": 0.06005859375, + "learning_rate": 5.831739961759082e-07, + "loss": 0.0024, + "reward": 1.5449497699737549, + "reward_std": 0.10346663743257523, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4668247699737549, + "rewards/pad": 0.078125, + "step": 1308 + }, + { + "completion_length": 284.78125, + "epoch": 0.417144678138942, + "grad_norm": 11.019664764404297, + "kl": 0.0908203125, + "learning_rate": 5.82855321861058e-07, + "loss": 0.0036, + "reward": 1.6366920471191406, + "reward_std": 0.22725805640220642, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4804421067237854, + "step": 1309 + }, + { + "completion_length": 238.125, + "epoch": 0.41746335245379224, + "grad_norm": 28.53036880493164, + "kl": 0.09912109375, + "learning_rate": 5.825366475462078e-07, + "loss": 0.004, + "reward": 1.4334297180175781, + "reward_std": 0.09850458055734634, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40217968821525574, + "step": 1310 + }, + { + "completion_length": 390.1875, + "epoch": 0.41778202676864246, + "grad_norm": 5.816764831542969, + "kl": 0.0634765625, + "learning_rate": 5.822179732313576e-07, + "loss": 0.0025, + "reward": 1.5051462650299072, + "reward_std": 0.16161230206489563, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48952123522758484, + "step": 1311 + }, + { + "completion_length": 234.140625, + "epoch": 0.4181007010834927, + "grad_norm": 10.69921875, + "kl": 0.09814453125, + "learning_rate": 5.818992989165073e-07, + "loss": 0.0039, + "reward": 1.806492805480957, + "reward_std": 0.11196824908256531, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6814928650856018, + "rewards/pad": 0.125, + "step": 1312 + }, + { + "completion_length": 172.796875, + "epoch": 0.4184193753983429, + "grad_norm": 19.093624114990234, + "kl": 0.1064453125, + "learning_rate": 5.815806246016571e-07, + "loss": 0.0043, + "reward": 1.6470550298690796, + "reward_std": 0.13487771153450012, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.428305059671402, + "rewards/pad": 0.21875, + "step": 1313 + }, + { + "completion_length": 474.78125, + "epoch": 0.4187380497131931, + "grad_norm": 10.676046371459961, + "kl": 0.045654296875, + "learning_rate": 5.812619502868069e-07, + "loss": 0.0018, + "reward": 1.5626897811889648, + "reward_std": 0.12818527221679688, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4533146917819977, + "step": 1314 + }, + { + "completion_length": 353.265625, + "epoch": 0.41905672402804334, + "grad_norm": 12.62907886505127, + "kl": 0.06494140625, + "learning_rate": 5.809432759719566e-07, + "loss": 0.0026, + "reward": 1.5759479999542236, + "reward_std": 0.11017276346683502, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45094799995422363, + "step": 1315 + }, + { + "completion_length": 274.0625, + "epoch": 0.41937539834289356, + "grad_norm": 19.575292587280273, + "kl": 0.08154296875, + "learning_rate": 5.806246016571063e-07, + "loss": 0.0033, + "reward": 1.4159862995147705, + "reward_std": 0.0481281504034996, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4159863591194153, + "rewards/pad": 0.0, + "step": 1316 + }, + { + "completion_length": 287.0625, + "epoch": 0.4196940726577438, + "grad_norm": 9.858039855957031, + "kl": 0.09423828125, + "learning_rate": 5.803059273422561e-07, + "loss": 0.0038, + "reward": 1.6656700372695923, + "reward_std": 0.142438605427742, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5094200372695923, + "rewards/pad": 0.171875, + "step": 1317 + }, + { + "completion_length": 270.609375, + "epoch": 0.420012746972594, + "grad_norm": 7.314291954040527, + "kl": 0.09423828125, + "learning_rate": 5.799872530274059e-07, + "loss": 0.0038, + "reward": 1.5594706535339355, + "reward_std": 0.07420562207698822, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5594705939292908, + "rewards/pad": 0.0, + "step": 1318 + }, + { + "completion_length": 284.703125, + "epoch": 0.4203314212874442, + "grad_norm": 15.29833698272705, + "kl": 0.0908203125, + "learning_rate": 5.796685787125558e-07, + "loss": 0.0036, + "reward": 1.506531000137329, + "reward_std": 0.04399479925632477, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3815310001373291, + "rewards/pad": 0.125, + "step": 1319 + }, + { + "completion_length": 324.671875, + "epoch": 0.42065009560229444, + "grad_norm": 13.741570472717285, + "kl": 0.064453125, + "learning_rate": 5.793499043977055e-07, + "loss": 0.0026, + "reward": 1.5349571704864502, + "reward_std": 0.18845012784004211, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4724571704864502, + "step": 1320 + }, + { + "completion_length": 165.75, + "epoch": 0.42096876991714466, + "grad_norm": 13.279805183410645, + "kl": 0.1064453125, + "learning_rate": 5.790312300828553e-07, + "loss": 0.0043, + "reward": 1.528536319732666, + "reward_std": 0.11498822271823883, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5285362601280212, + "rewards/pad": 0.0, + "step": 1321 + }, + { + "completion_length": 169.109375, + "epoch": 0.4212874442319949, + "grad_norm": 16.651105880737305, + "kl": 0.10595703125, + "learning_rate": 5.787125557680051e-07, + "loss": 0.0042, + "reward": 1.8210078477859497, + "reward_std": 0.040907394140958786, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5710077881813049, + "step": 1322 + }, + { + "completion_length": 259.84375, + "epoch": 0.4216061185468451, + "grad_norm": 13.567558288574219, + "kl": 0.09375, + "learning_rate": 5.783938814531548e-07, + "loss": 0.0038, + "reward": 1.519647240638733, + "reward_std": 0.151037335395813, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3165222406387329, + "rewards/pad": 0.21875, + "step": 1323 + }, + { + "completion_length": 370.0625, + "epoch": 0.4219247928616954, + "grad_norm": 8.145339965820312, + "kl": 0.06982421875, + "learning_rate": 5.780752071383046e-07, + "loss": 0.0028, + "reward": 1.4528758525848389, + "reward_std": 0.07324043661355972, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4528758227825165, + "rewards/pad": 0.0, + "step": 1324 + }, + { + "completion_length": 208.84375, + "epoch": 0.4222434671765456, + "grad_norm": 33.94546127319336, + "kl": 0.1142578125, + "learning_rate": 5.777565328234544e-07, + "loss": 0.0046, + "reward": 1.355173110961914, + "reward_std": 0.06881211698055267, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35517311096191406, + "rewards/pad": 0.0, + "step": 1325 + }, + { + "completion_length": 168.203125, + "epoch": 0.4225621414913958, + "grad_norm": 9.615571022033691, + "kl": 0.1142578125, + "learning_rate": 5.774378585086042e-07, + "loss": 0.0046, + "reward": 1.8198182582855225, + "reward_std": 0.23303106427192688, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.554193377494812, + "rewards/pad": 0.265625, + "step": 1326 + }, + { + "completion_length": 172.359375, + "epoch": 0.42288081580624604, + "grad_norm": 7.548245429992676, + "kl": 0.1279296875, + "learning_rate": 5.771191841937539e-07, + "loss": 0.0051, + "reward": 1.42877197265625, + "reward_std": 0.07921026647090912, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42877188324928284, + "rewards/pad": 0.0, + "step": 1327 + }, + { + "completion_length": 306.546875, + "epoch": 0.42319949012109626, + "grad_norm": 16.646623611450195, + "kl": 0.07470703125, + "learning_rate": 5.768005098789037e-07, + "loss": 0.003, + "reward": 1.5173559188842773, + "reward_std": 0.07084318995475769, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5173557996749878, + "rewards/pad": 0.0, + "step": 1328 + }, + { + "completion_length": 393.875, + "epoch": 0.4235181644359465, + "grad_norm": 5.775397300720215, + "kl": 0.07666015625, + "learning_rate": 5.764818355640535e-07, + "loss": 0.0031, + "reward": 1.3757777214050293, + "reward_std": 0.08598428219556808, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3914026618003845, + "step": 1329 + }, + { + "completion_length": 197.78125, + "epoch": 0.4238368387507967, + "grad_norm": 40.24833297729492, + "kl": 0.09521484375, + "learning_rate": 5.761631612492033e-07, + "loss": 0.0038, + "reward": 1.6771042346954346, + "reward_std": 0.12677818536758423, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5677293539047241, + "rewards/pad": 0.125, + "step": 1330 + }, + { + "completion_length": 340.328125, + "epoch": 0.4241555130656469, + "grad_norm": 8.642611503601074, + "kl": 0.0849609375, + "learning_rate": 5.75844486934353e-07, + "loss": 0.0034, + "reward": 1.409783124923706, + "reward_std": 0.14267167448997498, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42540812492370605, + "rewards/pad": 0.0, + "step": 1331 + }, + { + "completion_length": 254.890625, + "epoch": 0.42447418738049714, + "grad_norm": 11.313599586486816, + "kl": 0.08056640625, + "learning_rate": 5.755258126195028e-07, + "loss": 0.0032, + "reward": 1.4439899921417236, + "reward_std": 0.055654481053352356, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31898996233940125, + "rewards/pad": 0.125, + "step": 1332 + }, + { + "completion_length": 206.21875, + "epoch": 0.42479286169534736, + "grad_norm": 37.72480010986328, + "kl": 0.12451171875, + "learning_rate": 5.752071383046526e-07, + "loss": 0.005, + "reward": 1.554432988166809, + "reward_std": 0.06340833753347397, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5544329881668091, + "rewards/pad": 0.0, + "step": 1333 + }, + { + "completion_length": 276.078125, + "epoch": 0.4251115360101976, + "grad_norm": 10.532992362976074, + "kl": 0.0859375, + "learning_rate": 5.748884639898024e-07, + "loss": 0.0034, + "reward": 1.5329124927520752, + "reward_std": 0.07358253002166748, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5329123735427856, + "step": 1334 + }, + { + "completion_length": 315.671875, + "epoch": 0.4254302103250478, + "grad_norm": 4.332948207855225, + "kl": 0.07958984375, + "learning_rate": 5.745697896749521e-07, + "loss": 0.0032, + "reward": 1.4253344535827637, + "reward_std": 0.10386821627616882, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.20658442378044128, + "step": 1335 + }, + { + "completion_length": 212.71875, + "epoch": 0.425748884639898, + "grad_norm": 15.159989356994629, + "kl": 0.1123046875, + "learning_rate": 5.742511153601019e-07, + "loss": 0.0045, + "reward": 1.6377997398376465, + "reward_std": 0.14908741414546967, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6534247398376465, + "rewards/pad": 0.0, + "step": 1336 + }, + { + "completion_length": 248.8125, + "epoch": 0.42606755895474824, + "grad_norm": 11.552496910095215, + "kl": 0.09716796875, + "learning_rate": 5.739324410452517e-07, + "loss": 0.0039, + "reward": 1.4865126609802246, + "reward_std": 0.15525057911872864, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.5021377801895142, + "step": 1337 + }, + { + "completion_length": 216.625, + "epoch": 0.42638623326959846, + "grad_norm": 10.185986518859863, + "kl": 0.103515625, + "learning_rate": 5.736137667304016e-07, + "loss": 0.0041, + "reward": 1.363529086112976, + "reward_std": 0.09389464557170868, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.37915414571762085, + "rewards/pad": 0.0, + "step": 1338 + }, + { + "completion_length": 301.71875, + "epoch": 0.4267049075844487, + "grad_norm": 4.8207783699035645, + "kl": 0.0625, + "learning_rate": 5.732950924155513e-07, + "loss": 0.0025, + "reward": 1.6868584156036377, + "reward_std": 0.09036528319120407, + "rewards/pad": 0.34375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.34310850501060486, + "step": 1339 + }, + { + "completion_length": 109.8125, + "epoch": 0.4270235818992989, + "grad_norm": 11.68543815612793, + "kl": 0.12109375, + "learning_rate": 5.729764181007011e-07, + "loss": 0.0049, + "reward": 1.8774080276489258, + "reward_std": 0.15159153938293457, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.611782968044281, + "rewards/pad": 0.265625, + "step": 1340 + }, + { + "completion_length": 201.4375, + "epoch": 0.4273422562141491, + "grad_norm": 12.874130249023438, + "kl": 0.0986328125, + "learning_rate": 5.726577437858509e-07, + "loss": 0.0039, + "reward": 1.6547398567199707, + "reward_std": 0.12241728603839874, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5766147375106812, + "rewards/pad": 0.078125, + "step": 1341 + }, + { + "completion_length": 214.984375, + "epoch": 0.42766093052899934, + "grad_norm": 9.551128387451172, + "kl": 0.0830078125, + "learning_rate": 5.723390694710007e-07, + "loss": 0.0033, + "reward": 1.462345004081726, + "reward_std": 0.12054663896560669, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4779700040817261, + "step": 1342 + }, + { + "completion_length": 192.515625, + "epoch": 0.42797960484384956, + "grad_norm": 39.426368713378906, + "kl": 0.09375, + "learning_rate": 5.720203951561504e-07, + "loss": 0.0037, + "reward": 1.5890249013900757, + "reward_std": 0.06130727380514145, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4640248715877533, + "step": 1343 + }, + { + "completion_length": 216.390625, + "epoch": 0.42829827915869984, + "grad_norm": 6.589493274688721, + "kl": 0.083984375, + "learning_rate": 5.717017208413002e-07, + "loss": 0.0034, + "reward": 1.6650612354278564, + "reward_std": 0.07460354268550873, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41506117582321167, + "step": 1344 + }, + { + "completion_length": 245.09375, + "epoch": 0.42861695347355006, + "grad_norm": 11.974091529846191, + "kl": 0.11376953125, + "learning_rate": 5.7138304652645e-07, + "loss": 0.0045, + "reward": 1.5561902523040771, + "reward_std": 0.15294793248176575, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5718152523040771, + "step": 1345 + }, + { + "completion_length": 184.59375, + "epoch": 0.4289356277884003, + "grad_norm": 26.964982986450195, + "kl": 0.4609375, + "learning_rate": 5.710643722115998e-07, + "loss": 0.0185, + "reward": 1.5439296960830688, + "reward_std": 0.1391330063343048, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5595547556877136, + "rewards/pad": 0.0, + "step": 1346 + }, + { + "completion_length": 257.4375, + "epoch": 0.4292543021032505, + "grad_norm": 14.968935012817383, + "kl": 0.1484375, + "learning_rate": 5.707456978967495e-07, + "loss": 0.006, + "reward": 1.455926537513733, + "reward_std": 0.14180481433868408, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4715515375137329, + "step": 1347 + }, + { + "completion_length": 301.484375, + "epoch": 0.4295729764181007, + "grad_norm": 31.82322120666504, + "kl": 0.0703125, + "learning_rate": 5.704270235818993e-07, + "loss": 0.0028, + "reward": 1.5665318965911865, + "reward_std": 0.029603153467178345, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4415319263935089, + "step": 1348 + }, + { + "completion_length": 337.578125, + "epoch": 0.42989165073295094, + "grad_norm": 4.555129528045654, + "kl": 0.06640625, + "learning_rate": 5.701083492670491e-07, + "loss": 0.0027, + "reward": 1.360124945640564, + "reward_std": 0.041842639446258545, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36012494564056396, + "step": 1349 + }, + { + "completion_length": 247.921875, + "epoch": 0.43021032504780116, + "grad_norm": 24.405990600585938, + "kl": 0.08447265625, + "learning_rate": 5.697896749521989e-07, + "loss": 0.0034, + "reward": 1.51039719581604, + "reward_std": 0.11081908643245697, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4010222554206848, + "step": 1350 + }, + { + "completion_length": 310.265625, + "epoch": 0.4305289993626514, + "grad_norm": 11.049640655517578, + "kl": 0.08203125, + "learning_rate": 5.694710006373486e-07, + "loss": 0.0033, + "reward": 1.4907029867172241, + "reward_std": 0.04949101805686951, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4907029867172241, + "rewards/pad": 0.0, + "step": 1351 + }, + { + "completion_length": 262.125, + "epoch": 0.4308476736775016, + "grad_norm": 4.974499702453613, + "kl": 0.0947265625, + "learning_rate": 5.691523263224984e-07, + "loss": 0.0038, + "reward": 1.4859580993652344, + "reward_std": 0.07267333567142487, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3609580993652344, + "rewards/pad": 0.125, + "step": 1352 + }, + { + "completion_length": 168.984375, + "epoch": 0.4311663479923518, + "grad_norm": 8.621573448181152, + "kl": 0.1484375, + "learning_rate": 5.688336520076482e-07, + "loss": 0.006, + "reward": 1.674235224723816, + "reward_std": 0.09904008358716965, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6742351651191711, + "rewards/pad": 0.0, + "step": 1353 + }, + { + "completion_length": 196.71875, + "epoch": 0.43148502230720204, + "grad_norm": 10.181594848632812, + "kl": 0.11669921875, + "learning_rate": 5.685149776927978e-07, + "loss": 0.0047, + "reward": 1.6469101905822754, + "reward_std": 0.12676161527633667, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5219101905822754, + "rewards/pad": 0.125, + "step": 1354 + }, + { + "completion_length": 322.140625, + "epoch": 0.43180369662205226, + "grad_norm": 4.490674018859863, + "kl": 0.068359375, + "learning_rate": 5.681963033779476e-07, + "loss": 0.0027, + "reward": 1.5910611152648926, + "reward_std": 0.04071514308452606, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4660611152648926, + "step": 1355 + }, + { + "completion_length": 188.0, + "epoch": 0.4321223709369025, + "grad_norm": 11.657907485961914, + "kl": 0.10888671875, + "learning_rate": 5.678776290630974e-07, + "loss": 0.0043, + "reward": 1.5474567413330078, + "reward_std": 0.06650790572166443, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.547456681728363, + "rewards/pad": 0.0, + "step": 1356 + }, + { + "completion_length": 240.078125, + "epoch": 0.4324410452517527, + "grad_norm": 7.6790618896484375, + "kl": 0.07958984375, + "learning_rate": 5.675589547482473e-07, + "loss": 0.0032, + "reward": 1.6534273624420166, + "reward_std": 0.14597870409488678, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4503023326396942, + "rewards/pad": 0.203125, + "step": 1357 + }, + { + "completion_length": 191.546875, + "epoch": 0.4327597195666029, + "grad_norm": 9.91802978515625, + "kl": 0.1494140625, + "learning_rate": 5.67240280433397e-07, + "loss": 0.006, + "reward": 1.6715829372406006, + "reward_std": 0.22884416580200195, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.499707967042923, + "step": 1358 + }, + { + "completion_length": 284.390625, + "epoch": 0.43307839388145314, + "grad_norm": 9.172077178955078, + "kl": 0.08837890625, + "learning_rate": 5.669216061185468e-07, + "loss": 0.0035, + "reward": 1.5910166501998901, + "reward_std": 0.1787361055612564, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.49726665019989014, + "step": 1359 + }, + { + "completion_length": 363.84375, + "epoch": 0.43339706819630336, + "grad_norm": 6.0271148681640625, + "kl": 0.0693359375, + "learning_rate": 5.666029318036966e-07, + "loss": 0.0028, + "reward": 1.524344563484192, + "reward_std": 0.07330280542373657, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41496962308883667, + "step": 1360 + }, + { + "completion_length": 199.5625, + "epoch": 0.4337157425111536, + "grad_norm": 15.283858299255371, + "kl": 0.1376953125, + "learning_rate": 5.662842574888464e-07, + "loss": 0.0055, + "reward": 1.664130687713623, + "reward_std": 0.10358402132987976, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.539130687713623, + "rewards/pad": 0.125, + "step": 1361 + }, + { + "completion_length": 288.171875, + "epoch": 0.4340344168260038, + "grad_norm": 7.310180187225342, + "kl": 0.0986328125, + "learning_rate": 5.659655831739961e-07, + "loss": 0.0039, + "reward": 1.2990243434906006, + "reward_std": 0.07281455397605896, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3146492838859558, + "rewards/pad": 0.0, + "step": 1362 + }, + { + "completion_length": 230.265625, + "epoch": 0.434353091140854, + "grad_norm": 11.217733383178711, + "kl": 0.08056640625, + "learning_rate": 5.656469088591459e-07, + "loss": 0.0032, + "reward": 1.649712324142456, + "reward_std": 0.075341135263443, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.524712324142456, + "rewards/pad": 0.125, + "step": 1363 + }, + { + "completion_length": 202.203125, + "epoch": 0.4346717654557043, + "grad_norm": 49.92980194091797, + "kl": 0.447265625, + "learning_rate": 5.653282345442957e-07, + "loss": 0.0179, + "reward": 1.4197627305984497, + "reward_std": 0.08784756064414978, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4197627604007721, + "rewards/pad": 0.0, + "step": 1364 + }, + { + "completion_length": 146.1875, + "epoch": 0.4349904397705545, + "grad_norm": 7.490056991577148, + "kl": 0.126953125, + "learning_rate": 5.650095602294455e-07, + "loss": 0.0051, + "reward": 1.607069492340088, + "reward_std": 0.08210156857967377, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4820695221424103, + "rewards/pad": 0.125, + "step": 1365 + }, + { + "completion_length": 245.921875, + "epoch": 0.43530911408540474, + "grad_norm": 16.46823501586914, + "kl": 0.09423828125, + "learning_rate": 5.646908859145952e-07, + "loss": 0.0038, + "reward": 1.568756103515625, + "reward_std": 0.08668509870767593, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5687560439109802, + "rewards/pad": 0.0, + "step": 1366 + }, + { + "completion_length": 236.859375, + "epoch": 0.43562778840025496, + "grad_norm": 4.543325901031494, + "kl": 0.10546875, + "learning_rate": 5.64372211599745e-07, + "loss": 0.0042, + "reward": 1.3611669540405273, + "reward_std": 0.022801637649536133, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3611670136451721, + "rewards/pad": 0.0, + "step": 1367 + }, + { + "completion_length": 253.046875, + "epoch": 0.4359464627151052, + "grad_norm": 37.60184860229492, + "kl": 0.09619140625, + "learning_rate": 5.640535372848948e-07, + "loss": 0.0038, + "reward": 1.5685096979141235, + "reward_std": 0.044750314205884933, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5685096383094788, + "rewards/pad": 0.0, + "step": 1368 + }, + { + "completion_length": 321.796875, + "epoch": 0.4362651370299554, + "grad_norm": 10.251585960388184, + "kl": 0.07373046875, + "learning_rate": 5.637348629700446e-07, + "loss": 0.003, + "reward": 1.5848073959350586, + "reward_std": 0.07786726951599121, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5848073959350586, + "step": 1369 + }, + { + "completion_length": 258.515625, + "epoch": 0.4365838113448056, + "grad_norm": 13.010295867919922, + "kl": 0.11669921875, + "learning_rate": 5.634161886551943e-07, + "loss": 0.0047, + "reward": 1.402256727218628, + "reward_std": 0.06264589726924896, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4022567868232727, + "rewards/pad": 0.0, + "step": 1370 + }, + { + "completion_length": 205.53125, + "epoch": 0.43690248565965584, + "grad_norm": 13.601364135742188, + "kl": 0.09130859375, + "learning_rate": 5.630975143403441e-07, + "loss": 0.0037, + "reward": 1.7684476375579834, + "reward_std": 0.07154097408056259, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5184476971626282, + "step": 1371 + }, + { + "completion_length": 368.734375, + "epoch": 0.43722115997450606, + "grad_norm": 7.428386211395264, + "kl": 0.06982421875, + "learning_rate": 5.627788400254939e-07, + "loss": 0.0028, + "reward": 1.5593843460083008, + "reward_std": 0.140476793050766, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.450009286403656, + "step": 1372 + }, + { + "completion_length": 107.90625, + "epoch": 0.4375398342893563, + "grad_norm": 22.303043365478516, + "kl": 0.12255859375, + "learning_rate": 5.624601657106437e-07, + "loss": 0.0049, + "reward": 1.643021821975708, + "reward_std": 0.2257276475429535, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5648967623710632, + "rewards/pad": 0.078125, + "step": 1373 + }, + { + "completion_length": 149.328125, + "epoch": 0.4378585086042065, + "grad_norm": 16.431352615356445, + "kl": 0.1083984375, + "learning_rate": 5.621414913957934e-07, + "loss": 0.0043, + "reward": 1.3097245693206787, + "reward_std": 0.10590263456106186, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3097245693206787, + "rewards/pad": 0.0, + "step": 1374 + }, + { + "completion_length": 298.609375, + "epoch": 0.4381771829190567, + "grad_norm": 17.105371475219727, + "kl": 0.07470703125, + "learning_rate": 5.618228170809432e-07, + "loss": 0.003, + "reward": 1.5687129497528076, + "reward_std": 0.047115493565797806, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5687129497528076, + "step": 1375 + }, + { + "completion_length": 233.234375, + "epoch": 0.43849585723390694, + "grad_norm": 8.532923698425293, + "kl": 0.11865234375, + "learning_rate": 5.615041427660931e-07, + "loss": 0.0048, + "reward": 1.371286392211914, + "reward_std": 0.15830770134925842, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3244113028049469, + "step": 1376 + }, + { + "completion_length": 247.5625, + "epoch": 0.43881453154875716, + "grad_norm": 10.543306350708008, + "kl": 0.08154296875, + "learning_rate": 5.611854684512429e-07, + "loss": 0.0033, + "reward": 1.3614822626113892, + "reward_std": 0.14806589484214783, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.33023232221603394, + "step": 1377 + }, + { + "completion_length": 110.71875, + "epoch": 0.4391332058636074, + "grad_norm": 27.713855743408203, + "kl": 0.125, + "learning_rate": 5.608667941363926e-07, + "loss": 0.005, + "reward": 1.7164729833602905, + "reward_std": 0.17640821635723114, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4352230131626129, + "rewards/pad": 0.28125, + "step": 1378 + }, + { + "completion_length": 403.421875, + "epoch": 0.4394518801784576, + "grad_norm": 4.868521213531494, + "kl": 0.04248046875, + "learning_rate": 5.605481198215424e-07, + "loss": 0.0017, + "reward": 1.3028991222381592, + "reward_std": 0.027830326929688454, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3028990626335144, + "step": 1379 + }, + { + "completion_length": 206.03125, + "epoch": 0.4397705544933078, + "grad_norm": 12.161081314086914, + "kl": 0.08349609375, + "learning_rate": 5.602294455066922e-07, + "loss": 0.0033, + "reward": 1.6735880374908447, + "reward_std": 0.08616338670253754, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5485880374908447, + "rewards/pad": 0.125, + "step": 1380 + }, + { + "completion_length": 195.390625, + "epoch": 0.44008922880815804, + "grad_norm": 17.812496185302734, + "kl": 0.234375, + "learning_rate": 5.59910771191842e-07, + "loss": 0.0094, + "reward": 1.4883697032928467, + "reward_std": 0.08894871920347214, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.34774452447891235, + "step": 1381 + }, + { + "completion_length": 316.25, + "epoch": 0.44040790312300826, + "grad_norm": 9.531152725219727, + "kl": 0.078125, + "learning_rate": 5.595920968769917e-07, + "loss": 0.0031, + "reward": 1.4265236854553223, + "reward_std": 0.04695095121860504, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42652374505996704, + "rewards/pad": 0.0, + "step": 1382 + }, + { + "completion_length": 320.0, + "epoch": 0.44072657743785854, + "grad_norm": 7.157126426696777, + "kl": 0.0751953125, + "learning_rate": 5.592734225621415e-07, + "loss": 0.003, + "reward": 1.3335322141647339, + "reward_std": 0.050246525555849075, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.20853224396705627, + "step": 1383 + }, + { + "completion_length": 213.734375, + "epoch": 0.44104525175270876, + "grad_norm": 11.768050193786621, + "kl": 0.1123046875, + "learning_rate": 5.589547482472913e-07, + "loss": 0.0045, + "reward": 1.51654851436615, + "reward_std": 0.10998847335577011, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4540485143661499, + "step": 1384 + }, + { + "completion_length": 353.890625, + "epoch": 0.441363926067559, + "grad_norm": 7.449849605560303, + "kl": 0.08251953125, + "learning_rate": 5.586360739324411e-07, + "loss": 0.0033, + "reward": 1.274510145187378, + "reward_std": 0.01488026138395071, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2745102047920227, + "step": 1385 + }, + { + "completion_length": 165.375, + "epoch": 0.4416826003824092, + "grad_norm": 23.854345321655273, + "kl": 0.12109375, + "learning_rate": 5.583173996175908e-07, + "loss": 0.0048, + "reward": 1.5486022233963013, + "reward_std": 0.2040618658065796, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5642272233963013, + "step": 1386 + }, + { + "completion_length": 160.5625, + "epoch": 0.4420012746972594, + "grad_norm": 82.924560546875, + "kl": 0.1025390625, + "learning_rate": 5.579987253027406e-07, + "loss": 0.0041, + "reward": 1.7493038177490234, + "reward_std": 0.0807715579867363, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6243036985397339, + "rewards/pad": 0.125, + "step": 1387 + }, + { + "completion_length": 348.265625, + "epoch": 0.44231994901210964, + "grad_norm": 14.425209045410156, + "kl": 0.068359375, + "learning_rate": 5.576800509878904e-07, + "loss": 0.0027, + "reward": 1.4635679721832275, + "reward_std": 0.0678384006023407, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46356791257858276, + "rewards/pad": 0.0, + "step": 1388 + }, + { + "completion_length": 161.71875, + "epoch": 0.44263862332695986, + "grad_norm": 11.94912338256836, + "kl": 0.1083984375, + "learning_rate": 5.573613766730401e-07, + "loss": 0.0043, + "reward": 1.7208974361419678, + "reward_std": 0.09452605247497559, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47089749574661255, + "rewards/pad": 0.25, + "step": 1389 + }, + { + "completion_length": 259.5625, + "epoch": 0.4429572976418101, + "grad_norm": 18.560396194458008, + "kl": 0.07763671875, + "learning_rate": 5.570427023581899e-07, + "loss": 0.0031, + "reward": 1.8574298620224, + "reward_std": 0.1501571536064148, + "rewards/answer_reward": 0.328125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5293048024177551, + "step": 1390 + }, + { + "completion_length": 126.78125, + "epoch": 0.4432759719566603, + "grad_norm": 11.051600456237793, + "kl": 0.140625, + "learning_rate": 5.567240280433397e-07, + "loss": 0.0056, + "reward": 1.5313197374343872, + "reward_std": 0.21828460693359375, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35944464802742004, + "rewards/pad": 0.171875, + "step": 1391 + }, + { + "completion_length": 205.59375, + "epoch": 0.4435946462715105, + "grad_norm": 14.702584266662598, + "kl": 0.1005859375, + "learning_rate": 5.564053537284895e-07, + "loss": 0.004, + "reward": 1.6007747650146484, + "reward_std": 0.1382533311843872, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5851497054100037, + "rewards/pad": 0.015625, + "step": 1392 + }, + { + "completion_length": 213.046875, + "epoch": 0.44391332058636074, + "grad_norm": 11.537291526794434, + "kl": 0.10546875, + "learning_rate": 5.560866794136391e-07, + "loss": 0.0042, + "reward": 1.8047747611999512, + "reward_std": 0.15652026236057281, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6016498804092407, + "step": 1393 + }, + { + "completion_length": 294.125, + "epoch": 0.44423199490121096, + "grad_norm": 14.856083869934082, + "kl": 0.0810546875, + "learning_rate": 5.557680050987889e-07, + "loss": 0.0032, + "reward": 1.5364224910736084, + "reward_std": 0.04746140539646149, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5364224314689636, + "step": 1394 + }, + { + "completion_length": 312.5, + "epoch": 0.4445506692160612, + "grad_norm": 10.558499336242676, + "kl": 0.099609375, + "learning_rate": 5.554493307839388e-07, + "loss": 0.004, + "reward": 1.3890564441680908, + "reward_std": 0.10178478062152863, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4046815037727356, + "step": 1395 + }, + { + "completion_length": 276.171875, + "epoch": 0.4448693435309114, + "grad_norm": 8.246567726135254, + "kl": 0.1337890625, + "learning_rate": 5.551306564690886e-07, + "loss": 0.0053, + "reward": 1.415022611618042, + "reward_std": 0.11157543957233429, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4150225818157196, + "rewards/pad": 0.0, + "step": 1396 + }, + { + "completion_length": 226.796875, + "epoch": 0.4451880178457616, + "grad_norm": 13.543415069580078, + "kl": 0.107421875, + "learning_rate": 5.548119821542383e-07, + "loss": 0.0043, + "reward": 1.5432687997817993, + "reward_std": 0.10068956017494202, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5432687997817993, + "rewards/pad": 0.0, + "step": 1397 + }, + { + "completion_length": 218.734375, + "epoch": 0.44550669216061184, + "grad_norm": 7.59999942779541, + "kl": 0.107421875, + "learning_rate": 5.544933078393881e-07, + "loss": 0.0043, + "reward": 1.5217714309692383, + "reward_std": 0.07551935315132141, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5217715501785278, + "step": 1398 + }, + { + "completion_length": 210.984375, + "epoch": 0.44582536647546206, + "grad_norm": 7.47714376449585, + "kl": 0.12109375, + "learning_rate": 5.541746335245379e-07, + "loss": 0.0048, + "reward": 1.561887264251709, + "reward_std": 0.08502572774887085, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.452512264251709, + "rewards/pad": 0.109375, + "step": 1399 + }, + { + "completion_length": 181.28125, + "epoch": 0.4461440407903123, + "grad_norm": 18.773975372314453, + "kl": 0.11083984375, + "learning_rate": 5.538559592096877e-07, + "loss": 0.0044, + "reward": 1.6833388805389404, + "reward_std": 0.18605932593345642, + "rewards/answer_reward": 0.28125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4177139401435852, + "step": 1400 + }, + { + "completion_length": 204.671875, + "epoch": 0.4464627151051625, + "grad_norm": 7.299523830413818, + "kl": 0.10986328125, + "learning_rate": 5.535372848948374e-07, + "loss": 0.0044, + "reward": 1.4669969081878662, + "reward_std": 0.10377576947212219, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48262184858322144, + "step": 1401 + }, + { + "completion_length": 335.828125, + "epoch": 0.4467813894200127, + "grad_norm": 29.090721130371094, + "kl": 0.08349609375, + "learning_rate": 5.532186105799872e-07, + "loss": 0.0033, + "reward": 1.482242465019226, + "reward_std": 0.05453872308135033, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4822424054145813, + "rewards/pad": 0.0, + "step": 1402 + }, + { + "completion_length": 298.25, + "epoch": 0.447100063734863, + "grad_norm": 11.817734718322754, + "kl": 0.07568359375, + "learning_rate": 5.52899936265137e-07, + "loss": 0.003, + "reward": 1.6147228479385376, + "reward_std": 0.19429726898670197, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3334728479385376, + "rewards/pad": 0.28125, + "step": 1403 + }, + { + "completion_length": 179.03125, + "epoch": 0.4474187380497132, + "grad_norm": 17.679716110229492, + "kl": 0.10302734375, + "learning_rate": 5.525812619502868e-07, + "loss": 0.0041, + "reward": 1.5345510244369507, + "reward_std": 0.16566292941570282, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4408009946346283, + "step": 1404 + }, + { + "completion_length": 226.75, + "epoch": 0.44773741236456344, + "grad_norm": 27.14039421081543, + "kl": 0.10693359375, + "learning_rate": 5.522625876354365e-07, + "loss": 0.0043, + "reward": 1.5934033393859863, + "reward_std": 0.1432866007089615, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48402824997901917, + "step": 1405 + }, + { + "completion_length": 223.625, + "epoch": 0.44805608667941366, + "grad_norm": 5.414300441741943, + "kl": 0.08935546875, + "learning_rate": 5.519439133205863e-07, + "loss": 0.0036, + "reward": 1.5930595397949219, + "reward_std": 0.11963119357824326, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4836845099925995, + "step": 1406 + }, + { + "completion_length": 278.21875, + "epoch": 0.4483747609942639, + "grad_norm": 10.535198211669922, + "kl": 0.09814453125, + "learning_rate": 5.516252390057361e-07, + "loss": 0.0039, + "reward": 1.5685639381408691, + "reward_std": 0.09999606013298035, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5685639381408691, + "rewards/pad": 0.0, + "step": 1407 + }, + { + "completion_length": 319.0625, + "epoch": 0.4486934353091141, + "grad_norm": 7.539035797119141, + "kl": 0.059326171875, + "learning_rate": 5.513065646908859e-07, + "loss": 0.0024, + "reward": 1.7355012893676758, + "reward_std": 0.06021437793970108, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48550137877464294, + "step": 1408 + }, + { + "completion_length": 156.65625, + "epoch": 0.4490121096239643, + "grad_norm": 48.58030700683594, + "kl": 0.0986328125, + "learning_rate": 5.509878903760356e-07, + "loss": 0.0039, + "reward": 1.8095495700836182, + "reward_std": 0.12096580862998962, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5907995700836182, + "rewards/pad": 0.21875, + "step": 1409 + }, + { + "completion_length": 204.453125, + "epoch": 0.44933078393881454, + "grad_norm": 9.988531112670898, + "kl": 0.08544921875, + "learning_rate": 5.506692160611854e-07, + "loss": 0.0034, + "reward": 1.593253493309021, + "reward_std": 0.22704046964645386, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4838784635066986, + "rewards/pad": 0.109375, + "step": 1410 + }, + { + "completion_length": 273.0625, + "epoch": 0.44964945825366476, + "grad_norm": 7.341174125671387, + "kl": 0.0859375, + "learning_rate": 5.503505417463352e-07, + "loss": 0.0034, + "reward": 1.6155000925064087, + "reward_std": 0.08141268044710159, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4905000925064087, + "step": 1411 + }, + { + "completion_length": 343.21875, + "epoch": 0.449968132568515, + "grad_norm": 11.9456148147583, + "kl": 0.0625, + "learning_rate": 5.50031867431485e-07, + "loss": 0.0025, + "reward": 1.387474536895752, + "reward_std": 0.12038862705230713, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3093494772911072, + "step": 1412 + }, + { + "completion_length": 213.390625, + "epoch": 0.4502868068833652, + "grad_norm": 13.47103214263916, + "kl": 0.1044921875, + "learning_rate": 5.497131931166347e-07, + "loss": 0.0042, + "reward": 1.6264615058898926, + "reward_std": 0.14993353188037872, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5170865058898926, + "step": 1413 + }, + { + "completion_length": 219.453125, + "epoch": 0.4506054811982154, + "grad_norm": 9.811759948730469, + "kl": 0.080078125, + "learning_rate": 5.493945188017846e-07, + "loss": 0.0032, + "reward": 1.663762092590332, + "reward_std": 0.21020281314849854, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4293871521949768, + "step": 1414 + }, + { + "completion_length": 415.078125, + "epoch": 0.45092415551306564, + "grad_norm": 3.461106777191162, + "kl": 0.05224609375, + "learning_rate": 5.490758444869344e-07, + "loss": 0.0021, + "reward": 1.4688377380371094, + "reward_std": 0.0422951802611351, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34383776783943176, + "rewards/pad": 0.125, + "step": 1415 + }, + { + "completion_length": 335.234375, + "epoch": 0.45124282982791586, + "grad_norm": 6.391787052154541, + "kl": 0.06982421875, + "learning_rate": 5.487571701720841e-07, + "loss": 0.0028, + "reward": 1.430022120475769, + "reward_std": 0.22225743532180786, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.44564709067344666, + "step": 1416 + }, + { + "completion_length": 371.890625, + "epoch": 0.4515615041427661, + "grad_norm": 10.911314964294434, + "kl": 0.05908203125, + "learning_rate": 5.484384958572339e-07, + "loss": 0.0024, + "reward": 1.6407203674316406, + "reward_std": 0.05482962727546692, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5157203078269958, + "step": 1417 + }, + { + "completion_length": 291.5, + "epoch": 0.4518801784576163, + "grad_norm": 16.564706802368164, + "kl": 0.08642578125, + "learning_rate": 5.481198215423837e-07, + "loss": 0.0035, + "reward": 1.4472219944000244, + "reward_std": 0.06772038340568542, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4472220242023468, + "rewards/pad": 0.0, + "step": 1418 + }, + { + "completion_length": 304.78125, + "epoch": 0.4521988527724665, + "grad_norm": 5.892740249633789, + "kl": 0.07373046875, + "learning_rate": 5.478011472275335e-07, + "loss": 0.0029, + "reward": 1.537081003189087, + "reward_std": 0.06439124047756195, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4120810627937317, + "rewards/pad": 0.125, + "step": 1419 + }, + { + "completion_length": 248.75, + "epoch": 0.45251752708731674, + "grad_norm": 5.977435111999512, + "kl": 0.09765625, + "learning_rate": 5.474824729126832e-07, + "loss": 0.0039, + "reward": 1.4967403411865234, + "reward_std": 0.12663181126117706, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49674031138420105, + "rewards/pad": 0.0, + "step": 1420 + }, + { + "completion_length": 290.265625, + "epoch": 0.45283620140216696, + "grad_norm": 12.139585494995117, + "kl": 0.0810546875, + "learning_rate": 5.47163798597833e-07, + "loss": 0.0032, + "reward": 1.588675618171692, + "reward_std": 0.12400297820568085, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46367567777633667, + "step": 1421 + }, + { + "completion_length": 150.5625, + "epoch": 0.45315487571701724, + "grad_norm": 26.84971809387207, + "kl": 0.10693359375, + "learning_rate": 5.468451242829828e-07, + "loss": 0.0043, + "reward": 1.7052375078201294, + "reward_std": 0.10772273689508438, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7052374482154846, + "rewards/pad": 0.0, + "step": 1422 + }, + { + "completion_length": 286.390625, + "epoch": 0.45347355003186746, + "grad_norm": 16.370525360107422, + "kl": 0.09814453125, + "learning_rate": 5.465264499681326e-07, + "loss": 0.0039, + "reward": 1.5122170448303223, + "reward_std": 0.11250067502260208, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.2622169852256775, + "step": 1423 + }, + { + "completion_length": 194.15625, + "epoch": 0.4537922243467177, + "grad_norm": 9.194384574890137, + "kl": 0.09912109375, + "learning_rate": 5.462077756532823e-07, + "loss": 0.004, + "reward": 1.6611933708190918, + "reward_std": 0.06436239928007126, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41119328141212463, + "rewards/pad": 0.25, + "step": 1424 + }, + { + "completion_length": 412.109375, + "epoch": 0.4541108986615679, + "grad_norm": 8.52474594116211, + "kl": 0.06103515625, + "learning_rate": 5.458891013384321e-07, + "loss": 0.0024, + "reward": 1.3822156190872192, + "reward_std": 0.04052000492811203, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38221555948257446, + "step": 1425 + }, + { + "completion_length": 251.15625, + "epoch": 0.4544295729764181, + "grad_norm": 6.234942436218262, + "kl": 0.078125, + "learning_rate": 5.455704270235819e-07, + "loss": 0.0031, + "reward": 1.6243338584899902, + "reward_std": 0.039573561400175095, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49933379888534546, + "step": 1426 + }, + { + "completion_length": 224.28125, + "epoch": 0.45474824729126834, + "grad_norm": 9.928245544433594, + "kl": 0.10400390625, + "learning_rate": 5.452517527087317e-07, + "loss": 0.0042, + "reward": 1.652978539466858, + "reward_std": 0.13306963443756104, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.6686034798622131, + "step": 1427 + }, + { + "completion_length": 262.84375, + "epoch": 0.45506692160611856, + "grad_norm": 13.236774444580078, + "kl": 0.09228515625, + "learning_rate": 5.449330783938814e-07, + "loss": 0.0037, + "reward": 1.5027391910552979, + "reward_std": 0.06213487684726715, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5027391314506531, + "rewards/pad": 0.0, + "step": 1428 + }, + { + "completion_length": 359.8125, + "epoch": 0.4553855959209688, + "grad_norm": 7.582083702087402, + "kl": 0.068359375, + "learning_rate": 5.446144040790312e-07, + "loss": 0.0027, + "reward": 1.538433313369751, + "reward_std": 0.03907448425889015, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.413433313369751, + "rewards/pad": 0.125, + "step": 1429 + }, + { + "completion_length": 214.046875, + "epoch": 0.455704270235819, + "grad_norm": 15.639304161071777, + "kl": 0.087890625, + "learning_rate": 5.44295729764181e-07, + "loss": 0.0035, + "reward": 1.746893286705017, + "reward_std": 0.1430896818637848, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6375182867050171, + "rewards/pad": 0.109375, + "step": 1430 + }, + { + "completion_length": 286.3125, + "epoch": 0.4560229445506692, + "grad_norm": 4.884230136871338, + "kl": 0.07958984375, + "learning_rate": 5.439770554493309e-07, + "loss": 0.0032, + "reward": 1.6326934099197388, + "reward_std": 0.032782621681690216, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5076934099197388, + "step": 1431 + }, + { + "completion_length": 308.984375, + "epoch": 0.45634161886551944, + "grad_norm": 8.61745834350586, + "kl": 0.06640625, + "learning_rate": 5.436583811344804e-07, + "loss": 0.0027, + "reward": 1.4891072511672974, + "reward_std": 0.05865410715341568, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48910731077194214, + "rewards/pad": 0.0, + "step": 1432 + }, + { + "completion_length": 285.5625, + "epoch": 0.45666029318036966, + "grad_norm": 10.135014533996582, + "kl": 0.1123046875, + "learning_rate": 5.433397068196303e-07, + "loss": 0.0045, + "reward": 1.805755853652954, + "reward_std": 0.15974077582359314, + "rewards/answer_reward": 0.484375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3213808536529541, + "step": 1433 + }, + { + "completion_length": 328.390625, + "epoch": 0.4569789674952199, + "grad_norm": 9.830632209777832, + "kl": 0.123046875, + "learning_rate": 5.430210325047801e-07, + "loss": 0.0049, + "reward": 1.503233551979065, + "reward_std": 0.11444251239299774, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5032335519790649, + "step": 1434 + }, + { + "completion_length": 224.59375, + "epoch": 0.4572976418100701, + "grad_norm": 10.255071640014648, + "kl": 0.08642578125, + "learning_rate": 5.427023581899299e-07, + "loss": 0.0035, + "reward": 1.5475623607635498, + "reward_std": 0.04728948324918747, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42256247997283936, + "step": 1435 + }, + { + "completion_length": 272.0625, + "epoch": 0.4576163161249203, + "grad_norm": 9.047293663024902, + "kl": 0.107421875, + "learning_rate": 5.423836838750796e-07, + "loss": 0.0043, + "reward": 1.5126659870147705, + "reward_std": 0.09375756978988647, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5126661062240601, + "step": 1436 + }, + { + "completion_length": 289.34375, + "epoch": 0.45793499043977054, + "grad_norm": 9.918564796447754, + "kl": 0.119140625, + "learning_rate": 5.420650095602294e-07, + "loss": 0.0048, + "reward": 1.3692153692245483, + "reward_std": 0.08862542361021042, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3692152798175812, + "rewards/pad": 0.0, + "step": 1437 + }, + { + "completion_length": 370.71875, + "epoch": 0.45825366475462076, + "grad_norm": 6.384302616119385, + "kl": 0.07177734375, + "learning_rate": 5.417463352453792e-07, + "loss": 0.0029, + "reward": 1.4721190929412842, + "reward_std": 0.08346770703792572, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47211918234825134, + "rewards/pad": 0.0, + "step": 1438 + }, + { + "completion_length": 235.65625, + "epoch": 0.458572339069471, + "grad_norm": 29.600440979003906, + "kl": 0.083984375, + "learning_rate": 5.41427660930529e-07, + "loss": 0.0034, + "reward": 1.87379789352417, + "reward_std": 0.09178641438484192, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6394227743148804, + "rewards/pad": 0.234375, + "step": 1439 + }, + { + "completion_length": 311.25, + "epoch": 0.4588910133843212, + "grad_norm": 11.616978645324707, + "kl": 0.060302734375, + "learning_rate": 5.411089866156787e-07, + "loss": 0.0024, + "reward": 1.6346428394317627, + "reward_std": 0.14542795717716217, + "rewards/pad": 0.3125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.33776795864105225, + "step": 1440 + }, + { + "completion_length": 308.84375, + "epoch": 0.4592096876991714, + "grad_norm": 7.0898942947387695, + "kl": 0.068359375, + "learning_rate": 5.407903123008285e-07, + "loss": 0.0027, + "reward": 1.5286662578582764, + "reward_std": 0.08710530400276184, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4192911386489868, + "rewards/pad": 0.125, + "step": 1441 + }, + { + "completion_length": 146.25, + "epoch": 0.4595283620140217, + "grad_norm": 17.23931884765625, + "kl": 0.119140625, + "learning_rate": 5.404716379859783e-07, + "loss": 0.0048, + "reward": 1.466902256011963, + "reward_std": 0.04941733554005623, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3419022560119629, + "step": 1442 + }, + { + "completion_length": 218.71875, + "epoch": 0.4598470363288719, + "grad_norm": 8.337162017822266, + "kl": 0.09326171875, + "learning_rate": 5.401529636711281e-07, + "loss": 0.0037, + "reward": 1.692416787147522, + "reward_std": 0.09146659076213837, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44241684675216675, + "rewards/pad": 0.25, + "step": 1443 + }, + { + "completion_length": 239.65625, + "epoch": 0.46016571064372214, + "grad_norm": 15.837130546569824, + "kl": 0.08203125, + "learning_rate": 5.398342893562778e-07, + "loss": 0.0033, + "reward": 1.5374739170074463, + "reward_std": 0.06341423094272614, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5374739170074463, + "rewards/pad": 0.0, + "step": 1444 + }, + { + "completion_length": 164.625, + "epoch": 0.46048438495857236, + "grad_norm": 11.028009414672852, + "kl": 0.126953125, + "learning_rate": 5.395156150414276e-07, + "loss": 0.0051, + "reward": 1.506535530090332, + "reward_std": 0.16987767815589905, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.28778553009033203, + "rewards/pad": 0.21875, + "step": 1445 + }, + { + "completion_length": 324.359375, + "epoch": 0.4608030592734226, + "grad_norm": 9.85593318939209, + "kl": 0.07080078125, + "learning_rate": 5.391969407265774e-07, + "loss": 0.0028, + "reward": 1.5697029829025269, + "reward_std": 0.05483391135931015, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44470298290252686, + "rewards/pad": 0.125, + "step": 1446 + }, + { + "completion_length": 321.03125, + "epoch": 0.4611217335882728, + "grad_norm": 10.324029922485352, + "kl": 0.05859375, + "learning_rate": 5.388782664117271e-07, + "loss": 0.0023, + "reward": 1.5770617723464966, + "reward_std": 0.1372072696685791, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4676867425441742, + "step": 1447 + }, + { + "completion_length": 356.53125, + "epoch": 0.461440407903123, + "grad_norm": 5.90736198425293, + "kl": 0.06689453125, + "learning_rate": 5.385595920968769e-07, + "loss": 0.0027, + "reward": 1.6476682424545288, + "reward_std": 0.07913592457771301, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5226683020591736, + "rewards/pad": 0.125, + "step": 1448 + }, + { + "completion_length": 268.75, + "epoch": 0.46175908221797324, + "grad_norm": 24.653217315673828, + "kl": 0.08251953125, + "learning_rate": 5.382409177820267e-07, + "loss": 0.0033, + "reward": 1.5845396518707275, + "reward_std": 0.12653054296970367, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47516465187072754, + "rewards/pad": 0.125, + "step": 1449 + }, + { + "completion_length": 336.8125, + "epoch": 0.46207775653282346, + "grad_norm": 14.810794830322266, + "kl": 0.07177734375, + "learning_rate": 5.379222434671765e-07, + "loss": 0.0029, + "reward": 1.5406912565231323, + "reward_std": 0.21528227627277374, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4469412863254547, + "rewards/pad": 0.125, + "step": 1450 + }, + { + "completion_length": 215.84375, + "epoch": 0.4623964308476737, + "grad_norm": 19.48642349243164, + "kl": 0.10498046875, + "learning_rate": 5.376035691523262e-07, + "loss": 0.0042, + "reward": 1.615641474723816, + "reward_std": 0.14932119846343994, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5062664747238159, + "rewards/pad": 0.125, + "step": 1451 + }, + { + "completion_length": 154.0625, + "epoch": 0.4627151051625239, + "grad_norm": 15.968269348144531, + "kl": 0.1064453125, + "learning_rate": 5.372848948374761e-07, + "loss": 0.0043, + "reward": 1.4699654579162598, + "reward_std": 0.10306274145841599, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4699653387069702, + "rewards/pad": 0.0, + "step": 1452 + }, + { + "completion_length": 304.03125, + "epoch": 0.4630337794773741, + "grad_norm": 6.2782745361328125, + "kl": 0.078125, + "learning_rate": 5.369662205226259e-07, + "loss": 0.0031, + "reward": 1.429635763168335, + "reward_std": 0.03694413974881172, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.30463579297065735, + "step": 1453 + }, + { + "completion_length": 313.109375, + "epoch": 0.46335245379222434, + "grad_norm": 8.989459991455078, + "kl": 0.076171875, + "learning_rate": 5.366475462077757e-07, + "loss": 0.003, + "reward": 1.55964994430542, + "reward_std": 0.07360120117664337, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5596500635147095, + "rewards/pad": 0.0, + "step": 1454 + }, + { + "completion_length": 209.8125, + "epoch": 0.46367112810707456, + "grad_norm": 19.859041213989258, + "kl": 0.1005859375, + "learning_rate": 5.363288718929254e-07, + "loss": 0.004, + "reward": 1.6786119937896729, + "reward_std": 0.0755452886223793, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5536119341850281, + "rewards/pad": 0.125, + "step": 1455 + }, + { + "completion_length": 233.203125, + "epoch": 0.4639898024219248, + "grad_norm": 13.783255577087402, + "kl": 0.095703125, + "learning_rate": 5.360101975780752e-07, + "loss": 0.0038, + "reward": 1.7623509168624878, + "reward_std": 0.10795766860246658, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5123509168624878, + "step": 1456 + }, + { + "completion_length": 231.46875, + "epoch": 0.464308476736775, + "grad_norm": 9.37671184539795, + "kl": 0.09326171875, + "learning_rate": 5.35691523263225e-07, + "loss": 0.0037, + "reward": 1.4515711069107056, + "reward_std": 0.19940702617168427, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.34219613671302795, + "step": 1457 + }, + { + "completion_length": 324.125, + "epoch": 0.4646271510516252, + "grad_norm": 9.546353340148926, + "kl": 0.0673828125, + "learning_rate": 5.353728489483748e-07, + "loss": 0.0027, + "reward": 1.5384440422058105, + "reward_std": 0.10423227399587631, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41344398260116577, + "step": 1458 + }, + { + "completion_length": 232.90625, + "epoch": 0.46494582536647544, + "grad_norm": 11.147302627563477, + "kl": 0.0947265625, + "learning_rate": 5.350541746335245e-07, + "loss": 0.0038, + "reward": 1.5543948411941528, + "reward_std": 0.06055128574371338, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5543947815895081, + "rewards/pad": 0.0, + "step": 1459 + }, + { + "completion_length": 308.25, + "epoch": 0.46526449968132566, + "grad_norm": 12.76003646850586, + "kl": 0.07470703125, + "learning_rate": 5.347355003186743e-07, + "loss": 0.003, + "reward": 1.5120352506637573, + "reward_std": 0.07145994901657104, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5120352506637573, + "step": 1460 + }, + { + "completion_length": 315.890625, + "epoch": 0.4655831739961759, + "grad_norm": 7.023419380187988, + "kl": 0.076171875, + "learning_rate": 5.344168260038241e-07, + "loss": 0.003, + "reward": 1.4683029651641846, + "reward_std": 0.09806257486343384, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.48392796516418457, + "step": 1461 + }, + { + "completion_length": 329.484375, + "epoch": 0.46590184831102616, + "grad_norm": 27.741416931152344, + "kl": 0.07421875, + "learning_rate": 5.340981516889739e-07, + "loss": 0.003, + "reward": 1.455172061920166, + "reward_std": 0.07932863384485245, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.330172061920166, + "rewards/pad": 0.125, + "step": 1462 + }, + { + "completion_length": 196.984375, + "epoch": 0.4662205226258764, + "grad_norm": 7.4176506996154785, + "kl": 0.09423828125, + "learning_rate": 5.337794773741236e-07, + "loss": 0.0038, + "reward": 1.753882646560669, + "reward_std": 0.10934875905513763, + "rewards/answer_reward": 0.34375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41013264656066895, + "step": 1463 + }, + { + "completion_length": 179.03125, + "epoch": 0.4665391969407266, + "grad_norm": 17.609155654907227, + "kl": 0.0986328125, + "learning_rate": 5.334608030592734e-07, + "loss": 0.004, + "reward": 1.5946422815322876, + "reward_std": 0.19063492119312286, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4852672815322876, + "rewards/pad": 0.109375, + "step": 1464 + }, + { + "completion_length": 264.390625, + "epoch": 0.4668578712555768, + "grad_norm": 59.53977584838867, + "kl": 0.09130859375, + "learning_rate": 5.331421287444232e-07, + "loss": 0.0036, + "reward": 1.5025644302368164, + "reward_std": 0.1165265142917633, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5181894898414612, + "rewards/pad": 0.0, + "step": 1465 + }, + { + "completion_length": 244.765625, + "epoch": 0.46717654557042704, + "grad_norm": 32.96870803833008, + "kl": 0.1025390625, + "learning_rate": 5.32823454429573e-07, + "loss": 0.0041, + "reward": 1.5927139520645142, + "reward_std": 0.1837758719921112, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5145889520645142, + "step": 1466 + }, + { + "completion_length": 249.59375, + "epoch": 0.46749521988527726, + "grad_norm": 37.91212463378906, + "kl": 0.10791015625, + "learning_rate": 5.325047801147227e-07, + "loss": 0.0043, + "reward": 1.4021697044372559, + "reward_std": 0.12167386710643768, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.41779476404190063, + "step": 1467 + }, + { + "completion_length": 359.40625, + "epoch": 0.4678138942001275, + "grad_norm": 13.006011962890625, + "kl": 0.061767578125, + "learning_rate": 5.321861057998725e-07, + "loss": 0.0025, + "reward": 1.580627202987671, + "reward_std": 0.20081724226474762, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5181272029876709, + "rewards/pad": 0.09375, + "step": 1468 + }, + { + "completion_length": 257.640625, + "epoch": 0.4681325685149777, + "grad_norm": 22.143430709838867, + "kl": 0.11083984375, + "learning_rate": 5.318674314850224e-07, + "loss": 0.0044, + "reward": 1.6454774141311646, + "reward_std": 0.09021544456481934, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5361024737358093, + "rewards/pad": 0.109375, + "step": 1469 + }, + { + "completion_length": 351.6875, + "epoch": 0.4684512428298279, + "grad_norm": 18.117698669433594, + "kl": 0.0615234375, + "learning_rate": 5.315487571701722e-07, + "loss": 0.0025, + "reward": 1.7443134784698486, + "reward_std": 0.07033313810825348, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.36931341886520386, + "step": 1470 + }, + { + "completion_length": 297.453125, + "epoch": 0.46876991714467814, + "grad_norm": 5.271533966064453, + "kl": 0.0986328125, + "learning_rate": 5.312300828553218e-07, + "loss": 0.0039, + "reward": 1.6119842529296875, + "reward_std": 0.05973343178629875, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6119842529296875, + "rewards/pad": 0.0, + "step": 1471 + }, + { + "completion_length": 235.96875, + "epoch": 0.46908859145952836, + "grad_norm": 9.944218635559082, + "kl": 0.111328125, + "learning_rate": 5.309114085404716e-07, + "loss": 0.0045, + "reward": 1.5511842966079712, + "reward_std": 0.1317683756351471, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4418092370033264, + "rewards/pad": 0.109375, + "step": 1472 + }, + { + "completion_length": 246.828125, + "epoch": 0.4694072657743786, + "grad_norm": 9.702787399291992, + "kl": 0.08837890625, + "learning_rate": 5.305927342256214e-07, + "loss": 0.0035, + "reward": 1.7182908058166504, + "reward_std": 0.1070544645190239, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6245408654212952, + "step": 1473 + }, + { + "completion_length": 401.765625, + "epoch": 0.4697259400892288, + "grad_norm": 7.4017229080200195, + "kl": 0.06396484375, + "learning_rate": 5.302740599107712e-07, + "loss": 0.0026, + "reward": 1.312766671180725, + "reward_std": 0.16063524782657623, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.34401655197143555, + "rewards/pad": 0.0, + "step": 1474 + }, + { + "completion_length": 218.765625, + "epoch": 0.470044614404079, + "grad_norm": 12.235037803649902, + "kl": 0.07958984375, + "learning_rate": 5.299553855959209e-07, + "loss": 0.0032, + "reward": 1.5451993942260742, + "reward_std": 0.12426760792732239, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.451449453830719, + "rewards/pad": 0.109375, + "step": 1475 + }, + { + "completion_length": 251.0625, + "epoch": 0.47036328871892924, + "grad_norm": 13.679093360900879, + "kl": 0.08447265625, + "learning_rate": 5.296367112810707e-07, + "loss": 0.0034, + "reward": 1.7477211952209473, + "reward_std": 0.08329816907644272, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6070961952209473, + "step": 1476 + }, + { + "completion_length": 302.515625, + "epoch": 0.47068196303377946, + "grad_norm": 9.420421600341797, + "kl": 0.0771484375, + "learning_rate": 5.293180369662205e-07, + "loss": 0.0031, + "reward": 1.6046003103256226, + "reward_std": 0.049083903431892395, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.47960028052330017, + "step": 1477 + }, + { + "completion_length": 444.046875, + "epoch": 0.4710006373486297, + "grad_norm": 5.747177600860596, + "kl": 0.064453125, + "learning_rate": 5.289993626513702e-07, + "loss": 0.0026, + "reward": 1.641414761543274, + "reward_std": 0.11404988169670105, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5945398807525635, + "step": 1478 + }, + { + "completion_length": 365.953125, + "epoch": 0.4713193116634799, + "grad_norm": 13.27202033996582, + "kl": 0.08642578125, + "learning_rate": 5.2868068833652e-07, + "loss": 0.0035, + "reward": 1.4590954780578613, + "reward_std": 0.08784317970275879, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4590955078601837, + "step": 1479 + }, + { + "completion_length": 229.6875, + "epoch": 0.4716379859783301, + "grad_norm": 15.232290267944336, + "kl": 0.0986328125, + "learning_rate": 5.283620140216698e-07, + "loss": 0.0039, + "reward": 1.6809430122375488, + "reward_std": 0.17516812682151794, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5715680122375488, + "step": 1480 + }, + { + "completion_length": 225.96875, + "epoch": 0.4719566602931804, + "grad_norm": 12.24894905090332, + "kl": 0.080078125, + "learning_rate": 5.280433397068196e-07, + "loss": 0.0032, + "reward": 1.6003769636154175, + "reward_std": 0.1401960253715515, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.584752082824707, + "step": 1481 + }, + { + "completion_length": 241.15625, + "epoch": 0.4722753346080306, + "grad_norm": 8.365073204040527, + "kl": 0.10888671875, + "learning_rate": 5.277246653919693e-07, + "loss": 0.0044, + "reward": 1.6874208450317383, + "reward_std": 0.09517542272806168, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5624208450317383, + "rewards/pad": 0.125, + "step": 1482 + }, + { + "completion_length": 357.015625, + "epoch": 0.47259400892288084, + "grad_norm": 2.965028762817383, + "kl": 0.078125, + "learning_rate": 5.274059910771191e-07, + "loss": 0.0031, + "reward": 1.4896973371505737, + "reward_std": 0.08937929570674896, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48969733715057373, + "rewards/pad": 0.0, + "step": 1483 + }, + { + "completion_length": 258.78125, + "epoch": 0.47291268323773106, + "grad_norm": 8.647948265075684, + "kl": 0.083984375, + "learning_rate": 5.270873167622689e-07, + "loss": 0.0034, + "reward": 1.5113916397094727, + "reward_std": 0.23726826906204224, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.3082665801048279, + "step": 1484 + }, + { + "completion_length": 277.171875, + "epoch": 0.4732313575525813, + "grad_norm": 10.09041690826416, + "kl": 0.0810546875, + "learning_rate": 5.267686424474187e-07, + "loss": 0.0032, + "reward": 1.7379040718078613, + "reward_std": 0.21050803363323212, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37852901220321655, + "step": 1485 + }, + { + "completion_length": 289.796875, + "epoch": 0.4735500318674315, + "grad_norm": 5.892612934112549, + "kl": 0.095703125, + "learning_rate": 5.264499681325684e-07, + "loss": 0.0038, + "reward": 1.6408710479736328, + "reward_std": 0.156442791223526, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43774598836898804, + "step": 1486 + }, + { + "completion_length": 357.96875, + "epoch": 0.4738687061822817, + "grad_norm": 5.740538597106934, + "kl": 0.06982421875, + "learning_rate": 5.261312938177182e-07, + "loss": 0.0028, + "reward": 1.7562711238861084, + "reward_std": 0.10480953752994537, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5375211238861084, + "rewards/pad": 0.21875, + "step": 1487 + }, + { + "completion_length": 349.109375, + "epoch": 0.47418738049713194, + "grad_norm": 275.7038269042969, + "kl": 0.09228515625, + "learning_rate": 5.25812619502868e-07, + "loss": 0.0037, + "reward": 1.5019524097442627, + "reward_std": 0.06322328746318817, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5019525289535522, + "step": 1488 + }, + { + "completion_length": 228.03125, + "epoch": 0.47450605481198216, + "grad_norm": 13.978322982788086, + "kl": 0.08154296875, + "learning_rate": 5.254939451880179e-07, + "loss": 0.0033, + "reward": 1.7376821041107178, + "reward_std": 0.10818063467741013, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6126821637153625, + "rewards/pad": 0.125, + "step": 1489 + }, + { + "completion_length": 278.046875, + "epoch": 0.4748247291268324, + "grad_norm": 31.306400299072266, + "kl": 0.0791015625, + "learning_rate": 5.251752708731676e-07, + "loss": 0.0032, + "reward": 1.5066144466400146, + "reward_std": 0.052641309797763824, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3816143274307251, + "rewards/pad": 0.125, + "step": 1490 + }, + { + "completion_length": 273.21875, + "epoch": 0.4751434034416826, + "grad_norm": 5.694839954376221, + "kl": 0.09375, + "learning_rate": 5.248565965583174e-07, + "loss": 0.0038, + "reward": 1.6695730686187744, + "reward_std": 0.14564523100852966, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5758230686187744, + "rewards/pad": 0.109375, + "step": 1491 + }, + { + "completion_length": 231.21875, + "epoch": 0.4754620777565328, + "grad_norm": 15.175251960754395, + "kl": 0.107421875, + "learning_rate": 5.245379222434672e-07, + "loss": 0.0043, + "reward": 1.5302845239639282, + "reward_std": 0.21586251258850098, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5615345239639282, + "rewards/pad": 0.0, + "step": 1492 + }, + { + "completion_length": 348.546875, + "epoch": 0.47578075207138304, + "grad_norm": 6.989543914794922, + "kl": 0.0703125, + "learning_rate": 5.24219247928617e-07, + "loss": 0.0028, + "reward": 1.6768548488616943, + "reward_std": 0.11237145215272903, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5674799084663391, + "rewards/pad": 0.125, + "step": 1493 + }, + { + "completion_length": 319.53125, + "epoch": 0.47609942638623326, + "grad_norm": 9.460558891296387, + "kl": 0.09228515625, + "learning_rate": 5.239005736137667e-07, + "loss": 0.0037, + "reward": 1.5112285614013672, + "reward_std": 0.09450381994247437, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.511228621006012, + "step": 1494 + }, + { + "completion_length": 253.765625, + "epoch": 0.4764181007010835, + "grad_norm": 6.729824066162109, + "kl": 0.09912109375, + "learning_rate": 5.235818992989165e-07, + "loss": 0.004, + "reward": 1.6795250177383423, + "reward_std": 0.07208022475242615, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5701500177383423, + "step": 1495 + }, + { + "completion_length": 191.921875, + "epoch": 0.4767367750159337, + "grad_norm": 9.541268348693848, + "kl": 0.109375, + "learning_rate": 5.232632249840663e-07, + "loss": 0.0044, + "reward": 1.529311180114746, + "reward_std": 0.07672906666994095, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5293111801147461, + "rewards/pad": 0.0, + "step": 1496 + }, + { + "completion_length": 327.21875, + "epoch": 0.4770554493307839, + "grad_norm": 6.8143086433410645, + "kl": 0.091796875, + "learning_rate": 5.229445506692161e-07, + "loss": 0.0037, + "reward": 1.588439702987671, + "reward_std": 0.04701199382543564, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5884397029876709, + "rewards/pad": 0.0, + "step": 1497 + }, + { + "completion_length": 174.578125, + "epoch": 0.47737412364563414, + "grad_norm": 13.120146751403809, + "kl": 0.1083984375, + "learning_rate": 5.226258763543658e-07, + "loss": 0.0043, + "reward": 1.6799719333648682, + "reward_std": 0.05143211781978607, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6799719333648682, + "rewards/pad": 0.0, + "step": 1498 + }, + { + "completion_length": 362.4375, + "epoch": 0.47769279796048436, + "grad_norm": 11.334291458129883, + "kl": 0.0908203125, + "learning_rate": 5.223072020395156e-07, + "loss": 0.0036, + "reward": 1.4830890893936157, + "reward_std": 0.046455733478069305, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4830890893936157, + "rewards/pad": 0.0, + "step": 1499 + }, + { + "completion_length": 227.25, + "epoch": 0.4780114722753346, + "grad_norm": 13.76636791229248, + "kl": 0.095703125, + "learning_rate": 5.219885277246654e-07, + "loss": 0.0038, + "reward": 1.5293161869049072, + "reward_std": 0.11343329399824142, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5293163061141968, + "rewards/pad": 0.0, + "step": 1500 + }, + { + "completion_length": 354.734375, + "epoch": 0.47833014659018486, + "grad_norm": 6.618967533111572, + "kl": 0.0830078125, + "learning_rate": 5.216698534098152e-07, + "loss": 0.0033, + "reward": 1.4522448778152466, + "reward_std": 0.22254043817520142, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4522448182106018, + "rewards/pad": 0.03125, + "step": 1501 + }, + { + "completion_length": 322.3125, + "epoch": 0.4786488209050351, + "grad_norm": 30.195968627929688, + "kl": 0.07080078125, + "learning_rate": 5.213511790949649e-07, + "loss": 0.0028, + "reward": 1.5755908489227295, + "reward_std": 0.14302876591682434, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4818408489227295, + "step": 1502 + }, + { + "completion_length": 273.265625, + "epoch": 0.4789674952198853, + "grad_norm": 5.307925701141357, + "kl": 0.087890625, + "learning_rate": 5.210325047801147e-07, + "loss": 0.0035, + "reward": 1.4843538999557495, + "reward_std": 0.2061702311038971, + "rewards/format_reward_tg": 0.9375, + "rewards/iou_timestamp_reward": 0.5468538999557495, + "rewards/pad": 0.0, + "step": 1503 + }, + { + "completion_length": 289.5, + "epoch": 0.4792861695347355, + "grad_norm": 19.900287628173828, + "kl": 0.1220703125, + "learning_rate": 5.207138304652645e-07, + "loss": 0.0049, + "reward": 1.44891357421875, + "reward_std": 0.07818566262722015, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44891366362571716, + "rewards/pad": 0.0, + "step": 1504 + }, + { + "completion_length": 287.671875, + "epoch": 0.47960484384958574, + "grad_norm": 14.69202709197998, + "kl": 0.09814453125, + "learning_rate": 5.203951561504143e-07, + "loss": 0.0039, + "reward": 1.3902406692504883, + "reward_std": 0.07330122590065002, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39024072885513306, + "step": 1505 + }, + { + "completion_length": 332.484375, + "epoch": 0.47992351816443596, + "grad_norm": 39.345054626464844, + "kl": 0.0693359375, + "learning_rate": 5.20076481835564e-07, + "loss": 0.0028, + "reward": 1.4343560934066772, + "reward_std": 0.06460077315568924, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32498109340667725, + "step": 1506 + }, + { + "completion_length": 374.046875, + "epoch": 0.4802421924792862, + "grad_norm": 4.585975646972656, + "kl": 0.09814453125, + "learning_rate": 5.197578075207139e-07, + "loss": 0.0039, + "reward": 1.3336130380630493, + "reward_std": 0.0820106789469719, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34923800826072693, + "step": 1507 + }, + { + "completion_length": 232.140625, + "epoch": 0.4805608667941364, + "grad_norm": 16.264991760253906, + "kl": 0.07861328125, + "learning_rate": 5.194391332058637e-07, + "loss": 0.0031, + "reward": 1.7201967239379883, + "reward_std": 0.08893473446369171, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47019678354263306, + "step": 1508 + }, + { + "completion_length": 288.140625, + "epoch": 0.4808795411089866, + "grad_norm": 33.29106140136719, + "kl": 0.08349609375, + "learning_rate": 5.191204588910135e-07, + "loss": 0.0033, + "reward": 1.4425323009490967, + "reward_std": 0.04038984328508377, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4425322413444519, + "step": 1509 + }, + { + "completion_length": 342.90625, + "epoch": 0.48119821542383684, + "grad_norm": 9.950427055358887, + "kl": 0.0625, + "learning_rate": 5.188017845761632e-07, + "loss": 0.0025, + "reward": 1.8057146072387695, + "reward_std": 0.1636984646320343, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46196460723876953, + "rewards/pad": 0.359375, + "step": 1510 + }, + { + "completion_length": 214.515625, + "epoch": 0.48151688973868706, + "grad_norm": 8.171236991882324, + "kl": 0.09326171875, + "learning_rate": 5.184831102613129e-07, + "loss": 0.0037, + "reward": 1.408177375793457, + "reward_std": 0.05359324812889099, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2831774652004242, + "step": 1511 + }, + { + "completion_length": 258.65625, + "epoch": 0.4818355640535373, + "grad_norm": 9.692242622375488, + "kl": 0.1015625, + "learning_rate": 5.181644359464627e-07, + "loss": 0.0041, + "reward": 1.5358140468597412, + "reward_std": 0.13970480859279633, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4576890170574188, + "step": 1512 + }, + { + "completion_length": 269.0625, + "epoch": 0.4821542383683875, + "grad_norm": 10.316642761230469, + "kl": 0.099609375, + "learning_rate": 5.178457616316124e-07, + "loss": 0.004, + "reward": 1.5783674716949463, + "reward_std": 0.10503272712230682, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4064924716949463, + "step": 1513 + }, + { + "completion_length": 294.5, + "epoch": 0.4824729126832377, + "grad_norm": 26.83660888671875, + "kl": 0.09423828125, + "learning_rate": 5.175270873167622e-07, + "loss": 0.0038, + "reward": 1.566218614578247, + "reward_std": 0.13746222853660583, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5193435549736023, + "step": 1514 + }, + { + "completion_length": 303.984375, + "epoch": 0.48279158699808794, + "grad_norm": 6.991064548492432, + "kl": 0.07861328125, + "learning_rate": 5.17208413001912e-07, + "loss": 0.0031, + "reward": 1.561284065246582, + "reward_std": 0.17649711668491364, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.49878406524658203, + "step": 1515 + }, + { + "completion_length": 395.78125, + "epoch": 0.48311026131293816, + "grad_norm": 7.837182521820068, + "kl": 0.06298828125, + "learning_rate": 5.168897386870618e-07, + "loss": 0.0025, + "reward": 1.5242811441421509, + "reward_std": 0.04641805589199066, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3992811143398285, + "rewards/pad": 0.125, + "step": 1516 + }, + { + "completion_length": 219.359375, + "epoch": 0.4834289356277884, + "grad_norm": 7.788435935974121, + "kl": 0.11572265625, + "learning_rate": 5.165710643722115e-07, + "loss": 0.0046, + "reward": 1.4695744514465332, + "reward_std": 0.11476314067840576, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3601993918418884, + "step": 1517 + }, + { + "completion_length": 222.625, + "epoch": 0.4837476099426386, + "grad_norm": 23.84461784362793, + "kl": 0.10400390625, + "learning_rate": 5.162523900573613e-07, + "loss": 0.0042, + "reward": 1.7240705490112305, + "reward_std": 0.10388059914112091, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5990704298019409, + "rewards/pad": 0.125, + "step": 1518 + }, + { + "completion_length": 170.78125, + "epoch": 0.4840662842574888, + "grad_norm": 15.905930519104004, + "kl": 0.099609375, + "learning_rate": 5.159337157425111e-07, + "loss": 0.004, + "reward": 1.6322693824768066, + "reward_std": 0.10194779187440872, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5228945016860962, + "rewards/pad": 0.125, + "step": 1519 + }, + { + "completion_length": 306.546875, + "epoch": 0.4843849585723391, + "grad_norm": 24.74367332458496, + "kl": 0.08740234375, + "learning_rate": 5.156150414276609e-07, + "loss": 0.0035, + "reward": 1.5158793926239014, + "reward_std": 0.07431940734386444, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39087939262390137, + "rewards/pad": 0.125, + "step": 1520 + }, + { + "completion_length": 284.390625, + "epoch": 0.4847036328871893, + "grad_norm": 11.4489164352417, + "kl": 0.130859375, + "learning_rate": 5.152963671128106e-07, + "loss": 0.0052, + "reward": 1.5866434574127197, + "reward_std": 0.07465530186891556, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4616435170173645, + "step": 1521 + }, + { + "completion_length": 286.921875, + "epoch": 0.48502230720203954, + "grad_norm": 5.68887996673584, + "kl": 0.07421875, + "learning_rate": 5.149776927979604e-07, + "loss": 0.003, + "reward": 1.4670679569244385, + "reward_std": 0.11055351793766022, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3576929569244385, + "rewards/pad": 0.125, + "step": 1522 + }, + { + "completion_length": 331.75, + "epoch": 0.48534098151688976, + "grad_norm": 4.68539571762085, + "kl": 0.0673828125, + "learning_rate": 5.146590184831102e-07, + "loss": 0.0027, + "reward": 1.4538731575012207, + "reward_std": 0.06285285204648972, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3288731276988983, + "step": 1523 + }, + { + "completion_length": 333.78125, + "epoch": 0.48565965583174, + "grad_norm": 8.88425350189209, + "kl": 0.095703125, + "learning_rate": 5.1434034416826e-07, + "loss": 0.0038, + "reward": 1.3173884153366089, + "reward_std": 0.04951098561286926, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3173884153366089, + "step": 1524 + }, + { + "completion_length": 424.8125, + "epoch": 0.4859783301465902, + "grad_norm": 5.125090599060059, + "kl": 0.05224609375, + "learning_rate": 5.140216698534097e-07, + "loss": 0.0021, + "reward": 1.418252944946289, + "reward_std": 0.06199708208441734, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43387794494628906, + "step": 1525 + }, + { + "completion_length": 233.015625, + "epoch": 0.4862970044614404, + "grad_norm": 10.070802688598633, + "kl": 0.08642578125, + "learning_rate": 5.137029955385595e-07, + "loss": 0.0035, + "reward": 1.6166040897369385, + "reward_std": 0.168365940451622, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3197290897369385, + "step": 1526 + }, + { + "completion_length": 239.21875, + "epoch": 0.48661567877629064, + "grad_norm": 23.041690826416016, + "kl": 0.099609375, + "learning_rate": 5.133843212237094e-07, + "loss": 0.004, + "reward": 1.650726556777954, + "reward_std": 0.11059662699699402, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5257264971733093, + "rewards/pad": 0.125, + "step": 1527 + }, + { + "completion_length": 126.6875, + "epoch": 0.48693435309114086, + "grad_norm": 12.056344985961914, + "kl": 0.1142578125, + "learning_rate": 5.130656469088592e-07, + "loss": 0.0046, + "reward": 1.7706551551818848, + "reward_std": 0.12458785623311996, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.52065509557724, + "step": 1528 + }, + { + "completion_length": 180.921875, + "epoch": 0.4872530274059911, + "grad_norm": 5.845155715942383, + "kl": 0.11328125, + "learning_rate": 5.127469725940089e-07, + "loss": 0.0045, + "reward": 1.4621466398239136, + "reward_std": 0.09760545194149017, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46214666962623596, + "rewards/pad": 0.0, + "step": 1529 + }, + { + "completion_length": 183.96875, + "epoch": 0.4875717017208413, + "grad_norm": 18.358531951904297, + "kl": 0.09814453125, + "learning_rate": 5.124282982791587e-07, + "loss": 0.0039, + "reward": 1.92790687084198, + "reward_std": 0.19410569965839386, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6154068112373352, + "rewards/pad": 0.328125, + "step": 1530 + }, + { + "completion_length": 244.03125, + "epoch": 0.4878903760356915, + "grad_norm": 12.977493286132812, + "kl": 0.0830078125, + "learning_rate": 5.121096239643085e-07, + "loss": 0.0033, + "reward": 1.7430362701416016, + "reward_std": 0.12946908175945282, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5711612105369568, + "rewards/pad": 0.171875, + "step": 1531 + }, + { + "completion_length": 219.734375, + "epoch": 0.48820905035054174, + "grad_norm": 8.308573722839355, + "kl": 0.10546875, + "learning_rate": 5.117909496494583e-07, + "loss": 0.0042, + "reward": 1.6283482313156128, + "reward_std": 0.08693157136440277, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6283482313156128, + "rewards/pad": 0.0, + "step": 1532 + }, + { + "completion_length": 243.03125, + "epoch": 0.48852772466539196, + "grad_norm": 12.113231658935547, + "kl": 0.10205078125, + "learning_rate": 5.11472275334608e-07, + "loss": 0.0041, + "reward": 1.3734705448150635, + "reward_std": 0.07666440308094025, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3734704852104187, + "rewards/pad": 0.0, + "step": 1533 + }, + { + "completion_length": 221.328125, + "epoch": 0.4888463989802422, + "grad_norm": 15.715192794799805, + "kl": 0.1025390625, + "learning_rate": 5.111536010197578e-07, + "loss": 0.0041, + "reward": 1.5486443042755127, + "reward_std": 0.14197894930839539, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4236442446708679, + "rewards/pad": 0.125, + "step": 1534 + }, + { + "completion_length": 329.125, + "epoch": 0.4891650732950924, + "grad_norm": 18.260419845581055, + "kl": 0.06591796875, + "learning_rate": 5.108349267049076e-07, + "loss": 0.0026, + "reward": 1.6931008100509644, + "reward_std": 0.04175805300474167, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.44310078024864197, + "step": 1535 + }, + { + "completion_length": 384.28125, + "epoch": 0.4894837476099426, + "grad_norm": 3.997661590576172, + "kl": 0.06640625, + "learning_rate": 5.105162523900574e-07, + "loss": 0.0027, + "reward": 1.495345115661621, + "reward_std": 0.1974174678325653, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.5422199964523315, + "rewards/pad": 0.0, + "step": 1536 + }, + { + "completion_length": 328.578125, + "epoch": 0.48980242192479284, + "grad_norm": 6.118681907653809, + "kl": 0.07763671875, + "learning_rate": 5.101975780752071e-07, + "loss": 0.0031, + "reward": 1.459146499633789, + "reward_std": 0.11266092956066132, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.47477149963378906, + "step": 1537 + }, + { + "completion_length": 221.484375, + "epoch": 0.49012109623964306, + "grad_norm": 32.38083267211914, + "kl": 0.10107421875, + "learning_rate": 5.098789037603569e-07, + "loss": 0.0041, + "reward": 1.5192437171936035, + "reward_std": 0.10911397635936737, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.40986865758895874, + "step": 1538 + }, + { + "completion_length": 275.5625, + "epoch": 0.4904397705544933, + "grad_norm": 25.792707443237305, + "kl": 0.11376953125, + "learning_rate": 5.095602294455067e-07, + "loss": 0.0045, + "reward": 1.474266767501831, + "reward_std": 0.04674249142408371, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47426676750183105, + "step": 1539 + }, + { + "completion_length": 173.75, + "epoch": 0.49075844486934356, + "grad_norm": 15.709648132324219, + "kl": 0.1318359375, + "learning_rate": 5.092415551306564e-07, + "loss": 0.0053, + "reward": 1.5435810089111328, + "reward_std": 0.09169697761535645, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4342060983181, + "rewards/pad": 0.109375, + "step": 1540 + }, + { + "completion_length": 279.578125, + "epoch": 0.4910771191841938, + "grad_norm": 8.689448356628418, + "kl": 0.0791015625, + "learning_rate": 5.089228808158062e-07, + "loss": 0.0032, + "reward": 1.5602798461914062, + "reward_std": 0.16906121373176575, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.40402984619140625, + "step": 1541 + }, + { + "completion_length": 340.546875, + "epoch": 0.491395793499044, + "grad_norm": 15.626315116882324, + "kl": 0.07373046875, + "learning_rate": 5.08604206500956e-07, + "loss": 0.0029, + "reward": 1.3878772258758545, + "reward_std": 0.05019243061542511, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3878771960735321, + "step": 1542 + }, + { + "completion_length": 210.53125, + "epoch": 0.4917144678138942, + "grad_norm": 28.460586547851562, + "kl": 0.10791015625, + "learning_rate": 5.082855321861058e-07, + "loss": 0.0043, + "reward": 1.3582227230072021, + "reward_std": 0.05405221879482269, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3582227826118469, + "rewards/pad": 0.0, + "step": 1543 + }, + { + "completion_length": 231.71875, + "epoch": 0.49203314212874444, + "grad_norm": 52.28375244140625, + "kl": 0.08984375, + "learning_rate": 5.079668578712555e-07, + "loss": 0.0036, + "reward": 1.8166325092315674, + "reward_std": 0.22572393715381622, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6291325688362122, + "rewards/pad": 0.203125, + "step": 1544 + }, + { + "completion_length": 159.40625, + "epoch": 0.49235181644359466, + "grad_norm": 7.311517715454102, + "kl": 0.099609375, + "learning_rate": 5.076481835564054e-07, + "loss": 0.004, + "reward": 1.329869270324707, + "reward_std": 0.09352267533540726, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3298693001270294, + "step": 1545 + }, + { + "completion_length": 223.578125, + "epoch": 0.4926704907584449, + "grad_norm": 14.767241477966309, + "kl": 0.09765625, + "learning_rate": 5.073295092415552e-07, + "loss": 0.0039, + "reward": 1.7060375213623047, + "reward_std": 0.12457602471113205, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6122875213623047, + "step": 1546 + }, + { + "completion_length": 273.546875, + "epoch": 0.4929891650732951, + "grad_norm": 9.330548286437988, + "kl": 0.12890625, + "learning_rate": 5.07010834926705e-07, + "loss": 0.0051, + "reward": 1.4542279243469238, + "reward_std": 0.13715635240077972, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4698530435562134, + "step": 1547 + }, + { + "completion_length": 257.03125, + "epoch": 0.4933078393881453, + "grad_norm": 25.97972869873047, + "kl": 0.099609375, + "learning_rate": 5.066921606118547e-07, + "loss": 0.004, + "reward": 1.437532663345337, + "reward_std": 0.11327797174453735, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45315760374069214, + "rewards/pad": 0.0, + "step": 1548 + }, + { + "completion_length": 238.40625, + "epoch": 0.49362651370299554, + "grad_norm": 6.355741024017334, + "kl": 0.07080078125, + "learning_rate": 5.063734862970045e-07, + "loss": 0.0028, + "reward": 1.877377986907959, + "reward_std": 0.11931359767913818, + "rewards/answer_reward": 0.484375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3930028975009918, + "step": 1549 + }, + { + "completion_length": 372.40625, + "epoch": 0.49394518801784576, + "grad_norm": 9.344226837158203, + "kl": 0.078125, + "learning_rate": 5.060548119821542e-07, + "loss": 0.0031, + "reward": 1.4983468055725098, + "reward_std": 0.14175045490264893, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5139718055725098, + "step": 1550 + }, + { + "completion_length": 255.0625, + "epoch": 0.494263862332696, + "grad_norm": 22.564973831176758, + "kl": 0.1015625, + "learning_rate": 5.05736137667304e-07, + "loss": 0.0041, + "reward": 1.4562467336654663, + "reward_std": 0.16175200045108795, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.4874967336654663, + "step": 1551 + }, + { + "completion_length": 288.609375, + "epoch": 0.4945825366475462, + "grad_norm": 6.467219352722168, + "kl": 0.08349609375, + "learning_rate": 5.054174633524537e-07, + "loss": 0.0033, + "reward": 1.5488793849945068, + "reward_std": 0.06914226710796356, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5488794445991516, + "step": 1552 + }, + { + "completion_length": 267.984375, + "epoch": 0.4949012109623964, + "grad_norm": 6.855160236358643, + "kl": 0.0830078125, + "learning_rate": 5.050987890376035e-07, + "loss": 0.0033, + "reward": 1.58836030960083, + "reward_std": 0.06652949750423431, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4633602797985077, + "step": 1553 + }, + { + "completion_length": 228.734375, + "epoch": 0.49521988527724664, + "grad_norm": 19.235485076904297, + "kl": 0.10498046875, + "learning_rate": 5.047801147227533e-07, + "loss": 0.0042, + "reward": 1.5286242961883545, + "reward_std": 0.11664712429046631, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.2942492365837097, + "step": 1554 + }, + { + "completion_length": 280.9375, + "epoch": 0.49553855959209686, + "grad_norm": 10.115842819213867, + "kl": 0.09423828125, + "learning_rate": 5.044614404079031e-07, + "loss": 0.0038, + "reward": 1.548297643661499, + "reward_std": 0.11133519560098648, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5639227032661438, + "rewards/pad": 0.0, + "step": 1555 + }, + { + "completion_length": 226.234375, + "epoch": 0.4958572339069471, + "grad_norm": 22.632034301757812, + "kl": 0.10986328125, + "learning_rate": 5.041427660930528e-07, + "loss": 0.0044, + "reward": 1.5520951747894287, + "reward_std": 0.11214704811573029, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5520952343940735, + "rewards/pad": 0.0, + "step": 1556 + }, + { + "completion_length": 387.8125, + "epoch": 0.4961759082217973, + "grad_norm": 4.392566204071045, + "kl": 0.07080078125, + "learning_rate": 5.038240917782026e-07, + "loss": 0.0028, + "reward": 1.5486336946487427, + "reward_std": 0.04238799214363098, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5486336946487427, + "rewards/pad": 0.0, + "step": 1557 + }, + { + "completion_length": 266.71875, + "epoch": 0.4964945825366475, + "grad_norm": 10.745803833007812, + "kl": 0.0947265625, + "learning_rate": 5.035054174633524e-07, + "loss": 0.0038, + "reward": 1.51939058303833, + "reward_std": 0.10485051572322845, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5350156426429749, + "rewards/pad": 0.0, + "step": 1558 + }, + { + "completion_length": 287.21875, + "epoch": 0.49681325685149774, + "grad_norm": 9.209283828735352, + "kl": 0.08935546875, + "learning_rate": 5.031867431485022e-07, + "loss": 0.0036, + "reward": 1.5379369258880615, + "reward_std": 0.28919923305511475, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.953125, + "rewards/iou_glue_reward": 0.4598119258880615, + "step": 1559 + }, + { + "completion_length": 279.828125, + "epoch": 0.497131931166348, + "grad_norm": 9.263772964477539, + "kl": 0.0927734375, + "learning_rate": 5.028680688336519e-07, + "loss": 0.0037, + "reward": 1.4220798015594482, + "reward_std": 0.10979984700679779, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4377047121524811, + "rewards/pad": 0.0, + "step": 1560 + }, + { + "completion_length": 182.3125, + "epoch": 0.49745060548119824, + "grad_norm": 14.146515846252441, + "kl": 0.11572265625, + "learning_rate": 5.025493945188017e-07, + "loss": 0.0046, + "reward": 1.6618396043777466, + "reward_std": 0.09788843989372253, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5368396043777466, + "rewards/pad": 0.125, + "step": 1561 + }, + { + "completion_length": 191.421875, + "epoch": 0.49776927979604846, + "grad_norm": 53.63630676269531, + "kl": 0.09521484375, + "learning_rate": 5.022307202039515e-07, + "loss": 0.0038, + "reward": 1.9397457838058472, + "reward_std": 0.10246553272008896, + "rewards/answer_reward": 0.5, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.43974587321281433, + "step": 1562 + }, + { + "completion_length": 250.953125, + "epoch": 0.4980879541108987, + "grad_norm": 6.227761745452881, + "kl": 0.09521484375, + "learning_rate": 5.019120458891013e-07, + "loss": 0.0038, + "reward": 1.4951188564300537, + "reward_std": 0.14482280611991882, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5263688564300537, + "step": 1563 + }, + { + "completion_length": 416.875, + "epoch": 0.4984066284257489, + "grad_norm": 5.780639171600342, + "kl": 0.059814453125, + "learning_rate": 5.01593371574251e-07, + "loss": 0.0024, + "reward": 1.532064437866211, + "reward_std": 0.03820374235510826, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5320644378662109, + "step": 1564 + }, + { + "completion_length": 221.484375, + "epoch": 0.4987253027405991, + "grad_norm": 15.810646057128906, + "kl": 0.1015625, + "learning_rate": 5.012746972594009e-07, + "loss": 0.0041, + "reward": 1.7277759313583374, + "reward_std": 0.0686957985162735, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6027759313583374, + "rewards/pad": 0.125, + "step": 1565 + }, + { + "completion_length": 297.9375, + "epoch": 0.49904397705544934, + "grad_norm": 8.922152519226074, + "kl": 0.08642578125, + "learning_rate": 5.009560229445507e-07, + "loss": 0.0035, + "reward": 1.2540583610534668, + "reward_std": 0.31059730052948, + "rewards/format_reward_tg": 0.921875, + "rewards/iou_timestamp_reward": 0.316558301448822, + "rewards/pad": 0.015625, + "step": 1566 + }, + { + "completion_length": 220.203125, + "epoch": 0.49936265137029956, + "grad_norm": 17.810441970825195, + "kl": 0.09228515625, + "learning_rate": 5.006373486297005e-07, + "loss": 0.0037, + "reward": 1.598185420036316, + "reward_std": 0.1599069982767105, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3638104498386383, + "step": 1567 + }, + { + "completion_length": 271.53125, + "epoch": 0.4996813256851498, + "grad_norm": 5.525754928588867, + "kl": 0.080078125, + "learning_rate": 5.003186743148502e-07, + "loss": 0.0032, + "reward": 1.5842467546463013, + "reward_std": 0.12647908926010132, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5998716950416565, + "step": 1568 + }, + { + "completion_length": 277.890625, + "epoch": 0.5, + "grad_norm": 31.218156814575195, + "kl": 0.0859375, + "learning_rate": 5e-07, + "loss": 0.0034, + "reward": 1.6523618698120117, + "reward_std": 0.19241999089717865, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5586118102073669, + "step": 1569 + }, + { + "completion_length": 268.0625, + "epoch": 0.5003186743148502, + "grad_norm": 5.818955898284912, + "kl": 0.08984375, + "learning_rate": 4.996813256851498e-07, + "loss": 0.0036, + "reward": 1.4741897583007812, + "reward_std": 0.21641533076763153, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.5210647583007812, + "rewards/pad": 0.0, + "step": 1570 + }, + { + "completion_length": 230.578125, + "epoch": 0.5006373486297004, + "grad_norm": 8.602928161621094, + "kl": 0.09765625, + "learning_rate": 4.993626513702995e-07, + "loss": 0.0039, + "reward": 1.467624545097351, + "reward_std": 0.217521071434021, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.32699960470199585, + "rewards/pad": 0.15625, + "step": 1571 + }, + { + "completion_length": 345.484375, + "epoch": 0.5009560229445507, + "grad_norm": 5.404134750366211, + "kl": 0.07763671875, + "learning_rate": 4.990439770554493e-07, + "loss": 0.0031, + "reward": 1.6908576488494873, + "reward_std": 0.07052527368068695, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5658575892448425, + "rewards/pad": 0.125, + "step": 1572 + }, + { + "completion_length": 289.703125, + "epoch": 0.5012746972594009, + "grad_norm": 11.262680053710938, + "kl": 0.0986328125, + "learning_rate": 4.987253027405991e-07, + "loss": 0.004, + "reward": 1.4238890409469604, + "reward_std": 0.13102203607559204, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.39263904094696045, + "step": 1573 + }, + { + "completion_length": 204.421875, + "epoch": 0.5015933715742511, + "grad_norm": 16.025836944580078, + "kl": 0.11279296875, + "learning_rate": 4.984066284257489e-07, + "loss": 0.0045, + "reward": 1.6819837093353271, + "reward_std": 0.11338447034358978, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5726087093353271, + "rewards/pad": 0.109375, + "step": 1574 + }, + { + "completion_length": 283.59375, + "epoch": 0.5019120458891013, + "grad_norm": 7.817051887512207, + "kl": 0.0986328125, + "learning_rate": 4.980879541108986e-07, + "loss": 0.0039, + "reward": 1.5237798690795898, + "reward_std": 0.0966147929430008, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4144047498703003, + "step": 1575 + }, + { + "completion_length": 272.65625, + "epoch": 0.5022307202039515, + "grad_norm": 10.048602104187012, + "kl": 0.083984375, + "learning_rate": 4.977692797960484e-07, + "loss": 0.0034, + "reward": 1.7033123970031738, + "reward_std": 0.09860232472419739, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45331239700317383, + "step": 1576 + }, + { + "completion_length": 201.5625, + "epoch": 0.5025493945188018, + "grad_norm": 15.853896141052246, + "kl": 0.10400390625, + "learning_rate": 4.974506054811982e-07, + "loss": 0.0042, + "reward": 1.5546913146972656, + "reward_std": 0.1912136971950531, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5703162550926208, + "step": 1577 + }, + { + "completion_length": 317.140625, + "epoch": 0.502868068833652, + "grad_norm": 6.66535758972168, + "kl": 0.07763671875, + "learning_rate": 4.97131931166348e-07, + "loss": 0.0031, + "reward": 1.323586106300354, + "reward_std": 0.2054770141839981, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.37046104669570923, + "step": 1578 + }, + { + "completion_length": 285.765625, + "epoch": 0.5031867431485022, + "grad_norm": 11.269193649291992, + "kl": 0.09912109375, + "learning_rate": 4.968132568514977e-07, + "loss": 0.004, + "reward": 1.6706132888793945, + "reward_std": 0.14314395189285278, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5612383484840393, + "step": 1579 + }, + { + "completion_length": 269.6875, + "epoch": 0.5035054174633524, + "grad_norm": 11.386666297912598, + "kl": 0.09228515625, + "learning_rate": 4.964945825366475e-07, + "loss": 0.0037, + "reward": 1.4375762939453125, + "reward_std": 0.15949112176895142, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3438262939453125, + "step": 1580 + }, + { + "completion_length": 359.984375, + "epoch": 0.5038240917782026, + "grad_norm": 13.61768913269043, + "kl": 0.0654296875, + "learning_rate": 4.961759082217972e-07, + "loss": 0.0026, + "reward": 1.4767695665359497, + "reward_std": 0.23325678706169128, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.5236445069313049, + "step": 1581 + }, + { + "completion_length": 218.921875, + "epoch": 0.5041427660930529, + "grad_norm": 8.900588035583496, + "kl": 0.09814453125, + "learning_rate": 4.95857233906947e-07, + "loss": 0.0039, + "reward": 1.6065030097961426, + "reward_std": 0.06527124345302582, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.606502890586853, + "step": 1582 + }, + { + "completion_length": 216.53125, + "epoch": 0.5044614404079031, + "grad_norm": 8.777047157287598, + "kl": 0.1337890625, + "learning_rate": 4.955385595920969e-07, + "loss": 0.0053, + "reward": 1.6224644184112549, + "reward_std": 0.06472902745008469, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49746444821357727, + "step": 1583 + }, + { + "completion_length": 398.28125, + "epoch": 0.5047801147227533, + "grad_norm": 5.128695011138916, + "kl": 0.05078125, + "learning_rate": 4.952198852772467e-07, + "loss": 0.002, + "reward": 1.6772174835205078, + "reward_std": 0.13002997636795044, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4584675431251526, + "step": 1584 + }, + { + "completion_length": 224.328125, + "epoch": 0.5050987890376035, + "grad_norm": 8.237434387207031, + "kl": 0.083984375, + "learning_rate": 4.949012109623964e-07, + "loss": 0.0034, + "reward": 1.6073098182678223, + "reward_std": 0.10495173186063766, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.13855993747711182, + "rewards/pad": 0.46875, + "step": 1585 + }, + { + "completion_length": 253.96875, + "epoch": 0.5054174633524537, + "grad_norm": 7.674633026123047, + "kl": 0.1083984375, + "learning_rate": 4.945825366475462e-07, + "loss": 0.0043, + "reward": 1.4445209503173828, + "reward_std": 0.16878455877304077, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4601460099220276, + "rewards/pad": 0.0, + "step": 1586 + }, + { + "completion_length": 205.28125, + "epoch": 0.505736137667304, + "grad_norm": 5.523696422576904, + "kl": 0.09326171875, + "learning_rate": 4.94263862332696e-07, + "loss": 0.0037, + "reward": 1.7708675861358643, + "reward_std": 0.0980464294552803, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.645867645740509, + "step": 1587 + }, + { + "completion_length": 387.375, + "epoch": 0.5060548119821542, + "grad_norm": 8.26123332977295, + "kl": 0.0625, + "learning_rate": 4.939451880178458e-07, + "loss": 0.0025, + "reward": 1.524539828300476, + "reward_std": 0.12007015943527222, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4932897686958313, + "step": 1588 + }, + { + "completion_length": 383.890625, + "epoch": 0.5063734862970045, + "grad_norm": 5.443143367767334, + "kl": 0.06005859375, + "learning_rate": 4.936265137029955e-07, + "loss": 0.0024, + "reward": 1.693671464920044, + "reward_std": 0.04399287700653076, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44367146492004395, + "step": 1589 + }, + { + "completion_length": 339.703125, + "epoch": 0.5066921606118547, + "grad_norm": 3.934993028640747, + "kl": 0.08251953125, + "learning_rate": 4.933078393881453e-07, + "loss": 0.0033, + "reward": 1.6259467601776123, + "reward_std": 0.13442574441432953, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5165717005729675, + "step": 1590 + }, + { + "completion_length": 164.15625, + "epoch": 0.507010834926705, + "grad_norm": 15.19997787475586, + "kl": 0.1259765625, + "learning_rate": 4.929891650732951e-07, + "loss": 0.0051, + "reward": 1.5974699258804321, + "reward_std": 0.1301935464143753, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5037198662757874, + "step": 1591 + }, + { + "completion_length": 229.828125, + "epoch": 0.5073295092415552, + "grad_norm": 6.299907684326172, + "kl": 0.1103515625, + "learning_rate": 4.926704907584449e-07, + "loss": 0.0044, + "reward": 1.4970672130584717, + "reward_std": 0.12784096598625183, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3876921534538269, + "step": 1592 + }, + { + "completion_length": 341.140625, + "epoch": 0.5076481835564054, + "grad_norm": 5.9435343742370605, + "kl": 0.0947265625, + "learning_rate": 4.923518164435946e-07, + "loss": 0.0038, + "reward": 1.5198999643325806, + "reward_std": 0.10769708454608917, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4105249047279358, + "step": 1593 + }, + { + "completion_length": 308.359375, + "epoch": 0.5079668578712556, + "grad_norm": 18.813356399536133, + "kl": 0.060546875, + "learning_rate": 4.920331421287444e-07, + "loss": 0.0024, + "reward": 1.5813418626785278, + "reward_std": 0.1093776598572731, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47196686267852783, + "rewards/pad": 0.125, + "step": 1594 + }, + { + "completion_length": 273.34375, + "epoch": 0.5082855321861058, + "grad_norm": 10.208179473876953, + "kl": 0.08251953125, + "learning_rate": 4.917144678138942e-07, + "loss": 0.0033, + "reward": 1.6122044324874878, + "reward_std": 0.23450268805027008, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.39345434308052063, + "rewards/pad": 0.25, + "step": 1595 + }, + { + "completion_length": 263.703125, + "epoch": 0.5086042065009561, + "grad_norm": 7.797765731811523, + "kl": 0.076171875, + "learning_rate": 4.91395793499044e-07, + "loss": 0.003, + "reward": 1.4443575143814087, + "reward_std": 0.11956124752759933, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4599825143814087, + "rewards/pad": 0.0, + "step": 1596 + }, + { + "completion_length": 394.984375, + "epoch": 0.5089228808158063, + "grad_norm": 11.725752830505371, + "kl": 0.0654296875, + "learning_rate": 4.910771191841937e-07, + "loss": 0.0026, + "reward": 1.3768037557601929, + "reward_std": 0.07617262005805969, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3768037259578705, + "step": 1597 + }, + { + "completion_length": 338.140625, + "epoch": 0.5092415551306565, + "grad_norm": 4.875799655914307, + "kl": 0.099609375, + "learning_rate": 4.907584448693435e-07, + "loss": 0.004, + "reward": 1.4290649890899658, + "reward_std": 0.042309802025556564, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4290649890899658, + "rewards/pad": 0.0, + "step": 1598 + }, + { + "completion_length": 289.796875, + "epoch": 0.5095602294455067, + "grad_norm": 13.580358505249023, + "kl": 0.09033203125, + "learning_rate": 4.904397705544932e-07, + "loss": 0.0036, + "reward": 1.3642786741256714, + "reward_std": 0.09266138076782227, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3799036741256714, + "step": 1599 + }, + { + "completion_length": 292.46875, + "epoch": 0.5098789037603569, + "grad_norm": 21.963088989257812, + "kl": 0.07958984375, + "learning_rate": 4.90121096239643e-07, + "loss": 0.0032, + "reward": 1.4895875453948975, + "reward_std": 0.05440334975719452, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36458754539489746, + "step": 1600 + }, + { + "completion_length": 233.984375, + "epoch": 0.5101975780752072, + "grad_norm": 15.617948532104492, + "kl": 0.0966796875, + "learning_rate": 4.898024219247928e-07, + "loss": 0.0039, + "reward": 1.5592432022094727, + "reward_std": 0.1363193839788437, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4498681426048279, + "rewards/pad": 0.125, + "step": 1601 + }, + { + "completion_length": 168.90625, + "epoch": 0.5105162523900574, + "grad_norm": 10.946663856506348, + "kl": 0.10498046875, + "learning_rate": 4.894837476099425e-07, + "loss": 0.0042, + "reward": 1.5720787048339844, + "reward_std": 0.08019654452800751, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5720787644386292, + "step": 1602 + }, + { + "completion_length": 323.90625, + "epoch": 0.5108349267049076, + "grad_norm": 11.409135818481445, + "kl": 0.07080078125, + "learning_rate": 4.891650732950924e-07, + "loss": 0.0028, + "reward": 1.6570066213607788, + "reward_std": 0.11523165553808212, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5476316809654236, + "rewards/pad": 0.125, + "step": 1603 + }, + { + "completion_length": 323.5, + "epoch": 0.5111536010197578, + "grad_norm": 11.713022232055664, + "kl": 0.07958984375, + "learning_rate": 4.888463989802422e-07, + "loss": 0.0032, + "reward": 1.5533918142318726, + "reward_std": 0.13724485039710999, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5690168142318726, + "step": 1604 + }, + { + "completion_length": 246.78125, + "epoch": 0.511472275334608, + "grad_norm": 11.342105865478516, + "kl": 0.08740234375, + "learning_rate": 4.88527724665392e-07, + "loss": 0.0035, + "reward": 1.4041597843170166, + "reward_std": 0.08910918235778809, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40415966510772705, + "rewards/pad": 0.0, + "step": 1605 + }, + { + "completion_length": 287.859375, + "epoch": 0.5117909496494583, + "grad_norm": 11.058752059936523, + "kl": 0.07568359375, + "learning_rate": 4.882090503505417e-07, + "loss": 0.003, + "reward": 1.6646125316619873, + "reward_std": 0.10868878662586212, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5552375912666321, + "step": 1606 + }, + { + "completion_length": 334.265625, + "epoch": 0.5121096239643085, + "grad_norm": 9.2514066696167, + "kl": 0.058837890625, + "learning_rate": 4.878903760356915e-07, + "loss": 0.0024, + "reward": 1.4564669132232666, + "reward_std": 0.14030423760414124, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3314668834209442, + "step": 1607 + }, + { + "completion_length": 367.28125, + "epoch": 0.5124282982791587, + "grad_norm": 8.841236114501953, + "kl": 0.06494140625, + "learning_rate": 4.875717017208413e-07, + "loss": 0.0026, + "reward": 1.513685703277588, + "reward_std": 0.0567639134824276, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5136858224868774, + "step": 1608 + }, + { + "completion_length": 218.296875, + "epoch": 0.5127469725940089, + "grad_norm": 21.75227928161621, + "kl": 0.09423828125, + "learning_rate": 4.872530274059911e-07, + "loss": 0.0038, + "reward": 1.4949934482574463, + "reward_std": 0.2251434624195099, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4168684482574463, + "step": 1609 + }, + { + "completion_length": 267.765625, + "epoch": 0.5130656469088591, + "grad_norm": 13.175375938415527, + "kl": 0.0849609375, + "learning_rate": 4.869343530911408e-07, + "loss": 0.0034, + "reward": 1.5545045137405396, + "reward_std": 0.055992916226387024, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5545045137405396, + "rewards/pad": 0.0, + "step": 1610 + }, + { + "completion_length": 184.625, + "epoch": 0.5133843212237094, + "grad_norm": 6.863095760345459, + "kl": 0.0986328125, + "learning_rate": 4.866156787762906e-07, + "loss": 0.0039, + "reward": 1.5448052883148193, + "reward_std": 0.0887579694390297, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5448052287101746, + "rewards/pad": 0.0, + "step": 1611 + }, + { + "completion_length": 171.703125, + "epoch": 0.5137029955385596, + "grad_norm": 5.797022342681885, + "kl": 0.1640625, + "learning_rate": 4.862970044614404e-07, + "loss": 0.0066, + "reward": 1.6547784805297852, + "reward_std": 0.12803807854652405, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.420403391122818, + "step": 1612 + }, + { + "completion_length": 229.53125, + "epoch": 0.5140216698534098, + "grad_norm": 4.80680513381958, + "kl": 0.08251953125, + "learning_rate": 4.859783301465902e-07, + "loss": 0.0033, + "reward": 1.9011766910552979, + "reward_std": 0.10118616372346878, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5261766910552979, + "rewards/pad": 0.375, + "step": 1613 + }, + { + "completion_length": 385.375, + "epoch": 0.51434034416826, + "grad_norm": 12.173733711242676, + "kl": 0.06103515625, + "learning_rate": 4.856596558317399e-07, + "loss": 0.0024, + "reward": 1.4999946355819702, + "reward_std": 0.042640700936317444, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4999946355819702, + "rewards/pad": 0.0, + "step": 1614 + }, + { + "completion_length": 324.078125, + "epoch": 0.5146590184831102, + "grad_norm": 4.943187236785889, + "kl": 0.10791015625, + "learning_rate": 4.853409815168897e-07, + "loss": 0.0043, + "reward": 1.5149937868118286, + "reward_std": 0.10412536561489105, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4056188762187958, + "rewards/pad": 0.125, + "step": 1615 + }, + { + "completion_length": 427.796875, + "epoch": 0.5149776927979605, + "grad_norm": 4.6593451499938965, + "kl": 0.05322265625, + "learning_rate": 4.850223072020395e-07, + "loss": 0.0021, + "reward": 1.4270119667053223, + "reward_std": 0.09866784512996674, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4426369071006775, + "step": 1616 + }, + { + "completion_length": 149.375, + "epoch": 0.5152963671128107, + "grad_norm": 18.755388259887695, + "kl": 0.10791015625, + "learning_rate": 4.847036328871893e-07, + "loss": 0.0043, + "reward": 1.5256495475769043, + "reward_std": 0.07746165245771408, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5256494879722595, + "rewards/pad": 0.0, + "step": 1617 + }, + { + "completion_length": 444.65625, + "epoch": 0.5156150414276609, + "grad_norm": 35.75686264038086, + "kl": 0.052978515625, + "learning_rate": 4.84384958572339e-07, + "loss": 0.0021, + "reward": 1.4138743877410889, + "reward_std": 0.1107860878109932, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.3201243281364441, + "step": 1618 + }, + { + "completion_length": 384.34375, + "epoch": 0.5159337157425111, + "grad_norm": 9.533431053161621, + "kl": 0.068359375, + "learning_rate": 4.840662842574888e-07, + "loss": 0.0027, + "reward": 1.4957008361816406, + "reward_std": 0.12621979415416718, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5269508957862854, + "rewards/pad": 0.0, + "step": 1619 + }, + { + "completion_length": 382.53125, + "epoch": 0.5162523900573613, + "grad_norm": 16.5041446685791, + "kl": 0.05908203125, + "learning_rate": 4.837476099426385e-07, + "loss": 0.0024, + "reward": 1.5703582763671875, + "reward_std": 0.11693361401557922, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.585983157157898, + "step": 1620 + }, + { + "completion_length": 335.921875, + "epoch": 0.5165710643722116, + "grad_norm": 16.340742111206055, + "kl": 0.10302734375, + "learning_rate": 4.834289356277884e-07, + "loss": 0.0041, + "reward": 1.368363380432129, + "reward_std": 0.059988975524902344, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3683633506298065, + "step": 1621 + }, + { + "completion_length": 340.953125, + "epoch": 0.5168897386870618, + "grad_norm": 10.794461250305176, + "kl": 0.1845703125, + "learning_rate": 4.831102613129382e-07, + "loss": 0.0074, + "reward": 1.4841980934143066, + "reward_std": 0.10662591457366943, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4841980040073395, + "step": 1622 + }, + { + "completion_length": 302.21875, + "epoch": 0.517208413001912, + "grad_norm": 6.696167469024658, + "kl": 0.0791015625, + "learning_rate": 4.82791586998088e-07, + "loss": 0.0032, + "reward": 1.4154529571533203, + "reward_std": 0.16440361738204956, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.32170307636260986, + "rewards/pad": 0.125, + "step": 1623 + }, + { + "completion_length": 123.1875, + "epoch": 0.5175270873167622, + "grad_norm": 34.49293518066406, + "kl": 0.11181640625, + "learning_rate": 4.824729126832377e-07, + "loss": 0.0045, + "reward": 1.6380038261413574, + "reward_std": 0.12282894551753998, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5130038857460022, + "step": 1624 + }, + { + "completion_length": 113.046875, + "epoch": 0.5178457616316124, + "grad_norm": 68.6580810546875, + "kl": 0.1337890625, + "learning_rate": 4.821542383683875e-07, + "loss": 0.0054, + "reward": 1.6568090915679932, + "reward_std": 0.20202675461769104, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5630591511726379, + "rewards/pad": 0.09375, + "step": 1625 + }, + { + "completion_length": 355.78125, + "epoch": 0.5181644359464627, + "grad_norm": 7.60537576675415, + "kl": 0.07373046875, + "learning_rate": 4.818355640535373e-07, + "loss": 0.0029, + "reward": 1.4764320850372314, + "reward_std": 0.12306798994541168, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36705708503723145, + "step": 1626 + }, + { + "completion_length": 363.03125, + "epoch": 0.5184831102613129, + "grad_norm": 15.738922119140625, + "kl": 0.0810546875, + "learning_rate": 4.815168897386871e-07, + "loss": 0.0032, + "reward": 1.503629446029663, + "reward_std": 0.20149904489517212, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5192545056343079, + "step": 1627 + }, + { + "completion_length": 266.53125, + "epoch": 0.5188017845761632, + "grad_norm": 5.414165496826172, + "kl": 0.1162109375, + "learning_rate": 4.811982154238368e-07, + "loss": 0.0046, + "reward": 1.5420563220977783, + "reward_std": 0.08500739932060242, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4170563519001007, + "rewards/pad": 0.125, + "step": 1628 + }, + { + "completion_length": 229.15625, + "epoch": 0.5191204588910134, + "grad_norm": 8.028788566589355, + "kl": 0.09716796875, + "learning_rate": 4.808795411089866e-07, + "loss": 0.0039, + "reward": 1.7718290090560913, + "reward_std": 0.07292530685663223, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6468290686607361, + "rewards/pad": 0.125, + "step": 1629 + }, + { + "completion_length": 397.921875, + "epoch": 0.5194391332058637, + "grad_norm": 30.3446044921875, + "kl": 0.06640625, + "learning_rate": 4.805608667941364e-07, + "loss": 0.0027, + "reward": 1.5163317918777466, + "reward_std": 0.08876568078994751, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5319567918777466, + "rewards/pad": 0.0, + "step": 1630 + }, + { + "completion_length": 286.65625, + "epoch": 0.5197578075207139, + "grad_norm": 54.19609451293945, + "kl": 0.07275390625, + "learning_rate": 4.802421924792862e-07, + "loss": 0.0029, + "reward": 1.5980496406555176, + "reward_std": 0.17015178501605988, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.45742470026016235, + "step": 1631 + }, + { + "completion_length": 232.015625, + "epoch": 0.5200764818355641, + "grad_norm": 8.370911598205566, + "kl": 0.10302734375, + "learning_rate": 4.799235181644359e-07, + "loss": 0.0041, + "reward": 1.528737187385559, + "reward_std": 0.1446613371372223, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4193621873855591, + "rewards/pad": 0.125, + "step": 1632 + }, + { + "completion_length": 366.9375, + "epoch": 0.5203951561504143, + "grad_norm": 12.012307167053223, + "kl": 0.07958984375, + "learning_rate": 4.796048438495857e-07, + "loss": 0.0032, + "reward": 1.5256717205047607, + "reward_std": 0.07025659084320068, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5256717205047607, + "step": 1633 + }, + { + "completion_length": 235.0625, + "epoch": 0.5207138304652645, + "grad_norm": 40.669578552246094, + "kl": 0.099609375, + "learning_rate": 4.792861695347355e-07, + "loss": 0.004, + "reward": 1.5062785148620605, + "reward_std": 0.14399857819080353, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.39690354466438293, + "step": 1634 + }, + { + "completion_length": 225.34375, + "epoch": 0.5210325047801148, + "grad_norm": 14.326337814331055, + "kl": 0.091796875, + "learning_rate": 4.789674952198852e-07, + "loss": 0.0037, + "reward": 1.7521679401397705, + "reward_std": 0.16071289777755737, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5177929997444153, + "rewards/pad": 0.234375, + "step": 1635 + }, + { + "completion_length": 397.890625, + "epoch": 0.521351179094965, + "grad_norm": 12.720609664916992, + "kl": 0.061279296875, + "learning_rate": 4.78648820905035e-07, + "loss": 0.0025, + "reward": 1.395108699798584, + "reward_std": 0.06852993369102478, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39510878920555115, + "step": 1636 + }, + { + "completion_length": 520.375, + "epoch": 0.5216698534098152, + "grad_norm": 4.460532188415527, + "kl": 0.04052734375, + "learning_rate": 4.783301465901848e-07, + "loss": 0.0016, + "reward": 1.4853415489196777, + "reward_std": 0.03336441516876221, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4853416085243225, + "step": 1637 + }, + { + "completion_length": 204.171875, + "epoch": 0.5219885277246654, + "grad_norm": 25.1767578125, + "kl": 0.09619140625, + "learning_rate": 4.780114722753345e-07, + "loss": 0.0039, + "reward": 1.520871877670288, + "reward_std": 0.05909598991274834, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3958718776702881, + "step": 1638 + }, + { + "completion_length": 330.984375, + "epoch": 0.5223072020395156, + "grad_norm": 4.884491920471191, + "kl": 0.09423828125, + "learning_rate": 4.776927979604843e-07, + "loss": 0.0038, + "reward": 1.6307636499404907, + "reward_std": 0.08659254014492035, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6307636499404907, + "rewards/pad": 0.0, + "step": 1639 + }, + { + "completion_length": 280.90625, + "epoch": 0.5226258763543659, + "grad_norm": 11.95295238494873, + "kl": 0.07861328125, + "learning_rate": 4.773741236456342e-07, + "loss": 0.0032, + "reward": 1.5487451553344727, + "reward_std": 0.11208067834377289, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3143702447414398, + "rewards/pad": 0.25, + "step": 1640 + }, + { + "completion_length": 393.484375, + "epoch": 0.5229445506692161, + "grad_norm": 9.313077926635742, + "kl": 0.06298828125, + "learning_rate": 4.770554493307839e-07, + "loss": 0.0025, + "reward": 1.4793241024017334, + "reward_std": 0.10382667183876038, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.369949072599411, + "rewards/pad": 0.125, + "step": 1641 + }, + { + "completion_length": 271.015625, + "epoch": 0.5232632249840663, + "grad_norm": 11.412222862243652, + "kl": 0.080078125, + "learning_rate": 4.7673677501593366e-07, + "loss": 0.0032, + "reward": 1.6363248825073242, + "reward_std": 0.13290798664093018, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5269498825073242, + "rewards/pad": 0.125, + "step": 1642 + }, + { + "completion_length": 284.734375, + "epoch": 0.5235818992989165, + "grad_norm": 18.375911712646484, + "kl": 0.09375, + "learning_rate": 4.7641810070108347e-07, + "loss": 0.0038, + "reward": 1.5433933734893799, + "reward_std": 0.05979537591338158, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5433933734893799, + "rewards/pad": 0.0, + "step": 1643 + }, + { + "completion_length": 453.34375, + "epoch": 0.5239005736137667, + "grad_norm": 14.870288848876953, + "kl": 0.048828125, + "learning_rate": 4.760994263862332e-07, + "loss": 0.002, + "reward": 1.3990988731384277, + "reward_std": 0.1635962575674057, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.3209739923477173, + "step": 1644 + }, + { + "completion_length": 217.828125, + "epoch": 0.524219247928617, + "grad_norm": 10.764137268066406, + "kl": 0.09912109375, + "learning_rate": 4.7578075207138303e-07, + "loss": 0.004, + "reward": 1.5573375225067139, + "reward_std": 0.05173371732234955, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5573375225067139, + "rewards/pad": 0.0, + "step": 1645 + }, + { + "completion_length": 237.171875, + "epoch": 0.5245379222434672, + "grad_norm": 124.7045669555664, + "kl": 0.0947265625, + "learning_rate": 4.754620777565328e-07, + "loss": 0.0038, + "reward": 1.6781928539276123, + "reward_std": 0.06751322746276855, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5531928539276123, + "step": 1646 + }, + { + "completion_length": 294.71875, + "epoch": 0.5248565965583174, + "grad_norm": 12.334156036376953, + "kl": 0.11083984375, + "learning_rate": 4.751434034416826e-07, + "loss": 0.0044, + "reward": 1.528046727180481, + "reward_std": 0.09726923704147339, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5436716079711914, + "step": 1647 + }, + { + "completion_length": 314.109375, + "epoch": 0.5251752708731676, + "grad_norm": 29.347471237182617, + "kl": 0.07373046875, + "learning_rate": 4.7482472912683235e-07, + "loss": 0.003, + "reward": 1.6609556674957275, + "reward_std": 0.04223080724477768, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5359556674957275, + "rewards/pad": 0.125, + "step": 1648 + }, + { + "completion_length": 163.21875, + "epoch": 0.5254939451880178, + "grad_norm": 6.873000621795654, + "kl": 0.11376953125, + "learning_rate": 4.7450605481198215e-07, + "loss": 0.0045, + "reward": 1.8783053159713745, + "reward_std": 0.11644686013460159, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6439304351806641, + "step": 1649 + }, + { + "completion_length": 287.171875, + "epoch": 0.5258126195028681, + "grad_norm": 11.115781784057617, + "kl": 0.09130859375, + "learning_rate": 4.741873804971319e-07, + "loss": 0.0037, + "reward": 1.7113351821899414, + "reward_std": 0.11662662774324417, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6019601225852966, + "step": 1650 + }, + { + "completion_length": 320.15625, + "epoch": 0.5261312938177183, + "grad_norm": 8.5549898147583, + "kl": 0.08203125, + "learning_rate": 4.738687061822817e-07, + "loss": 0.0033, + "reward": 1.419663667678833, + "reward_std": 0.14064767956733704, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4509136974811554, + "step": 1651 + }, + { + "completion_length": 346.859375, + "epoch": 0.5264499681325685, + "grad_norm": 26.915172576904297, + "kl": 0.07177734375, + "learning_rate": 4.7355003186743147e-07, + "loss": 0.0029, + "reward": 1.6085193157196045, + "reward_std": 0.17502236366271973, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5147693157196045, + "step": 1652 + }, + { + "completion_length": 202.125, + "epoch": 0.5267686424474187, + "grad_norm": 27.095932006835938, + "kl": 0.1201171875, + "learning_rate": 4.732313575525813e-07, + "loss": 0.0048, + "reward": 1.517913579940796, + "reward_std": 0.13210374116897583, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40853869915008545, + "rewards/pad": 0.125, + "step": 1653 + }, + { + "completion_length": 302.140625, + "epoch": 0.5270873167622689, + "grad_norm": 11.207296371459961, + "kl": 0.08203125, + "learning_rate": 4.7291268323773103e-07, + "loss": 0.0033, + "reward": 1.5230052471160889, + "reward_std": 0.14820341765880585, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.2886302173137665, + "step": 1654 + }, + { + "completion_length": 158.53125, + "epoch": 0.5274059910771192, + "grad_norm": 9.38110065460205, + "kl": 0.16796875, + "learning_rate": 4.7259400892288084e-07, + "loss": 0.0067, + "reward": 1.4777312278747559, + "reward_std": 0.10828878730535507, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4777311682701111, + "rewards/pad": 0.0, + "step": 1655 + }, + { + "completion_length": 231.46875, + "epoch": 0.5277246653919694, + "grad_norm": 94.59187316894531, + "kl": 0.10986328125, + "learning_rate": 4.722753346080306e-07, + "loss": 0.0044, + "reward": 1.632595419883728, + "reward_std": 0.11153492331504822, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.585720419883728, + "rewards/pad": 0.046875, + "step": 1656 + }, + { + "completion_length": 251.0625, + "epoch": 0.5280433397068196, + "grad_norm": 11.970016479492188, + "kl": 0.0849609375, + "learning_rate": 4.719566602931804e-07, + "loss": 0.0034, + "reward": 1.5200518369674683, + "reward_std": 0.06651220470666885, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.520051896572113, + "step": 1657 + }, + { + "completion_length": 290.203125, + "epoch": 0.5283620140216698, + "grad_norm": 5.539214134216309, + "kl": 0.07861328125, + "learning_rate": 4.716379859783301e-07, + "loss": 0.0032, + "reward": 1.2751659154891968, + "reward_std": 0.10172433406114578, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.29079097509384155, + "step": 1658 + }, + { + "completion_length": 320.609375, + "epoch": 0.52868068833652, + "grad_norm": 14.202303886413574, + "kl": 0.087890625, + "learning_rate": 4.713193116634799e-07, + "loss": 0.0035, + "reward": 1.5370453596115112, + "reward_std": 0.089061439037323, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4276703894138336, + "rewards/pad": 0.109375, + "step": 1659 + }, + { + "completion_length": 290.359375, + "epoch": 0.5289993626513703, + "grad_norm": 7.756679534912109, + "kl": 0.09130859375, + "learning_rate": 4.7100063734862966e-07, + "loss": 0.0037, + "reward": 1.3637516498565674, + "reward_std": 0.10193685442209244, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36375173926353455, + "step": 1660 + }, + { + "completion_length": 230.8125, + "epoch": 0.5293180369662205, + "grad_norm": 12.022960662841797, + "kl": 0.09912109375, + "learning_rate": 4.706819630337794e-07, + "loss": 0.004, + "reward": 1.688589096069336, + "reward_std": 0.09793002158403397, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43858909606933594, + "rewards/pad": 0.25, + "step": 1661 + }, + { + "completion_length": 269.140625, + "epoch": 0.5296367112810707, + "grad_norm": 9.252043724060059, + "kl": 0.09033203125, + "learning_rate": 4.703632887189292e-07, + "loss": 0.0036, + "reward": 1.3499665260314941, + "reward_std": 0.031937532126903534, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34996655583381653, + "rewards/pad": 0.0, + "step": 1662 + }, + { + "completion_length": 317.265625, + "epoch": 0.5299553855959209, + "grad_norm": 7.217067241668701, + "kl": 0.10595703125, + "learning_rate": 4.70044614404079e-07, + "loss": 0.0042, + "reward": 1.4520938396453857, + "reward_std": 0.13367217779159546, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4677189290523529, + "step": 1663 + }, + { + "completion_length": 293.28125, + "epoch": 0.5302740599107711, + "grad_norm": 8.803386688232422, + "kl": 0.09423828125, + "learning_rate": 4.697259400892288e-07, + "loss": 0.0038, + "reward": 1.5019474029541016, + "reward_std": 0.054961420595645905, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5019474029541016, + "rewards/pad": 0.0, + "step": 1664 + }, + { + "completion_length": 327.90625, + "epoch": 0.5305927342256214, + "grad_norm": 5.697347640991211, + "kl": 0.1064453125, + "learning_rate": 4.6940726577437853e-07, + "loss": 0.0043, + "reward": 1.683156967163086, + "reward_std": 0.15118834376335144, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5581569671630859, + "step": 1665 + }, + { + "completion_length": 316.8125, + "epoch": 0.5309114085404716, + "grad_norm": 3.3667261600494385, + "kl": 0.072265625, + "learning_rate": 4.6908859145952834e-07, + "loss": 0.0029, + "reward": 1.341244101524353, + "reward_std": 0.04248841851949692, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3412441611289978, + "step": 1666 + }, + { + "completion_length": 276.625, + "epoch": 0.5312300828553218, + "grad_norm": 9.462784767150879, + "kl": 0.09228515625, + "learning_rate": 4.687699171446781e-07, + "loss": 0.0037, + "reward": 1.3527380228042603, + "reward_std": 0.060129985213279724, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35273808240890503, + "rewards/pad": 0.0, + "step": 1667 + }, + { + "completion_length": 299.484375, + "epoch": 0.5315487571701721, + "grad_norm": 6.0697150230407715, + "kl": 0.0869140625, + "learning_rate": 4.684512428298279e-07, + "loss": 0.0035, + "reward": 1.400463581085205, + "reward_std": 0.08055169880390167, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4160885810852051, + "rewards/pad": 0.0, + "step": 1668 + }, + { + "completion_length": 231.203125, + "epoch": 0.5318674314850224, + "grad_norm": 13.91861629486084, + "kl": 0.10595703125, + "learning_rate": 4.6813256851497766e-07, + "loss": 0.0042, + "reward": 1.6042506694793701, + "reward_std": 0.06380566954612732, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6042506694793701, + "rewards/pad": 0.0, + "step": 1669 + }, + { + "completion_length": 266.671875, + "epoch": 0.5321861057998726, + "grad_norm": 13.381567001342773, + "kl": 0.0947265625, + "learning_rate": 4.6781389420012746e-07, + "loss": 0.0038, + "reward": 1.5504193305969238, + "reward_std": 0.14849324524402618, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.45666933059692383, + "step": 1670 + }, + { + "completion_length": 378.53125, + "epoch": 0.5325047801147228, + "grad_norm": 10.656835556030273, + "kl": 0.0712890625, + "learning_rate": 4.674952198852772e-07, + "loss": 0.0029, + "reward": 1.446120262145996, + "reward_std": 0.11353107541799545, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46174532175064087, + "step": 1671 + }, + { + "completion_length": 262.234375, + "epoch": 0.532823454429573, + "grad_norm": 20.61830711364746, + "kl": 0.1005859375, + "learning_rate": 4.67176545570427e-07, + "loss": 0.004, + "reward": 1.6212286949157715, + "reward_std": 0.062337301671504974, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6212287545204163, + "step": 1672 + }, + { + "completion_length": 228.296875, + "epoch": 0.5331421287444232, + "grad_norm": 9.58569622039795, + "kl": 0.10009765625, + "learning_rate": 4.668578712555768e-07, + "loss": 0.004, + "reward": 1.542300820350647, + "reward_std": 0.08927084505558014, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5423007607460022, + "rewards/pad": 0.0, + "step": 1673 + }, + { + "completion_length": 243.328125, + "epoch": 0.5334608030592735, + "grad_norm": 8.100432395935059, + "kl": 0.0732421875, + "learning_rate": 4.665391969407266e-07, + "loss": 0.0029, + "reward": 1.8803434371948242, + "reward_std": 0.10601752996444702, + "rewards/answer_reward": 0.484375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.39596837759017944, + "step": 1674 + }, + { + "completion_length": 287.1875, + "epoch": 0.5337794773741237, + "grad_norm": 7.46446418762207, + "kl": 0.11083984375, + "learning_rate": 4.6622052262587634e-07, + "loss": 0.0044, + "reward": 1.512256383895874, + "reward_std": 0.12309861183166504, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5122564435005188, + "rewards/pad": 0.0, + "step": 1675 + }, + { + "completion_length": 304.25, + "epoch": 0.5340981516889739, + "grad_norm": 12.087656021118164, + "kl": 0.12060546875, + "learning_rate": 4.6590184831102615e-07, + "loss": 0.0048, + "reward": 1.4665218591690063, + "reward_std": 0.08359487354755402, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46652188897132874, + "rewards/pad": 0.0, + "step": 1676 + }, + { + "completion_length": 333.96875, + "epoch": 0.5344168260038241, + "grad_norm": 5.124676704406738, + "kl": 0.0712890625, + "learning_rate": 4.6558317399617585e-07, + "loss": 0.0029, + "reward": 1.5634325742721558, + "reward_std": 0.04829826205968857, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.438432514667511, + "rewards/pad": 0.125, + "step": 1677 + }, + { + "completion_length": 297.390625, + "epoch": 0.5347355003186743, + "grad_norm": 33.74581527709961, + "kl": 0.064453125, + "learning_rate": 4.6526449968132566e-07, + "loss": 0.0026, + "reward": 1.5150209665298462, + "reward_std": 0.07572810351848602, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3900209963321686, + "step": 1678 + }, + { + "completion_length": 392.421875, + "epoch": 0.5350541746335246, + "grad_norm": 5.853975772857666, + "kl": 0.05517578125, + "learning_rate": 4.649458253664754e-07, + "loss": 0.0022, + "reward": 1.466147780418396, + "reward_std": 0.1339925229549408, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4817728102207184, + "step": 1679 + }, + { + "completion_length": 263.0625, + "epoch": 0.5353728489483748, + "grad_norm": 29.1920108795166, + "kl": 0.09814453125, + "learning_rate": 4.646271510516252e-07, + "loss": 0.0039, + "reward": 1.4998505115509033, + "reward_std": 0.0841817855834961, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4998505711555481, + "rewards/pad": 0.0, + "step": 1680 + }, + { + "completion_length": 299.109375, + "epoch": 0.535691523263225, + "grad_norm": 9.483532905578613, + "kl": 0.08056640625, + "learning_rate": 4.6430847673677497e-07, + "loss": 0.0032, + "reward": 1.566691279411316, + "reward_std": 0.11589021980762482, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5823163390159607, + "rewards/pad": 0.0, + "step": 1681 + }, + { + "completion_length": 213.34375, + "epoch": 0.5360101975780752, + "grad_norm": 21.55391502380371, + "kl": 0.11083984375, + "learning_rate": 4.639898024219248e-07, + "loss": 0.0044, + "reward": 1.4185980558395386, + "reward_std": 0.08017615228891373, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4185979962348938, + "rewards/pad": 0.0, + "step": 1682 + }, + { + "completion_length": 247.609375, + "epoch": 0.5363288718929254, + "grad_norm": 7.853524208068848, + "kl": 0.078125, + "learning_rate": 4.6367112810707453e-07, + "loss": 0.0031, + "reward": 1.6442821025848389, + "reward_std": 0.08468535542488098, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6442821025848389, + "step": 1683 + }, + { + "completion_length": 116.953125, + "epoch": 0.5366475462077757, + "grad_norm": 15.663179397583008, + "kl": 0.1103515625, + "learning_rate": 4.6335245379222434e-07, + "loss": 0.0044, + "reward": 1.6685657501220703, + "reward_std": 0.12723290920257568, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5435657501220703, + "rewards/pad": 0.125, + "step": 1684 + }, + { + "completion_length": 374.3125, + "epoch": 0.5369662205226259, + "grad_norm": 6.08555269241333, + "kl": 0.05908203125, + "learning_rate": 4.630337794773741e-07, + "loss": 0.0024, + "reward": 1.594588041305542, + "reward_std": 0.08949033915996552, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.344588041305542, + "step": 1685 + }, + { + "completion_length": 211.9375, + "epoch": 0.5372848948374761, + "grad_norm": 6.44908332824707, + "kl": 0.10009765625, + "learning_rate": 4.627151051625239e-07, + "loss": 0.004, + "reward": 1.6699178218841553, + "reward_std": 0.10696688294410706, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5449178814888, + "rewards/pad": 0.125, + "step": 1686 + }, + { + "completion_length": 265.53125, + "epoch": 0.5376035691523263, + "grad_norm": 12.869318008422852, + "kl": 0.0830078125, + "learning_rate": 4.6239643084767365e-07, + "loss": 0.0033, + "reward": 1.3589563369750977, + "reward_std": 0.10501168668270111, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3745812475681305, + "step": 1687 + }, + { + "completion_length": 222.53125, + "epoch": 0.5379222434671765, + "grad_norm": 8.010903358459473, + "kl": 0.10693359375, + "learning_rate": 4.6207775653282346e-07, + "loss": 0.0043, + "reward": 1.8639793395996094, + "reward_std": 0.08176255226135254, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.7389793992042542, + "step": 1688 + }, + { + "completion_length": 320.578125, + "epoch": 0.5382409177820268, + "grad_norm": 22.186399459838867, + "kl": 0.08447265625, + "learning_rate": 4.617590822179732e-07, + "loss": 0.0034, + "reward": 1.3118007183074951, + "reward_std": 0.05238881707191467, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.31180059909820557, + "step": 1689 + }, + { + "completion_length": 249.984375, + "epoch": 0.538559592096877, + "grad_norm": 15.33934211730957, + "kl": 0.1826171875, + "learning_rate": 4.61440407903123e-07, + "loss": 0.0073, + "reward": 1.4747562408447266, + "reward_std": 0.12676838040351868, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4903812110424042, + "rewards/pad": 0.0, + "step": 1690 + }, + { + "completion_length": 254.5625, + "epoch": 0.5388782664117272, + "grad_norm": 16.812482833862305, + "kl": 0.1591796875, + "learning_rate": 4.611217335882728e-07, + "loss": 0.0064, + "reward": 1.4752761125564575, + "reward_std": 0.11461857706308365, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4752761423587799, + "rewards/pad": 0.0, + "step": 1691 + }, + { + "completion_length": 221.171875, + "epoch": 0.5391969407265774, + "grad_norm": 10.983870506286621, + "kl": 0.09765625, + "learning_rate": 4.608030592734226e-07, + "loss": 0.0039, + "reward": 1.8936082124710083, + "reward_std": 0.12493997067213058, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6436082124710083, + "step": 1692 + }, + { + "completion_length": 209.359375, + "epoch": 0.5395156150414276, + "grad_norm": 14.340348243713379, + "kl": 0.0927734375, + "learning_rate": 4.6048438495857234e-07, + "loss": 0.0037, + "reward": 1.4879741668701172, + "reward_std": 0.07082471251487732, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48797428607940674, + "rewards/pad": 0.0, + "step": 1693 + }, + { + "completion_length": 257.390625, + "epoch": 0.5398342893562779, + "grad_norm": 103.97561645507812, + "kl": 0.09375, + "learning_rate": 4.601657106437221e-07, + "loss": 0.0038, + "reward": 1.7300111055374146, + "reward_std": 0.06880944967269897, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6050111055374146, + "rewards/pad": 0.125, + "step": 1694 + }, + { + "completion_length": 346.125, + "epoch": 0.5401529636711281, + "grad_norm": 20.965011596679688, + "kl": 0.0751953125, + "learning_rate": 4.598470363288719e-07, + "loss": 0.003, + "reward": 1.4433112144470215, + "reward_std": 0.03581881895661354, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44331109523773193, + "rewards/pad": 0.0, + "step": 1695 + }, + { + "completion_length": 268.53125, + "epoch": 0.5404716379859783, + "grad_norm": 13.639444351196289, + "kl": 0.08984375, + "learning_rate": 4.5952836201402165e-07, + "loss": 0.0036, + "reward": 1.5070383548736572, + "reward_std": 0.16513481736183167, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.2882883548736572, + "rewards/pad": 0.25, + "step": 1696 + }, + { + "completion_length": 260.453125, + "epoch": 0.5407903123008285, + "grad_norm": 14.117385864257812, + "kl": 0.080078125, + "learning_rate": 4.592096876991714e-07, + "loss": 0.0032, + "reward": 1.620898723602295, + "reward_std": 0.12866953015327454, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.417773574590683, + "rewards/pad": 0.203125, + "step": 1697 + }, + { + "completion_length": 265.515625, + "epoch": 0.5411089866156787, + "grad_norm": 6.444183349609375, + "kl": 0.1181640625, + "learning_rate": 4.5889101338432116e-07, + "loss": 0.0047, + "reward": 1.491443395614624, + "reward_std": 0.06407006084918976, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49144336581230164, + "step": 1698 + }, + { + "completion_length": 247.890625, + "epoch": 0.541427660930529, + "grad_norm": 10.815384864807129, + "kl": 0.10400390625, + "learning_rate": 4.5857233906947097e-07, + "loss": 0.0042, + "reward": 1.6102548837661743, + "reward_std": 0.15115009248256683, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6258799433708191, + "step": 1699 + }, + { + "completion_length": 306.15625, + "epoch": 0.5417463352453792, + "grad_norm": 16.33805274963379, + "kl": 0.076171875, + "learning_rate": 4.582536647546207e-07, + "loss": 0.003, + "reward": 1.3535027503967285, + "reward_std": 0.08354859054088593, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.24412773549556732, + "step": 1700 + }, + { + "completion_length": 162.90625, + "epoch": 0.5420650095602294, + "grad_norm": 9.865619659423828, + "kl": 0.150390625, + "learning_rate": 4.5793499043977053e-07, + "loss": 0.006, + "reward": 1.7482905387878418, + "reward_std": 0.13975197076797485, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.529540479183197, + "rewards/pad": 0.234375, + "step": 1701 + }, + { + "completion_length": 157.546875, + "epoch": 0.5423836838750796, + "grad_norm": 12.057559967041016, + "kl": 0.0947265625, + "learning_rate": 4.576163161249203e-07, + "loss": 0.0038, + "reward": 1.8558648824691772, + "reward_std": 0.1299760341644287, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.48086488246917725, + "step": 1702 + }, + { + "completion_length": 249.71875, + "epoch": 0.5427023581899298, + "grad_norm": 18.16450309753418, + "kl": 0.07080078125, + "learning_rate": 4.572976418100701e-07, + "loss": 0.0028, + "reward": 1.7190160751342773, + "reward_std": 0.08569124341011047, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4690161347389221, + "step": 1703 + }, + { + "completion_length": 307.59375, + "epoch": 0.5430210325047801, + "grad_norm": 6.907443523406982, + "kl": 0.08984375, + "learning_rate": 4.5697896749521984e-07, + "loss": 0.0036, + "reward": 1.523721694946289, + "reward_std": 0.12501972913742065, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5237216353416443, + "rewards/pad": 0.0, + "step": 1704 + }, + { + "completion_length": 254.9375, + "epoch": 0.5433397068196303, + "grad_norm": 9.634760856628418, + "kl": 0.08447265625, + "learning_rate": 4.5666029318036965e-07, + "loss": 0.0034, + "reward": 1.637330412864685, + "reward_std": 0.09492364525794983, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5279552936553955, + "rewards/pad": 0.125, + "step": 1705 + }, + { + "completion_length": 387.03125, + "epoch": 0.5436583811344805, + "grad_norm": 8.362167358398438, + "kl": 0.052490234375, + "learning_rate": 4.563416188655194e-07, + "loss": 0.0021, + "reward": 1.4359925985336304, + "reward_std": 0.04619568586349487, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43599265813827515, + "rewards/pad": 0.0, + "step": 1706 + }, + { + "completion_length": 306.765625, + "epoch": 0.5439770554493308, + "grad_norm": 8.562478065490723, + "kl": 0.08154296875, + "learning_rate": 4.560229445506692e-07, + "loss": 0.0033, + "reward": 1.3705644607543945, + "reward_std": 0.07248996943235397, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3861893117427826, + "rewards/pad": 0.0, + "step": 1707 + }, + { + "completion_length": 308.296875, + "epoch": 0.5442957297641811, + "grad_norm": 10.349902153015137, + "kl": 0.1083984375, + "learning_rate": 4.5570427023581896e-07, + "loss": 0.0043, + "reward": 1.5466220378875732, + "reward_std": 0.049023132771253586, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5466219782829285, + "rewards/pad": 0.0, + "step": 1708 + }, + { + "completion_length": 155.03125, + "epoch": 0.5446144040790313, + "grad_norm": 17.148969650268555, + "kl": 0.1318359375, + "learning_rate": 4.5538559592096877e-07, + "loss": 0.0053, + "reward": 1.6405951976776123, + "reward_std": 0.14014357328414917, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5155952572822571, + "rewards/pad": 0.125, + "step": 1709 + }, + { + "completion_length": 306.78125, + "epoch": 0.5449330783938815, + "grad_norm": 19.449230194091797, + "kl": 0.07421875, + "learning_rate": 4.550669216061185e-07, + "loss": 0.003, + "reward": 1.7386236190795898, + "reward_std": 0.06590403616428375, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48862361907958984, + "rewards/pad": 0.25, + "step": 1710 + }, + { + "completion_length": 266.96875, + "epoch": 0.5452517527087317, + "grad_norm": 8.654415130615234, + "kl": 0.0830078125, + "learning_rate": 4.5474824729126833e-07, + "loss": 0.0033, + "reward": 1.7879807949066162, + "reward_std": 0.07044827193021774, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5379809141159058, + "step": 1711 + }, + { + "completion_length": 189.875, + "epoch": 0.5455704270235819, + "grad_norm": 10.486869812011719, + "kl": 0.1044921875, + "learning_rate": 4.544295729764181e-07, + "loss": 0.0042, + "reward": 1.7235859632492065, + "reward_std": 0.09536559879779816, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5985859632492065, + "rewards/pad": 0.125, + "step": 1712 + }, + { + "completion_length": 199.296875, + "epoch": 0.5458891013384322, + "grad_norm": 7.5744123458862305, + "kl": 0.10302734375, + "learning_rate": 4.541108986615679e-07, + "loss": 0.0041, + "reward": 1.4725016355514526, + "reward_std": 0.09483238309621811, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.363126665353775, + "rewards/pad": 0.125, + "step": 1713 + }, + { + "completion_length": 267.671875, + "epoch": 0.5462077756532824, + "grad_norm": 67.24296569824219, + "kl": 0.1552734375, + "learning_rate": 4.5379222434671765e-07, + "loss": 0.0062, + "reward": 1.489917516708374, + "reward_std": 0.055628370493650436, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4899175763130188, + "rewards/pad": 0.0, + "step": 1714 + }, + { + "completion_length": 262.203125, + "epoch": 0.5465264499681326, + "grad_norm": 7.187012195587158, + "kl": 0.08447265625, + "learning_rate": 4.5347355003186745e-07, + "loss": 0.0034, + "reward": 1.5411200523376465, + "reward_std": 0.05903906747698784, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5411199927330017, + "step": 1715 + }, + { + "completion_length": 235.75, + "epoch": 0.5468451242829828, + "grad_norm": 24.71690559387207, + "kl": 0.0771484375, + "learning_rate": 4.531548757170172e-07, + "loss": 0.0031, + "reward": 1.7826646566390991, + "reward_std": 0.12565046548843384, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5326645970344543, + "rewards/pad": 0.25, + "step": 1716 + }, + { + "completion_length": 183.4375, + "epoch": 0.547163798597833, + "grad_norm": 9.081765174865723, + "kl": 0.1015625, + "learning_rate": 4.5283620140216696e-07, + "loss": 0.0041, + "reward": 1.3880410194396973, + "reward_std": 0.02689719945192337, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3880411386489868, + "step": 1717 + }, + { + "completion_length": 210.640625, + "epoch": 0.5474824729126833, + "grad_norm": 7.534451007843018, + "kl": 0.0927734375, + "learning_rate": 4.525175270873167e-07, + "loss": 0.0037, + "reward": 1.4608581066131592, + "reward_std": 0.10575767606496811, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.35148316621780396, + "rewards/pad": 0.125, + "step": 1718 + }, + { + "completion_length": 325.765625, + "epoch": 0.5478011472275335, + "grad_norm": 12.512798309326172, + "kl": 0.0703125, + "learning_rate": 4.521988527724665e-07, + "loss": 0.0028, + "reward": 1.5565497875213623, + "reward_std": 0.15284176170825958, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3534247875213623, + "step": 1719 + }, + { + "completion_length": 358.3125, + "epoch": 0.5481198215423837, + "grad_norm": 5.614782333374023, + "kl": 0.0712890625, + "learning_rate": 4.518801784576163e-07, + "loss": 0.0029, + "reward": 1.4160782098770142, + "reward_std": 0.14821010828018188, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.44732820987701416, + "rewards/pad": 0.0, + "step": 1720 + }, + { + "completion_length": 258.953125, + "epoch": 0.5484384958572339, + "grad_norm": 11.684313774108887, + "kl": 0.07177734375, + "learning_rate": 4.515615041427661e-07, + "loss": 0.0029, + "reward": 1.5642802715301514, + "reward_std": 0.10995316505432129, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.45490533113479614, + "step": 1721 + }, + { + "completion_length": 247.78125, + "epoch": 0.5487571701720841, + "grad_norm": 54.16145706176758, + "kl": 0.08447265625, + "learning_rate": 4.5124282982791584e-07, + "loss": 0.0034, + "reward": 1.5749287605285645, + "reward_std": 0.09817046672105789, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44992879033088684, + "rewards/pad": 0.125, + "step": 1722 + }, + { + "completion_length": 263.578125, + "epoch": 0.5490758444869344, + "grad_norm": 12.113851547241211, + "kl": 0.1845703125, + "learning_rate": 4.509241555130656e-07, + "loss": 0.0074, + "reward": 1.623182773590088, + "reward_std": 0.05416783317923546, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49818283319473267, + "step": 1723 + }, + { + "completion_length": 214.21875, + "epoch": 0.5493945188017846, + "grad_norm": 5.456802845001221, + "kl": 0.0810546875, + "learning_rate": 4.506054811982154e-07, + "loss": 0.0032, + "reward": 1.621408224105835, + "reward_std": 0.0672256350517273, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49640828371047974, + "step": 1724 + }, + { + "completion_length": 195.09375, + "epoch": 0.5497131931166348, + "grad_norm": 13.368043899536133, + "kl": 0.11279296875, + "learning_rate": 4.5028680688336515e-07, + "loss": 0.0045, + "reward": 1.6206947565078735, + "reward_std": 0.11809299886226654, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.636319637298584, + "rewards/pad": 0.0, + "step": 1725 + }, + { + "completion_length": 192.515625, + "epoch": 0.550031867431485, + "grad_norm": 15.275333404541016, + "kl": 0.1025390625, + "learning_rate": 4.4996813256851496e-07, + "loss": 0.0041, + "reward": 1.5296623706817627, + "reward_std": 0.07286571711301804, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4046623706817627, + "step": 1726 + }, + { + "completion_length": 295.0, + "epoch": 0.5503505417463352, + "grad_norm": 10.07372760772705, + "kl": 0.07421875, + "learning_rate": 4.496494582536647e-07, + "loss": 0.003, + "reward": 1.5608832836151123, + "reward_std": 0.1302630454301834, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4515083432197571, + "step": 1727 + }, + { + "completion_length": 221.890625, + "epoch": 0.5506692160611855, + "grad_norm": 18.7092227935791, + "kl": 0.08642578125, + "learning_rate": 4.493307839388145e-07, + "loss": 0.0035, + "reward": 1.7104350328445435, + "reward_std": 0.07333327829837799, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46043500304222107, + "rewards/pad": 0.25, + "step": 1728 + }, + { + "completion_length": 279.703125, + "epoch": 0.5509878903760357, + "grad_norm": 9.025415420532227, + "kl": 0.09033203125, + "learning_rate": 4.490121096239643e-07, + "loss": 0.0036, + "reward": 1.6692687273025513, + "reward_std": 0.07121677696704865, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6692686080932617, + "rewards/pad": 0.0, + "step": 1729 + }, + { + "completion_length": 155.78125, + "epoch": 0.5513065646908859, + "grad_norm": 29.62628746032715, + "kl": 0.10986328125, + "learning_rate": 4.486934353091141e-07, + "loss": 0.0044, + "reward": 1.5663038492202759, + "reward_std": 0.1585964560508728, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5819288492202759, + "rewards/pad": 0.0, + "step": 1730 + }, + { + "completion_length": 141.46875, + "epoch": 0.5516252390057361, + "grad_norm": 20.791046142578125, + "kl": 0.1220703125, + "learning_rate": 4.4837476099426384e-07, + "loss": 0.0049, + "reward": 1.6928167343139648, + "reward_std": 0.05160044878721237, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6928167343139648, + "rewards/pad": 0.0, + "step": 1731 + }, + { + "completion_length": 316.265625, + "epoch": 0.5519439133205863, + "grad_norm": 8.608355522155762, + "kl": 0.08984375, + "learning_rate": 4.4805608667941364e-07, + "loss": 0.0036, + "reward": 1.4499294757843018, + "reward_std": 0.07365255802869797, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.449929416179657, + "rewards/pad": 0.0, + "step": 1732 + }, + { + "completion_length": 260.3125, + "epoch": 0.5522625876354366, + "grad_norm": 11.485649108886719, + "kl": 0.087890625, + "learning_rate": 4.477374123645634e-07, + "loss": 0.0035, + "reward": 1.5133521556854248, + "reward_std": 0.045910757035017014, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5133520364761353, + "rewards/pad": 0.0, + "step": 1733 + }, + { + "completion_length": 331.859375, + "epoch": 0.5525812619502868, + "grad_norm": 7.711301803588867, + "kl": 0.0771484375, + "learning_rate": 4.474187380497132e-07, + "loss": 0.0031, + "reward": 1.431697130203247, + "reward_std": 0.07672516256570816, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43169713020324707, + "rewards/pad": 0.0, + "step": 1734 + }, + { + "completion_length": 141.65625, + "epoch": 0.552899936265137, + "grad_norm": 10.256484985351562, + "kl": 0.10791015625, + "learning_rate": 4.4710006373486296e-07, + "loss": 0.0043, + "reward": 1.7603859901428223, + "reward_std": 0.08859126269817352, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.510386049747467, + "step": 1735 + }, + { + "completion_length": 270.296875, + "epoch": 0.5532186105799872, + "grad_norm": 32.26211929321289, + "kl": 0.0791015625, + "learning_rate": 4.467813894200127e-07, + "loss": 0.0032, + "reward": 1.6208715438842773, + "reward_std": 0.0996231734752655, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.38649648427963257, + "step": 1736 + }, + { + "completion_length": 211.734375, + "epoch": 0.5535372848948374, + "grad_norm": 7.994901180267334, + "kl": 0.10595703125, + "learning_rate": 4.4646271510516247e-07, + "loss": 0.0042, + "reward": 1.7499022483825684, + "reward_std": 0.09591798484325409, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49990230798721313, + "rewards/pad": 0.25, + "step": 1737 + }, + { + "completion_length": 201.34375, + "epoch": 0.5538559592096877, + "grad_norm": 7.6049628257751465, + "kl": 0.099609375, + "learning_rate": 4.4614404079031227e-07, + "loss": 0.004, + "reward": 1.4028677940368652, + "reward_std": 0.12666383385658264, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3872428238391876, + "step": 1738 + }, + { + "completion_length": 259.28125, + "epoch": 0.5541746335245379, + "grad_norm": 7.623438358306885, + "kl": 0.1005859375, + "learning_rate": 4.4582536647546203e-07, + "loss": 0.004, + "reward": 1.636760950088501, + "reward_std": 0.12068548798561096, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.652385950088501, + "step": 1739 + }, + { + "completion_length": 293.875, + "epoch": 0.5544933078393881, + "grad_norm": 137.72137451171875, + "kl": 0.0810546875, + "learning_rate": 4.4550669216061183e-07, + "loss": 0.0032, + "reward": 1.3920228481292725, + "reward_std": 0.1445450484752655, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4076477885246277, + "rewards/pad": 0.0, + "step": 1740 + }, + { + "completion_length": 150.578125, + "epoch": 0.5548119821542383, + "grad_norm": 16.826099395751953, + "kl": 0.115234375, + "learning_rate": 4.451880178457616e-07, + "loss": 0.0046, + "reward": 1.7300306558609009, + "reward_std": 0.09280145913362503, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6050306558609009, + "step": 1741 + }, + { + "completion_length": 315.109375, + "epoch": 0.5551306564690885, + "grad_norm": 5.719409465789795, + "kl": 0.0703125, + "learning_rate": 4.448693435309114e-07, + "loss": 0.0028, + "reward": 1.417891025543213, + "reward_std": 0.05220051109790802, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4178909659385681, + "step": 1742 + }, + { + "completion_length": 105.234375, + "epoch": 0.5554493307839388, + "grad_norm": 23.73946762084961, + "kl": 0.109375, + "learning_rate": 4.4455066921606115e-07, + "loss": 0.0044, + "reward": 1.599583387374878, + "reward_std": 0.0803811177611351, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47458338737487793, + "rewards/pad": 0.125, + "step": 1743 + }, + { + "completion_length": 199.203125, + "epoch": 0.555768005098789, + "grad_norm": 13.549830436706543, + "kl": 0.1162109375, + "learning_rate": 4.4423199490121096e-07, + "loss": 0.0046, + "reward": 1.552984595298767, + "reward_std": 0.1160770133137703, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4279846251010895, + "step": 1744 + }, + { + "completion_length": 147.515625, + "epoch": 0.5560866794136392, + "grad_norm": 22.141952514648438, + "kl": 0.1259765625, + "learning_rate": 4.439133205863607e-07, + "loss": 0.005, + "reward": 1.5688796043395996, + "reward_std": 0.10179469734430313, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5688797235488892, + "rewards/pad": 0.0, + "step": 1745 + }, + { + "completion_length": 203.09375, + "epoch": 0.5564053537284895, + "grad_norm": 13.100513458251953, + "kl": 0.103515625, + "learning_rate": 4.435946462715105e-07, + "loss": 0.0042, + "reward": 1.5506198406219482, + "reward_std": 0.11961662769317627, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.550619900226593, + "step": 1746 + }, + { + "completion_length": 330.734375, + "epoch": 0.5567240280433398, + "grad_norm": 5.324476718902588, + "kl": 0.06884765625, + "learning_rate": 4.4327597195666027e-07, + "loss": 0.0028, + "reward": 1.4494538307189941, + "reward_std": 0.13746029138565063, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34007877111434937, + "step": 1747 + }, + { + "completion_length": 210.171875, + "epoch": 0.55704270235819, + "grad_norm": 22.65328598022461, + "kl": 0.09375, + "learning_rate": 4.429572976418101e-07, + "loss": 0.0037, + "reward": 1.7961393594741821, + "reward_std": 0.08024761080741882, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4211394190788269, + "rewards/pad": 0.375, + "step": 1748 + }, + { + "completion_length": 206.53125, + "epoch": 0.5573613766730402, + "grad_norm": 46.93434143066406, + "kl": 0.119140625, + "learning_rate": 4.4263862332695983e-07, + "loss": 0.0048, + "reward": 1.575434684753418, + "reward_std": 0.10544592142105103, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.575434684753418, + "step": 1749 + }, + { + "completion_length": 166.828125, + "epoch": 0.5576800509878904, + "grad_norm": 9.36785888671875, + "kl": 0.09033203125, + "learning_rate": 4.4231994901210964e-07, + "loss": 0.0036, + "reward": 1.5459160804748535, + "reward_std": 0.08506603538990021, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.29591605067253113, + "step": 1750 + }, + { + "completion_length": 225.109375, + "epoch": 0.5579987253027406, + "grad_norm": 17.232309341430664, + "kl": 0.1181640625, + "learning_rate": 4.420012746972594e-07, + "loss": 0.0047, + "reward": 1.61324143409729, + "reward_std": 0.16581512987613678, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5038663744926453, + "rewards/pad": 0.125, + "step": 1751 + }, + { + "completion_length": 331.640625, + "epoch": 0.5583173996175909, + "grad_norm": 7.584392070770264, + "kl": 0.0693359375, + "learning_rate": 4.416826003824092e-07, + "loss": 0.0028, + "reward": 1.5683870315551758, + "reward_std": 0.18173089623451233, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.490261971950531, + "step": 1752 + }, + { + "completion_length": 406.953125, + "epoch": 0.5586360739324411, + "grad_norm": 11.046274185180664, + "kl": 0.0810546875, + "learning_rate": 4.4136392606755895e-07, + "loss": 0.0032, + "reward": 1.5528819561004639, + "reward_std": 0.06767392158508301, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42788198590278625, + "rewards/pad": 0.125, + "step": 1753 + }, + { + "completion_length": 217.9375, + "epoch": 0.5589547482472913, + "grad_norm": 13.496262550354004, + "kl": 0.0927734375, + "learning_rate": 4.4104525175270876e-07, + "loss": 0.0037, + "reward": 1.387298345565796, + "reward_std": 0.0605352446436882, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3872983157634735, + "rewards/pad": 0.0, + "step": 1754 + }, + { + "completion_length": 334.15625, + "epoch": 0.5592734225621415, + "grad_norm": 38.26420211791992, + "kl": 0.10302734375, + "learning_rate": 4.407265774378585e-07, + "loss": 0.0041, + "reward": 1.5812796354293823, + "reward_std": 0.10023447871208191, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5812796950340271, + "rewards/pad": 0.0, + "step": 1755 + }, + { + "completion_length": 267.375, + "epoch": 0.5595920968769917, + "grad_norm": 17.166568756103516, + "kl": 0.09521484375, + "learning_rate": 4.404079031230082e-07, + "loss": 0.0038, + "reward": 1.739503026008606, + "reward_std": 0.08325989544391632, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6301279664039612, + "rewards/pad": 0.109375, + "step": 1756 + }, + { + "completion_length": 272.671875, + "epoch": 0.559910771191842, + "grad_norm": 8.726228713989258, + "kl": 0.08154296875, + "learning_rate": 4.40089228808158e-07, + "loss": 0.0033, + "reward": 1.4452790021896362, + "reward_std": 0.10831916332244873, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.429654061794281, + "rewards/pad": 0.015625, + "step": 1757 + }, + { + "completion_length": 276.875, + "epoch": 0.5602294455066922, + "grad_norm": 12.648775100708008, + "kl": 0.06591796875, + "learning_rate": 4.397705544933078e-07, + "loss": 0.0026, + "reward": 1.4636542797088623, + "reward_std": 0.26348677277565, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3230292499065399, + "step": 1758 + }, + { + "completion_length": 279.109375, + "epoch": 0.5605481198215424, + "grad_norm": 7.031533718109131, + "kl": 0.09033203125, + "learning_rate": 4.394518801784576e-07, + "loss": 0.0036, + "reward": 1.6584200859069824, + "reward_std": 0.10096648335456848, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5334200859069824, + "rewards/pad": 0.125, + "step": 1759 + }, + { + "completion_length": 105.8125, + "epoch": 0.5608667941363926, + "grad_norm": 18.536157608032227, + "kl": 0.11767578125, + "learning_rate": 4.3913320586360734e-07, + "loss": 0.0047, + "reward": 1.5089656114578247, + "reward_std": 0.07973577827215195, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3839656114578247, + "rewards/pad": 0.125, + "step": 1760 + }, + { + "completion_length": 217.3125, + "epoch": 0.5611854684512428, + "grad_norm": 9.231001853942871, + "kl": 0.10107421875, + "learning_rate": 4.3881453154875715e-07, + "loss": 0.004, + "reward": 1.5179157257080078, + "reward_std": 0.1328127682209015, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42416560649871826, + "rewards/pad": 0.109375, + "step": 1761 + }, + { + "completion_length": 197.875, + "epoch": 0.5615041427660931, + "grad_norm": 8.673807144165039, + "kl": 0.09375, + "learning_rate": 4.384958572339069e-07, + "loss": 0.0038, + "reward": 1.443040370941162, + "reward_std": 0.1960524320602417, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.38054025173187256, + "rewards/pad": 0.078125, + "step": 1762 + }, + { + "completion_length": 255.921875, + "epoch": 0.5618228170809433, + "grad_norm": 11.084033012390137, + "kl": 0.08154296875, + "learning_rate": 4.381771829190567e-07, + "loss": 0.0033, + "reward": 1.4985325336456299, + "reward_std": 0.10977034270763397, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4985325038433075, + "rewards/pad": 0.0, + "step": 1763 + }, + { + "completion_length": 154.3125, + "epoch": 0.5621414913957935, + "grad_norm": 16.184070587158203, + "kl": 0.10595703125, + "learning_rate": 4.3785850860420646e-07, + "loss": 0.0042, + "reward": 1.5756832361221313, + "reward_std": 0.0854271948337555, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.45068323612213135, + "step": 1764 + }, + { + "completion_length": 284.046875, + "epoch": 0.5624601657106437, + "grad_norm": 22.491928100585938, + "kl": 0.076171875, + "learning_rate": 4.3753983428935627e-07, + "loss": 0.0031, + "reward": 1.7509132623672485, + "reward_std": 0.12830114364624023, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5321633219718933, + "step": 1765 + }, + { + "completion_length": 295.640625, + "epoch": 0.5627788400254939, + "grad_norm": 9.5074462890625, + "kl": 0.07373046875, + "learning_rate": 4.37221159974506e-07, + "loss": 0.003, + "reward": 1.5447643995285034, + "reward_std": 0.0879368782043457, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5447643995285034, + "rewards/pad": 0.0, + "step": 1766 + }, + { + "completion_length": 255.265625, + "epoch": 0.5630975143403442, + "grad_norm": 7.456171035766602, + "kl": 0.099609375, + "learning_rate": 4.3690248565965583e-07, + "loss": 0.004, + "reward": 1.75307297706604, + "reward_std": 0.09735266864299774, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5030728578567505, + "rewards/pad": 0.25, + "step": 1767 + }, + { + "completion_length": 280.421875, + "epoch": 0.5634161886551944, + "grad_norm": 24.25010108947754, + "kl": 0.076171875, + "learning_rate": 4.365838113448056e-07, + "loss": 0.0031, + "reward": 1.6167089939117432, + "reward_std": 0.0686415284872055, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36670899391174316, + "rewards/pad": 0.25, + "step": 1768 + }, + { + "completion_length": 247.296875, + "epoch": 0.5637348629700446, + "grad_norm": 13.49270248413086, + "kl": 0.0693359375, + "learning_rate": 4.362651370299554e-07, + "loss": 0.0028, + "reward": 1.6864056587219238, + "reward_std": 0.1645689159631729, + "rewards/answer_reward": 0.296875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.38953065872192383, + "step": 1769 + }, + { + "completion_length": 339.109375, + "epoch": 0.5640535372848948, + "grad_norm": 21.020076751708984, + "kl": 0.080078125, + "learning_rate": 4.3594646271510514e-07, + "loss": 0.0032, + "reward": 1.3722314834594727, + "reward_std": 0.15145091712474823, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.38785648345947266, + "step": 1770 + }, + { + "completion_length": 236.453125, + "epoch": 0.564372211599745, + "grad_norm": 19.411277770996094, + "kl": 0.0947265625, + "learning_rate": 4.3562778840025495e-07, + "loss": 0.0038, + "reward": 1.4857559204101562, + "reward_std": 0.11996018886566162, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36075592041015625, + "rewards/pad": 0.125, + "step": 1771 + }, + { + "completion_length": 354.15625, + "epoch": 0.5646908859145953, + "grad_norm": 8.80309009552002, + "kl": 0.05126953125, + "learning_rate": 4.353091140854047e-07, + "loss": 0.0021, + "reward": 1.5560991764068604, + "reward_std": 0.10942699015140533, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44672417640686035, + "step": 1772 + }, + { + "completion_length": 349.953125, + "epoch": 0.5650095602294455, + "grad_norm": 7.837812900543213, + "kl": 0.06298828125, + "learning_rate": 4.349904397705545e-07, + "loss": 0.0025, + "reward": 1.5426127910614014, + "reward_std": 0.04760386049747467, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41761288046836853, + "rewards/pad": 0.125, + "step": 1773 + }, + { + "completion_length": 233.96875, + "epoch": 0.5653282345442957, + "grad_norm": 9.303118705749512, + "kl": 0.1025390625, + "learning_rate": 4.3467176545570427e-07, + "loss": 0.0041, + "reward": 1.7672696113586426, + "reward_std": 0.09113365411758423, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.642269492149353, + "rewards/pad": 0.125, + "step": 1774 + }, + { + "completion_length": 272.28125, + "epoch": 0.5656469088591459, + "grad_norm": 9.206984519958496, + "kl": 0.076171875, + "learning_rate": 4.34353091140854e-07, + "loss": 0.003, + "reward": 1.6474246978759766, + "reward_std": 0.07027721405029297, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5224246978759766, + "step": 1775 + }, + { + "completion_length": 212.359375, + "epoch": 0.5659655831739961, + "grad_norm": 5.96937894821167, + "kl": 0.138671875, + "learning_rate": 4.340344168260038e-07, + "loss": 0.0056, + "reward": 1.5690696239471436, + "reward_std": 0.10176509618759155, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5690696239471436, + "rewards/pad": 0.0, + "step": 1776 + }, + { + "completion_length": 139.25, + "epoch": 0.5662842574888464, + "grad_norm": 9.70727825164795, + "kl": 0.10400390625, + "learning_rate": 4.337157425111536e-07, + "loss": 0.0042, + "reward": 1.7603824138641357, + "reward_std": 0.05401366576552391, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7603825330734253, + "rewards/pad": 0.0, + "step": 1777 + }, + { + "completion_length": 272.46875, + "epoch": 0.5666029318036966, + "grad_norm": 56.87026596069336, + "kl": 0.078125, + "learning_rate": 4.3339706819630333e-07, + "loss": 0.0031, + "reward": 1.4824305772781372, + "reward_std": 0.0942038744688034, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46680548787117004, + "rewards/pad": 0.015625, + "step": 1778 + }, + { + "completion_length": 198.09375, + "epoch": 0.5669216061185468, + "grad_norm": 7.219006061553955, + "kl": 0.11474609375, + "learning_rate": 4.3307839388145314e-07, + "loss": 0.0046, + "reward": 1.5830705165863037, + "reward_std": 0.14112816751003265, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4580705165863037, + "rewards/pad": 0.125, + "step": 1779 + }, + { + "completion_length": 311.171875, + "epoch": 0.567240280433397, + "grad_norm": 52.35779571533203, + "kl": 0.0830078125, + "learning_rate": 4.327597195666029e-07, + "loss": 0.0033, + "reward": 1.5457806587219238, + "reward_std": 0.07235310226678848, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42078062891960144, + "rewards/pad": 0.125, + "step": 1780 + }, + { + "completion_length": 175.4375, + "epoch": 0.5675589547482472, + "grad_norm": 8.508112907409668, + "kl": 0.103515625, + "learning_rate": 4.324410452517527e-07, + "loss": 0.0041, + "reward": 1.7337126731872559, + "reward_std": 0.12397350370883942, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5149626731872559, + "step": 1781 + }, + { + "completion_length": 265.078125, + "epoch": 0.5678776290630975, + "grad_norm": 11.246129989624023, + "kl": 0.08544921875, + "learning_rate": 4.3212237093690246e-07, + "loss": 0.0034, + "reward": 1.5527467727661133, + "reward_std": 0.0398021899163723, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4277467727661133, + "rewards/pad": 0.125, + "step": 1782 + }, + { + "completion_length": 412.265625, + "epoch": 0.5681963033779477, + "grad_norm": 4.389920711517334, + "kl": 0.04931640625, + "learning_rate": 4.3180369662205226e-07, + "loss": 0.002, + "reward": 1.6300678253173828, + "reward_std": 0.0944734513759613, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.39569270610809326, + "step": 1783 + }, + { + "completion_length": 245.546875, + "epoch": 0.5685149776927979, + "grad_norm": 18.353273391723633, + "kl": 0.11181640625, + "learning_rate": 4.31485022307202e-07, + "loss": 0.0045, + "reward": 1.658485770225525, + "reward_std": 0.12130609154701233, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5334858298301697, + "step": 1784 + }, + { + "completion_length": 215.734375, + "epoch": 0.5688336520076482, + "grad_norm": 9.696588516235352, + "kl": 0.10400390625, + "learning_rate": 4.3116634799235177e-07, + "loss": 0.0042, + "reward": 1.6243197917938232, + "reward_std": 0.17657245695590973, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5461949110031128, + "rewards/pad": 0.09375, + "step": 1785 + }, + { + "completion_length": 294.421875, + "epoch": 0.5691523263224985, + "grad_norm": 5.507444858551025, + "kl": 0.10107421875, + "learning_rate": 4.308476736775016e-07, + "loss": 0.0041, + "reward": 1.4492971897125244, + "reward_std": 0.15551123023033142, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4805472195148468, + "step": 1786 + }, + { + "completion_length": 303.421875, + "epoch": 0.5694710006373487, + "grad_norm": 5.179250240325928, + "kl": 0.083984375, + "learning_rate": 4.3052899936265133e-07, + "loss": 0.0034, + "reward": 1.6566708087921143, + "reward_std": 0.09154780209064484, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5316708087921143, + "step": 1787 + }, + { + "completion_length": 208.765625, + "epoch": 0.5697896749521989, + "grad_norm": 12.69097900390625, + "kl": 0.08447265625, + "learning_rate": 4.3021032504780114e-07, + "loss": 0.0034, + "reward": 1.6494231224060059, + "reward_std": 0.09254056215286255, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6494231224060059, + "rewards/pad": 0.0, + "step": 1788 + }, + { + "completion_length": 460.21875, + "epoch": 0.5701083492670491, + "grad_norm": 2.9629247188568115, + "kl": 0.051025390625, + "learning_rate": 4.298916507329509e-07, + "loss": 0.002, + "reward": 1.5042239427566528, + "reward_std": 0.03330201655626297, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37922394275665283, + "step": 1789 + }, + { + "completion_length": 217.03125, + "epoch": 0.5704270235818993, + "grad_norm": 9.85554027557373, + "kl": 0.09765625, + "learning_rate": 4.295729764181007e-07, + "loss": 0.0039, + "reward": 1.6437140703201294, + "reward_std": 0.09522366523742676, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5187140703201294, + "step": 1790 + }, + { + "completion_length": 144.40625, + "epoch": 0.5707456978967496, + "grad_norm": 19.401777267456055, + "kl": 0.216796875, + "learning_rate": 4.2925430210325045e-07, + "loss": 0.0087, + "reward": 1.585303783416748, + "reward_std": 0.1496884524822235, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.600928783416748, + "rewards/pad": 0.0, + "step": 1791 + }, + { + "completion_length": 267.890625, + "epoch": 0.5710643722115998, + "grad_norm": 9.033243179321289, + "kl": 0.0927734375, + "learning_rate": 4.2893562778840026e-07, + "loss": 0.0037, + "reward": 1.5676125288009644, + "reward_std": 0.10527728497982025, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5832374691963196, + "rewards/pad": 0.0, + "step": 1792 + }, + { + "completion_length": 275.578125, + "epoch": 0.57138304652645, + "grad_norm": 51.64488220214844, + "kl": 0.0849609375, + "learning_rate": 4.2861695347355e-07, + "loss": 0.0034, + "reward": 1.3531391620635986, + "reward_std": 0.0575980469584465, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35313910245895386, + "rewards/pad": 0.0, + "step": 1793 + }, + { + "completion_length": 220.46875, + "epoch": 0.5717017208413002, + "grad_norm": 20.05837059020996, + "kl": 0.10791015625, + "learning_rate": 4.282982791586998e-07, + "loss": 0.0043, + "reward": 1.5053462982177734, + "reward_std": 0.05837986245751381, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5053463578224182, + "rewards/pad": 0.0, + "step": 1794 + }, + { + "completion_length": 222.203125, + "epoch": 0.5720203951561504, + "grad_norm": 16.75486183166504, + "kl": 0.107421875, + "learning_rate": 4.279796048438495e-07, + "loss": 0.0043, + "reward": 1.4838123321533203, + "reward_std": 0.10905701667070389, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4994373917579651, + "step": 1795 + }, + { + "completion_length": 141.515625, + "epoch": 0.5723390694710007, + "grad_norm": 13.598223686218262, + "kl": 0.1494140625, + "learning_rate": 4.2766093052899933e-07, + "loss": 0.0059, + "reward": 1.5000587701797485, + "reward_std": 0.07967067509889603, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5000587701797485, + "rewards/pad": 0.0, + "step": 1796 + }, + { + "completion_length": 311.8125, + "epoch": 0.5726577437858509, + "grad_norm": 15.51604175567627, + "kl": 0.08203125, + "learning_rate": 4.273422562141491e-07, + "loss": 0.0033, + "reward": 1.5402470827102661, + "reward_std": 0.12160883098840714, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4308720827102661, + "rewards/pad": 0.125, + "step": 1797 + }, + { + "completion_length": 267.734375, + "epoch": 0.5729764181007011, + "grad_norm": 16.46525001525879, + "kl": 0.083984375, + "learning_rate": 4.270235818992989e-07, + "loss": 0.0034, + "reward": 1.640853762626648, + "reward_std": 0.05473409965634346, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6408537030220032, + "rewards/pad": 0.0, + "step": 1798 + }, + { + "completion_length": 256.234375, + "epoch": 0.5732950924155513, + "grad_norm": 27.168062210083008, + "kl": 0.099609375, + "learning_rate": 4.2670490758444865e-07, + "loss": 0.004, + "reward": 1.5514403581619263, + "reward_std": 0.11094315350055695, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5045653581619263, + "rewards/pad": 0.046875, + "step": 1799 + }, + { + "completion_length": 225.265625, + "epoch": 0.5736137667304015, + "grad_norm": 18.198102951049805, + "kl": 0.11669921875, + "learning_rate": 4.2638623326959845e-07, + "loss": 0.0047, + "reward": 1.4886753559112549, + "reward_std": 0.11343041062355042, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5043004155158997, + "rewards/pad": 0.0, + "step": 1800 + }, + { + "completion_length": 397.796875, + "epoch": 0.5739324410452518, + "grad_norm": 11.939931869506836, + "kl": 0.052978515625, + "learning_rate": 4.260675589547482e-07, + "loss": 0.0021, + "reward": 1.3815760612487793, + "reward_std": 0.0875547006726265, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3972010016441345, + "step": 1801 + }, + { + "completion_length": 262.390625, + "epoch": 0.574251115360102, + "grad_norm": 31.43267250061035, + "kl": 0.0791015625, + "learning_rate": 4.25748884639898e-07, + "loss": 0.0032, + "reward": 1.619990587234497, + "reward_std": 0.1106300875544548, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43249058723449707, + "step": 1802 + }, + { + "completion_length": 98.484375, + "epoch": 0.5745697896749522, + "grad_norm": 8.432551383972168, + "kl": 0.1357421875, + "learning_rate": 4.2543021032504777e-07, + "loss": 0.0054, + "reward": 1.5922178030014038, + "reward_std": 0.10581833869218826, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5922178030014038, + "rewards/pad": 0.0, + "step": 1803 + }, + { + "completion_length": 214.65625, + "epoch": 0.5748884639898024, + "grad_norm": 15.14476203918457, + "kl": 0.1064453125, + "learning_rate": 4.251115360101976e-07, + "loss": 0.0043, + "reward": 1.5079833269119263, + "reward_std": 0.18713060021400452, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3673582673072815, + "rewards/pad": 0.15625, + "step": 1804 + }, + { + "completion_length": 244.984375, + "epoch": 0.5752071383046526, + "grad_norm": 31.882089614868164, + "kl": 0.244140625, + "learning_rate": 4.2479286169534733e-07, + "loss": 0.0098, + "reward": 1.6286325454711914, + "reward_std": 0.10635349899530411, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.519257664680481, + "rewards/pad": 0.125, + "step": 1805 + }, + { + "completion_length": 282.375, + "epoch": 0.5755258126195029, + "grad_norm": 15.163368225097656, + "kl": 0.07470703125, + "learning_rate": 4.2447418738049714e-07, + "loss": 0.003, + "reward": 1.6580278873443604, + "reward_std": 0.17273136973381042, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4236530363559723, + "step": 1806 + }, + { + "completion_length": 275.171875, + "epoch": 0.5758444869343531, + "grad_norm": 9.724393844604492, + "kl": 0.0966796875, + "learning_rate": 4.241555130656469e-07, + "loss": 0.0039, + "reward": 1.5033131837844849, + "reward_std": 0.13871508836746216, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5189381837844849, + "rewards/pad": 0.0, + "step": 1807 + }, + { + "completion_length": 247.203125, + "epoch": 0.5761631612492033, + "grad_norm": 15.421486854553223, + "kl": 0.2265625, + "learning_rate": 4.238368387507967e-07, + "loss": 0.009, + "reward": 1.5908052921295166, + "reward_std": 0.16774819791316986, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3876802325248718, + "rewards/pad": 0.21875, + "step": 1808 + }, + { + "completion_length": 381.65625, + "epoch": 0.5764818355640535, + "grad_norm": 4.462833404541016, + "kl": 0.08154296875, + "learning_rate": 4.2351816443594645e-07, + "loss": 0.0033, + "reward": 1.5021185874938965, + "reward_std": 0.07689405232667923, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5021185278892517, + "rewards/pad": 0.0, + "step": 1809 + }, + { + "completion_length": 247.03125, + "epoch": 0.5768005098789037, + "grad_norm": 7.219703674316406, + "kl": 0.12060546875, + "learning_rate": 4.2319949012109626e-07, + "loss": 0.0048, + "reward": 1.5395386219024658, + "reward_std": 0.07849196344614029, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.539538562297821, + "rewards/pad": 0.0, + "step": 1810 + }, + { + "completion_length": 306.890625, + "epoch": 0.577119184193754, + "grad_norm": 8.923142433166504, + "kl": 0.08544921875, + "learning_rate": 4.22880815806246e-07, + "loss": 0.0034, + "reward": 1.4586997032165527, + "reward_std": 0.04761248081922531, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4586997330188751, + "rewards/pad": 0.0, + "step": 1811 + }, + { + "completion_length": 204.171875, + "epoch": 0.5774378585086042, + "grad_norm": 6.285178184509277, + "kl": 0.1181640625, + "learning_rate": 4.225621414913958e-07, + "loss": 0.0047, + "reward": 1.7298730611801147, + "reward_std": 0.09434516727924347, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7298730611801147, + "rewards/pad": 0.0, + "step": 1812 + }, + { + "completion_length": 156.25, + "epoch": 0.5777565328234544, + "grad_norm": 14.29250431060791, + "kl": 0.115234375, + "learning_rate": 4.2224346717654557e-07, + "loss": 0.0046, + "reward": 1.761228322982788, + "reward_std": 0.07904788106679916, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6362283229827881, + "rewards/pad": 0.125, + "step": 1813 + }, + { + "completion_length": 260.34375, + "epoch": 0.5780752071383046, + "grad_norm": 15.537161827087402, + "kl": 0.10205078125, + "learning_rate": 4.219247928616954e-07, + "loss": 0.0041, + "reward": 1.5445449352264404, + "reward_std": 0.1415569931268692, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.43516993522644043, + "step": 1814 + }, + { + "completion_length": 289.515625, + "epoch": 0.5783938814531548, + "grad_norm": 9.268240928649902, + "kl": 0.07421875, + "learning_rate": 4.216061185468451e-07, + "loss": 0.003, + "reward": 1.4811573028564453, + "reward_std": 0.05475914850831032, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48115718364715576, + "rewards/pad": 0.0, + "step": 1815 + }, + { + "completion_length": 235.4375, + "epoch": 0.5787125557680051, + "grad_norm": 12.025527954101562, + "kl": 0.09521484375, + "learning_rate": 4.2128744423199483e-07, + "loss": 0.0038, + "reward": 1.5596461296081543, + "reward_std": 0.09580011665821075, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4346460700035095, + "step": 1816 + }, + { + "completion_length": 287.515625, + "epoch": 0.5790312300828553, + "grad_norm": 6.452342987060547, + "kl": 0.07568359375, + "learning_rate": 4.2096876991714464e-07, + "loss": 0.003, + "reward": 1.7021329402923584, + "reward_std": 0.05965135246515274, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45213305950164795, + "rewards/pad": 0.25, + "step": 1817 + }, + { + "completion_length": 238.75, + "epoch": 0.5793499043977055, + "grad_norm": 28.46439552307129, + "kl": 0.1552734375, + "learning_rate": 4.206500956022944e-07, + "loss": 0.0062, + "reward": 1.6458709239959717, + "reward_std": 0.08807423710823059, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6458709239959717, + "rewards/pad": 0.0, + "step": 1818 + }, + { + "completion_length": 230.328125, + "epoch": 0.5796685787125557, + "grad_norm": 41.31342315673828, + "kl": 0.26171875, + "learning_rate": 4.203314212874442e-07, + "loss": 0.0104, + "reward": 1.5134882926940918, + "reward_std": 0.09538456797599792, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3884884715080261, + "rewards/pad": 0.125, + "step": 1819 + }, + { + "completion_length": 187.453125, + "epoch": 0.579987253027406, + "grad_norm": 12.76840877532959, + "kl": 0.11083984375, + "learning_rate": 4.2001274697259396e-07, + "loss": 0.0044, + "reward": 1.51198410987854, + "reward_std": 0.10140029340982437, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.52760910987854, + "rewards/pad": 0.0, + "step": 1820 + }, + { + "completion_length": 275.921875, + "epoch": 0.5803059273422562, + "grad_norm": 20.936594009399414, + "kl": 0.07861328125, + "learning_rate": 4.1969407265774376e-07, + "loss": 0.0031, + "reward": 1.5541658401489258, + "reward_std": 0.1162695437669754, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.31979086995124817, + "step": 1821 + }, + { + "completion_length": 423.9375, + "epoch": 0.5806246016571064, + "grad_norm": 5.021227836608887, + "kl": 0.04296875, + "learning_rate": 4.193753983428935e-07, + "loss": 0.0017, + "reward": 1.3449110984802246, + "reward_std": 0.09172512590885162, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.36053603887557983, + "step": 1822 + }, + { + "completion_length": 233.15625, + "epoch": 0.5809432759719566, + "grad_norm": 12.918827056884766, + "kl": 0.09228515625, + "learning_rate": 4.190567240280433e-07, + "loss": 0.0037, + "reward": 1.6571413278579712, + "reward_std": 0.10717593878507614, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.532141387462616, + "step": 1823 + }, + { + "completion_length": 250.203125, + "epoch": 0.5812619502868069, + "grad_norm": 26.820283889770508, + "kl": 0.095703125, + "learning_rate": 4.187380497131931e-07, + "loss": 0.0038, + "reward": 1.663448691368103, + "reward_std": 0.18507176637649536, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5540737509727478, + "step": 1824 + }, + { + "completion_length": 248.46875, + "epoch": 0.5815806246016572, + "grad_norm": 12.499021530151367, + "kl": 0.09228515625, + "learning_rate": 4.184193753983429e-07, + "loss": 0.0037, + "reward": 1.4607033729553223, + "reward_std": 0.15350082516670227, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.30445337295532227, + "rewards/pad": 0.15625, + "step": 1825 + }, + { + "completion_length": 185.390625, + "epoch": 0.5818992989165074, + "grad_norm": 14.832054138183594, + "kl": 0.1357421875, + "learning_rate": 4.1810070108349264e-07, + "loss": 0.0054, + "reward": 1.7510137557983398, + "reward_std": 0.11052236706018448, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6260136961936951, + "rewards/pad": 0.125, + "step": 1826 + }, + { + "completion_length": 237.546875, + "epoch": 0.5822179732313576, + "grad_norm": 6.5988335609436035, + "kl": 0.0908203125, + "learning_rate": 4.1778202676864245e-07, + "loss": 0.0036, + "reward": 1.6146430969238281, + "reward_std": 0.05205903202295303, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4896431565284729, + "rewards/pad": 0.125, + "step": 1827 + }, + { + "completion_length": 201.65625, + "epoch": 0.5825366475462078, + "grad_norm": 13.932860374450684, + "kl": 0.09716796875, + "learning_rate": 4.174633524537922e-07, + "loss": 0.0039, + "reward": 1.6670434474945068, + "reward_std": 0.09753619134426117, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5576685667037964, + "rewards/pad": 0.109375, + "step": 1828 + }, + { + "completion_length": 275.890625, + "epoch": 0.582855321861058, + "grad_norm": 6.20548677444458, + "kl": 0.08203125, + "learning_rate": 4.17144678138942e-07, + "loss": 0.0033, + "reward": 1.488236427307129, + "reward_std": 0.05150597542524338, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4882364869117737, + "rewards/pad": 0.0, + "step": 1829 + }, + { + "completion_length": 187.609375, + "epoch": 0.5831739961759083, + "grad_norm": 9.856452941894531, + "kl": 0.111328125, + "learning_rate": 4.1682600382409176e-07, + "loss": 0.0045, + "reward": 1.603241205215454, + "reward_std": 0.11433658003807068, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6032410860061646, + "step": 1830 + }, + { + "completion_length": 208.890625, + "epoch": 0.5834926704907585, + "grad_norm": 8.903815269470215, + "kl": 0.126953125, + "learning_rate": 4.1650732950924157e-07, + "loss": 0.0051, + "reward": 1.6346218585968018, + "reward_std": 0.12258712202310562, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.650246798992157, + "rewards/pad": 0.0, + "step": 1831 + }, + { + "completion_length": 252.90625, + "epoch": 0.5838113448056087, + "grad_norm": 6.527516841888428, + "kl": 0.080078125, + "learning_rate": 4.161886551943913e-07, + "loss": 0.0032, + "reward": 1.6859986782073975, + "reward_std": 0.058990031480789185, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43599867820739746, + "step": 1832 + }, + { + "completion_length": 368.953125, + "epoch": 0.5841300191204589, + "grad_norm": 24.48158073425293, + "kl": 0.06396484375, + "learning_rate": 4.1586998087954113e-07, + "loss": 0.0026, + "reward": 1.3795561790466309, + "reward_std": 0.06779491156339645, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37955614924430847, + "step": 1833 + }, + { + "completion_length": 282.96875, + "epoch": 0.5844486934353091, + "grad_norm": 10.014896392822266, + "kl": 0.0654296875, + "learning_rate": 4.1555130656469083e-07, + "loss": 0.0026, + "reward": 1.4391634464263916, + "reward_std": 0.09068316221237183, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4391633868217468, + "rewards/pad": 0.0, + "step": 1834 + }, + { + "completion_length": 388.375, + "epoch": 0.5847673677501594, + "grad_norm": 10.559937477111816, + "kl": 0.059814453125, + "learning_rate": 4.1523263224984064e-07, + "loss": 0.0024, + "reward": 1.4498090744018555, + "reward_std": 0.030296865850687027, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44980907440185547, + "step": 1835 + }, + { + "completion_length": 330.359375, + "epoch": 0.5850860420650096, + "grad_norm": 5.408528804779053, + "kl": 0.0556640625, + "learning_rate": 4.149139579349904e-07, + "loss": 0.0022, + "reward": 1.639885663986206, + "reward_std": 0.06358183920383453, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5148857831954956, + "step": 1836 + }, + { + "completion_length": 249.515625, + "epoch": 0.5854047163798598, + "grad_norm": 15.045970916748047, + "kl": 0.091796875, + "learning_rate": 4.145952836201402e-07, + "loss": 0.0037, + "reward": 1.5363945960998535, + "reward_std": 0.14407403767108917, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5520195960998535, + "rewards/pad": 0.0, + "step": 1837 + }, + { + "completion_length": 302.1875, + "epoch": 0.58572339069471, + "grad_norm": 12.751809120178223, + "kl": 0.10986328125, + "learning_rate": 4.1427660930528995e-07, + "loss": 0.0044, + "reward": 1.50230872631073, + "reward_std": 0.04466398060321808, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37730872631073, + "step": 1838 + }, + { + "completion_length": 202.234375, + "epoch": 0.5860420650095602, + "grad_norm": 9.511930465698242, + "kl": 0.11865234375, + "learning_rate": 4.1395793499043976e-07, + "loss": 0.0048, + "reward": 1.65510892868042, + "reward_std": 0.06541073322296143, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6551089286804199, + "step": 1839 + }, + { + "completion_length": 236.03125, + "epoch": 0.5863607393244105, + "grad_norm": 16.85823631286621, + "kl": 0.107421875, + "learning_rate": 4.136392606755895e-07, + "loss": 0.0043, + "reward": 1.6708178520202637, + "reward_std": 0.12455782294273376, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5770677924156189, + "rewards/pad": 0.125, + "step": 1840 + }, + { + "completion_length": 302.78125, + "epoch": 0.5866794136392607, + "grad_norm": 11.518096923828125, + "kl": 0.08447265625, + "learning_rate": 4.133205863607393e-07, + "loss": 0.0034, + "reward": 1.3582675457000732, + "reward_std": 0.11208176612854004, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35826757550239563, + "rewards/pad": 0.0, + "step": 1841 + }, + { + "completion_length": 243.609375, + "epoch": 0.5869980879541109, + "grad_norm": 4.969910144805908, + "kl": 0.0849609375, + "learning_rate": 4.130019120458891e-07, + "loss": 0.0034, + "reward": 1.4788875579833984, + "reward_std": 0.1530902087688446, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.38513755798339844, + "step": 1842 + }, + { + "completion_length": 281.15625, + "epoch": 0.5873167622689611, + "grad_norm": 25.539875030517578, + "kl": 0.08544921875, + "learning_rate": 4.126832377310389e-07, + "loss": 0.0034, + "reward": 1.5895493030548096, + "reward_std": 0.09465925395488739, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5895493626594543, + "rewards/pad": 0.0, + "step": 1843 + }, + { + "completion_length": 232.171875, + "epoch": 0.5876354365838113, + "grad_norm": 16.575340270996094, + "kl": 0.08349609375, + "learning_rate": 4.1236456341618864e-07, + "loss": 0.0033, + "reward": 1.4185080528259277, + "reward_std": 0.20313654839992523, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4341329336166382, + "rewards/pad": 0.015625, + "step": 1844 + }, + { + "completion_length": 199.8125, + "epoch": 0.5879541108986616, + "grad_norm": 16.62732696533203, + "kl": 0.1171875, + "learning_rate": 4.1204588910133844e-07, + "loss": 0.0047, + "reward": 1.666072130203247, + "reward_std": 0.08273019641637802, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5410721898078918, + "step": 1845 + }, + { + "completion_length": 316.03125, + "epoch": 0.5882727852135118, + "grad_norm": 7.145381927490234, + "kl": 0.0693359375, + "learning_rate": 4.117272147864882e-07, + "loss": 0.0028, + "reward": 1.601731777191162, + "reward_std": 0.047059301286935806, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47673168778419495, + "step": 1846 + }, + { + "completion_length": 235.078125, + "epoch": 0.588591459528362, + "grad_norm": 13.02017879486084, + "kl": 0.08203125, + "learning_rate": 4.1140854047163795e-07, + "loss": 0.0033, + "reward": 1.4461121559143066, + "reward_std": 0.08059521019458771, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3211122155189514, + "step": 1847 + }, + { + "completion_length": 190.203125, + "epoch": 0.5889101338432122, + "grad_norm": 7.518156051635742, + "kl": 0.0927734375, + "learning_rate": 4.1108986615678776e-07, + "loss": 0.0037, + "reward": 1.5900521278381348, + "reward_std": 0.06868287175893784, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46505212783813477, + "step": 1848 + }, + { + "completion_length": 313.59375, + "epoch": 0.5892288081580624, + "grad_norm": 6.491325855255127, + "kl": 0.07568359375, + "learning_rate": 4.107711918419375e-07, + "loss": 0.003, + "reward": 1.5029609203338623, + "reward_std": 0.04590877890586853, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37796100974082947, + "step": 1849 + }, + { + "completion_length": 295.8125, + "epoch": 0.5895474824729127, + "grad_norm": 11.342720985412598, + "kl": 0.07763671875, + "learning_rate": 4.104525175270873e-07, + "loss": 0.0031, + "reward": 1.5426108837127686, + "reward_std": 0.06933946907520294, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.29261088371276855, + "step": 1850 + }, + { + "completion_length": 248.25, + "epoch": 0.5898661567877629, + "grad_norm": 8.002965927124023, + "kl": 0.09033203125, + "learning_rate": 4.1013384321223707e-07, + "loss": 0.0036, + "reward": 1.4224798679351807, + "reward_std": 0.09290251135826111, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42247986793518066, + "rewards/pad": 0.0, + "step": 1851 + }, + { + "completion_length": 225.25, + "epoch": 0.5901848311026131, + "grad_norm": 27.69028091430664, + "kl": 0.06982421875, + "learning_rate": 4.098151688973869e-07, + "loss": 0.0028, + "reward": 1.8487733602523804, + "reward_std": 0.17965924739837646, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48939836025238037, + "step": 1852 + }, + { + "completion_length": 288.5, + "epoch": 0.5905035054174633, + "grad_norm": 9.17288589477539, + "kl": 0.0869140625, + "learning_rate": 4.0949649458253663e-07, + "loss": 0.0035, + "reward": 1.4427770376205444, + "reward_std": 0.0336136668920517, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4427770972251892, + "rewards/pad": 0.0, + "step": 1853 + }, + { + "completion_length": 241.296875, + "epoch": 0.5908221797323135, + "grad_norm": 8.414031028747559, + "kl": 0.09228515625, + "learning_rate": 4.091778202676864e-07, + "loss": 0.0037, + "reward": 1.7099772691726685, + "reward_std": 0.07219240069389343, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5849772691726685, + "rewards/pad": 0.125, + "step": 1854 + }, + { + "completion_length": 145.625, + "epoch": 0.5911408540471638, + "grad_norm": 8.692977905273438, + "kl": 0.1005859375, + "learning_rate": 4.0885914595283614e-07, + "loss": 0.004, + "reward": 1.5729535818099976, + "reward_std": 0.13664300739765167, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4323284924030304, + "rewards/pad": 0.15625, + "step": 1855 + }, + { + "completion_length": 259.796875, + "epoch": 0.591459528362014, + "grad_norm": 9.177119255065918, + "kl": 0.0859375, + "learning_rate": 4.0854047163798595e-07, + "loss": 0.0034, + "reward": 1.6044448614120483, + "reward_std": 0.09958793222904205, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.49506986141204834, + "step": 1856 + }, + { + "completion_length": 221.984375, + "epoch": 0.5917782026768642, + "grad_norm": 9.245564460754395, + "kl": 0.0947265625, + "learning_rate": 4.082217973231357e-07, + "loss": 0.0038, + "reward": 1.5397629737854004, + "reward_std": 0.12672016024589539, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4303880035877228, + "step": 1857 + }, + { + "completion_length": 201.09375, + "epoch": 0.5920968769917144, + "grad_norm": 11.010499000549316, + "kl": 0.10009765625, + "learning_rate": 4.079031230082855e-07, + "loss": 0.004, + "reward": 1.8728845119476318, + "reward_std": 0.09205446392297745, + "rewards/answer_reward": 0.265625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6072594523429871, + "step": 1858 + }, + { + "completion_length": 208.859375, + "epoch": 0.5924155513065646, + "grad_norm": 97.43705749511719, + "kl": 0.12890625, + "learning_rate": 4.0758444869343526e-07, + "loss": 0.0052, + "reward": 1.571459174156189, + "reward_std": 0.10175631195306778, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.571459174156189, + "step": 1859 + }, + { + "completion_length": 255.28125, + "epoch": 0.5927342256214149, + "grad_norm": 7.342701435089111, + "kl": 0.08203125, + "learning_rate": 4.0726577437858507e-07, + "loss": 0.0033, + "reward": 1.6075087785720825, + "reward_std": 0.0989253893494606, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4981337785720825, + "rewards/pad": 0.109375, + "step": 1860 + }, + { + "completion_length": 144.578125, + "epoch": 0.5930528999362651, + "grad_norm": 13.898255348205566, + "kl": 0.12255859375, + "learning_rate": 4.069471000637348e-07, + "loss": 0.0049, + "reward": 1.6271339654922485, + "reward_std": 0.0684891790151596, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5021339654922485, + "rewards/pad": 0.125, + "step": 1861 + }, + { + "completion_length": 325.75, + "epoch": 0.5933715742511153, + "grad_norm": 8.965490341186523, + "kl": 0.06591796875, + "learning_rate": 4.0662842574888463e-07, + "loss": 0.0026, + "reward": 1.6634330749511719, + "reward_std": 0.09572862088680267, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5384331941604614, + "rewards/pad": 0.125, + "step": 1862 + }, + { + "completion_length": 293.0, + "epoch": 0.5936902485659655, + "grad_norm": 6.07974910736084, + "kl": 0.1279296875, + "learning_rate": 4.063097514340344e-07, + "loss": 0.0051, + "reward": 1.4634603261947632, + "reward_std": 0.05717071518301964, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46346038579940796, + "step": 1863 + }, + { + "completion_length": 103.890625, + "epoch": 0.5940089228808159, + "grad_norm": 9.67024040222168, + "kl": 0.130859375, + "learning_rate": 4.059910771191842e-07, + "loss": 0.0052, + "reward": 1.7005484104156494, + "reward_std": 0.09798851609230042, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5755484700202942, + "rewards/pad": 0.125, + "step": 1864 + }, + { + "completion_length": 259.671875, + "epoch": 0.5943275971956661, + "grad_norm": 15.222319602966309, + "kl": 0.0888671875, + "learning_rate": 4.0567240280433395e-07, + "loss": 0.0036, + "reward": 1.5262441635131836, + "reward_std": 0.13098961114883423, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.30749407410621643, + "step": 1865 + }, + { + "completion_length": 300.15625, + "epoch": 0.5946462715105163, + "grad_norm": 7.883068561553955, + "kl": 0.09521484375, + "learning_rate": 4.0535372848948375e-07, + "loss": 0.0038, + "reward": 1.6139923334121704, + "reward_std": 0.06202385574579239, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6139923334121704, + "step": 1866 + }, + { + "completion_length": 188.03125, + "epoch": 0.5949649458253665, + "grad_norm": 11.199960708618164, + "kl": 0.11181640625, + "learning_rate": 4.050350541746335e-07, + "loss": 0.0045, + "reward": 1.4685306549072266, + "reward_std": 0.07563184201717377, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4685305953025818, + "step": 1867 + }, + { + "completion_length": 317.65625, + "epoch": 0.5952836201402167, + "grad_norm": 13.962400436401367, + "kl": 0.06298828125, + "learning_rate": 4.047163798597833e-07, + "loss": 0.0025, + "reward": 1.5315433740615845, + "reward_std": 0.07140391319990158, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3909183442592621, + "rewards/pad": 0.140625, + "step": 1868 + }, + { + "completion_length": 167.90625, + "epoch": 0.595602294455067, + "grad_norm": 12.310955047607422, + "kl": 0.10302734375, + "learning_rate": 4.0439770554493307e-07, + "loss": 0.0041, + "reward": 1.6179437637329102, + "reward_std": 0.07316721975803375, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3679437041282654, + "step": 1869 + }, + { + "completion_length": 307.828125, + "epoch": 0.5959209687699172, + "grad_norm": 4.975736141204834, + "kl": 0.0791015625, + "learning_rate": 4.040790312300829e-07, + "loss": 0.0032, + "reward": 1.3248085975646973, + "reward_std": 0.03344433754682541, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3248084485530853, + "step": 1870 + }, + { + "completion_length": 211.75, + "epoch": 0.5962396430847674, + "grad_norm": 9.022954940795898, + "kl": 0.0849609375, + "learning_rate": 4.0376035691523263e-07, + "loss": 0.0034, + "reward": 1.4232125282287598, + "reward_std": 0.07294245809316635, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.423212468624115, + "step": 1871 + }, + { + "completion_length": 297.09375, + "epoch": 0.5965583173996176, + "grad_norm": 19.87626075744629, + "kl": 0.0751953125, + "learning_rate": 4.0344168260038244e-07, + "loss": 0.003, + "reward": 1.5290082693099976, + "reward_std": 0.08635507524013519, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40400823950767517, + "rewards/pad": 0.125, + "step": 1872 + }, + { + "completion_length": 364.375, + "epoch": 0.5968769917144678, + "grad_norm": 8.226881980895996, + "kl": 0.087890625, + "learning_rate": 4.0312300828553214e-07, + "loss": 0.0035, + "reward": 1.4459209442138672, + "reward_std": 0.1533610224723816, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.47717100381851196, + "step": 1873 + }, + { + "completion_length": 150.03125, + "epoch": 0.5971956660293181, + "grad_norm": 16.274169921875, + "kl": 0.126953125, + "learning_rate": 4.0280433397068195e-07, + "loss": 0.0051, + "reward": 1.5381252765655518, + "reward_std": 0.10919656604528427, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41312527656555176, + "rewards/pad": 0.125, + "step": 1874 + }, + { + "completion_length": 202.6875, + "epoch": 0.5975143403441683, + "grad_norm": 9.773507118225098, + "kl": 0.10888671875, + "learning_rate": 4.024856596558317e-07, + "loss": 0.0044, + "reward": 1.7387768030166626, + "reward_std": 0.11466390639543533, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6137768030166626, + "step": 1875 + }, + { + "completion_length": 304.234375, + "epoch": 0.5978330146590185, + "grad_norm": 7.625086784362793, + "kl": 0.07470703125, + "learning_rate": 4.0216698534098145e-07, + "loss": 0.003, + "reward": 1.4400463104248047, + "reward_std": 0.08653664588928223, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4556713104248047, + "rewards/pad": 0.0, + "step": 1876 + }, + { + "completion_length": 151.234375, + "epoch": 0.5981516889738687, + "grad_norm": 14.312983512878418, + "kl": 0.10693359375, + "learning_rate": 4.0184831102613126e-07, + "loss": 0.0043, + "reward": 1.6709468364715576, + "reward_std": 0.07913589477539062, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.42094680666923523, + "step": 1877 + }, + { + "completion_length": 207.3125, + "epoch": 0.5984703632887189, + "grad_norm": 7.628540515899658, + "kl": 0.1044921875, + "learning_rate": 4.01529636711281e-07, + "loss": 0.0042, + "reward": 1.4818041324615479, + "reward_std": 0.09821852296590805, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48180416226387024, + "rewards/pad": 0.0, + "step": 1878 + }, + { + "completion_length": 140.609375, + "epoch": 0.5987890376035692, + "grad_norm": 10.626641273498535, + "kl": 0.1982421875, + "learning_rate": 4.012109623964308e-07, + "loss": 0.0079, + "reward": 1.6922886371612549, + "reward_std": 0.13950839638710022, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.7079135179519653, + "rewards/pad": 0.0, + "step": 1879 + }, + { + "completion_length": 294.15625, + "epoch": 0.5991077119184194, + "grad_norm": 25.593908309936523, + "kl": 0.07861328125, + "learning_rate": 4.008922880815806e-07, + "loss": 0.0031, + "reward": 1.4315825700759888, + "reward_std": 0.1311132311820984, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44720759987831116, + "rewards/pad": 0.0, + "step": 1880 + }, + { + "completion_length": 200.328125, + "epoch": 0.5994263862332696, + "grad_norm": 24.06759262084961, + "kl": 0.11279296875, + "learning_rate": 4.005736137667304e-07, + "loss": 0.0045, + "reward": 1.5628618001937866, + "reward_std": 0.09779863059520721, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5628618001937866, + "step": 1881 + }, + { + "completion_length": 243.4375, + "epoch": 0.5997450605481198, + "grad_norm": 5.812214374542236, + "kl": 0.0986328125, + "learning_rate": 4.0025493945188014e-07, + "loss": 0.0039, + "reward": 1.586617112159729, + "reward_std": 0.13455936312675476, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.602242112159729, + "rewards/pad": 0.0, + "step": 1882 + }, + { + "completion_length": 212.484375, + "epoch": 0.60006373486297, + "grad_norm": 11.938089370727539, + "kl": 0.09423828125, + "learning_rate": 3.9993626513702994e-07, + "loss": 0.0038, + "reward": 1.4001901149749756, + "reward_std": 0.054095618426799774, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.2751900553703308, + "rewards/pad": 0.125, + "step": 1883 + }, + { + "completion_length": 267.765625, + "epoch": 0.6003824091778203, + "grad_norm": 22.59400177001953, + "kl": 0.07666015625, + "learning_rate": 3.996175908221797e-07, + "loss": 0.0031, + "reward": 1.3393089771270752, + "reward_std": 0.05717310309410095, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.33930888772010803, + "step": 1884 + }, + { + "completion_length": 341.578125, + "epoch": 0.6007010834926705, + "grad_norm": 21.02568244934082, + "kl": 0.06787109375, + "learning_rate": 3.992989165073295e-07, + "loss": 0.0027, + "reward": 1.4882479906082153, + "reward_std": 0.1340409815311432, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5038730502128601, + "rewards/pad": 0.0, + "step": 1885 + }, + { + "completion_length": 181.578125, + "epoch": 0.6010197578075207, + "grad_norm": 10.017570495605469, + "kl": 0.1298828125, + "learning_rate": 3.9898024219247926e-07, + "loss": 0.0052, + "reward": 1.3889895677566528, + "reward_std": 0.07805732637643814, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38898953795433044, + "rewards/pad": 0.0, + "step": 1886 + }, + { + "completion_length": 305.75, + "epoch": 0.6013384321223709, + "grad_norm": 9.133565902709961, + "kl": 0.10693359375, + "learning_rate": 3.9866156787762907e-07, + "loss": 0.0043, + "reward": 1.6056489944458008, + "reward_std": 0.08965301513671875, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48064911365509033, + "step": 1887 + }, + { + "completion_length": 155.5, + "epoch": 0.6016571064372211, + "grad_norm": 9.378506660461426, + "kl": 0.12255859375, + "learning_rate": 3.983428935627788e-07, + "loss": 0.0049, + "reward": 1.60020112991333, + "reward_std": 0.11282531917095184, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4908260703086853, + "step": 1888 + }, + { + "completion_length": 257.46875, + "epoch": 0.6019757807520714, + "grad_norm": 8.208368301391602, + "kl": 0.06982421875, + "learning_rate": 3.980242192479286e-07, + "loss": 0.0028, + "reward": 1.8248472213745117, + "reward_std": 0.061266396194696426, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4498470425605774, + "step": 1889 + }, + { + "completion_length": 162.140625, + "epoch": 0.6022944550669216, + "grad_norm": 13.764080047607422, + "kl": 0.11181640625, + "learning_rate": 3.977055449330784e-07, + "loss": 0.0045, + "reward": 1.4256174564361572, + "reward_std": 0.08565089106559753, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.425617516040802, + "rewards/pad": 0.0, + "step": 1890 + }, + { + "completion_length": 287.6875, + "epoch": 0.6026131293817718, + "grad_norm": 20.85105323791504, + "kl": 0.0869140625, + "learning_rate": 3.973868706182282e-07, + "loss": 0.0035, + "reward": 1.375194787979126, + "reward_std": 0.17042842507362366, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.359569787979126, + "step": 1891 + }, + { + "completion_length": 296.953125, + "epoch": 0.602931803696622, + "grad_norm": 15.295384407043457, + "kl": 0.0771484375, + "learning_rate": 3.9706819630337794e-07, + "loss": 0.0031, + "reward": 1.5884289741516113, + "reward_std": 0.09348006546497345, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4634290039539337, + "step": 1892 + }, + { + "completion_length": 157.0625, + "epoch": 0.6032504780114722, + "grad_norm": 48.732383728027344, + "kl": 0.10888671875, + "learning_rate": 3.967495219885277e-07, + "loss": 0.0044, + "reward": 1.6936684846878052, + "reward_std": 0.09765303134918213, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.56866854429245, + "step": 1893 + }, + { + "completion_length": 155.9375, + "epoch": 0.6035691523263225, + "grad_norm": 9.036518096923828, + "kl": 0.0966796875, + "learning_rate": 3.9643084767367745e-07, + "loss": 0.0039, + "reward": 1.5201268196105957, + "reward_std": 0.1085783839225769, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4107518196105957, + "rewards/pad": 0.125, + "step": 1894 + }, + { + "completion_length": 423.0625, + "epoch": 0.6038878266411727, + "grad_norm": 6.770268440246582, + "kl": 0.058837890625, + "learning_rate": 3.9611217335882726e-07, + "loss": 0.0023, + "reward": 1.5838446617126465, + "reward_std": 0.11189889162778854, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4744696021080017, + "step": 1895 + }, + { + "completion_length": 243.03125, + "epoch": 0.6042065009560229, + "grad_norm": 17.663448333740234, + "kl": 0.1171875, + "learning_rate": 3.95793499043977e-07, + "loss": 0.0047, + "reward": 1.535946011543274, + "reward_std": 0.06312038749456406, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5359460115432739, + "rewards/pad": 0.0, + "step": 1896 + }, + { + "completion_length": 149.265625, + "epoch": 0.6045251752708731, + "grad_norm": 25.105934143066406, + "kl": 0.11279296875, + "learning_rate": 3.954748247291268e-07, + "loss": 0.0045, + "reward": 1.7034320831298828, + "reward_std": 0.17257440090179443, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5471821427345276, + "step": 1897 + }, + { + "completion_length": 237.578125, + "epoch": 0.6048438495857233, + "grad_norm": 47.23228073120117, + "kl": 0.076171875, + "learning_rate": 3.9515615041427657e-07, + "loss": 0.003, + "reward": 1.3625296354293823, + "reward_std": 0.05569884181022644, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3625296950340271, + "rewards/pad": 0.0, + "step": 1898 + }, + { + "completion_length": 181.34375, + "epoch": 0.6051625239005736, + "grad_norm": 24.187870025634766, + "kl": 0.11572265625, + "learning_rate": 3.948374760994264e-07, + "loss": 0.0046, + "reward": 1.5649663209915161, + "reward_std": 0.14966291189193726, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4087163209915161, + "rewards/pad": 0.15625, + "step": 1899 + }, + { + "completion_length": 260.171875, + "epoch": 0.6054811982154238, + "grad_norm": 13.711686134338379, + "kl": 0.08251953125, + "learning_rate": 3.9451880178457613e-07, + "loss": 0.0033, + "reward": 1.4515687227249146, + "reward_std": 0.03165704011917114, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45156872272491455, + "rewards/pad": 0.0, + "step": 1900 + }, + { + "completion_length": 283.40625, + "epoch": 0.605799872530274, + "grad_norm": 10.902450561523438, + "kl": 0.07763671875, + "learning_rate": 3.9420012746972594e-07, + "loss": 0.0031, + "reward": 1.4659996032714844, + "reward_std": 0.07243816554546356, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4659996032714844, + "rewards/pad": 0.0, + "step": 1901 + }, + { + "completion_length": 131.78125, + "epoch": 0.6061185468451242, + "grad_norm": 14.222344398498535, + "kl": 0.119140625, + "learning_rate": 3.938814531548757e-07, + "loss": 0.0048, + "reward": 1.5577516555786133, + "reward_std": 0.09155458956956863, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43275171518325806, + "rewards/pad": 0.125, + "step": 1902 + }, + { + "completion_length": 165.375, + "epoch": 0.6064372211599746, + "grad_norm": 42.20652770996094, + "kl": 0.10693359375, + "learning_rate": 3.935627788400255e-07, + "loss": 0.0043, + "reward": 1.5626838207244873, + "reward_std": 0.14869019389152527, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4845588803291321, + "rewards/pad": 0.078125, + "step": 1903 + }, + { + "completion_length": 341.890625, + "epoch": 0.6067558954748248, + "grad_norm": 15.570034980773926, + "kl": 0.06201171875, + "learning_rate": 3.9324410452517525e-07, + "loss": 0.0025, + "reward": 1.4920532703399658, + "reward_std": 0.11405228078365326, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5076783299446106, + "rewards/pad": 0.0, + "step": 1904 + }, + { + "completion_length": 250.53125, + "epoch": 0.607074569789675, + "grad_norm": 6.26237678527832, + "kl": 0.0830078125, + "learning_rate": 3.9292543021032506e-07, + "loss": 0.0033, + "reward": 1.6416082382202148, + "reward_std": 0.15905556082725525, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.42285817861557007, + "step": 1905 + }, + { + "completion_length": 206.390625, + "epoch": 0.6073932441045252, + "grad_norm": 15.79101276397705, + "kl": 0.0947265625, + "learning_rate": 3.926067558954748e-07, + "loss": 0.0038, + "reward": 1.5966687202453613, + "reward_std": 0.13306990265846252, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47166866064071655, + "rewards/pad": 0.125, + "step": 1906 + }, + { + "completion_length": 155.0, + "epoch": 0.6077119184193754, + "grad_norm": 16.358394622802734, + "kl": 0.11279296875, + "learning_rate": 3.922880815806246e-07, + "loss": 0.0045, + "reward": 1.724739909172058, + "reward_std": 0.11366938054561615, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4747399091720581, + "rewards/pad": 0.25, + "step": 1907 + }, + { + "completion_length": 253.15625, + "epoch": 0.6080305927342257, + "grad_norm": 28.876209259033203, + "kl": 0.07763671875, + "learning_rate": 3.919694072657744e-07, + "loss": 0.0031, + "reward": 1.5843945741653442, + "reward_std": 0.1402738243341446, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.490644633769989, + "step": 1908 + }, + { + "completion_length": 265.671875, + "epoch": 0.6083492670490759, + "grad_norm": 30.23357391357422, + "kl": 0.08935546875, + "learning_rate": 3.9165073295092413e-07, + "loss": 0.0036, + "reward": 1.5527620315551758, + "reward_std": 0.10428045690059662, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.31838709115982056, + "step": 1909 + }, + { + "completion_length": 185.734375, + "epoch": 0.6086679413639261, + "grad_norm": 12.290230751037598, + "kl": 0.10205078125, + "learning_rate": 3.9133205863607394e-07, + "loss": 0.0041, + "reward": 1.577617883682251, + "reward_std": 0.08793454617261887, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.577617883682251, + "rewards/pad": 0.0, + "step": 1910 + }, + { + "completion_length": 255.359375, + "epoch": 0.6089866156787763, + "grad_norm": 18.367904663085938, + "kl": 0.10205078125, + "learning_rate": 3.910133843212237e-07, + "loss": 0.0041, + "reward": 1.344871997833252, + "reward_std": 0.13884970545768738, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.37612199783325195, + "step": 1911 + }, + { + "completion_length": 287.296875, + "epoch": 0.6093052899936265, + "grad_norm": 11.594067573547363, + "kl": 0.07568359375, + "learning_rate": 3.906947100063735e-07, + "loss": 0.003, + "reward": 1.5959113836288452, + "reward_std": 0.07247813045978546, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4709113836288452, + "step": 1912 + }, + { + "completion_length": 258.34375, + "epoch": 0.6096239643084768, + "grad_norm": 6.591706275939941, + "kl": 0.07080078125, + "learning_rate": 3.903760356915232e-07, + "loss": 0.0028, + "reward": 1.65617036819458, + "reward_std": 0.05984325334429741, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5311704277992249, + "rewards/pad": 0.125, + "step": 1913 + }, + { + "completion_length": 198.046875, + "epoch": 0.609942638623327, + "grad_norm": 10.927465438842773, + "kl": 0.09326171875, + "learning_rate": 3.90057361376673e-07, + "loss": 0.0037, + "reward": 1.6630644798278809, + "reward_std": 0.0670454129576683, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6630644798278809, + "rewards/pad": 0.0, + "step": 1914 + }, + { + "completion_length": 253.65625, + "epoch": 0.6102613129381772, + "grad_norm": 16.31411361694336, + "kl": 0.1025390625, + "learning_rate": 3.8973868706182276e-07, + "loss": 0.0041, + "reward": 1.5014548301696777, + "reward_std": 0.10171373188495636, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5170798897743225, + "step": 1915 + }, + { + "completion_length": 253.40625, + "epoch": 0.6105799872530274, + "grad_norm": 6.231085777282715, + "kl": 0.103515625, + "learning_rate": 3.8942001274697257e-07, + "loss": 0.0041, + "reward": 1.4525947570800781, + "reward_std": 0.06625127792358398, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45259469747543335, + "rewards/pad": 0.0, + "step": 1916 + }, + { + "completion_length": 216.78125, + "epoch": 0.6108986615678776, + "grad_norm": 10.79334831237793, + "kl": 0.08740234375, + "learning_rate": 3.891013384321223e-07, + "loss": 0.0035, + "reward": 1.8049113750457764, + "reward_std": 0.08902693539857864, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6799114346504211, + "rewards/pad": 0.125, + "step": 1917 + }, + { + "completion_length": 283.734375, + "epoch": 0.6112173358827279, + "grad_norm": 9.601179122924805, + "kl": 0.08203125, + "learning_rate": 3.8878266411727213e-07, + "loss": 0.0033, + "reward": 1.535913348197937, + "reward_std": 0.0846046507358551, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.426538348197937, + "step": 1918 + }, + { + "completion_length": 321.453125, + "epoch": 0.6115360101975781, + "grad_norm": 13.384991645812988, + "kl": 0.07275390625, + "learning_rate": 3.884639898024219e-07, + "loss": 0.0029, + "reward": 1.493588924407959, + "reward_std": 0.03804008662700653, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49358880519866943, + "step": 1919 + }, + { + "completion_length": 184.3125, + "epoch": 0.6118546845124283, + "grad_norm": 13.001421928405762, + "kl": 0.10009765625, + "learning_rate": 3.881453154875717e-07, + "loss": 0.004, + "reward": 1.6277797222137451, + "reward_std": 0.18806394934654236, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5496547818183899, + "step": 1920 + }, + { + "completion_length": 330.90625, + "epoch": 0.6121733588272785, + "grad_norm": 10.868631362915039, + "kl": 0.05859375, + "learning_rate": 3.8782664117272144e-07, + "loss": 0.0023, + "reward": 1.4844664335250854, + "reward_std": 0.058168746531009674, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3594663739204407, + "step": 1921 + }, + { + "completion_length": 247.5625, + "epoch": 0.6124920331421287, + "grad_norm": 13.617033004760742, + "kl": 0.1484375, + "learning_rate": 3.8750796685787125e-07, + "loss": 0.0059, + "reward": 1.4870398044586182, + "reward_std": 0.12303633987903595, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.3932897746562958, + "rewards/pad": 0.125, + "step": 1922 + }, + { + "completion_length": 196.28125, + "epoch": 0.612810707456979, + "grad_norm": 17.17580795288086, + "kl": 0.083984375, + "learning_rate": 3.87189292543021e-07, + "loss": 0.0033, + "reward": 1.5076944828033447, + "reward_std": 0.12078005820512772, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5233194828033447, + "step": 1923 + }, + { + "completion_length": 139.265625, + "epoch": 0.6131293817718292, + "grad_norm": 6.496094703674316, + "kl": 0.1162109375, + "learning_rate": 3.868706182281708e-07, + "loss": 0.0046, + "reward": 1.4235994815826416, + "reward_std": 0.053943369537591934, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4235994815826416, + "rewards/pad": 0.0, + "step": 1924 + }, + { + "completion_length": 288.4375, + "epoch": 0.6134480560866794, + "grad_norm": 29.971853256225586, + "kl": 0.08203125, + "learning_rate": 3.8655194391332057e-07, + "loss": 0.0033, + "reward": 1.4659730195999146, + "reward_std": 0.05683894455432892, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4659729599952698, + "rewards/pad": 0.0, + "step": 1925 + }, + { + "completion_length": 183.3125, + "epoch": 0.6137667304015296, + "grad_norm": 13.859814643859863, + "kl": 0.10205078125, + "learning_rate": 3.8623326959847037e-07, + "loss": 0.0041, + "reward": 1.5560696125030518, + "reward_std": 0.10703746974468231, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.571694552898407, + "rewards/pad": 0.0, + "step": 1926 + }, + { + "completion_length": 272.53125, + "epoch": 0.6140854047163798, + "grad_norm": 18.45200538635254, + "kl": 0.0830078125, + "learning_rate": 3.8591459528362013e-07, + "loss": 0.0033, + "reward": 1.680010199546814, + "reward_std": 0.11167501658201218, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39876025915145874, + "rewards/pad": 0.28125, + "step": 1927 + }, + { + "completion_length": 214.59375, + "epoch": 0.6144040790312301, + "grad_norm": 21.060997009277344, + "kl": 0.0888671875, + "learning_rate": 3.8559592096876993e-07, + "loss": 0.0035, + "reward": 1.52708101272583, + "reward_std": 0.1346757560968399, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3708308935165405, + "rewards/pad": 0.15625, + "step": 1928 + }, + { + "completion_length": 322.53125, + "epoch": 0.6147227533460803, + "grad_norm": 16.372791290283203, + "kl": 0.07666015625, + "learning_rate": 3.852772466539197e-07, + "loss": 0.0031, + "reward": 1.5177533626556396, + "reward_std": 0.0657130628824234, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5177534818649292, + "step": 1929 + }, + { + "completion_length": 199.5625, + "epoch": 0.6150414276609305, + "grad_norm": 23.437456130981445, + "kl": 0.09423828125, + "learning_rate": 3.849585723390695e-07, + "loss": 0.0038, + "reward": 1.479830265045166, + "reward_std": 0.10783857107162476, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47983020544052124, + "rewards/pad": 0.0, + "step": 1930 + }, + { + "completion_length": 253.46875, + "epoch": 0.6153601019757807, + "grad_norm": 8.0709867477417, + "kl": 0.09521484375, + "learning_rate": 3.8463989802421925e-07, + "loss": 0.0038, + "reward": 1.6359091997146606, + "reward_std": 0.054778601974248886, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5109091997146606, + "step": 1931 + }, + { + "completion_length": 330.0625, + "epoch": 0.615678776290631, + "grad_norm": 5.908551216125488, + "kl": 0.06884765625, + "learning_rate": 3.84321223709369e-07, + "loss": 0.0028, + "reward": 1.397073745727539, + "reward_std": 0.03605101630091667, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3970736861228943, + "step": 1932 + }, + { + "completion_length": 313.6875, + "epoch": 0.6159974506054812, + "grad_norm": 5.9190897941589355, + "kl": 0.08740234375, + "learning_rate": 3.8400254939451876e-07, + "loss": 0.0035, + "reward": 1.4032435417175293, + "reward_std": 0.11116233468055725, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4032435417175293, + "step": 1933 + }, + { + "completion_length": 244.328125, + "epoch": 0.6163161249203314, + "grad_norm": 8.51202392578125, + "kl": 0.103515625, + "learning_rate": 3.8368387507966856e-07, + "loss": 0.0041, + "reward": 1.6362656354904175, + "reward_std": 0.08885425329208374, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6362656950950623, + "step": 1934 + }, + { + "completion_length": 209.75, + "epoch": 0.6166347992351816, + "grad_norm": 14.257871627807617, + "kl": 0.10986328125, + "learning_rate": 3.833652007648183e-07, + "loss": 0.0044, + "reward": 1.492276668548584, + "reward_std": 0.08433869481086731, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36727672815322876, + "step": 1935 + }, + { + "completion_length": 203.90625, + "epoch": 0.6169534735500318, + "grad_norm": 15.767688751220703, + "kl": 0.1005859375, + "learning_rate": 3.830465264499681e-07, + "loss": 0.004, + "reward": 1.728574514389038, + "reward_std": 0.1251630187034607, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6191996335983276, + "rewards/pad": 0.125, + "step": 1936 + }, + { + "completion_length": 218.09375, + "epoch": 0.617272147864882, + "grad_norm": 11.859417915344238, + "kl": 0.1240234375, + "learning_rate": 3.827278521351179e-07, + "loss": 0.005, + "reward": 1.3472200632095337, + "reward_std": 0.050142768770456314, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3472200930118561, + "step": 1937 + }, + { + "completion_length": 273.96875, + "epoch": 0.6175908221797323, + "grad_norm": 9.644183158874512, + "kl": 0.09521484375, + "learning_rate": 3.8240917782026763e-07, + "loss": 0.0038, + "reward": 1.8013083934783936, + "reward_std": 0.06755261868238449, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.551308274269104, + "rewards/pad": 0.25, + "step": 1938 + }, + { + "completion_length": 314.0625, + "epoch": 0.6179094964945825, + "grad_norm": 5.385295391082764, + "kl": 0.07177734375, + "learning_rate": 3.8209050350541744e-07, + "loss": 0.0029, + "reward": 1.4403769969940186, + "reward_std": 0.08441875874996185, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.45600199699401855, + "step": 1939 + }, + { + "completion_length": 283.09375, + "epoch": 0.6182281708094327, + "grad_norm": 16.010988235473633, + "kl": 0.09033203125, + "learning_rate": 3.817718291905672e-07, + "loss": 0.0036, + "reward": 1.4347320795059204, + "reward_std": 0.05241646617650986, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4347321391105652, + "step": 1940 + }, + { + "completion_length": 159.171875, + "epoch": 0.6185468451242829, + "grad_norm": 12.74837875366211, + "kl": 0.11279296875, + "learning_rate": 3.81453154875717e-07, + "loss": 0.0045, + "reward": 1.7130329608917236, + "reward_std": 0.09416225552558899, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5880329012870789, + "step": 1941 + }, + { + "completion_length": 246.609375, + "epoch": 0.6188655194391333, + "grad_norm": 10.222420692443848, + "kl": 0.0869140625, + "learning_rate": 3.8113448056086675e-07, + "loss": 0.0035, + "reward": 1.6772123575210571, + "reward_std": 0.0816313698887825, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4428373873233795, + "rewards/pad": 0.234375, + "step": 1942 + }, + { + "completion_length": 103.359375, + "epoch": 0.6191841937539835, + "grad_norm": 8.743064880371094, + "kl": 0.140625, + "learning_rate": 3.8081580624601656e-07, + "loss": 0.0056, + "reward": 1.548558235168457, + "reward_std": 0.12006276845932007, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5485581755638123, + "rewards/pad": 0.0, + "step": 1943 + }, + { + "completion_length": 334.34375, + "epoch": 0.6195028680688337, + "grad_norm": 11.024765968322754, + "kl": 0.1640625, + "learning_rate": 3.804971319311663e-07, + "loss": 0.0066, + "reward": 1.461504340171814, + "reward_std": 0.19224317371845245, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4927542805671692, + "rewards/pad": 0.0, + "step": 1944 + }, + { + "completion_length": 194.65625, + "epoch": 0.6198215423836839, + "grad_norm": 13.036958694458008, + "kl": 0.1455078125, + "learning_rate": 3.801784576163161e-07, + "loss": 0.0058, + "reward": 1.6175357103347778, + "reward_std": 0.11850807815790176, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5081607103347778, + "step": 1945 + }, + { + "completion_length": 337.9375, + "epoch": 0.6201402166985341, + "grad_norm": 6.2965264320373535, + "kl": 0.0693359375, + "learning_rate": 3.798597833014659e-07, + "loss": 0.0028, + "reward": 1.675209641456604, + "reward_std": 0.045450374484062195, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.675209641456604, + "step": 1946 + }, + { + "completion_length": 209.671875, + "epoch": 0.6204588910133844, + "grad_norm": 10.050835609436035, + "kl": 0.095703125, + "learning_rate": 3.795411089866157e-07, + "loss": 0.0038, + "reward": 1.769952416419983, + "reward_std": 0.13773831725120544, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5355774760246277, + "step": 1947 + }, + { + "completion_length": 296.0, + "epoch": 0.6207775653282346, + "grad_norm": 8.063725471496582, + "kl": 0.0869140625, + "learning_rate": 3.7922243467176544e-07, + "loss": 0.0035, + "reward": 1.4973119497299194, + "reward_std": 0.1238342821598053, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3879369795322418, + "rewards/pad": 0.125, + "step": 1948 + }, + { + "completion_length": 237.25, + "epoch": 0.6210962396430848, + "grad_norm": 11.939338684082031, + "kl": 0.08740234375, + "learning_rate": 3.7890376035691524e-07, + "loss": 0.0035, + "reward": 1.6560659408569336, + "reward_std": 0.05877237021923065, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5310658812522888, + "step": 1949 + }, + { + "completion_length": 338.9375, + "epoch": 0.621414913957935, + "grad_norm": 9.682485580444336, + "kl": 0.06787109375, + "learning_rate": 3.78585086042065e-07, + "loss": 0.0027, + "reward": 1.4578567743301392, + "reward_std": 0.10045262426137924, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34848177433013916, + "rewards/pad": 0.125, + "step": 1950 + }, + { + "completion_length": 244.734375, + "epoch": 0.6217335882727852, + "grad_norm": 31.29572296142578, + "kl": 0.0908203125, + "learning_rate": 3.782664117272148e-07, + "loss": 0.0036, + "reward": 1.561888337135315, + "reward_std": 0.04258840158581734, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5618883371353149, + "rewards/pad": 0.0, + "step": 1951 + }, + { + "completion_length": 234.125, + "epoch": 0.6220522625876355, + "grad_norm": 10.641196250915527, + "kl": 0.1513671875, + "learning_rate": 3.779477374123645e-07, + "loss": 0.0061, + "reward": 1.4462183713912964, + "reward_std": 0.15027868747711182, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.414968341588974, + "rewards/pad": 0.03125, + "step": 1952 + }, + { + "completion_length": 203.34375, + "epoch": 0.6223709369024857, + "grad_norm": 15.608134269714355, + "kl": 0.0966796875, + "learning_rate": 3.776290630975143e-07, + "loss": 0.0039, + "reward": 1.7943435907363892, + "reward_std": 0.09442506730556488, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6693435311317444, + "step": 1953 + }, + { + "completion_length": 238.421875, + "epoch": 0.6226896112173359, + "grad_norm": 38.01367950439453, + "kl": 0.0927734375, + "learning_rate": 3.7731038878266407e-07, + "loss": 0.0037, + "reward": 1.5012284517288208, + "reward_std": 0.10308201611042023, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4231035113334656, + "step": 1954 + }, + { + "completion_length": 200.796875, + "epoch": 0.6230082855321861, + "grad_norm": 8.269294738769531, + "kl": 0.07861328125, + "learning_rate": 3.769917144678139e-07, + "loss": 0.0032, + "reward": 1.824209451675415, + "reward_std": 0.12756913900375366, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.589834451675415, + "step": 1955 + }, + { + "completion_length": 212.171875, + "epoch": 0.6233269598470363, + "grad_norm": 17.209590911865234, + "kl": 0.10107421875, + "learning_rate": 3.7667304015296363e-07, + "loss": 0.004, + "reward": 1.6338493824005127, + "reward_std": 0.06915850192308426, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6338493824005127, + "step": 1956 + }, + { + "completion_length": 275.015625, + "epoch": 0.6236456341618866, + "grad_norm": 6.851185321807861, + "kl": 0.07373046875, + "learning_rate": 3.7635436583811344e-07, + "loss": 0.0029, + "reward": 1.5390303134918213, + "reward_std": 0.0869000107049942, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2890303134918213, + "step": 1957 + }, + { + "completion_length": 155.125, + "epoch": 0.6239643084767368, + "grad_norm": 11.542649269104004, + "kl": 0.1201171875, + "learning_rate": 3.760356915232632e-07, + "loss": 0.0048, + "reward": 1.5345096588134766, + "reward_std": 0.05417778715491295, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5345095992088318, + "rewards/pad": 0.0, + "step": 1958 + }, + { + "completion_length": 207.5625, + "epoch": 0.624282982791587, + "grad_norm": 6.008335590362549, + "kl": 0.0966796875, + "learning_rate": 3.75717017208413e-07, + "loss": 0.0039, + "reward": 1.4731647968292236, + "reward_std": 0.14521664381027222, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39503979682922363, + "rewards/pad": 0.078125, + "step": 1959 + }, + { + "completion_length": 252.5625, + "epoch": 0.6246016571064372, + "grad_norm": 10.692580223083496, + "kl": 0.076171875, + "learning_rate": 3.7539834289356275e-07, + "loss": 0.003, + "reward": 1.6467187404632568, + "reward_std": 0.11047782748937607, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44359374046325684, + "rewards/pad": 0.203125, + "step": 1960 + }, + { + "completion_length": 359.296875, + "epoch": 0.6249203314212874, + "grad_norm": 4.429586887359619, + "kl": 0.05029296875, + "learning_rate": 3.7507966857871256e-07, + "loss": 0.002, + "reward": 1.327553629875183, + "reward_std": 0.09251774847507477, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3431786596775055, + "rewards/pad": 0.0, + "step": 1961 + }, + { + "completion_length": 294.515625, + "epoch": 0.6252390057361377, + "grad_norm": 12.322518348693848, + "kl": 0.0712890625, + "learning_rate": 3.747609942638623e-07, + "loss": 0.0029, + "reward": 1.5070735216140747, + "reward_std": 0.1569286286830902, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5226985216140747, + "step": 1962 + }, + { + "completion_length": 208.21875, + "epoch": 0.6255576800509879, + "grad_norm": 24.573041915893555, + "kl": 0.08837890625, + "learning_rate": 3.744423199490121e-07, + "loss": 0.0035, + "reward": 1.5982944965362549, + "reward_std": 0.14915892481803894, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5045445561408997, + "step": 1963 + }, + { + "completion_length": 275.953125, + "epoch": 0.6258763543658381, + "grad_norm": 11.793495178222656, + "kl": 0.08837890625, + "learning_rate": 3.7412364563416187e-07, + "loss": 0.0035, + "reward": 1.4760801792144775, + "reward_std": 0.041609302163124084, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3510802984237671, + "step": 1964 + }, + { + "completion_length": 218.25, + "epoch": 0.6261950286806883, + "grad_norm": 13.615195274353027, + "kl": 0.11669921875, + "learning_rate": 3.738049713193117e-07, + "loss": 0.0047, + "reward": 1.6353042125701904, + "reward_std": 0.09656595438718796, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5103042125701904, + "step": 1965 + }, + { + "completion_length": 289.375, + "epoch": 0.6265137029955385, + "grad_norm": 5.561672210693359, + "kl": 0.08056640625, + "learning_rate": 3.7348629700446143e-07, + "loss": 0.0032, + "reward": 1.458938717842102, + "reward_std": 0.08781175315380096, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4589385986328125, + "step": 1966 + }, + { + "completion_length": 220.921875, + "epoch": 0.6268323773103888, + "grad_norm": 8.867315292358398, + "kl": 0.11572265625, + "learning_rate": 3.7316762268961124e-07, + "loss": 0.0046, + "reward": 1.5917067527770996, + "reward_std": 0.08042184263467789, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5917068123817444, + "rewards/pad": 0.0, + "step": 1967 + }, + { + "completion_length": 218.890625, + "epoch": 0.627151051625239, + "grad_norm": 14.069080352783203, + "kl": 0.07763671875, + "learning_rate": 3.72848948374761e-07, + "loss": 0.0031, + "reward": 1.7388994693756104, + "reward_std": 0.09234826266765594, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4888995289802551, + "rewards/pad": 0.25, + "step": 1968 + }, + { + "completion_length": 312.96875, + "epoch": 0.6274697259400892, + "grad_norm": 7.027201175689697, + "kl": 0.12060546875, + "learning_rate": 3.725302740599108e-07, + "loss": 0.0048, + "reward": 1.5714778900146484, + "reward_std": 0.14681343734264374, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46210283041000366, + "rewards/pad": 0.125, + "step": 1969 + }, + { + "completion_length": 245.671875, + "epoch": 0.6277884002549394, + "grad_norm": 7.419850826263428, + "kl": 0.068359375, + "learning_rate": 3.7221159974506056e-07, + "loss": 0.0027, + "reward": 1.7859978675842285, + "reward_std": 0.0766286849975586, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6609978675842285, + "rewards/pad": 0.125, + "step": 1970 + }, + { + "completion_length": 163.28125, + "epoch": 0.6281070745697896, + "grad_norm": 54.53138732910156, + "kl": 0.10888671875, + "learning_rate": 3.718929254302103e-07, + "loss": 0.0044, + "reward": 1.4890398979187012, + "reward_std": 0.11502894014120102, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3796647787094116, + "rewards/pad": 0.125, + "step": 1971 + }, + { + "completion_length": 350.671875, + "epoch": 0.6284257488846399, + "grad_norm": 6.750097274780273, + "kl": 0.062255859375, + "learning_rate": 3.7157425111536006e-07, + "loss": 0.0025, + "reward": 1.429856300354004, + "reward_std": 0.0958351194858551, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44548124074935913, + "step": 1972 + }, + { + "completion_length": 216.578125, + "epoch": 0.6287444231994901, + "grad_norm": 7.220076084136963, + "kl": 0.07763671875, + "learning_rate": 3.712555768005098e-07, + "loss": 0.0031, + "reward": 1.8001503944396973, + "reward_std": 0.09091947972774506, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4251503646373749, + "rewards/pad": 0.375, + "step": 1973 + }, + { + "completion_length": 227.6875, + "epoch": 0.6290630975143403, + "grad_norm": 5.458504676818848, + "kl": 0.10693359375, + "learning_rate": 3.709369024856596e-07, + "loss": 0.0043, + "reward": 1.4395153522491455, + "reward_std": 0.08207175880670547, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43951529264450073, + "rewards/pad": 0.0, + "step": 1974 + }, + { + "completion_length": 151.265625, + "epoch": 0.6293817718291905, + "grad_norm": 13.328695297241211, + "kl": 0.1220703125, + "learning_rate": 3.706182281708094e-07, + "loss": 0.0049, + "reward": 1.6766632795333862, + "reward_std": 0.09797574579715729, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.676663339138031, + "rewards/pad": 0.0, + "step": 1975 + }, + { + "completion_length": 231.625, + "epoch": 0.6297004461440407, + "grad_norm": 10.131150245666504, + "kl": 0.1328125, + "learning_rate": 3.702995538559592e-07, + "loss": 0.0053, + "reward": 1.8385590314865112, + "reward_std": 0.1282750517129898, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.47918403148651123, + "step": 1976 + }, + { + "completion_length": 307.25, + "epoch": 0.630019120458891, + "grad_norm": 7.387079238891602, + "kl": 0.0673828125, + "learning_rate": 3.6998087954110894e-07, + "loss": 0.0027, + "reward": 1.5480023622512817, + "reward_std": 0.04613717645406723, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.42300236225128174, + "step": 1977 + }, + { + "completion_length": 164.6875, + "epoch": 0.6303377947737412, + "grad_norm": 16.791282653808594, + "kl": 0.1416015625, + "learning_rate": 3.6966220522625875e-07, + "loss": 0.0057, + "reward": 1.5016502141952515, + "reward_std": 0.08761771768331528, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5016502737998962, + "step": 1978 + }, + { + "completion_length": 218.96875, + "epoch": 0.6306564690885914, + "grad_norm": 72.0509033203125, + "kl": 0.07958984375, + "learning_rate": 3.693435309114085e-07, + "loss": 0.0032, + "reward": 1.9234390258789062, + "reward_std": 0.22262617945671082, + "rewards/pad": 0.265625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6734391450881958, + "step": 1979 + }, + { + "completion_length": 337.03125, + "epoch": 0.6309751434034416, + "grad_norm": 13.148605346679688, + "kl": 0.07275390625, + "learning_rate": 3.690248565965583e-07, + "loss": 0.0029, + "reward": 1.4018738269805908, + "reward_std": 0.17414557933807373, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4331238567829132, + "rewards/pad": 0.0, + "step": 1980 + }, + { + "completion_length": 182.375, + "epoch": 0.631293817718292, + "grad_norm": 7.003880977630615, + "kl": 0.0908203125, + "learning_rate": 3.6870618228170806e-07, + "loss": 0.0036, + "reward": 1.6010947227478027, + "reward_std": 0.11983469128608704, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.2573447823524475, + "step": 1981 + }, + { + "completion_length": 270.84375, + "epoch": 0.6316124920331422, + "grad_norm": 5.914927005767822, + "kl": 0.07958984375, + "learning_rate": 3.6838750796685787e-07, + "loss": 0.0032, + "reward": 1.6006536483764648, + "reward_std": 0.11984487622976303, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49127864837646484, + "rewards/pad": 0.125, + "step": 1982 + }, + { + "completion_length": 222.578125, + "epoch": 0.6319311663479924, + "grad_norm": 6.8138747215271, + "kl": 0.10888671875, + "learning_rate": 3.680688336520076e-07, + "loss": 0.0043, + "reward": 1.5949468612670898, + "reward_std": 0.08582202345132828, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5949469208717346, + "rewards/pad": 0.0, + "step": 1983 + }, + { + "completion_length": 243.03125, + "epoch": 0.6322498406628426, + "grad_norm": 12.201217651367188, + "kl": 0.078125, + "learning_rate": 3.6775015933715743e-07, + "loss": 0.0031, + "reward": 1.477726936340332, + "reward_std": 0.1586262583732605, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49335187673568726, + "rewards/pad": 0.0, + "step": 1984 + }, + { + "completion_length": 279.671875, + "epoch": 0.6325685149776928, + "grad_norm": 14.337632179260254, + "kl": 0.0859375, + "learning_rate": 3.674314850223072e-07, + "loss": 0.0034, + "reward": 1.4611310958862305, + "reward_std": 0.07045906037092209, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46113121509552, + "step": 1985 + }, + { + "completion_length": 313.609375, + "epoch": 0.6328871892925431, + "grad_norm": 28.399967193603516, + "kl": 0.08154296875, + "learning_rate": 3.67112810707457e-07, + "loss": 0.0033, + "reward": 1.359786868095398, + "reward_std": 0.09078215062618256, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.35978686809539795, + "step": 1986 + }, + { + "completion_length": 152.953125, + "epoch": 0.6332058636073933, + "grad_norm": 21.749130249023438, + "kl": 0.1005859375, + "learning_rate": 3.6679413639260674e-07, + "loss": 0.004, + "reward": 1.3360384702682495, + "reward_std": 0.047141361981630325, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3360384404659271, + "rewards/pad": 0.0, + "step": 1987 + }, + { + "completion_length": 294.984375, + "epoch": 0.6335245379222435, + "grad_norm": 12.006050109863281, + "kl": 0.07421875, + "learning_rate": 3.6647546207775655e-07, + "loss": 0.003, + "reward": 1.4102833271026611, + "reward_std": 0.2350875735282898, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.2696583569049835, + "step": 1988 + }, + { + "completion_length": 259.015625, + "epoch": 0.6338432122370937, + "grad_norm": 7.643406867980957, + "kl": 0.1044921875, + "learning_rate": 3.661567877629063e-07, + "loss": 0.0042, + "reward": 1.5448561906814575, + "reward_std": 0.09191156923770905, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4354812204837799, + "step": 1989 + }, + { + "completion_length": 110.65625, + "epoch": 0.6341618865519439, + "grad_norm": 11.28622817993164, + "kl": 0.1533203125, + "learning_rate": 3.658381134480561e-07, + "loss": 0.0061, + "reward": 1.6380500793457031, + "reward_std": 0.18105697631835938, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5286750793457031, + "rewards/pad": 0.125, + "step": 1990 + }, + { + "completion_length": 173.828125, + "epoch": 0.6344805608667942, + "grad_norm": 6.512990951538086, + "kl": 0.099609375, + "learning_rate": 3.655194391332058e-07, + "loss": 0.004, + "reward": 1.6445505619049072, + "reward_std": 0.054843753576278687, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5195505619049072, + "rewards/pad": 0.125, + "step": 1991 + }, + { + "completion_length": 306.859375, + "epoch": 0.6347992351816444, + "grad_norm": 10.722943305969238, + "kl": 0.076171875, + "learning_rate": 3.652007648183556e-07, + "loss": 0.003, + "reward": 1.3924834728240967, + "reward_std": 0.09814517199993134, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.28310853242874146, + "step": 1992 + }, + { + "completion_length": 194.40625, + "epoch": 0.6351179094964946, + "grad_norm": 7.510833740234375, + "kl": 0.1328125, + "learning_rate": 3.648820905035054e-07, + "loss": 0.0053, + "reward": 1.677827000617981, + "reward_std": 0.06774307787418365, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.677827000617981, + "rewards/pad": 0.0, + "step": 1993 + }, + { + "completion_length": 275.46875, + "epoch": 0.6354365838113448, + "grad_norm": 35.3467903137207, + "kl": 0.08544921875, + "learning_rate": 3.645634161886552e-07, + "loss": 0.0034, + "reward": 1.5682491064071655, + "reward_std": 0.1375354379415512, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4588741362094879, + "step": 1994 + }, + { + "completion_length": 217.609375, + "epoch": 0.635755258126195, + "grad_norm": 8.333303451538086, + "kl": 0.09375, + "learning_rate": 3.6424474187380494e-07, + "loss": 0.0038, + "reward": 1.6891238689422607, + "reward_std": 0.07034970819950104, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.564123809337616, + "rewards/pad": 0.125, + "step": 1995 + }, + { + "completion_length": 230.8125, + "epoch": 0.6360739324410453, + "grad_norm": 9.261439323425293, + "kl": 0.10546875, + "learning_rate": 3.6392606755895474e-07, + "loss": 0.0042, + "reward": 1.5057934522628784, + "reward_std": 0.10754223167896271, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5057935118675232, + "rewards/pad": 0.0, + "step": 1996 + }, + { + "completion_length": 332.25, + "epoch": 0.6363926067558955, + "grad_norm": 5.428790092468262, + "kl": 0.07177734375, + "learning_rate": 3.636073932441045e-07, + "loss": 0.0029, + "reward": 1.567945122718811, + "reward_std": 0.06191801652312279, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.567945122718811, + "step": 1997 + }, + { + "completion_length": 323.0, + "epoch": 0.6367112810707457, + "grad_norm": 29.642379760742188, + "kl": 0.109375, + "learning_rate": 3.632887189292543e-07, + "loss": 0.0044, + "reward": 1.7203319072723389, + "reward_std": 0.07314572483301163, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4703318774700165, + "step": 1998 + }, + { + "completion_length": 223.3125, + "epoch": 0.6370299553855959, + "grad_norm": 9.734238624572754, + "kl": 0.10986328125, + "learning_rate": 3.6297004461440406e-07, + "loss": 0.0044, + "reward": 1.5392111539840698, + "reward_std": 0.11153466254472733, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5392111539840698, + "rewards/pad": 0.0, + "step": 1999 + }, + { + "completion_length": 267.5, + "epoch": 0.6373486297004461, + "grad_norm": 12.07893180847168, + "kl": 0.09326171875, + "learning_rate": 3.626513702995538e-07, + "loss": 0.0037, + "reward": 1.5638694763183594, + "reward_std": 0.09723970293998718, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43886953592300415, + "rewards/pad": 0.125, + "step": 2000 + }, + { + "completion_length": 313.0625, + "epoch": 0.6376673040152964, + "grad_norm": 5.8718414306640625, + "kl": 0.07373046875, + "learning_rate": 3.623326959847036e-07, + "loss": 0.0029, + "reward": 1.4742928743362427, + "reward_std": 0.05352408438920975, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47429290413856506, + "step": 2001 + }, + { + "completion_length": 314.875, + "epoch": 0.6379859783301466, + "grad_norm": 72.57586669921875, + "kl": 0.08740234375, + "learning_rate": 3.6201402166985337e-07, + "loss": 0.0035, + "reward": 1.5000325441360474, + "reward_std": 0.06527876853942871, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5000325441360474, + "rewards/pad": 0.0, + "step": 2002 + }, + { + "completion_length": 199.53125, + "epoch": 0.6383046526449968, + "grad_norm": 10.587024688720703, + "kl": 0.1474609375, + "learning_rate": 3.616953473550032e-07, + "loss": 0.0059, + "reward": 1.5684709548950195, + "reward_std": 0.12079137563705444, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31847089529037476, + "rewards/pad": 0.25, + "step": 2003 + }, + { + "completion_length": 318.40625, + "epoch": 0.638623326959847, + "grad_norm": 4.737164497375488, + "kl": 0.0712890625, + "learning_rate": 3.6137667304015293e-07, + "loss": 0.0028, + "reward": 1.5824774503707886, + "reward_std": 0.09609906375408173, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45747750997543335, + "step": 2004 + }, + { + "completion_length": 286.640625, + "epoch": 0.6389420012746972, + "grad_norm": 6.416104793548584, + "kl": 0.09912109375, + "learning_rate": 3.6105799872530274e-07, + "loss": 0.004, + "reward": 1.5904369354248047, + "reward_std": 0.12192674726247787, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6060618758201599, + "step": 2005 + }, + { + "completion_length": 360.890625, + "epoch": 0.6392606755895475, + "grad_norm": 12.107686042785645, + "kl": 0.06884765625, + "learning_rate": 3.607393244104525e-07, + "loss": 0.0027, + "reward": 1.4735157489776611, + "reward_std": 0.10779435932636261, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4891408085823059, + "step": 2006 + }, + { + "completion_length": 263.96875, + "epoch": 0.6395793499043977, + "grad_norm": 11.123519897460938, + "kl": 0.09423828125, + "learning_rate": 3.604206500956023e-07, + "loss": 0.0038, + "reward": 1.5251493453979492, + "reward_std": 0.04218164086341858, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5251492857933044, + "step": 2007 + }, + { + "completion_length": 291.46875, + "epoch": 0.6398980242192479, + "grad_norm": 8.76659107208252, + "kl": 0.08056640625, + "learning_rate": 3.6010197578075206e-07, + "loss": 0.0032, + "reward": 1.5642863512039185, + "reward_std": 0.11715750396251678, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45491135120391846, + "rewards/pad": 0.125, + "step": 2008 + }, + { + "completion_length": 358.78125, + "epoch": 0.6402166985340981, + "grad_norm": 6.300568580627441, + "kl": 0.06787109375, + "learning_rate": 3.5978330146590186e-07, + "loss": 0.0027, + "reward": 1.526787281036377, + "reward_std": 0.19017034769058228, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4017874002456665, + "step": 2009 + }, + { + "completion_length": 296.671875, + "epoch": 0.6405353728489483, + "grad_norm": 7.317353248596191, + "kl": 0.08251953125, + "learning_rate": 3.594646271510516e-07, + "loss": 0.0033, + "reward": 1.5332716703414917, + "reward_std": 0.09363551437854767, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5332717299461365, + "rewards/pad": 0.0, + "step": 2010 + }, + { + "completion_length": 178.515625, + "epoch": 0.6408540471637986, + "grad_norm": 38.86016845703125, + "kl": 0.1005859375, + "learning_rate": 3.5914595283620137e-07, + "loss": 0.004, + "reward": 1.6575322151184082, + "reward_std": 0.04223339259624481, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4075321555137634, + "step": 2011 + }, + { + "completion_length": 235.984375, + "epoch": 0.6411727214786488, + "grad_norm": 24.452747344970703, + "kl": 0.0869140625, + "learning_rate": 3.588272785213511e-07, + "loss": 0.0035, + "reward": 1.6948299407958984, + "reward_std": 0.15844303369522095, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5229548811912537, + "step": 2012 + }, + { + "completion_length": 246.625, + "epoch": 0.641491395793499, + "grad_norm": 17.739437103271484, + "kl": 0.09228515625, + "learning_rate": 3.5850860420650093e-07, + "loss": 0.0037, + "reward": 1.663374423980713, + "reward_std": 0.10271087288856506, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6789993643760681, + "rewards/pad": 0.0, + "step": 2013 + }, + { + "completion_length": 275.296875, + "epoch": 0.6418100701083492, + "grad_norm": 10.146001815795898, + "kl": 0.0869140625, + "learning_rate": 3.581899298916507e-07, + "loss": 0.0035, + "reward": 1.8067395687103271, + "reward_std": 0.08705069869756699, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5723645687103271, + "step": 2014 + }, + { + "completion_length": 331.625, + "epoch": 0.6421287444231994, + "grad_norm": 7.079098701477051, + "kl": 0.0693359375, + "learning_rate": 3.578712555768005e-07, + "loss": 0.0028, + "reward": 1.4350299835205078, + "reward_std": 0.1440443992614746, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45065492391586304, + "rewards/pad": 0.0, + "step": 2015 + }, + { + "completion_length": 279.125, + "epoch": 0.6424474187380497, + "grad_norm": 6.421546459197998, + "kl": 0.07763671875, + "learning_rate": 3.5755258126195025e-07, + "loss": 0.0031, + "reward": 1.558060646057129, + "reward_std": 0.07677186280488968, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4330606460571289, + "rewards/pad": 0.125, + "step": 2016 + }, + { + "completion_length": 269.84375, + "epoch": 0.6427660930528999, + "grad_norm": 17.23097801208496, + "kl": 0.09326171875, + "learning_rate": 3.5723390694710005e-07, + "loss": 0.0037, + "reward": 1.6398184299468994, + "reward_std": 0.07573511451482773, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5148184299468994, + "rewards/pad": 0.125, + "step": 2017 + }, + { + "completion_length": 314.546875, + "epoch": 0.6430847673677501, + "grad_norm": 7.973112106323242, + "kl": 0.07080078125, + "learning_rate": 3.569152326322498e-07, + "loss": 0.0028, + "reward": 1.4390485286712646, + "reward_std": 0.03923648223280907, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3140484690666199, + "rewards/pad": 0.125, + "step": 2018 + }, + { + "completion_length": 232.8125, + "epoch": 0.6434034416826003, + "grad_norm": 7.682769298553467, + "kl": 0.08203125, + "learning_rate": 3.565965583173996e-07, + "loss": 0.0033, + "reward": 1.6641254425048828, + "reward_std": 0.09908575564622879, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42975035309791565, + "rewards/pad": 0.25, + "step": 2019 + }, + { + "completion_length": 201.984375, + "epoch": 0.6437221159974506, + "grad_norm": 99.12747192382812, + "kl": 0.1005859375, + "learning_rate": 3.5627788400254937e-07, + "loss": 0.004, + "reward": 1.4531182050704956, + "reward_std": 0.15648740530014038, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4531181752681732, + "step": 2020 + }, + { + "completion_length": 271.8125, + "epoch": 0.6440407903123009, + "grad_norm": 9.223620414733887, + "kl": 0.07421875, + "learning_rate": 3.559592096876992e-07, + "loss": 0.003, + "reward": 1.5330138206481934, + "reward_std": 0.06132769584655762, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40801379084587097, + "rewards/pad": 0.125, + "step": 2021 + }, + { + "completion_length": 284.625, + "epoch": 0.6443594646271511, + "grad_norm": 7.2176079750061035, + "kl": 0.07763671875, + "learning_rate": 3.5564053537284893e-07, + "loss": 0.0031, + "reward": 1.4767729043960571, + "reward_std": 0.06308045983314514, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47677284479141235, + "rewards/pad": 0.0, + "step": 2022 + }, + { + "completion_length": 310.28125, + "epoch": 0.6446781389420013, + "grad_norm": 11.201861381530762, + "kl": 0.1455078125, + "learning_rate": 3.5532186105799874e-07, + "loss": 0.0058, + "reward": 1.4627937078475952, + "reward_std": 0.1969190090894699, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3534187078475952, + "step": 2023 + }, + { + "completion_length": 293.828125, + "epoch": 0.6449968132568515, + "grad_norm": 4.923059940338135, + "kl": 0.0791015625, + "learning_rate": 3.550031867431485e-07, + "loss": 0.0032, + "reward": 1.4457471370697021, + "reward_std": 0.053632702678442, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44574716687202454, + "step": 2024 + }, + { + "completion_length": 272.671875, + "epoch": 0.6453154875717018, + "grad_norm": 6.5525360107421875, + "kl": 0.0830078125, + "learning_rate": 3.546845124282983e-07, + "loss": 0.0033, + "reward": 1.4404683113098145, + "reward_std": 0.10515347123146057, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.45609331130981445, + "step": 2025 + }, + { + "completion_length": 238.734375, + "epoch": 0.645634161886552, + "grad_norm": 11.033935546875, + "kl": 0.107421875, + "learning_rate": 3.5436583811344805e-07, + "loss": 0.0043, + "reward": 1.7628023624420166, + "reward_std": 0.21090787649154663, + "rewards/pad": 0.3125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4659273624420166, + "step": 2026 + }, + { + "completion_length": 183.1875, + "epoch": 0.6459528362014022, + "grad_norm": 8.789874076843262, + "kl": 0.099609375, + "learning_rate": 3.5404716379859786e-07, + "loss": 0.004, + "reward": 1.8745747804641724, + "reward_std": 0.08702093362808228, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4995748698711395, + "step": 2027 + }, + { + "completion_length": 457.5, + "epoch": 0.6462715105162524, + "grad_norm": 8.265464782714844, + "kl": 0.05615234375, + "learning_rate": 3.537284894837476e-07, + "loss": 0.0022, + "reward": 1.4592204093933105, + "reward_std": 0.0653558000922203, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45922043919563293, + "step": 2028 + }, + { + "completion_length": 259.765625, + "epoch": 0.6465901848311026, + "grad_norm": 14.02660083770752, + "kl": 0.08740234375, + "learning_rate": 3.534098151688974e-07, + "loss": 0.0035, + "reward": 1.4943106174468994, + "reward_std": 0.09233476966619492, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4943106174468994, + "rewards/pad": 0.0, + "step": 2029 + }, + { + "completion_length": 321.1875, + "epoch": 0.6469088591459529, + "grad_norm": 99.69380187988281, + "kl": 0.08251953125, + "learning_rate": 3.530911408540471e-07, + "loss": 0.0033, + "reward": 1.5039708614349365, + "reward_std": 0.16264577209949493, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5195959210395813, + "rewards/pad": 0.0, + "step": 2030 + }, + { + "completion_length": 274.03125, + "epoch": 0.6472275334608031, + "grad_norm": 8.511700630187988, + "kl": 0.07470703125, + "learning_rate": 3.527724665391969e-07, + "loss": 0.003, + "reward": 1.6471614837646484, + "reward_std": 0.11210751533508301, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5377864241600037, + "rewards/pad": 0.125, + "step": 2031 + }, + { + "completion_length": 223.65625, + "epoch": 0.6475462077756533, + "grad_norm": 22.564176559448242, + "kl": 0.08251953125, + "learning_rate": 3.524537922243467e-07, + "loss": 0.0033, + "reward": 1.6479836702346802, + "reward_std": 0.1305704116821289, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47610872983932495, + "step": 2032 + }, + { + "completion_length": 311.484375, + "epoch": 0.6478648820905035, + "grad_norm": 13.289634704589844, + "kl": 0.08154296875, + "learning_rate": 3.5213511790949644e-07, + "loss": 0.0033, + "reward": 1.714613914489746, + "reward_std": 0.13245388865470886, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.6208638548851013, + "step": 2033 + }, + { + "completion_length": 210.953125, + "epoch": 0.6481835564053537, + "grad_norm": 18.591171264648438, + "kl": 0.1015625, + "learning_rate": 3.5181644359464624e-07, + "loss": 0.0041, + "reward": 1.7396042346954346, + "reward_std": 0.07380440831184387, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6146041750907898, + "rewards/pad": 0.125, + "step": 2034 + }, + { + "completion_length": 348.328125, + "epoch": 0.648502230720204, + "grad_norm": 7.386063575744629, + "kl": 0.06494140625, + "learning_rate": 3.51497769279796e-07, + "loss": 0.0026, + "reward": 1.6278009414672852, + "reward_std": 0.05366513878107071, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.37780094146728516, + "rewards/pad": 0.25, + "step": 2035 + }, + { + "completion_length": 221.359375, + "epoch": 0.6488209050350542, + "grad_norm": 15.518906593322754, + "kl": 0.0927734375, + "learning_rate": 3.511790949649458e-07, + "loss": 0.0037, + "reward": 1.6019225120544434, + "reward_std": 0.07224541902542114, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6019225120544434, + "rewards/pad": 0.0, + "step": 2036 + }, + { + "completion_length": 193.90625, + "epoch": 0.6491395793499044, + "grad_norm": 22.68230628967285, + "kl": 0.08056640625, + "learning_rate": 3.5086042065009556e-07, + "loss": 0.0032, + "reward": 1.8990789651870728, + "reward_std": 0.08138591051101685, + "rewards/answer_reward": 0.5, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.39907896518707275, + "step": 2037 + }, + { + "completion_length": 316.15625, + "epoch": 0.6494582536647546, + "grad_norm": 15.482177734375, + "kl": 0.0693359375, + "learning_rate": 3.5054174633524537e-07, + "loss": 0.0028, + "reward": 1.4039890766143799, + "reward_std": 0.06646262854337692, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4039890170097351, + "step": 2038 + }, + { + "completion_length": 350.6875, + "epoch": 0.6497769279796048, + "grad_norm": 9.26961898803711, + "kl": 0.06201171875, + "learning_rate": 3.502230720203951e-07, + "loss": 0.0025, + "reward": 1.6099941730499268, + "reward_std": 0.061674658209085464, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6099941730499268, + "step": 2039 + }, + { + "completion_length": 407.15625, + "epoch": 0.6500956022944551, + "grad_norm": 4.745171546936035, + "kl": 0.0732421875, + "learning_rate": 3.499043977055449e-07, + "loss": 0.0029, + "reward": 1.4271552562713623, + "reward_std": 0.12896493077278137, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4427802562713623, + "rewards/pad": 0.0, + "step": 2040 + }, + { + "completion_length": 223.984375, + "epoch": 0.6504142766093053, + "grad_norm": 13.607216835021973, + "kl": 0.0712890625, + "learning_rate": 3.495857233906947e-07, + "loss": 0.0029, + "reward": 1.6901793479919434, + "reward_std": 0.07591713964939117, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44017940759658813, + "rewards/pad": 0.25, + "step": 2041 + }, + { + "completion_length": 311.078125, + "epoch": 0.6507329509241555, + "grad_norm": 7.090723514556885, + "kl": 0.06787109375, + "learning_rate": 3.492670490758445e-07, + "loss": 0.0027, + "reward": 1.6069159507751465, + "reward_std": 0.21890851855278015, + "rewards/pad": 0.34375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.29441598057746887, + "step": 2042 + }, + { + "completion_length": 345.390625, + "epoch": 0.6510516252390057, + "grad_norm": 26.19813346862793, + "kl": 0.06396484375, + "learning_rate": 3.4894837476099424e-07, + "loss": 0.0026, + "reward": 1.6184431314468384, + "reward_std": 0.10701720416545868, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35281819105148315, + "rewards/pad": 0.265625, + "step": 2043 + }, + { + "completion_length": 291.828125, + "epoch": 0.651370299553856, + "grad_norm": 10.5043306350708, + "kl": 0.07470703125, + "learning_rate": 3.4862970044614405e-07, + "loss": 0.003, + "reward": 1.4766829013824463, + "reward_std": 0.07539892196655273, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.35168278217315674, + "rewards/pad": 0.125, + "step": 2044 + }, + { + "completion_length": 198.4375, + "epoch": 0.6516889738687062, + "grad_norm": 7.1742658615112305, + "kl": 0.11328125, + "learning_rate": 3.483110261312938e-07, + "loss": 0.0045, + "reward": 1.6048579216003418, + "reward_std": 0.08010513335466385, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6204828023910522, + "step": 2045 + }, + { + "completion_length": 218.125, + "epoch": 0.6520076481835564, + "grad_norm": 12.691168785095215, + "kl": 0.09619140625, + "learning_rate": 3.479923518164436e-07, + "loss": 0.0038, + "reward": 1.5560691356658936, + "reward_std": 0.08447499573230743, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4310690462589264, + "rewards/pad": 0.125, + "step": 2046 + }, + { + "completion_length": 283.890625, + "epoch": 0.6523263224984066, + "grad_norm": 24.001325607299805, + "kl": 0.08740234375, + "learning_rate": 3.4767367750159336e-07, + "loss": 0.0035, + "reward": 1.343886375427246, + "reward_std": 0.12355609983205795, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3595114052295685, + "step": 2047 + }, + { + "completion_length": 307.515625, + "epoch": 0.6526449968132568, + "grad_norm": 5.191986083984375, + "kl": 0.0673828125, + "learning_rate": 3.4735500318674317e-07, + "loss": 0.0027, + "reward": 1.5148013830184937, + "reward_std": 0.0558304563164711, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38980141282081604, + "rewards/pad": 0.125, + "step": 2048 + }, + { + "completion_length": 335.328125, + "epoch": 0.652963671128107, + "grad_norm": 11.43508529663086, + "kl": 0.21875, + "learning_rate": 3.470363288718929e-07, + "loss": 0.0087, + "reward": 1.516359806060791, + "reward_std": 0.13375940918922424, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.422609806060791, + "step": 2049 + }, + { + "completion_length": 221.71875, + "epoch": 0.6532823454429573, + "grad_norm": 12.406441688537598, + "kl": 0.07373046875, + "learning_rate": 3.467176545570427e-07, + "loss": 0.003, + "reward": 1.6535775661468506, + "reward_std": 0.09638911485671997, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5285775661468506, + "rewards/pad": 0.125, + "step": 2050 + }, + { + "completion_length": 325.890625, + "epoch": 0.6536010197578075, + "grad_norm": 13.22891616821289, + "kl": 0.062255859375, + "learning_rate": 3.4639898024219243e-07, + "loss": 0.0025, + "reward": 1.6790329217910767, + "reward_std": 0.09938417375087738, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44465792179107666, + "step": 2051 + }, + { + "completion_length": 138.46875, + "epoch": 0.6539196940726577, + "grad_norm": 16.85652732849121, + "kl": 0.1298828125, + "learning_rate": 3.4608030592734224e-07, + "loss": 0.0052, + "reward": 1.7120802402496338, + "reward_std": 0.09974721819162369, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7120803594589233, + "rewards/pad": 0.0, + "step": 2052 + }, + { + "completion_length": 282.78125, + "epoch": 0.6542383683875079, + "grad_norm": 20.882482528686523, + "kl": 0.083984375, + "learning_rate": 3.45761631612492e-07, + "loss": 0.0033, + "reward": 1.5891286134719849, + "reward_std": 0.15410195291042328, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5110036134719849, + "step": 2053 + }, + { + "completion_length": 251.390625, + "epoch": 0.6545570427023581, + "grad_norm": 13.868054389953613, + "kl": 0.0927734375, + "learning_rate": 3.454429572976418e-07, + "loss": 0.0037, + "reward": 1.6373168230056763, + "reward_std": 0.07170312851667404, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.637316882610321, + "step": 2054 + }, + { + "completion_length": 259.09375, + "epoch": 0.6548757170172084, + "grad_norm": 19.36661148071289, + "kl": 0.08642578125, + "learning_rate": 3.4512428298279155e-07, + "loss": 0.0035, + "reward": 1.507773995399475, + "reward_std": 0.12215866148471832, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3983990550041199, + "step": 2055 + }, + { + "completion_length": 215.59375, + "epoch": 0.6551943913320586, + "grad_norm": 8.673489570617676, + "kl": 0.0927734375, + "learning_rate": 3.4480560866794136e-07, + "loss": 0.0037, + "reward": 1.5000026226043701, + "reward_std": 0.05920401215553284, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37500256299972534, + "step": 2056 + }, + { + "completion_length": 215.34375, + "epoch": 0.6555130656469088, + "grad_norm": 18.26003646850586, + "kl": 0.09130859375, + "learning_rate": 3.444869343530911e-07, + "loss": 0.0036, + "reward": 1.7287582159042358, + "reward_std": 0.12288461625576019, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5725082159042358, + "step": 2057 + }, + { + "completion_length": 265.703125, + "epoch": 0.655831739961759, + "grad_norm": 11.515850067138672, + "kl": 0.07666015625, + "learning_rate": 3.441682600382409e-07, + "loss": 0.0031, + "reward": 1.430114507675171, + "reward_std": 0.13782833516597748, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3051145076751709, + "rewards/pad": 0.140625, + "step": 2058 + }, + { + "completion_length": 289.265625, + "epoch": 0.6561504142766093, + "grad_norm": 7.043642044067383, + "kl": 0.0927734375, + "learning_rate": 3.438495857233907e-07, + "loss": 0.0037, + "reward": 1.5840719938278198, + "reward_std": 0.15137723088264465, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5996969938278198, + "step": 2059 + }, + { + "completion_length": 187.375, + "epoch": 0.6564690885914596, + "grad_norm": 150.14413452148438, + "kl": 0.1044921875, + "learning_rate": 3.435309114085405e-07, + "loss": 0.0042, + "reward": 1.5122089385986328, + "reward_std": 0.09084093570709229, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.512208878993988, + "step": 2060 + }, + { + "completion_length": 346.515625, + "epoch": 0.6567877629063098, + "grad_norm": 8.237125396728516, + "kl": 0.064453125, + "learning_rate": 3.4321223709369024e-07, + "loss": 0.0026, + "reward": 1.3773670196533203, + "reward_std": 0.0877017229795456, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4086169898509979, + "step": 2061 + }, + { + "completion_length": 387.421875, + "epoch": 0.65710643722116, + "grad_norm": 14.507115364074707, + "kl": 0.07861328125, + "learning_rate": 3.4289356277884e-07, + "loss": 0.0031, + "reward": 1.3933491706848145, + "reward_std": 0.13431811332702637, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4089740514755249, + "step": 2062 + }, + { + "completion_length": 355.015625, + "epoch": 0.6574251115360102, + "grad_norm": 14.55032730102539, + "kl": 0.07177734375, + "learning_rate": 3.425748884639898e-07, + "loss": 0.0029, + "reward": 1.2918533086776733, + "reward_std": 0.08444277942180634, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.32310330867767334, + "rewards/pad": 0.0, + "step": 2063 + }, + { + "completion_length": 294.796875, + "epoch": 0.6577437858508605, + "grad_norm": 12.294832229614258, + "kl": 0.0810546875, + "learning_rate": 3.4225621414913955e-07, + "loss": 0.0032, + "reward": 1.4564380645751953, + "reward_std": 0.08586239069700241, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4564380347728729, + "step": 2064 + }, + { + "completion_length": 426.53125, + "epoch": 0.6580624601657107, + "grad_norm": 5.074521541595459, + "kl": 0.05078125, + "learning_rate": 3.4193753983428936e-07, + "loss": 0.002, + "reward": 1.4866843223571777, + "reward_std": 0.13601148128509521, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.2679342031478882, + "step": 2065 + }, + { + "completion_length": 304.984375, + "epoch": 0.6583811344805609, + "grad_norm": 32.57131576538086, + "kl": 0.09619140625, + "learning_rate": 3.416188655194391e-07, + "loss": 0.0038, + "reward": 1.3942128419876099, + "reward_std": 0.1586059182882309, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.42546284198760986, + "rewards/pad": 0.0, + "step": 2066 + }, + { + "completion_length": 214.171875, + "epoch": 0.6586998087954111, + "grad_norm": 7.813204765319824, + "kl": 0.10009765625, + "learning_rate": 3.413001912045889e-07, + "loss": 0.004, + "reward": 1.6111717224121094, + "reward_std": 0.09173314273357391, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6111717820167542, + "rewards/pad": 0.0, + "step": 2067 + }, + { + "completion_length": 244.546875, + "epoch": 0.6590184831102613, + "grad_norm": 10.449907302856445, + "kl": 0.09521484375, + "learning_rate": 3.409815168897387e-07, + "loss": 0.0038, + "reward": 1.6441354751586914, + "reward_std": 0.15547554194927216, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5347604751586914, + "step": 2068 + }, + { + "completion_length": 344.46875, + "epoch": 0.6593371574251116, + "grad_norm": 7.0181660652160645, + "kl": 0.059326171875, + "learning_rate": 3.406628425748885e-07, + "loss": 0.0024, + "reward": 1.453713297843933, + "reward_std": 0.08794372528791428, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.32871323823928833, + "step": 2069 + }, + { + "completion_length": 175.140625, + "epoch": 0.6596558317399618, + "grad_norm": 10.922768592834473, + "kl": 0.107421875, + "learning_rate": 3.403441682600382e-07, + "loss": 0.0043, + "reward": 1.7466790676116943, + "reward_std": 0.12864170968532562, + "rewards/pad": 0.296875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4498040974140167, + "step": 2070 + }, + { + "completion_length": 393.125, + "epoch": 0.659974506054812, + "grad_norm": 4.762542247772217, + "kl": 0.06591796875, + "learning_rate": 3.40025493945188e-07, + "loss": 0.0026, + "reward": 1.5451923608779907, + "reward_std": 0.10473742336034775, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5139423608779907, + "step": 2071 + }, + { + "completion_length": 382.59375, + "epoch": 0.6602931803696622, + "grad_norm": 9.2803373336792, + "kl": 0.07373046875, + "learning_rate": 3.3970681963033774e-07, + "loss": 0.003, + "reward": 1.369894027709961, + "reward_std": 0.1183726117014885, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.38551896810531616, + "rewards/pad": 0.0, + "step": 2072 + }, + { + "completion_length": 401.953125, + "epoch": 0.6606118546845124, + "grad_norm": 10.371095657348633, + "kl": 0.06640625, + "learning_rate": 3.3938814531548755e-07, + "loss": 0.0026, + "reward": 1.4870697259902954, + "reward_std": 0.11239289492368698, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5026946663856506, + "step": 2073 + }, + { + "completion_length": 335.0, + "epoch": 0.6609305289993627, + "grad_norm": 17.193138122558594, + "kl": 0.083984375, + "learning_rate": 3.390694710006373e-07, + "loss": 0.0034, + "reward": 1.389042854309082, + "reward_std": 0.10139884054660797, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4046678841114044, + "rewards/pad": 0.0, + "step": 2074 + }, + { + "completion_length": 448.859375, + "epoch": 0.6612492033142129, + "grad_norm": 3.9179635047912598, + "kl": 0.04833984375, + "learning_rate": 3.387507966857871e-07, + "loss": 0.0019, + "reward": 1.4355924129486084, + "reward_std": 0.11411859840154648, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45121729373931885, + "rewards/pad": 0.0, + "step": 2075 + }, + { + "completion_length": 218.078125, + "epoch": 0.6615678776290631, + "grad_norm": 10.488283157348633, + "kl": 0.11083984375, + "learning_rate": 3.3843212237093687e-07, + "loss": 0.0044, + "reward": 1.6504709720611572, + "reward_std": 0.07368594408035278, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.650471031665802, + "rewards/pad": 0.0, + "step": 2076 + }, + { + "completion_length": 330.015625, + "epoch": 0.6618865519439133, + "grad_norm": 5.749670505523682, + "kl": 0.08154296875, + "learning_rate": 3.3811344805608667e-07, + "loss": 0.0033, + "reward": 1.6113598346710205, + "reward_std": 0.08943575620651245, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6269847750663757, + "step": 2077 + }, + { + "completion_length": 230.9375, + "epoch": 0.6622052262587635, + "grad_norm": 12.148361206054688, + "kl": 0.07861328125, + "learning_rate": 3.3779477374123643e-07, + "loss": 0.0031, + "reward": 1.5412075519561768, + "reward_std": 0.17028933763504028, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4162076413631439, + "step": 2078 + }, + { + "completion_length": 165.96875, + "epoch": 0.6625239005736138, + "grad_norm": 40.98452377319336, + "kl": 0.1142578125, + "learning_rate": 3.3747609942638623e-07, + "loss": 0.0046, + "reward": 1.661864995956421, + "reward_std": 0.14011311531066895, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5524899363517761, + "step": 2079 + }, + { + "completion_length": 239.84375, + "epoch": 0.662842574888464, + "grad_norm": 6.925748348236084, + "kl": 0.0888671875, + "learning_rate": 3.37157425111536e-07, + "loss": 0.0036, + "reward": 1.6524909734725952, + "reward_std": 0.09404580295085907, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4024909734725952, + "rewards/pad": 0.25, + "step": 2080 + }, + { + "completion_length": 327.015625, + "epoch": 0.6631612492033142, + "grad_norm": 12.143636703491211, + "kl": 0.0791015625, + "learning_rate": 3.368387507966858e-07, + "loss": 0.0032, + "reward": 1.5867152214050293, + "reward_std": 0.12560738623142242, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6023401021957397, + "step": 2081 + }, + { + "completion_length": 278.578125, + "epoch": 0.6634799235181644, + "grad_norm": 6.4376749992370605, + "kl": 0.091796875, + "learning_rate": 3.3652007648183555e-07, + "loss": 0.0037, + "reward": 1.5167558193206787, + "reward_std": 0.08888687193393707, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5167558193206787, + "step": 2082 + }, + { + "completion_length": 334.828125, + "epoch": 0.6637985978330146, + "grad_norm": 10.173569679260254, + "kl": 0.095703125, + "learning_rate": 3.3620140216698536e-07, + "loss": 0.0038, + "reward": 1.6784933805465698, + "reward_std": 0.18250435590744019, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.6003684401512146, + "step": 2083 + }, + { + "completion_length": 233.4375, + "epoch": 0.6641172721478649, + "grad_norm": 7.580698013305664, + "kl": 0.1416015625, + "learning_rate": 3.358827278521351e-07, + "loss": 0.0056, + "reward": 1.357182264328003, + "reward_std": 0.13714271783828735, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.38843220472335815, + "rewards/pad": 0.0, + "step": 2084 + }, + { + "completion_length": 275.4375, + "epoch": 0.6644359464627151, + "grad_norm": 27.354007720947266, + "kl": 0.1357421875, + "learning_rate": 3.355640535372849e-07, + "loss": 0.0055, + "reward": 1.4340639114379883, + "reward_std": 0.1587117612361908, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.35593900084495544, + "rewards/pad": 0.09375, + "step": 2085 + }, + { + "completion_length": 226.390625, + "epoch": 0.6647546207775653, + "grad_norm": 11.366966247558594, + "kl": 0.0830078125, + "learning_rate": 3.3524537922243467e-07, + "loss": 0.0033, + "reward": 1.5681848526000977, + "reward_std": 0.13581296801567078, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4588099718093872, + "step": 2086 + }, + { + "completion_length": 180.953125, + "epoch": 0.6650732950924155, + "grad_norm": 15.23033332824707, + "kl": 0.09912109375, + "learning_rate": 3.349267049075845e-07, + "loss": 0.004, + "reward": 1.6432744264602661, + "reward_std": 0.1856716275215149, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.45577436685562134, + "step": 2087 + }, + { + "completion_length": 334.546875, + "epoch": 0.6653919694072657, + "grad_norm": 5.13187837600708, + "kl": 0.06640625, + "learning_rate": 3.3460803059273423e-07, + "loss": 0.0027, + "reward": 1.5802624225616455, + "reward_std": 0.09438225626945496, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4708874225616455, + "rewards/pad": 0.125, + "step": 2088 + }, + { + "completion_length": 392.328125, + "epoch": 0.665710643722116, + "grad_norm": 14.914807319641113, + "kl": 0.05810546875, + "learning_rate": 3.34289356277884e-07, + "loss": 0.0023, + "reward": 1.4108269214630127, + "reward_std": 0.07436959445476532, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4108269214630127, + "step": 2089 + }, + { + "completion_length": 307.3125, + "epoch": 0.6660293180369662, + "grad_norm": 11.673267364501953, + "kl": 0.0732421875, + "learning_rate": 3.3397068196303374e-07, + "loss": 0.0029, + "reward": 1.2841676473617554, + "reward_std": 0.11732315272092819, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.28416767716407776, + "rewards/pad": 0.015625, + "step": 2090 + }, + { + "completion_length": 515.8125, + "epoch": 0.6663479923518164, + "grad_norm": 3.9852609634399414, + "kl": 0.037109375, + "learning_rate": 3.336520076481835e-07, + "loss": 0.0015, + "reward": 1.4397344589233398, + "reward_std": 0.12903329730033875, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.377234548330307, + "step": 2091 + }, + { + "completion_length": 323.90625, + "epoch": 0.6666666666666666, + "grad_norm": 6.877256870269775, + "kl": 0.08642578125, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0034, + "reward": 1.566868782043457, + "reward_std": 0.09986849129199982, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5668688416481018, + "rewards/pad": 0.0, + "step": 2092 + }, + { + "completion_length": 424.84375, + "epoch": 0.6669853409815168, + "grad_norm": 4.864638805389404, + "kl": 0.050537109375, + "learning_rate": 3.3301465901848305e-07, + "loss": 0.002, + "reward": 1.5969088077545166, + "reward_std": 0.04110327735543251, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4719088077545166, + "step": 2093 + }, + { + "completion_length": 186.890625, + "epoch": 0.6673040152963671, + "grad_norm": 9.601995468139648, + "kl": 0.0966796875, + "learning_rate": 3.3269598470363286e-07, + "loss": 0.0039, + "reward": 1.6393721103668213, + "reward_std": 0.12142668664455414, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5143721103668213, + "rewards/pad": 0.125, + "step": 2094 + }, + { + "completion_length": 355.21875, + "epoch": 0.6676226896112173, + "grad_norm": 9.643255233764648, + "kl": 0.0732421875, + "learning_rate": 3.323773103887826e-07, + "loss": 0.0029, + "reward": 1.5644805431365967, + "reward_std": 0.045724526047706604, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4394805133342743, + "step": 2095 + }, + { + "completion_length": 376.03125, + "epoch": 0.6679413639260675, + "grad_norm": 6.067450523376465, + "kl": 0.053955078125, + "learning_rate": 3.320586360739324e-07, + "loss": 0.0022, + "reward": 1.5394401550292969, + "reward_std": 0.15540587902069092, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5706900954246521, + "step": 2096 + }, + { + "completion_length": 266.0625, + "epoch": 0.6682600382409177, + "grad_norm": 13.3276948928833, + "kl": 0.06201171875, + "learning_rate": 3.317399617590822e-07, + "loss": 0.0025, + "reward": 1.539819359779358, + "reward_std": 0.1312125325202942, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4304443299770355, + "step": 2097 + }, + { + "completion_length": 269.125, + "epoch": 0.668578712555768, + "grad_norm": 14.501863479614258, + "kl": 0.09375, + "learning_rate": 3.31421287444232e-07, + "loss": 0.0038, + "reward": 1.5091478824615479, + "reward_std": 0.09442204236984253, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5247730016708374, + "rewards/pad": 0.0, + "step": 2098 + }, + { + "completion_length": 321.046875, + "epoch": 0.6688973868706183, + "grad_norm": 4.701894760131836, + "kl": 0.0732421875, + "learning_rate": 3.3110261312938174e-07, + "loss": 0.0029, + "reward": 1.5360662937164307, + "reward_std": 0.19051328301429749, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4423162639141083, + "rewards/pad": 0.109375, + "step": 2099 + }, + { + "completion_length": 242.984375, + "epoch": 0.6692160611854685, + "grad_norm": 14.593493461608887, + "kl": 0.103515625, + "learning_rate": 3.3078393881453154e-07, + "loss": 0.0041, + "reward": 1.5114970207214355, + "reward_std": 0.11804000288248062, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5271220207214355, + "step": 2100 + }, + { + "completion_length": 270.296875, + "epoch": 0.6695347355003187, + "grad_norm": 11.10335636138916, + "kl": 0.0830078125, + "learning_rate": 3.304652644996813e-07, + "loss": 0.0033, + "reward": 1.50400972366333, + "reward_std": 0.09469564259052277, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5040097236633301, + "rewards/pad": 0.0, + "step": 2101 + }, + { + "completion_length": 310.46875, + "epoch": 0.6698534098151689, + "grad_norm": 10.861712455749512, + "kl": 0.09716796875, + "learning_rate": 3.301465901848311e-07, + "loss": 0.0039, + "reward": 1.2648383378982544, + "reward_std": 0.1338602602481842, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.2804632782936096, + "rewards/pad": 0.0, + "step": 2102 + }, + { + "completion_length": 215.671875, + "epoch": 0.6701720841300192, + "grad_norm": 17.331504821777344, + "kl": 0.08837890625, + "learning_rate": 3.2982791586998086e-07, + "loss": 0.0035, + "reward": 1.326276183128357, + "reward_std": 0.08590371906757355, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32627618312835693, + "step": 2103 + }, + { + "completion_length": 242.3125, + "epoch": 0.6704907584448694, + "grad_norm": 14.294371604919434, + "kl": 0.091796875, + "learning_rate": 3.2950924155513067e-07, + "loss": 0.0037, + "reward": 1.5341930389404297, + "reward_std": 0.10620848834514618, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5498180985450745, + "rewards/pad": 0.0, + "step": 2104 + }, + { + "completion_length": 276.578125, + "epoch": 0.6708094327597196, + "grad_norm": 5.867783546447754, + "kl": 0.0703125, + "learning_rate": 3.291905672402804e-07, + "loss": 0.0028, + "reward": 1.6233067512512207, + "reward_std": 0.057741183787584305, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4983067512512207, + "rewards/pad": 0.125, + "step": 2105 + }, + { + "completion_length": 380.515625, + "epoch": 0.6711281070745698, + "grad_norm": 10.394988059997559, + "kl": 0.06298828125, + "learning_rate": 3.2887189292543023e-07, + "loss": 0.0025, + "reward": 1.4723341464996338, + "reward_std": 0.2224801480770111, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.39420920610427856, + "step": 2106 + }, + { + "completion_length": 131.578125, + "epoch": 0.67144678138942, + "grad_norm": 10.822446823120117, + "kl": 0.103515625, + "learning_rate": 3.2855321861058e-07, + "loss": 0.0041, + "reward": 1.6905455589294434, + "reward_std": 0.20311430096626282, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5342955589294434, + "rewards/pad": 0.15625, + "step": 2107 + }, + { + "completion_length": 220.984375, + "epoch": 0.6717654557042703, + "grad_norm": 18.684009552001953, + "kl": 0.0791015625, + "learning_rate": 3.282345442957298e-07, + "loss": 0.0032, + "reward": 1.6459847688674927, + "reward_std": 0.12653863430023193, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5209848284721375, + "rewards/pad": 0.125, + "step": 2108 + }, + { + "completion_length": 252.4375, + "epoch": 0.6720841300191205, + "grad_norm": 6.713516712188721, + "kl": 0.083984375, + "learning_rate": 3.279158699808795e-07, + "loss": 0.0034, + "reward": 1.2996958494186401, + "reward_std": 0.0696154534816742, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.17469587922096252, + "step": 2109 + }, + { + "completion_length": 313.6875, + "epoch": 0.6724028043339707, + "grad_norm": 35.75420379638672, + "kl": 0.0703125, + "learning_rate": 3.275971956660293e-07, + "loss": 0.0028, + "reward": 1.653608798980713, + "reward_std": 0.05071251094341278, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5286088585853577, + "rewards/pad": 0.125, + "step": 2110 + }, + { + "completion_length": 396.140625, + "epoch": 0.6727214786488209, + "grad_norm": 6.315822601318359, + "kl": 0.057373046875, + "learning_rate": 3.2727852135117905e-07, + "loss": 0.0023, + "reward": 1.5398814678192139, + "reward_std": 0.1398405134677887, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.44613149762153625, + "step": 2111 + }, + { + "completion_length": 275.28125, + "epoch": 0.6730401529636711, + "grad_norm": 18.19247055053711, + "kl": 0.0859375, + "learning_rate": 3.2695984703632886e-07, + "loss": 0.0034, + "reward": 1.7365256547927856, + "reward_std": 0.18420612812042236, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5177757740020752, + "step": 2112 + }, + { + "completion_length": 216.171875, + "epoch": 0.6733588272785214, + "grad_norm": 17.706764221191406, + "kl": 0.08349609375, + "learning_rate": 3.266411727214786e-07, + "loss": 0.0033, + "reward": 1.43001389503479, + "reward_std": 0.11733804643154144, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44563889503479004, + "step": 2113 + }, + { + "completion_length": 280.859375, + "epoch": 0.6736775015933716, + "grad_norm": 12.78309154510498, + "kl": 0.07177734375, + "learning_rate": 3.263224984066284e-07, + "loss": 0.0029, + "reward": 1.4484806060791016, + "reward_std": 0.1356436014175415, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46410566568374634, + "rewards/pad": 0.0, + "step": 2114 + }, + { + "completion_length": 200.109375, + "epoch": 0.6739961759082218, + "grad_norm": 10.1220064163208, + "kl": 0.099609375, + "learning_rate": 3.2600382409177817e-07, + "loss": 0.004, + "reward": 1.534574270248413, + "reward_std": 0.13635796308517456, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4251992702484131, + "step": 2115 + }, + { + "completion_length": 333.5, + "epoch": 0.674314850223072, + "grad_norm": 13.432465553283691, + "kl": 0.07763671875, + "learning_rate": 3.25685149776928e-07, + "loss": 0.0031, + "reward": 1.4835143089294434, + "reward_std": 0.1128297820687294, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37413930892944336, + "step": 2116 + }, + { + "completion_length": 295.078125, + "epoch": 0.6746335245379222, + "grad_norm": 6.980226993560791, + "kl": 0.07421875, + "learning_rate": 3.2536647546207773e-07, + "loss": 0.003, + "reward": 1.530266523361206, + "reward_std": 0.05441119521856308, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.530266523361206, + "rewards/pad": 0.0, + "step": 2117 + }, + { + "completion_length": 268.21875, + "epoch": 0.6749521988527725, + "grad_norm": 5.705792427062988, + "kl": 0.083984375, + "learning_rate": 3.2504780114722754e-07, + "loss": 0.0034, + "reward": 1.416893482208252, + "reward_std": 0.07435958832502365, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4168934226036072, + "step": 2118 + }, + { + "completion_length": 309.875, + "epoch": 0.6752708731676227, + "grad_norm": 7.612441539764404, + "kl": 0.08154296875, + "learning_rate": 3.247291268323773e-07, + "loss": 0.0033, + "reward": 1.6166479587554932, + "reward_std": 0.18765589594841003, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5228978991508484, + "rewards/pad": 0.109375, + "step": 2119 + }, + { + "completion_length": 255.640625, + "epoch": 0.6755895474824729, + "grad_norm": 13.338653564453125, + "kl": 0.0830078125, + "learning_rate": 3.244104525175271e-07, + "loss": 0.0033, + "reward": 1.6340851783752441, + "reward_std": 0.12460452318191528, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5247102379798889, + "rewards/pad": 0.125, + "step": 2120 + }, + { + "completion_length": 187.09375, + "epoch": 0.6759082217973231, + "grad_norm": 65.28882598876953, + "kl": 0.11572265625, + "learning_rate": 3.2409177820267686e-07, + "loss": 0.0046, + "reward": 1.5655567646026611, + "reward_std": 0.08916109800338745, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5655567049980164, + "rewards/pad": 0.0, + "step": 2121 + }, + { + "completion_length": 208.203125, + "epoch": 0.6762268961121733, + "grad_norm": 12.348464012145996, + "kl": 0.08837890625, + "learning_rate": 3.2377310388782666e-07, + "loss": 0.0035, + "reward": 1.7495148181915283, + "reward_std": 0.09711165726184845, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4995148181915283, + "step": 2122 + }, + { + "completion_length": 207.9375, + "epoch": 0.6765455704270236, + "grad_norm": 7.536280155181885, + "kl": 0.0703125, + "learning_rate": 3.234544295729764e-07, + "loss": 0.0028, + "reward": 1.8844282627105713, + "reward_std": 0.06893973052501678, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5094282627105713, + "step": 2123 + }, + { + "completion_length": 253.8125, + "epoch": 0.6768642447418738, + "grad_norm": 6.380032062530518, + "kl": 0.0849609375, + "learning_rate": 3.2313575525812617e-07, + "loss": 0.0034, + "reward": 1.497983694076538, + "reward_std": 0.0429433137178421, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49798381328582764, + "rewards/pad": 0.0, + "step": 2124 + }, + { + "completion_length": 241.640625, + "epoch": 0.677182919056724, + "grad_norm": 8.918638229370117, + "kl": 0.09228515625, + "learning_rate": 3.22817080943276e-07, + "loss": 0.0037, + "reward": 1.4526426792144775, + "reward_std": 0.09334304183721542, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4526427388191223, + "step": 2125 + }, + { + "completion_length": 252.984375, + "epoch": 0.6775015933715742, + "grad_norm": 14.118407249450684, + "kl": 0.09228515625, + "learning_rate": 3.2249840662842573e-07, + "loss": 0.0037, + "reward": 1.5071016550064087, + "reward_std": 0.1294305920600891, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5227266550064087, + "step": 2126 + }, + { + "completion_length": 297.984375, + "epoch": 0.6778202676864244, + "grad_norm": 15.682186126708984, + "kl": 0.0771484375, + "learning_rate": 3.2217973231357554e-07, + "loss": 0.0031, + "reward": 1.5055906772613525, + "reward_std": 0.1808309257030487, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.41184061765670776, + "rewards/pad": 0.125, + "step": 2127 + }, + { + "completion_length": 317.921875, + "epoch": 0.6781389420012747, + "grad_norm": 10.266303062438965, + "kl": 0.0693359375, + "learning_rate": 3.2186105799872524e-07, + "loss": 0.0028, + "reward": 1.4385040998458862, + "reward_std": 0.057395100593566895, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.18850407004356384, + "step": 2128 + }, + { + "completion_length": 344.046875, + "epoch": 0.6784576163161249, + "grad_norm": 8.247588157653809, + "kl": 0.11572265625, + "learning_rate": 3.2154238368387505e-07, + "loss": 0.0046, + "reward": 1.4720286130905151, + "reward_std": 0.0752357468008995, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47202861309051514, + "rewards/pad": 0.0, + "step": 2129 + }, + { + "completion_length": 197.46875, + "epoch": 0.6787762906309751, + "grad_norm": 30.725082397460938, + "kl": 0.0986328125, + "learning_rate": 3.212237093690248e-07, + "loss": 0.0039, + "reward": 1.365682601928711, + "reward_std": 0.05450920760631561, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3656827509403229, + "rewards/pad": 0.0, + "step": 2130 + }, + { + "completion_length": 254.5625, + "epoch": 0.6790949649458253, + "grad_norm": 10.495698928833008, + "kl": 0.10595703125, + "learning_rate": 3.209050350541746e-07, + "loss": 0.0042, + "reward": 1.5750397443771362, + "reward_std": 0.08169522881507874, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.575039803981781, + "rewards/pad": 0.0, + "step": 2131 + }, + { + "completion_length": 306.203125, + "epoch": 0.6794136392606756, + "grad_norm": 16.586490631103516, + "kl": 0.07373046875, + "learning_rate": 3.2058636073932436e-07, + "loss": 0.0029, + "reward": 1.5276161432266235, + "reward_std": 0.07765782624483109, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.30886608362197876, + "step": 2132 + }, + { + "completion_length": 205.203125, + "epoch": 0.6797323135755258, + "grad_norm": 35.02643966674805, + "kl": 0.10302734375, + "learning_rate": 3.2026768642447417e-07, + "loss": 0.0041, + "reward": 1.581708312034607, + "reward_std": 0.09050709009170532, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5817083120346069, + "step": 2133 + }, + { + "completion_length": 232.484375, + "epoch": 0.680050987890376, + "grad_norm": 27.2739200592041, + "kl": 0.0869140625, + "learning_rate": 3.199490121096239e-07, + "loss": 0.0035, + "reward": 1.488992691040039, + "reward_std": 0.08124648034572601, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36399269104003906, + "rewards/pad": 0.125, + "step": 2134 + }, + { + "completion_length": 351.234375, + "epoch": 0.6803696622052262, + "grad_norm": 8.70663070678711, + "kl": 0.0771484375, + "learning_rate": 3.1963033779477373e-07, + "loss": 0.0031, + "reward": 1.446648120880127, + "reward_std": 0.03918899595737457, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4466480612754822, + "step": 2135 + }, + { + "completion_length": 297.484375, + "epoch": 0.6806883365200764, + "grad_norm": 25.142175674438477, + "kl": 0.062255859375, + "learning_rate": 3.193116634799235e-07, + "loss": 0.0025, + "reward": 1.6298236846923828, + "reward_std": 0.10432236641645432, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.39544859528541565, + "step": 2136 + }, + { + "completion_length": 224.484375, + "epoch": 0.6810070108349267, + "grad_norm": 7.5887627601623535, + "kl": 0.0830078125, + "learning_rate": 3.189929891650733e-07, + "loss": 0.0033, + "reward": 1.6977254152297974, + "reward_std": 0.0781145989894867, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5727253556251526, + "step": 2137 + }, + { + "completion_length": 242.59375, + "epoch": 0.681325685149777, + "grad_norm": 16.79660415649414, + "kl": 0.09130859375, + "learning_rate": 3.1867431485022304e-07, + "loss": 0.0037, + "reward": 1.3872432708740234, + "reward_std": 0.06355497241020203, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3872433304786682, + "rewards/pad": 0.0, + "step": 2138 + }, + { + "completion_length": 162.296875, + "epoch": 0.6816443594646272, + "grad_norm": 14.901323318481445, + "kl": 0.11474609375, + "learning_rate": 3.1835564053537285e-07, + "loss": 0.0046, + "reward": 1.5048975944519043, + "reward_std": 0.12836740911006927, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4892726242542267, + "rewards/pad": 0.015625, + "step": 2139 + }, + { + "completion_length": 202.75, + "epoch": 0.6819630337794774, + "grad_norm": 29.60565757751465, + "kl": 0.09716796875, + "learning_rate": 3.180369662205226e-07, + "loss": 0.0039, + "reward": 1.6599183082580566, + "reward_std": 0.10319438576698303, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5661684274673462, + "step": 2140 + }, + { + "completion_length": 303.40625, + "epoch": 0.6822817080943276, + "grad_norm": 20.659942626953125, + "kl": 0.06201171875, + "learning_rate": 3.177182919056724e-07, + "loss": 0.0025, + "reward": 1.590092420578003, + "reward_std": 0.11927967518568039, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5119674801826477, + "rewards/pad": 0.078125, + "step": 2141 + }, + { + "completion_length": 231.1875, + "epoch": 0.6826003824091779, + "grad_norm": 12.383918762207031, + "kl": 0.08203125, + "learning_rate": 3.1739961759082217e-07, + "loss": 0.0033, + "reward": 1.486088514328003, + "reward_std": 0.06917048245668411, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.36108851432800293, + "rewards/pad": 0.125, + "step": 2142 + }, + { + "completion_length": 244.96875, + "epoch": 0.6829190567240281, + "grad_norm": 4.997034549713135, + "kl": 0.09521484375, + "learning_rate": 3.17080943275972e-07, + "loss": 0.0038, + "reward": 1.5020737648010254, + "reward_std": 0.06794846802949905, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.37707382440567017, + "step": 2143 + }, + { + "completion_length": 206.140625, + "epoch": 0.6832377310388783, + "grad_norm": 13.505306243896484, + "kl": 0.08203125, + "learning_rate": 3.1676226896112173e-07, + "loss": 0.0033, + "reward": 1.6951855421066284, + "reward_std": 0.07091044634580612, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4451856017112732, + "step": 2144 + }, + { + "completion_length": 355.734375, + "epoch": 0.6835564053537285, + "grad_norm": 19.45197296142578, + "kl": 0.1767578125, + "learning_rate": 3.1644359464627153e-07, + "loss": 0.0071, + "reward": 1.5212619304656982, + "reward_std": 0.09661325812339783, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5368869304656982, + "step": 2145 + }, + { + "completion_length": 378.578125, + "epoch": 0.6838750796685787, + "grad_norm": 10.446003913879395, + "kl": 0.052978515625, + "learning_rate": 3.161249203314213e-07, + "loss": 0.0021, + "reward": 1.5361788272857666, + "reward_std": 0.15842914581298828, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4424287974834442, + "step": 2146 + }, + { + "completion_length": 224.28125, + "epoch": 0.684193753983429, + "grad_norm": 16.769617080688477, + "kl": 0.09130859375, + "learning_rate": 3.158062460165711e-07, + "loss": 0.0037, + "reward": 1.5884971618652344, + "reward_std": 0.1478859782218933, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6041221618652344, + "rewards/pad": 0.0, + "step": 2147 + }, + { + "completion_length": 209.46875, + "epoch": 0.6845124282982792, + "grad_norm": 14.459205627441406, + "kl": 0.0966796875, + "learning_rate": 3.154875717017208e-07, + "loss": 0.0039, + "reward": 1.5993014574050903, + "reward_std": 0.1308264136314392, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6149263978004456, + "rewards/pad": 0.0, + "step": 2148 + }, + { + "completion_length": 166.890625, + "epoch": 0.6848311026131294, + "grad_norm": 12.089001655578613, + "kl": 0.1171875, + "learning_rate": 3.151688973868706e-07, + "loss": 0.0047, + "reward": 1.6179935932159424, + "reward_std": 0.14248764514923096, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3836185932159424, + "rewards/pad": 0.25, + "step": 2149 + }, + { + "completion_length": 262.203125, + "epoch": 0.6851497769279796, + "grad_norm": 25.056171417236328, + "kl": 0.06884765625, + "learning_rate": 3.1485022307202036e-07, + "loss": 0.0028, + "reward": 1.6065418720245361, + "reward_std": 0.1253398358821869, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5752919912338257, + "step": 2150 + }, + { + "completion_length": 309.8125, + "epoch": 0.6854684512428298, + "grad_norm": 12.220285415649414, + "kl": 0.064453125, + "learning_rate": 3.1453154875717016e-07, + "loss": 0.0026, + "reward": 1.4543453454971313, + "reward_std": 0.12550035119056702, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34497037529945374, + "rewards/pad": 0.125, + "step": 2151 + }, + { + "completion_length": 238.40625, + "epoch": 0.6857871255576801, + "grad_norm": 9.260493278503418, + "kl": 0.09228515625, + "learning_rate": 3.142128744423199e-07, + "loss": 0.0037, + "reward": 1.6811506748199463, + "reward_std": 0.12158751487731934, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5405256748199463, + "rewards/pad": 0.140625, + "step": 2152 + }, + { + "completion_length": 247.828125, + "epoch": 0.6861057998725303, + "grad_norm": 16.205915451049805, + "kl": 0.11328125, + "learning_rate": 3.1389420012746967e-07, + "loss": 0.0045, + "reward": 1.6481878757476807, + "reward_std": 0.09217728674411774, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5231878757476807, + "step": 2153 + }, + { + "completion_length": 317.828125, + "epoch": 0.6864244741873805, + "grad_norm": 9.023106575012207, + "kl": 0.07275390625, + "learning_rate": 3.135755258126195e-07, + "loss": 0.0029, + "reward": 1.5768007040023804, + "reward_std": 0.09698163717985153, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4518006443977356, + "rewards/pad": 0.125, + "step": 2154 + }, + { + "completion_length": 239.109375, + "epoch": 0.6867431485022307, + "grad_norm": 9.951905250549316, + "kl": 0.115234375, + "learning_rate": 3.1325685149776923e-07, + "loss": 0.0046, + "reward": 1.6389799118041992, + "reward_std": 0.2226027250289917, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.5608548521995544, + "rewards/pad": 0.125, + "step": 2155 + }, + { + "completion_length": 201.875, + "epoch": 0.687061822817081, + "grad_norm": 13.409597396850586, + "kl": 0.087890625, + "learning_rate": 3.1293817718291904e-07, + "loss": 0.0035, + "reward": 1.8152843713760376, + "reward_std": 0.149655282497406, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.5965343713760376, + "step": 2156 + }, + { + "completion_length": 171.328125, + "epoch": 0.6873804971319312, + "grad_norm": 12.817052841186523, + "kl": 0.1357421875, + "learning_rate": 3.126195028680688e-07, + "loss": 0.0054, + "reward": 1.5036648511886597, + "reward_std": 0.1525655835866928, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4099148213863373, + "rewards/pad": 0.109375, + "step": 2157 + }, + { + "completion_length": 221.375, + "epoch": 0.6876991714467814, + "grad_norm": 10.492629051208496, + "kl": 0.10498046875, + "learning_rate": 3.123008285532186e-07, + "loss": 0.0042, + "reward": 1.6159346103668213, + "reward_std": 0.19899283349514008, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5221847295761108, + "step": 2158 + }, + { + "completion_length": 344.296875, + "epoch": 0.6880178457616316, + "grad_norm": 7.73666524887085, + "kl": 0.11279296875, + "learning_rate": 3.1198215423836836e-07, + "loss": 0.0045, + "reward": 1.3504173755645752, + "reward_std": 0.09438289701938629, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.36604249477386475, + "rewards/pad": 0.0, + "step": 2159 + }, + { + "completion_length": 256.203125, + "epoch": 0.6883365200764818, + "grad_norm": 23.789533615112305, + "kl": 0.07666015625, + "learning_rate": 3.1166347992351816e-07, + "loss": 0.0031, + "reward": 1.567455530166626, + "reward_std": 0.056159138679504395, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3174554705619812, + "step": 2160 + }, + { + "completion_length": 197.8125, + "epoch": 0.688655194391332, + "grad_norm": 10.632448196411133, + "kl": 0.091796875, + "learning_rate": 3.113448056086679e-07, + "loss": 0.0037, + "reward": 1.5515515804290771, + "reward_std": 0.14573228359222412, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47342658042907715, + "step": 2161 + }, + { + "completion_length": 266.59375, + "epoch": 0.6889738687061823, + "grad_norm": 18.066492080688477, + "kl": 0.0771484375, + "learning_rate": 3.110261312938177e-07, + "loss": 0.0031, + "reward": 1.5058605670928955, + "reward_std": 0.10740061849355698, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38086065649986267, + "step": 2162 + }, + { + "completion_length": 209.8125, + "epoch": 0.6892925430210325, + "grad_norm": 16.501205444335938, + "kl": 0.08544921875, + "learning_rate": 3.107074569789675e-07, + "loss": 0.0034, + "reward": 1.7047884464263916, + "reward_std": 0.07390573620796204, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4547883868217468, + "step": 2163 + }, + { + "completion_length": 153.609375, + "epoch": 0.6896112173358827, + "grad_norm": 11.284491539001465, + "kl": 0.11962890625, + "learning_rate": 3.103887826641173e-07, + "loss": 0.0048, + "reward": 1.6686841249465942, + "reward_std": 0.055707305669784546, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6686841249465942, + "rewards/pad": 0.0, + "step": 2164 + }, + { + "completion_length": 222.65625, + "epoch": 0.6899298916507329, + "grad_norm": 19.294475555419922, + "kl": 0.0947265625, + "learning_rate": 3.1007010834926704e-07, + "loss": 0.0038, + "reward": 1.5718317031860352, + "reward_std": 0.15449334681034088, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5093317031860352, + "rewards/pad": 0.0625, + "step": 2165 + }, + { + "completion_length": 262.125, + "epoch": 0.6902485659655831, + "grad_norm": 10.915752410888672, + "kl": 0.08642578125, + "learning_rate": 3.0975143403441685e-07, + "loss": 0.0035, + "reward": 1.6148755550384521, + "reward_std": 0.09238608181476593, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4898754954338074, + "step": 2166 + }, + { + "completion_length": 253.265625, + "epoch": 0.6905672402804334, + "grad_norm": 20.450952529907227, + "kl": 0.1025390625, + "learning_rate": 3.094327597195666e-07, + "loss": 0.0041, + "reward": 1.6529321670532227, + "reward_std": 0.05808892846107483, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5279321670532227, + "step": 2167 + }, + { + "completion_length": 267.890625, + "epoch": 0.6908859145952836, + "grad_norm": 9.19316577911377, + "kl": 0.07080078125, + "learning_rate": 3.0911408540471635e-07, + "loss": 0.0028, + "reward": 1.5765550136566162, + "reward_std": 0.12725546956062317, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4671799838542938, + "step": 2168 + }, + { + "completion_length": 202.6875, + "epoch": 0.6912045889101338, + "grad_norm": 19.015901565551758, + "kl": 0.11083984375, + "learning_rate": 3.087954110898661e-07, + "loss": 0.0044, + "reward": 1.5361312627792358, + "reward_std": 0.14626570045948029, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5517563223838806, + "rewards/pad": 0.0, + "step": 2169 + }, + { + "completion_length": 184.296875, + "epoch": 0.691523263224984, + "grad_norm": 14.142531394958496, + "kl": 0.0888671875, + "learning_rate": 3.084767367750159e-07, + "loss": 0.0036, + "reward": 1.4200398921966553, + "reward_std": 0.1185888797044754, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.2950398325920105, + "step": 2170 + }, + { + "completion_length": 324.59375, + "epoch": 0.6918419375398343, + "grad_norm": 10.305068016052246, + "kl": 0.07275390625, + "learning_rate": 3.0815806246016567e-07, + "loss": 0.0029, + "reward": 1.5552586317062378, + "reward_std": 0.09846571087837219, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5708836317062378, + "step": 2171 + }, + { + "completion_length": 312.921875, + "epoch": 0.6921606118546845, + "grad_norm": 7.395571708679199, + "kl": 0.06298828125, + "learning_rate": 3.078393881453155e-07, + "loss": 0.0025, + "reward": 1.5822372436523438, + "reward_std": 0.13736650347709656, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4259873032569885, + "rewards/pad": 0.15625, + "step": 2172 + }, + { + "completion_length": 200.625, + "epoch": 0.6924792861695347, + "grad_norm": 13.26996898651123, + "kl": 0.10498046875, + "learning_rate": 3.0752071383046523e-07, + "loss": 0.0042, + "reward": 1.5964655876159668, + "reward_std": 0.04902723804116249, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5964655876159668, + "rewards/pad": 0.0, + "step": 2173 + }, + { + "completion_length": 196.3125, + "epoch": 0.6927979604843849, + "grad_norm": 19.934968948364258, + "kl": 0.10498046875, + "learning_rate": 3.0720203951561504e-07, + "loss": 0.0042, + "reward": 1.6381475925445557, + "reward_std": 0.11138331890106201, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5131475925445557, + "rewards/pad": 0.125, + "step": 2174 + }, + { + "completion_length": 140.171875, + "epoch": 0.6931166347992351, + "grad_norm": 13.879480361938477, + "kl": 0.1142578125, + "learning_rate": 3.068833652007648e-07, + "loss": 0.0046, + "reward": 1.5448079109191895, + "reward_std": 0.09733173996210098, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5448079109191895, + "rewards/pad": 0.0, + "step": 2175 + }, + { + "completion_length": 261.96875, + "epoch": 0.6934353091140854, + "grad_norm": 10.17612075805664, + "kl": 0.076171875, + "learning_rate": 3.065646908859146e-07, + "loss": 0.0031, + "reward": 1.7340949773788452, + "reward_std": 0.06729496270418167, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48409509658813477, + "rewards/pad": 0.25, + "step": 2176 + }, + { + "completion_length": 238.140625, + "epoch": 0.6937539834289357, + "grad_norm": 9.040535926818848, + "kl": 0.08447265625, + "learning_rate": 3.0624601657106435e-07, + "loss": 0.0034, + "reward": 1.5748553276062012, + "reward_std": 0.07171687483787537, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5748553276062012, + "rewards/pad": 0.0, + "step": 2177 + }, + { + "completion_length": 219.59375, + "epoch": 0.6940726577437859, + "grad_norm": 8.737715721130371, + "kl": 0.16015625, + "learning_rate": 3.0592734225621416e-07, + "loss": 0.0064, + "reward": 1.494781255722046, + "reward_std": 0.08365820348262787, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49478113651275635, + "step": 2178 + }, + { + "completion_length": 274.984375, + "epoch": 0.6943913320586361, + "grad_norm": 27.758153915405273, + "kl": 0.0771484375, + "learning_rate": 3.056086679413639e-07, + "loss": 0.0031, + "reward": 1.5322539806365967, + "reward_std": 0.09594956040382385, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5478790402412415, + "rewards/pad": 0.0, + "step": 2179 + }, + { + "completion_length": 278.5, + "epoch": 0.6947100063734863, + "grad_norm": 8.370512962341309, + "kl": 0.08544921875, + "learning_rate": 3.052899936265137e-07, + "loss": 0.0034, + "reward": 1.5229417085647583, + "reward_std": 0.09253714978694916, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5385667085647583, + "step": 2180 + }, + { + "completion_length": 155.171875, + "epoch": 0.6950286806883366, + "grad_norm": 38.28666687011719, + "kl": 0.1123046875, + "learning_rate": 3.049713193116635e-07, + "loss": 0.0045, + "reward": 1.7286458015441895, + "reward_std": 0.15620869398117065, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5098958015441895, + "rewards/pad": 0.234375, + "step": 2181 + }, + { + "completion_length": 345.28125, + "epoch": 0.6953473550031868, + "grad_norm": 9.88138484954834, + "kl": 0.05419921875, + "learning_rate": 3.046526449968133e-07, + "loss": 0.0022, + "reward": 1.5823705196380615, + "reward_std": 0.1268407106399536, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4104955792427063, + "step": 2182 + }, + { + "completion_length": 283.203125, + "epoch": 0.695666029318037, + "grad_norm": 19.334638595581055, + "kl": 0.06494140625, + "learning_rate": 3.0433397068196304e-07, + "loss": 0.0026, + "reward": 1.5107245445251465, + "reward_std": 0.09902769327163696, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40134957432746887, + "rewards/pad": 0.125, + "step": 2183 + }, + { + "completion_length": 257.359375, + "epoch": 0.6959847036328872, + "grad_norm": 7.854503154754639, + "kl": 0.07763671875, + "learning_rate": 3.0401529636711284e-07, + "loss": 0.0031, + "reward": 1.5665841102600098, + "reward_std": 0.16165412962436676, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.488459050655365, + "step": 2184 + }, + { + "completion_length": 102.671875, + "epoch": 0.6963033779477374, + "grad_norm": 18.900053024291992, + "kl": 0.1240234375, + "learning_rate": 3.036966220522626e-07, + "loss": 0.005, + "reward": 1.6835215091705322, + "reward_std": 0.05428507551550865, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.683521568775177, + "rewards/pad": 0.0, + "step": 2185 + }, + { + "completion_length": 171.46875, + "epoch": 0.6966220522625877, + "grad_norm": 12.507633209228516, + "kl": 0.09619140625, + "learning_rate": 3.0337794773741235e-07, + "loss": 0.0039, + "reward": 1.5419970750808716, + "reward_std": 0.08573351800441742, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5419970154762268, + "rewards/pad": 0.0, + "step": 2186 + }, + { + "completion_length": 291.046875, + "epoch": 0.6969407265774379, + "grad_norm": 12.020310401916504, + "kl": 0.0771484375, + "learning_rate": 3.030592734225621e-07, + "loss": 0.0031, + "reward": 1.5739872455596924, + "reward_std": 0.07773323357105255, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5739873647689819, + "step": 2187 + }, + { + "completion_length": 192.609375, + "epoch": 0.6972594008922881, + "grad_norm": 66.1197509765625, + "kl": 0.2138671875, + "learning_rate": 3.0274059910771186e-07, + "loss": 0.0085, + "reward": 1.4427610635757446, + "reward_std": 0.12355436384677887, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4583861231803894, + "rewards/pad": 0.0, + "step": 2188 + }, + { + "completion_length": 233.828125, + "epoch": 0.6975780752071383, + "grad_norm": 15.46832275390625, + "kl": 0.11328125, + "learning_rate": 3.0242192479286167e-07, + "loss": 0.0045, + "reward": 1.5915331840515137, + "reward_std": 0.11270304024219513, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6071581840515137, + "rewards/pad": 0.0, + "step": 2189 + }, + { + "completion_length": 304.765625, + "epoch": 0.6978967495219885, + "grad_norm": 11.094467163085938, + "kl": 0.1494140625, + "learning_rate": 3.021032504780114e-07, + "loss": 0.006, + "reward": 1.4202561378479004, + "reward_std": 0.09753061085939407, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4202560782432556, + "rewards/pad": 0.0, + "step": 2190 + }, + { + "completion_length": 205.84375, + "epoch": 0.6982154238368388, + "grad_norm": 12.484258651733398, + "kl": 0.08154296875, + "learning_rate": 3.017845761631612e-07, + "loss": 0.0033, + "reward": 1.753688931465149, + "reward_std": 0.07068890333175659, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3786890208721161, + "step": 2191 + }, + { + "completion_length": 192.96875, + "epoch": 0.698534098151689, + "grad_norm": 16.866378784179688, + "kl": 0.11083984375, + "learning_rate": 3.01465901848311e-07, + "loss": 0.0044, + "reward": 1.4792733192443848, + "reward_std": 0.12700484693050385, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.38552331924438477, + "rewards/pad": 0.125, + "step": 2192 + }, + { + "completion_length": 367.0625, + "epoch": 0.6988527724665392, + "grad_norm": 6.141723155975342, + "kl": 0.08349609375, + "learning_rate": 3.011472275334608e-07, + "loss": 0.0033, + "reward": 1.4423532485961914, + "reward_std": 0.0903225690126419, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.457978218793869, + "step": 2193 + }, + { + "completion_length": 239.140625, + "epoch": 0.6991714467813894, + "grad_norm": 7.0455732345581055, + "kl": 0.087890625, + "learning_rate": 3.0082855321861054e-07, + "loss": 0.0035, + "reward": 1.6241099834442139, + "reward_std": 0.0879855751991272, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49911004304885864, + "rewards/pad": 0.125, + "step": 2194 + }, + { + "completion_length": 280.640625, + "epoch": 0.6994901210962396, + "grad_norm": 17.386064529418945, + "kl": 0.07275390625, + "learning_rate": 3.0050987890376035e-07, + "loss": 0.0029, + "reward": 1.50416898727417, + "reward_std": 0.08903595060110092, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4260439872741699, + "step": 2195 + }, + { + "completion_length": 147.21875, + "epoch": 0.6998087954110899, + "grad_norm": 13.106439590454102, + "kl": 0.10888671875, + "learning_rate": 3.001912045889101e-07, + "loss": 0.0044, + "reward": 1.604874849319458, + "reward_std": 0.1985802948474884, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4173748195171356, + "rewards/pad": 0.203125, + "step": 2196 + }, + { + "completion_length": 192.84375, + "epoch": 0.7001274697259401, + "grad_norm": 16.086668014526367, + "kl": 0.09130859375, + "learning_rate": 2.998725302740599e-07, + "loss": 0.0036, + "reward": 1.608708381652832, + "reward_std": 0.057043883949518204, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6087083220481873, + "rewards/pad": 0.0, + "step": 2197 + }, + { + "completion_length": 232.875, + "epoch": 0.7004461440407903, + "grad_norm": 10.739706993103027, + "kl": 0.0908203125, + "learning_rate": 2.9955385595920966e-07, + "loss": 0.0036, + "reward": 1.5825889110565186, + "reward_std": 0.07782239466905594, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5825889110565186, + "rewards/pad": 0.0, + "step": 2198 + }, + { + "completion_length": 114.265625, + "epoch": 0.7007648183556405, + "grad_norm": 50.676753997802734, + "kl": 0.11865234375, + "learning_rate": 2.9923518164435947e-07, + "loss": 0.0047, + "reward": 1.7058982849121094, + "reward_std": 0.19980685412883759, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5652732849121094, + "step": 2199 + }, + { + "completion_length": 191.40625, + "epoch": 0.7010834926704907, + "grad_norm": 15.388541221618652, + "kl": 0.0849609375, + "learning_rate": 2.989165073295092e-07, + "loss": 0.0034, + "reward": 1.5542172193527222, + "reward_std": 0.06731617450714111, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3042171895503998, + "step": 2200 + }, + { + "completion_length": 230.0625, + "epoch": 0.701402166985341, + "grad_norm": 10.774901390075684, + "kl": 0.0859375, + "learning_rate": 2.9859783301465903e-07, + "loss": 0.0034, + "reward": 1.502295970916748, + "reward_std": 0.10654985159635544, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39292100071907043, + "rewards/pad": 0.125, + "step": 2201 + }, + { + "completion_length": 252.640625, + "epoch": 0.7017208413001912, + "grad_norm": 7.72211217880249, + "kl": 0.08056640625, + "learning_rate": 2.982791586998088e-07, + "loss": 0.0032, + "reward": 1.6727725267410278, + "reward_std": 0.129264697432518, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5633975863456726, + "step": 2202 + }, + { + "completion_length": 414.328125, + "epoch": 0.7020395156150414, + "grad_norm": 9.224809646606445, + "kl": 0.06982421875, + "learning_rate": 2.979604843849586e-07, + "loss": 0.0028, + "reward": 1.4881536960601807, + "reward_std": 0.16598357260227203, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3787786662578583, + "step": 2203 + }, + { + "completion_length": 289.796875, + "epoch": 0.7023581899298916, + "grad_norm": 7.270951271057129, + "kl": 0.083984375, + "learning_rate": 2.9764181007010835e-07, + "loss": 0.0034, + "reward": 1.5620064735412598, + "reward_std": 0.07426491379737854, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5620065331459045, + "rewards/pad": 0.0, + "step": 2204 + }, + { + "completion_length": 199.078125, + "epoch": 0.7026768642447419, + "grad_norm": 8.401556968688965, + "kl": 0.1005859375, + "learning_rate": 2.9732313575525815e-07, + "loss": 0.004, + "reward": 1.50137197971344, + "reward_std": 0.09893225878477097, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5013719797134399, + "step": 2205 + }, + { + "completion_length": 241.796875, + "epoch": 0.7029955385595921, + "grad_norm": 13.952909469604492, + "kl": 0.08642578125, + "learning_rate": 2.970044614404079e-07, + "loss": 0.0035, + "reward": 1.5688952207565308, + "reward_std": 0.22908249497413635, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.490770161151886, + "step": 2206 + }, + { + "completion_length": 305.25, + "epoch": 0.7033142128744423, + "grad_norm": 11.196319580078125, + "kl": 0.07470703125, + "learning_rate": 2.9668578712555766e-07, + "loss": 0.003, + "reward": 1.534885048866272, + "reward_std": 0.10356839001178741, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.550510048866272, + "step": 2207 + }, + { + "completion_length": 167.203125, + "epoch": 0.7036328871892925, + "grad_norm": 7.589974403381348, + "kl": 0.107421875, + "learning_rate": 2.963671128107074e-07, + "loss": 0.0043, + "reward": 1.5548350811004639, + "reward_std": 0.14660724997520447, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38296017050743103, + "rewards/pad": 0.171875, + "step": 2208 + }, + { + "completion_length": 317.625, + "epoch": 0.7039515615041427, + "grad_norm": 5.4483962059021, + "kl": 0.078125, + "learning_rate": 2.960484384958572e-07, + "loss": 0.0031, + "reward": 1.4976832866668701, + "reward_std": 0.12796911597251892, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5289332866668701, + "rewards/pad": 0.0, + "step": 2209 + }, + { + "completion_length": 262.40625, + "epoch": 0.704270235818993, + "grad_norm": 31.277687072753906, + "kl": 0.1015625, + "learning_rate": 2.95729764181007e-07, + "loss": 0.0041, + "reward": 1.6184595823287964, + "reward_std": 0.17132757604122162, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.6497095823287964, + "step": 2210 + }, + { + "completion_length": 238.25, + "epoch": 0.7045889101338432, + "grad_norm": 12.485980033874512, + "kl": 0.0869140625, + "learning_rate": 2.954110898661568e-07, + "loss": 0.0035, + "reward": 1.5100154876708984, + "reward_std": 0.04877918213605881, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38501548767089844, + "step": 2211 + }, + { + "completion_length": 194.09375, + "epoch": 0.7049075844486934, + "grad_norm": 9.918503761291504, + "kl": 0.0908203125, + "learning_rate": 2.9509241555130654e-07, + "loss": 0.0036, + "reward": 1.5856094360351562, + "reward_std": 0.07165414094924927, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4606093764305115, + "step": 2212 + }, + { + "completion_length": 336.96875, + "epoch": 0.7052262587635436, + "grad_norm": 5.346742153167725, + "kl": 0.0654296875, + "learning_rate": 2.9477374123645634e-07, + "loss": 0.0026, + "reward": 1.417320966720581, + "reward_std": 0.14546999335289001, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40169599652290344, + "rewards/pad": 0.03125, + "step": 2213 + }, + { + "completion_length": 274.9375, + "epoch": 0.7055449330783938, + "grad_norm": 12.8856201171875, + "kl": 0.072265625, + "learning_rate": 2.944550669216061e-07, + "loss": 0.0029, + "reward": 1.5375027656555176, + "reward_std": 0.11782203614711761, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5531278252601624, + "step": 2214 + }, + { + "completion_length": 251.359375, + "epoch": 0.705863607393244, + "grad_norm": 9.751949310302734, + "kl": 0.0830078125, + "learning_rate": 2.9413639260675585e-07, + "loss": 0.0033, + "reward": 1.5835912227630615, + "reward_std": 0.10557487607002258, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5210912227630615, + "rewards/pad": 0.0625, + "step": 2215 + }, + { + "completion_length": 152.375, + "epoch": 0.7061822817080943, + "grad_norm": 10.901607513427734, + "kl": 0.10009765625, + "learning_rate": 2.9381771829190566e-07, + "loss": 0.004, + "reward": 1.6085469722747803, + "reward_std": 0.15042608976364136, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45229700207710266, + "rewards/pad": 0.171875, + "step": 2216 + }, + { + "completion_length": 296.390625, + "epoch": 0.7065009560229446, + "grad_norm": 10.667808532714844, + "kl": 0.07861328125, + "learning_rate": 2.934990439770554e-07, + "loss": 0.0031, + "reward": 1.5774935483932495, + "reward_std": 0.060238465666770935, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4524935781955719, + "step": 2217 + }, + { + "completion_length": 209.09375, + "epoch": 0.7068196303377948, + "grad_norm": 7.474569797515869, + "kl": 0.07861328125, + "learning_rate": 2.931803696622052e-07, + "loss": 0.0031, + "reward": 1.9297740459442139, + "reward_std": 0.1745995581150055, + "rewards/answer_reward": 0.359375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5860241055488586, + "step": 2218 + }, + { + "completion_length": 301.015625, + "epoch": 0.707138304652645, + "grad_norm": 11.891040802001953, + "kl": 0.07080078125, + "learning_rate": 2.92861695347355e-07, + "loss": 0.0028, + "reward": 1.4260377883911133, + "reward_std": 0.12861579656600952, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44166284799575806, + "rewards/pad": 0.0, + "step": 2219 + }, + { + "completion_length": 387.03125, + "epoch": 0.7074569789674953, + "grad_norm": 12.634696960449219, + "kl": 0.0830078125, + "learning_rate": 2.925430210325048e-07, + "loss": 0.0033, + "reward": 1.5856103897094727, + "reward_std": 0.13534234464168549, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.49186044931411743, + "step": 2220 + }, + { + "completion_length": 266.484375, + "epoch": 0.7077756532823455, + "grad_norm": 8.80695629119873, + "kl": 0.083984375, + "learning_rate": 2.9222434671765454e-07, + "loss": 0.0033, + "reward": 1.4818050861358643, + "reward_std": 0.13370054960250854, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.48180508613586426, + "step": 2221 + }, + { + "completion_length": 235.59375, + "epoch": 0.7080943275971957, + "grad_norm": 20.21587562561035, + "kl": 0.0869140625, + "learning_rate": 2.9190567240280434e-07, + "loss": 0.0035, + "reward": 1.457129955291748, + "reward_std": 0.15341603755950928, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47275495529174805, + "rewards/pad": 0.0, + "step": 2222 + }, + { + "completion_length": 143.390625, + "epoch": 0.7084130019120459, + "grad_norm": 21.482358932495117, + "kl": 0.125, + "learning_rate": 2.915869980879541e-07, + "loss": 0.005, + "reward": 1.4829232692718506, + "reward_std": 0.14291679859161377, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4985482692718506, + "rewards/pad": 0.0, + "step": 2223 + }, + { + "completion_length": 318.28125, + "epoch": 0.7087316762268961, + "grad_norm": 9.289806365966797, + "kl": 0.0830078125, + "learning_rate": 2.912683237731039e-07, + "loss": 0.0033, + "reward": 1.5218733549118042, + "reward_std": 0.1469164788722992, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4281233549118042, + "rewards/pad": 0.109375, + "step": 2224 + }, + { + "completion_length": 321.234375, + "epoch": 0.7090503505417464, + "grad_norm": 7.08596658706665, + "kl": 0.07470703125, + "learning_rate": 2.9094964945825366e-07, + "loss": 0.003, + "reward": 1.5650315284729004, + "reward_std": 0.06518622487783432, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5650315284729004, + "rewards/pad": 0.0, + "step": 2225 + }, + { + "completion_length": 190.640625, + "epoch": 0.7093690248565966, + "grad_norm": 15.268865585327148, + "kl": 0.087890625, + "learning_rate": 2.9063097514340346e-07, + "loss": 0.0035, + "reward": 1.6178052425384521, + "reward_std": 0.10819917172193527, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4303053319454193, + "step": 2226 + }, + { + "completion_length": 257.265625, + "epoch": 0.7096876991714468, + "grad_norm": 8.498229026794434, + "kl": 0.0966796875, + "learning_rate": 2.9031230082855317e-07, + "loss": 0.0039, + "reward": 1.4241349697113037, + "reward_std": 0.143952414393425, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4241349399089813, + "step": 2227 + }, + { + "completion_length": 190.03125, + "epoch": 0.710006373486297, + "grad_norm": 9.867719650268555, + "kl": 0.10888671875, + "learning_rate": 2.8999362651370297e-07, + "loss": 0.0044, + "reward": 1.3704220056533813, + "reward_std": 0.08321335911750793, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3860470652580261, + "rewards/pad": 0.0, + "step": 2228 + }, + { + "completion_length": 243.09375, + "epoch": 0.7103250478011472, + "grad_norm": 6.65676212310791, + "kl": 0.059814453125, + "learning_rate": 2.896749521988527e-07, + "loss": 0.0024, + "reward": 1.8582676649093628, + "reward_std": 0.20621733367443085, + "rewards/pad": 0.484375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3895176947116852, + "step": 2229 + }, + { + "completion_length": 259.859375, + "epoch": 0.7106437221159975, + "grad_norm": 12.128743171691895, + "kl": 0.09716796875, + "learning_rate": 2.8935627788400253e-07, + "loss": 0.0039, + "reward": 1.594397783279419, + "reward_std": 0.10106487572193146, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.594397783279419, + "rewards/pad": 0.0, + "step": 2230 + }, + { + "completion_length": 306.15625, + "epoch": 0.7109623964308477, + "grad_norm": 11.669578552246094, + "kl": 0.06396484375, + "learning_rate": 2.890376035691523e-07, + "loss": 0.0026, + "reward": 1.7158432006835938, + "reward_std": 0.08693593740463257, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6064682006835938, + "step": 2231 + }, + { + "completion_length": 206.515625, + "epoch": 0.7112810707456979, + "grad_norm": 18.729063034057617, + "kl": 0.12890625, + "learning_rate": 2.887189292543021e-07, + "loss": 0.0051, + "reward": 1.4813673496246338, + "reward_std": 0.12282691895961761, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.371992290019989, + "step": 2232 + }, + { + "completion_length": 315.859375, + "epoch": 0.7115997450605481, + "grad_norm": 29.33302116394043, + "kl": 0.09326171875, + "learning_rate": 2.8840025493945185e-07, + "loss": 0.0037, + "reward": 1.4940911531448364, + "reward_std": 0.05092655122280121, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4940911531448364, + "rewards/pad": 0.0, + "step": 2233 + }, + { + "completion_length": 266.46875, + "epoch": 0.7119184193753983, + "grad_norm": 21.585269927978516, + "kl": 0.1015625, + "learning_rate": 2.8808158062460166e-07, + "loss": 0.0041, + "reward": 1.5710127353668213, + "reward_std": 0.17600609362125397, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5397627949714661, + "rewards/pad": 0.046875, + "step": 2234 + }, + { + "completion_length": 267.984375, + "epoch": 0.7122370936902486, + "grad_norm": 7.6832451820373535, + "kl": 0.08251953125, + "learning_rate": 2.877629063097514e-07, + "loss": 0.0033, + "reward": 1.5236074924468994, + "reward_std": 0.1719309240579605, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5548574924468994, + "step": 2235 + }, + { + "completion_length": 152.515625, + "epoch": 0.7125557680050988, + "grad_norm": 20.99796485900879, + "kl": 0.125, + "learning_rate": 2.874442319949012e-07, + "loss": 0.005, + "reward": 1.5960841178894043, + "reward_std": 0.0930885523557663, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5960839986801147, + "rewards/pad": 0.0, + "step": 2236 + }, + { + "completion_length": 343.09375, + "epoch": 0.712874442319949, + "grad_norm": 21.34062385559082, + "kl": 0.0693359375, + "learning_rate": 2.8712555768005097e-07, + "loss": 0.0028, + "reward": 1.4536378383636475, + "reward_std": 0.10192979127168655, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4692626893520355, + "step": 2237 + }, + { + "completion_length": 207.6875, + "epoch": 0.7131931166347992, + "grad_norm": 46.177391052246094, + "kl": 0.10498046875, + "learning_rate": 2.868068833652008e-07, + "loss": 0.0042, + "reward": 1.6383215188980103, + "reward_std": 0.13554789125919342, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4508214294910431, + "rewards/pad": 0.1875, + "step": 2238 + }, + { + "completion_length": 195.5, + "epoch": 0.7135117909496494, + "grad_norm": 15.51397705078125, + "kl": 0.09228515625, + "learning_rate": 2.8648820905035053e-07, + "loss": 0.0037, + "reward": 1.394288420677185, + "reward_std": 0.05121123790740967, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3942883610725403, + "step": 2239 + }, + { + "completion_length": 258.84375, + "epoch": 0.7138304652644997, + "grad_norm": 16.36289405822754, + "kl": 0.0771484375, + "learning_rate": 2.8616953473550034e-07, + "loss": 0.0031, + "reward": 1.529281735420227, + "reward_std": 0.19541539251804352, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.35740670561790466, + "rewards/pad": 0.1875, + "step": 2240 + }, + { + "completion_length": 304.640625, + "epoch": 0.7141491395793499, + "grad_norm": 7.12017822265625, + "kl": 0.0751953125, + "learning_rate": 2.858508604206501e-07, + "loss": 0.003, + "reward": 1.5130261182785034, + "reward_std": 0.08079716563224792, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5130261182785034, + "rewards/pad": 0.0, + "step": 2241 + }, + { + "completion_length": 299.890625, + "epoch": 0.7144678138942001, + "grad_norm": 14.374198913574219, + "kl": 0.11279296875, + "learning_rate": 2.855321861057999e-07, + "loss": 0.0045, + "reward": 1.4909179210662842, + "reward_std": 0.10724478960037231, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5065428018569946, + "step": 2242 + }, + { + "completion_length": 316.75, + "epoch": 0.7147864882090503, + "grad_norm": 11.825737953186035, + "kl": 0.07568359375, + "learning_rate": 2.8521351179094965e-07, + "loss": 0.003, + "reward": 1.5616931915283203, + "reward_std": 0.17490637302398682, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.37419313192367554, + "step": 2243 + }, + { + "completion_length": 357.015625, + "epoch": 0.7151051625239006, + "grad_norm": 4.220314979553223, + "kl": 0.06591796875, + "learning_rate": 2.8489483747609946e-07, + "loss": 0.0026, + "reward": 1.5746936798095703, + "reward_std": 0.04713768512010574, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5746935606002808, + "step": 2244 + }, + { + "completion_length": 194.375, + "epoch": 0.7154238368387508, + "grad_norm": 9.533937454223633, + "kl": 0.08154296875, + "learning_rate": 2.845761631612492e-07, + "loss": 0.0033, + "reward": 1.9403507709503174, + "reward_std": 0.06749410927295685, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5653507709503174, + "rewards/pad": 0.375, + "step": 2245 + }, + { + "completion_length": 331.3125, + "epoch": 0.715742511153601, + "grad_norm": 7.48617696762085, + "kl": 0.0634765625, + "learning_rate": 2.842574888463989e-07, + "loss": 0.0025, + "reward": 1.603176236152649, + "reward_std": 0.1028289645910263, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4313012361526489, + "step": 2246 + }, + { + "completion_length": 250.5625, + "epoch": 0.7160611854684512, + "grad_norm": 10.276695251464844, + "kl": 0.10888671875, + "learning_rate": 2.839388145315487e-07, + "loss": 0.0044, + "reward": 1.653708815574646, + "reward_std": 0.1339433491230011, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.544333815574646, + "rewards/pad": 0.109375, + "step": 2247 + }, + { + "completion_length": 412.28125, + "epoch": 0.7163798597833014, + "grad_norm": 12.946741104125977, + "kl": 0.062255859375, + "learning_rate": 2.836201402166985e-07, + "loss": 0.0025, + "reward": 1.4689414501190186, + "reward_std": 0.11642967909574509, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4845663905143738, + "step": 2248 + }, + { + "completion_length": 282.09375, + "epoch": 0.7166985340981517, + "grad_norm": 7.711841583251953, + "kl": 0.0888671875, + "learning_rate": 2.833014659018483e-07, + "loss": 0.0035, + "reward": 1.492098331451416, + "reward_std": 0.11170074343681335, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.38272321224212646, + "step": 2249 + }, + { + "completion_length": 196.390625, + "epoch": 0.7170172084130019, + "grad_norm": 13.00545597076416, + "kl": 0.12255859375, + "learning_rate": 2.8298279158699804e-07, + "loss": 0.0049, + "reward": 1.6409862041473389, + "reward_std": 0.12882104516029358, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6566110849380493, + "step": 2250 + }, + { + "completion_length": 226.328125, + "epoch": 0.7173358827278521, + "grad_norm": 14.536446571350098, + "kl": 0.0986328125, + "learning_rate": 2.8266411727214784e-07, + "loss": 0.0039, + "reward": 1.617206335067749, + "reward_std": 0.08918547630310059, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.507831335067749, + "rewards/pad": 0.125, + "step": 2251 + }, + { + "completion_length": 316.0, + "epoch": 0.7176545570427023, + "grad_norm": 29.476350784301758, + "kl": 0.07958984375, + "learning_rate": 2.823454429572976e-07, + "loss": 0.0032, + "reward": 1.5252597332000732, + "reward_std": 0.10496915876865387, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5408847332000732, + "step": 2252 + }, + { + "completion_length": 218.40625, + "epoch": 0.7179732313575525, + "grad_norm": 39.07975387573242, + "kl": 0.1259765625, + "learning_rate": 2.820267686424474e-07, + "loss": 0.005, + "reward": 1.6677427291870117, + "reward_std": 0.0700826495885849, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5427427291870117, + "step": 2253 + }, + { + "completion_length": 382.328125, + "epoch": 0.7182919056724028, + "grad_norm": 33.672271728515625, + "kl": 0.06201171875, + "learning_rate": 2.8170809432759716e-07, + "loss": 0.0025, + "reward": 1.567516803741455, + "reward_std": 0.10253050923347473, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4581417441368103, + "step": 2254 + }, + { + "completion_length": 265.90625, + "epoch": 0.718610579987253, + "grad_norm": 7.391505718231201, + "kl": 0.07958984375, + "learning_rate": 2.8138942001274697e-07, + "loss": 0.0032, + "reward": 1.600637674331665, + "reward_std": 0.08258798718452454, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3506375551223755, + "rewards/pad": 0.25, + "step": 2255 + }, + { + "completion_length": 215.796875, + "epoch": 0.7189292543021033, + "grad_norm": 12.145453453063965, + "kl": 0.09716796875, + "learning_rate": 2.810707456978967e-07, + "loss": 0.0039, + "reward": 1.5579832792282104, + "reward_std": 0.06622102111577988, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4329833388328552, + "step": 2256 + }, + { + "completion_length": 157.46875, + "epoch": 0.7192479286169535, + "grad_norm": 26.469045639038086, + "kl": 0.12109375, + "learning_rate": 2.8075207138304653e-07, + "loss": 0.0048, + "reward": 1.8562612533569336, + "reward_std": 0.19814197719097137, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6687612533569336, + "rewards/pad": 0.1875, + "step": 2257 + }, + { + "completion_length": 164.8125, + "epoch": 0.7195666029318037, + "grad_norm": 12.529221534729004, + "kl": 0.125, + "learning_rate": 2.804333970681963e-07, + "loss": 0.005, + "reward": 1.5287246704101562, + "reward_std": 0.1076161339879036, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4037245512008667, + "rewards/pad": 0.125, + "step": 2258 + }, + { + "completion_length": 393.375, + "epoch": 0.719885277246654, + "grad_norm": 7.355383396148682, + "kl": 0.0498046875, + "learning_rate": 2.801147227533461e-07, + "loss": 0.002, + "reward": 1.5239171981811523, + "reward_std": 0.1284276247024536, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46141716837882996, + "step": 2259 + }, + { + "completion_length": 157.8125, + "epoch": 0.7202039515615042, + "grad_norm": 11.706124305725098, + "kl": 0.10498046875, + "learning_rate": 2.7979604843849584e-07, + "loss": 0.0042, + "reward": 1.599716067314148, + "reward_std": 0.28268083930015564, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5215910077095032, + "rewards/pad": 0.109375, + "step": 2260 + }, + { + "completion_length": 216.34375, + "epoch": 0.7205226258763544, + "grad_norm": 19.255443572998047, + "kl": 0.10986328125, + "learning_rate": 2.7947737412364565e-07, + "loss": 0.0044, + "reward": 1.5750744342803955, + "reward_std": 0.06895896792411804, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5750743746757507, + "rewards/pad": 0.0, + "step": 2261 + }, + { + "completion_length": 200.203125, + "epoch": 0.7208413001912046, + "grad_norm": 12.548673629760742, + "kl": 0.11083984375, + "learning_rate": 2.791586998087954e-07, + "loss": 0.0044, + "reward": 1.5564448833465576, + "reward_std": 0.09696632623672485, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4314449429512024, + "step": 2262 + }, + { + "completion_length": 211.109375, + "epoch": 0.7211599745060548, + "grad_norm": 16.141681671142578, + "kl": 0.12255859375, + "learning_rate": 2.788400254939452e-07, + "loss": 0.0049, + "reward": 1.605320930480957, + "reward_std": 0.1554790735244751, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5115709900856018, + "step": 2263 + }, + { + "completion_length": 172.0, + "epoch": 0.7214786488209051, + "grad_norm": 11.038640975952148, + "kl": 0.11669921875, + "learning_rate": 2.7852135117909496e-07, + "loss": 0.0047, + "reward": 1.8946161270141602, + "reward_std": 0.08197048306465149, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5196161866188049, + "rewards/pad": 0.375, + "step": 2264 + }, + { + "completion_length": 165.359375, + "epoch": 0.7217973231357553, + "grad_norm": 24.56603240966797, + "kl": 0.08984375, + "learning_rate": 2.7820267686424477e-07, + "loss": 0.0036, + "reward": 1.5874215364456177, + "reward_std": 0.12280981987714767, + "rewards/answer_reward": 0.28125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3061715364456177, + "step": 2265 + }, + { + "completion_length": 241.953125, + "epoch": 0.7221159974506055, + "grad_norm": 13.44584846496582, + "kl": 0.08203125, + "learning_rate": 2.7788400254939447e-07, + "loss": 0.0033, + "reward": 1.4582545757293701, + "reward_std": 0.04218538850545883, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.45825451612472534, + "step": 2266 + }, + { + "completion_length": 273.109375, + "epoch": 0.7224346717654557, + "grad_norm": 5.412264347076416, + "kl": 0.0859375, + "learning_rate": 2.775653282345443e-07, + "loss": 0.0034, + "reward": 1.7483843564987183, + "reward_std": 0.07010708749294281, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.49838435649871826, + "step": 2267 + }, + { + "completion_length": 203.796875, + "epoch": 0.722753346080306, + "grad_norm": 12.466039657592773, + "kl": 0.1083984375, + "learning_rate": 2.7724665391969403e-07, + "loss": 0.0043, + "reward": 1.511660099029541, + "reward_std": 0.047675177454948425, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.511660099029541, + "step": 2268 + }, + { + "completion_length": 299.453125, + "epoch": 0.7230720203951562, + "grad_norm": 41.059959411621094, + "kl": 0.076171875, + "learning_rate": 2.7692797960484384e-07, + "loss": 0.003, + "reward": 1.6285617351531982, + "reward_std": 0.10453909635543823, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5035616159439087, + "rewards/pad": 0.125, + "step": 2269 + }, + { + "completion_length": 216.765625, + "epoch": 0.7233906947100064, + "grad_norm": 9.3422212600708, + "kl": 0.0869140625, + "learning_rate": 2.766093052899936e-07, + "loss": 0.0035, + "reward": 1.7067334651947021, + "reward_std": 0.11028394103050232, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4723585247993469, + "step": 2270 + }, + { + "completion_length": 307.953125, + "epoch": 0.7237093690248566, + "grad_norm": 12.578081130981445, + "kl": 0.08251953125, + "learning_rate": 2.762906309751434e-07, + "loss": 0.0033, + "reward": 1.5892455577850342, + "reward_std": 0.23204761743545532, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.5267455577850342, + "step": 2271 + }, + { + "completion_length": 187.015625, + "epoch": 0.7240280433397068, + "grad_norm": 14.036943435668945, + "kl": 0.10498046875, + "learning_rate": 2.7597195666029316e-07, + "loss": 0.0042, + "reward": 1.6664389371871948, + "reward_std": 0.10293828696012497, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6664389967918396, + "step": 2272 + }, + { + "completion_length": 210.96875, + "epoch": 0.724346717654557, + "grad_norm": 25.174528121948242, + "kl": 0.10400390625, + "learning_rate": 2.7565328234544296e-07, + "loss": 0.0042, + "reward": 1.5564359426498413, + "reward_std": 0.06936076283454895, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43143588304519653, + "rewards/pad": 0.125, + "step": 2273 + }, + { + "completion_length": 155.6875, + "epoch": 0.7246653919694073, + "grad_norm": 4.558319091796875, + "kl": 0.134765625, + "learning_rate": 2.753346080305927e-07, + "loss": 0.0054, + "reward": 1.5315523147583008, + "reward_std": 0.0986432284116745, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.422177255153656, + "rewards/pad": 0.125, + "step": 2274 + }, + { + "completion_length": 313.28125, + "epoch": 0.7249840662842575, + "grad_norm": 9.25501537322998, + "kl": 0.060546875, + "learning_rate": 2.750159337157425e-07, + "loss": 0.0024, + "reward": 1.6354743242263794, + "reward_std": 0.1479807049036026, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5417243242263794, + "rewards/pad": 0.125, + "step": 2275 + }, + { + "completion_length": 112.90625, + "epoch": 0.7253027405991077, + "grad_norm": 19.316740036010742, + "kl": 0.107421875, + "learning_rate": 2.746972594008923e-07, + "loss": 0.0043, + "reward": 1.7034404277801514, + "reward_std": 0.05819811671972275, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.453440397977829, + "rewards/pad": 0.25, + "step": 2276 + }, + { + "completion_length": 339.046875, + "epoch": 0.7256214149139579, + "grad_norm": 6.1517815589904785, + "kl": 0.07763671875, + "learning_rate": 2.7437858508604203e-07, + "loss": 0.0031, + "reward": 1.485317349433899, + "reward_std": 0.15594914555549622, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37594231963157654, + "step": 2277 + }, + { + "completion_length": 237.09375, + "epoch": 0.7259400892288081, + "grad_norm": 22.668804168701172, + "kl": 0.0888671875, + "learning_rate": 2.7405991077119184e-07, + "loss": 0.0036, + "reward": 1.654649019241333, + "reward_std": 0.14173540472984314, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5921490788459778, + "rewards/pad": 0.0625, + "step": 2278 + }, + { + "completion_length": 179.484375, + "epoch": 0.7262587635436584, + "grad_norm": 13.707090377807617, + "kl": 0.11328125, + "learning_rate": 2.737412364563416e-07, + "loss": 0.0045, + "reward": 1.6936432123184204, + "reward_std": 0.1634928286075592, + "rewards/answer_reward": 0.265625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4280182123184204, + "step": 2279 + }, + { + "completion_length": 212.328125, + "epoch": 0.7265774378585086, + "grad_norm": 11.429169654846191, + "kl": 0.111328125, + "learning_rate": 2.734225621414914e-07, + "loss": 0.0044, + "reward": 1.50600004196167, + "reward_std": 0.13330598175525665, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5216250419616699, + "rewards/pad": 0.0, + "step": 2280 + }, + { + "completion_length": 312.8125, + "epoch": 0.7268961121733588, + "grad_norm": 9.42291259765625, + "kl": 0.0751953125, + "learning_rate": 2.7310388782664115e-07, + "loss": 0.003, + "reward": 1.5466002225875854, + "reward_std": 0.10822071880102158, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4372251033782959, + "step": 2281 + }, + { + "completion_length": 324.96875, + "epoch": 0.727214786488209, + "grad_norm": 13.664058685302734, + "kl": 0.06396484375, + "learning_rate": 2.7278521351179096e-07, + "loss": 0.0026, + "reward": 1.5122783184051514, + "reward_std": 0.09490574896335602, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.40290340781211853, + "step": 2282 + }, + { + "completion_length": 159.9375, + "epoch": 0.7275334608030593, + "grad_norm": 8.65300464630127, + "kl": 0.10009765625, + "learning_rate": 2.724665391969407e-07, + "loss": 0.004, + "reward": 1.7262582778930664, + "reward_std": 0.08647529780864716, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.47625815868377686, + "step": 2283 + }, + { + "completion_length": 281.9375, + "epoch": 0.7278521351179095, + "grad_norm": 14.41006088256836, + "kl": 0.076171875, + "learning_rate": 2.721478648820905e-07, + "loss": 0.003, + "reward": 1.639107346534729, + "reward_std": 0.10988225042819977, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5297324657440186, + "rewards/pad": 0.125, + "step": 2284 + }, + { + "completion_length": 111.6875, + "epoch": 0.7281708094327597, + "grad_norm": 24.305002212524414, + "kl": 0.138671875, + "learning_rate": 2.718291905672402e-07, + "loss": 0.0055, + "reward": 1.6760501861572266, + "reward_std": 0.19735486805438995, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5823003053665161, + "rewards/pad": 0.109375, + "step": 2285 + }, + { + "completion_length": 331.8125, + "epoch": 0.7284894837476099, + "grad_norm": 5.85117769241333, + "kl": 0.0625, + "learning_rate": 2.7151051625239003e-07, + "loss": 0.0025, + "reward": 1.3858469724655151, + "reward_std": 0.0686102956533432, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38584691286087036, + "step": 2286 + }, + { + "completion_length": 303.90625, + "epoch": 0.7288081580624601, + "grad_norm": 32.89085388183594, + "kl": 0.11181640625, + "learning_rate": 2.711918419375398e-07, + "loss": 0.0045, + "reward": 1.343575119972229, + "reward_std": 0.09186676144599915, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.343575119972229, + "rewards/pad": 0.0, + "step": 2287 + }, + { + "completion_length": 224.28125, + "epoch": 0.7291268323773104, + "grad_norm": 16.360694885253906, + "kl": 0.109375, + "learning_rate": 2.708731676226896e-07, + "loss": 0.0044, + "reward": 1.630078673362732, + "reward_std": 0.09970824420452118, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6457037329673767, + "step": 2288 + }, + { + "completion_length": 212.65625, + "epoch": 0.7294455066921606, + "grad_norm": 90.82035064697266, + "kl": 0.099609375, + "learning_rate": 2.7055449330783934e-07, + "loss": 0.004, + "reward": 1.3920116424560547, + "reward_std": 0.06360248476266861, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3920116424560547, + "rewards/pad": 0.0, + "step": 2289 + }, + { + "completion_length": 381.640625, + "epoch": 0.7297641810070108, + "grad_norm": 11.928213119506836, + "kl": 0.05517578125, + "learning_rate": 2.7023581899298915e-07, + "loss": 0.0022, + "reward": 1.4683802127838135, + "reward_std": 0.06154194846749306, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4683801829814911, + "rewards/pad": 0.0, + "step": 2290 + }, + { + "completion_length": 294.578125, + "epoch": 0.730082855321861, + "grad_norm": 25.566471099853516, + "kl": 0.09228515625, + "learning_rate": 2.699171446781389e-07, + "loss": 0.0037, + "reward": 1.4241492748260498, + "reward_std": 0.15201619267463684, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4553992748260498, + "step": 2291 + }, + { + "completion_length": 361.625, + "epoch": 0.7304015296367112, + "grad_norm": 38.10639190673828, + "kl": 0.059326171875, + "learning_rate": 2.695984703632887e-07, + "loss": 0.0024, + "reward": 1.5291061401367188, + "reward_std": 0.11724649369716644, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45098114013671875, + "rewards/pad": 0.078125, + "step": 2292 + }, + { + "completion_length": 207.078125, + "epoch": 0.7307202039515615, + "grad_norm": 15.148269653320312, + "kl": 0.1298828125, + "learning_rate": 2.6927979604843847e-07, + "loss": 0.0052, + "reward": 1.7448357343673706, + "reward_std": 0.19660058617591858, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5573357343673706, + "step": 2293 + }, + { + "completion_length": 113.921875, + "epoch": 0.7310388782664117, + "grad_norm": 26.689838409423828, + "kl": 0.134765625, + "learning_rate": 2.689611217335883e-07, + "loss": 0.0054, + "reward": 1.82138991355896, + "reward_std": 0.1653740108013153, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.57138991355896, + "step": 2294 + }, + { + "completion_length": 281.15625, + "epoch": 0.731357552581262, + "grad_norm": 10.188474655151367, + "kl": 0.091796875, + "learning_rate": 2.6864244741873803e-07, + "loss": 0.0037, + "reward": 1.4889552593231201, + "reward_std": 0.12610562145709991, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5045802593231201, + "rewards/pad": 0.0, + "step": 2295 + }, + { + "completion_length": 318.375, + "epoch": 0.7316762268961122, + "grad_norm": 5.968549728393555, + "kl": 0.06396484375, + "learning_rate": 2.6832377310388783e-07, + "loss": 0.0026, + "reward": 1.3989291191101074, + "reward_std": 0.16393840312957764, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3051791191101074, + "step": 2296 + }, + { + "completion_length": 303.234375, + "epoch": 0.7319949012109624, + "grad_norm": 10.959392547607422, + "kl": 0.06396484375, + "learning_rate": 2.680050987890376e-07, + "loss": 0.0026, + "reward": 1.6067535877227783, + "reward_std": 0.1501012146472931, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5130035877227783, + "step": 2297 + }, + { + "completion_length": 268.875, + "epoch": 0.7323135755258127, + "grad_norm": 11.456298828125, + "kl": 0.08447265625, + "learning_rate": 2.676864244741874e-07, + "loss": 0.0034, + "reward": 1.4659645557403564, + "reward_std": 0.11348327994346619, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46596449613571167, + "rewards/pad": 0.0, + "step": 2298 + }, + { + "completion_length": 230.703125, + "epoch": 0.7326322498406629, + "grad_norm": 41.293704986572266, + "kl": 0.123046875, + "learning_rate": 2.6736775015933715e-07, + "loss": 0.0049, + "reward": 1.5611324310302734, + "reward_std": 0.12068554759025574, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5298823714256287, + "rewards/pad": 0.03125, + "step": 2299 + }, + { + "completion_length": 248.234375, + "epoch": 0.7329509241555131, + "grad_norm": 18.223880767822266, + "kl": 0.080078125, + "learning_rate": 2.6704907584448696e-07, + "loss": 0.0032, + "reward": 1.879724383354187, + "reward_std": 0.1355212926864624, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5203494429588318, + "step": 2300 + }, + { + "completion_length": 153.40625, + "epoch": 0.7332695984703633, + "grad_norm": 22.09395980834961, + "kl": 0.115234375, + "learning_rate": 2.667304015296367e-07, + "loss": 0.0046, + "reward": 1.470009207725525, + "reward_std": 0.14496728777885437, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4700092077255249, + "rewards/pad": 0.0, + "step": 2301 + }, + { + "completion_length": 210.1875, + "epoch": 0.7335882727852135, + "grad_norm": 9.805888175964355, + "kl": 0.109375, + "learning_rate": 2.664117272147865e-07, + "loss": 0.0044, + "reward": 1.6735904216766357, + "reward_std": 0.08329583704471588, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.548590362071991, + "rewards/pad": 0.125, + "step": 2302 + }, + { + "completion_length": 106.1875, + "epoch": 0.7339069471000638, + "grad_norm": 15.89492416381836, + "kl": 0.1728515625, + "learning_rate": 2.6609305289993627e-07, + "loss": 0.0069, + "reward": 1.5772063732147217, + "reward_std": 0.13957607746124268, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45220643281936646, + "rewards/pad": 0.125, + "step": 2303 + }, + { + "completion_length": 268.859375, + "epoch": 0.734225621414914, + "grad_norm": 9.0073881149292, + "kl": 0.08544921875, + "learning_rate": 2.657743785850861e-07, + "loss": 0.0034, + "reward": 1.5893383026123047, + "reward_std": 0.10896583646535873, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4955883026123047, + "step": 2304 + }, + { + "completion_length": 243.046875, + "epoch": 0.7345442957297642, + "grad_norm": 6.57396936416626, + "kl": 0.09130859375, + "learning_rate": 2.654557042702358e-07, + "loss": 0.0036, + "reward": 1.7307865619659424, + "reward_std": 0.10434369742870331, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6057864427566528, + "rewards/pad": 0.125, + "step": 2305 + }, + { + "completion_length": 288.90625, + "epoch": 0.7348629700446144, + "grad_norm": 5.094200134277344, + "kl": 0.064453125, + "learning_rate": 2.651370299553856e-07, + "loss": 0.0026, + "reward": 1.5313658714294434, + "reward_std": 0.10805720835924149, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4376158118247986, + "rewards/pad": 0.09375, + "step": 2306 + }, + { + "completion_length": 193.25, + "epoch": 0.7351816443594646, + "grad_norm": 43.746063232421875, + "kl": 0.0947265625, + "learning_rate": 2.6481835564053534e-07, + "loss": 0.0038, + "reward": 1.545185923576355, + "reward_std": 0.15073031187057495, + "rewards/answer_reward": 0.078125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4670608937740326, + "step": 2307 + }, + { + "completion_length": 220.703125, + "epoch": 0.7355003186743149, + "grad_norm": 11.71030330657959, + "kl": 0.11865234375, + "learning_rate": 2.644996813256851e-07, + "loss": 0.0048, + "reward": 1.5706113576889038, + "reward_std": 0.1904187798500061, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4612364172935486, + "step": 2308 + }, + { + "completion_length": 197.125, + "epoch": 0.7358189929891651, + "grad_norm": 23.52389907836914, + "kl": 0.09716796875, + "learning_rate": 2.641810070108349e-07, + "loss": 0.0039, + "reward": 1.6343523263931274, + "reward_std": 0.14043483138084412, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5093523263931274, + "rewards/pad": 0.125, + "step": 2309 + }, + { + "completion_length": 292.390625, + "epoch": 0.7361376673040153, + "grad_norm": 23.054824829101562, + "kl": 0.06640625, + "learning_rate": 2.6386233269598466e-07, + "loss": 0.0027, + "reward": 1.565664529800415, + "reward_std": 0.163468599319458, + "rewards/pad": 0.1875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.39378952980041504, + "step": 2310 + }, + { + "completion_length": 217.765625, + "epoch": 0.7364563416188655, + "grad_norm": 9.4763765335083, + "kl": 0.076171875, + "learning_rate": 2.6354365838113446e-07, + "loss": 0.003, + "reward": 1.7144595384597778, + "reward_std": 0.10374715924263, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4800845682621002, + "rewards/pad": 0.234375, + "step": 2311 + }, + { + "completion_length": 234.046875, + "epoch": 0.7367750159337157, + "grad_norm": 14.154435157775879, + "kl": 0.1064453125, + "learning_rate": 2.632249840662842e-07, + "loss": 0.0043, + "reward": 1.5973759889602661, + "reward_std": 0.13200649619102478, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4880009591579437, + "rewards/pad": 0.125, + "step": 2312 + }, + { + "completion_length": 242.140625, + "epoch": 0.737093690248566, + "grad_norm": 9.745589256286621, + "kl": 0.0869140625, + "learning_rate": 2.62906309751434e-07, + "loss": 0.0035, + "reward": 1.4887791872024536, + "reward_std": 0.0891360491514206, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3637791574001312, + "step": 2313 + }, + { + "completion_length": 182.015625, + "epoch": 0.7374123645634162, + "grad_norm": 55.06594467163086, + "kl": 0.09765625, + "learning_rate": 2.625876354365838e-07, + "loss": 0.0039, + "reward": 1.7070443630218506, + "reward_std": 0.072098508477211, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5820443630218506, + "step": 2314 + }, + { + "completion_length": 253.75, + "epoch": 0.7377310388782664, + "grad_norm": 34.84463882446289, + "kl": 0.078125, + "learning_rate": 2.622689611217336e-07, + "loss": 0.0031, + "reward": 1.5879429578781128, + "reward_std": 0.15225547552108765, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4316929578781128, + "rewards/pad": 0.15625, + "step": 2315 + }, + { + "completion_length": 223.125, + "epoch": 0.7380497131931166, + "grad_norm": 8.495549201965332, + "kl": 0.0810546875, + "learning_rate": 2.6195028680688334e-07, + "loss": 0.0032, + "reward": 1.5012521743774414, + "reward_std": 0.14926576614379883, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.34500211477279663, + "step": 2316 + }, + { + "completion_length": 147.171875, + "epoch": 0.7383683875079669, + "grad_norm": 7.767884254455566, + "kl": 0.10498046875, + "learning_rate": 2.6163161249203315e-07, + "loss": 0.0042, + "reward": 1.7319977283477783, + "reward_std": 0.04964713752269745, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6069977283477783, + "step": 2317 + }, + { + "completion_length": 240.015625, + "epoch": 0.7386870618228171, + "grad_norm": 41.17066955566406, + "kl": 0.07373046875, + "learning_rate": 2.613129381771829e-07, + "loss": 0.0029, + "reward": 1.4253292083740234, + "reward_std": 0.13328561186790466, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4409542679786682, + "step": 2318 + }, + { + "completion_length": 106.796875, + "epoch": 0.7390057361376673, + "grad_norm": 428.2945556640625, + "kl": 1.484375, + "learning_rate": 2.609942638623327e-07, + "loss": 0.0595, + "reward": 1.5589778423309326, + "reward_std": 0.1439211368560791, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.433977872133255, + "rewards/pad": 0.125, + "step": 2319 + }, + { + "completion_length": 243.734375, + "epoch": 0.7393244104525175, + "grad_norm": 30.449155807495117, + "kl": 0.08642578125, + "learning_rate": 2.6067558954748246e-07, + "loss": 0.0034, + "reward": 1.480348825454712, + "reward_std": 0.11358199268579483, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3553488254547119, + "step": 2320 + }, + { + "completion_length": 334.75, + "epoch": 0.7396430847673677, + "grad_norm": 8.932828903198242, + "kl": 0.07275390625, + "learning_rate": 2.6035691523263227e-07, + "loss": 0.0029, + "reward": 1.5339847803115845, + "reward_std": 0.06272891163825989, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5339848399162292, + "step": 2321 + }, + { + "completion_length": 280.109375, + "epoch": 0.739961759082218, + "grad_norm": 14.46866512298584, + "kl": 0.0712890625, + "learning_rate": 2.60038240917782e-07, + "loss": 0.0028, + "reward": 1.4635485410690308, + "reward_std": 0.0832393690943718, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3385485112667084, + "rewards/pad": 0.125, + "step": 2322 + }, + { + "completion_length": 303.859375, + "epoch": 0.7402804333970682, + "grad_norm": 7.381771564483643, + "kl": 0.052734375, + "learning_rate": 2.5971956660293183e-07, + "loss": 0.0021, + "reward": 1.7793185710906982, + "reward_std": 0.07244474440813065, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.529318630695343, + "step": 2323 + }, + { + "completion_length": 136.515625, + "epoch": 0.7405991077119184, + "grad_norm": 9.891907691955566, + "kl": 0.126953125, + "learning_rate": 2.594008922880816e-07, + "loss": 0.0051, + "reward": 1.4616608619689941, + "reward_std": 0.07319878041744232, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4616607427597046, + "rewards/pad": 0.0, + "step": 2324 + }, + { + "completion_length": 243.015625, + "epoch": 0.7409177820267686, + "grad_norm": 23.877626419067383, + "kl": 0.0849609375, + "learning_rate": 2.5908221797323134e-07, + "loss": 0.0034, + "reward": 1.4740469455718994, + "reward_std": 0.21762168407440186, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3802970051765442, + "step": 2325 + }, + { + "completion_length": 286.09375, + "epoch": 0.7412364563416188, + "grad_norm": 6.268517971038818, + "kl": 0.06640625, + "learning_rate": 2.587635436583811e-07, + "loss": 0.0027, + "reward": 1.504499912261963, + "reward_std": 0.0882675051689148, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3951248824596405, + "rewards/pad": 0.125, + "step": 2326 + }, + { + "completion_length": 194.625, + "epoch": 0.741555130656469, + "grad_norm": 6.7995429039001465, + "kl": 0.10595703125, + "learning_rate": 2.584448693435309e-07, + "loss": 0.0042, + "reward": 1.5988112688064575, + "reward_std": 0.11524415016174316, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4738112986087799, + "rewards/pad": 0.125, + "step": 2327 + }, + { + "completion_length": 111.0625, + "epoch": 0.7418738049713193, + "grad_norm": 18.028059005737305, + "kl": 0.10791015625, + "learning_rate": 2.5812619502868065e-07, + "loss": 0.0043, + "reward": 1.900662899017334, + "reward_std": 0.12899482250213623, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5256630182266235, + "step": 2328 + }, + { + "completion_length": 199.359375, + "epoch": 0.7421924792861695, + "grad_norm": 24.136371612548828, + "kl": 0.0966796875, + "learning_rate": 2.5780752071383046e-07, + "loss": 0.0039, + "reward": 1.6215674877166748, + "reward_std": 0.11621560156345367, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5121925473213196, + "rewards/pad": 0.125, + "step": 2329 + }, + { + "completion_length": 317.078125, + "epoch": 0.7425111536010197, + "grad_norm": 5.354953765869141, + "kl": 0.078125, + "learning_rate": 2.574888463989802e-07, + "loss": 0.0031, + "reward": 1.3678014278411865, + "reward_std": 0.03476231172680855, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3678014278411865, + "step": 2330 + }, + { + "completion_length": 203.90625, + "epoch": 0.7428298279158699, + "grad_norm": 13.329392433166504, + "kl": 0.109375, + "learning_rate": 2.5717017208413e-07, + "loss": 0.0044, + "reward": 1.6113553047180176, + "reward_std": 0.12751217186450958, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5488553047180176, + "step": 2331 + }, + { + "completion_length": 350.703125, + "epoch": 0.7431485022307202, + "grad_norm": 14.267176628112793, + "kl": 0.07666015625, + "learning_rate": 2.568514977692798e-07, + "loss": 0.0031, + "reward": 1.491163969039917, + "reward_std": 0.03700249269604683, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49116405844688416, + "step": 2332 + }, + { + "completion_length": 192.890625, + "epoch": 0.7434671765455704, + "grad_norm": 24.421035766601562, + "kl": 0.0849609375, + "learning_rate": 2.565328234544296e-07, + "loss": 0.0034, + "reward": 1.6897307634353638, + "reward_std": 0.06476552784442902, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43973079323768616, + "step": 2333 + }, + { + "completion_length": 242.234375, + "epoch": 0.7437858508604207, + "grad_norm": 10.342123031616211, + "kl": 0.0830078125, + "learning_rate": 2.5621414913957934e-07, + "loss": 0.0033, + "reward": 1.543015956878662, + "reward_std": 0.09575996547937393, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5430158972740173, + "rewards/pad": 0.0, + "step": 2334 + }, + { + "completion_length": 141.1875, + "epoch": 0.7441045251752709, + "grad_norm": 10.832265853881836, + "kl": 0.1162109375, + "learning_rate": 2.5589547482472914e-07, + "loss": 0.0046, + "reward": 1.7195611000061035, + "reward_std": 0.10429985076189041, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5945611000061035, + "rewards/pad": 0.125, + "step": 2335 + }, + { + "completion_length": 282.234375, + "epoch": 0.7444231994901211, + "grad_norm": 10.534006118774414, + "kl": 0.08984375, + "learning_rate": 2.555768005098789e-07, + "loss": 0.0036, + "reward": 1.4840242862701416, + "reward_std": 0.06869726628065109, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3590242266654968, + "step": 2336 + }, + { + "completion_length": 243.046875, + "epoch": 0.7447418738049714, + "grad_norm": 8.319096565246582, + "kl": 0.0927734375, + "learning_rate": 2.552581261950287e-07, + "loss": 0.0037, + "reward": 1.5507136583328247, + "reward_std": 0.12487111240625381, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5663386583328247, + "step": 2337 + }, + { + "completion_length": 192.734375, + "epoch": 0.7450605481198216, + "grad_norm": 9.370587348937988, + "kl": 0.103515625, + "learning_rate": 2.5493945188017846e-07, + "loss": 0.0041, + "reward": 1.6424996852874756, + "reward_std": 0.09512320160865784, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5174996852874756, + "step": 2338 + }, + { + "completion_length": 214.1875, + "epoch": 0.7453792224346718, + "grad_norm": 13.986674308776855, + "kl": 0.08984375, + "learning_rate": 2.546207775653282e-07, + "loss": 0.0036, + "reward": 1.5424704551696777, + "reward_std": 0.036111533641815186, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5424704551696777, + "rewards/pad": 0.0, + "step": 2339 + }, + { + "completion_length": 142.5, + "epoch": 0.745697896749522, + "grad_norm": 11.817384719848633, + "kl": 0.1171875, + "learning_rate": 2.54302103250478e-07, + "loss": 0.0047, + "reward": 1.5492959022521973, + "reward_std": 0.10030539333820343, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5649209022521973, + "rewards/pad": 0.0, + "step": 2340 + }, + { + "completion_length": 179.484375, + "epoch": 0.7460165710643722, + "grad_norm": 10.856732368469238, + "kl": 0.11376953125, + "learning_rate": 2.5398342893562777e-07, + "loss": 0.0046, + "reward": 1.516695261001587, + "reward_std": 0.08297878503799438, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3916953206062317, + "step": 2341 + }, + { + "completion_length": 218.71875, + "epoch": 0.7463352453792225, + "grad_norm": 23.0338077545166, + "kl": 0.08544921875, + "learning_rate": 2.536647546207776e-07, + "loss": 0.0034, + "reward": 1.5617284774780273, + "reward_std": 0.09402166306972504, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.45235344767570496, + "step": 2342 + }, + { + "completion_length": 149.796875, + "epoch": 0.7466539196940727, + "grad_norm": 11.982497215270996, + "kl": 0.10546875, + "learning_rate": 2.5334608030592733e-07, + "loss": 0.0042, + "reward": 1.6303856372833252, + "reward_std": 0.17340770363807678, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5835106372833252, + "rewards/pad": 0.046875, + "step": 2343 + }, + { + "completion_length": 346.765625, + "epoch": 0.7469725940089229, + "grad_norm": 6.833131313323975, + "kl": 0.05517578125, + "learning_rate": 2.530274059910771e-07, + "loss": 0.0022, + "reward": 1.2404136657714844, + "reward_std": 0.1525970995426178, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.2716636657714844, + "rewards/pad": 0.0, + "step": 2344 + }, + { + "completion_length": 192.640625, + "epoch": 0.7472912683237731, + "grad_norm": 9.509944915771484, + "kl": 0.087890625, + "learning_rate": 2.5270873167622684e-07, + "loss": 0.0035, + "reward": 1.6787506341934204, + "reward_std": 0.057284578680992126, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4287506341934204, + "step": 2345 + }, + { + "completion_length": 268.9375, + "epoch": 0.7476099426386233, + "grad_norm": 33.24864959716797, + "kl": 0.07666015625, + "learning_rate": 2.5239005736137665e-07, + "loss": 0.0031, + "reward": 1.7066154479980469, + "reward_std": 0.11836501955986023, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5972404479980469, + "step": 2346 + }, + { + "completion_length": 223.25, + "epoch": 0.7479286169534736, + "grad_norm": 21.398069381713867, + "kl": 0.080078125, + "learning_rate": 2.520713830465264e-07, + "loss": 0.0032, + "reward": 1.3415656089782715, + "reward_std": 0.06385195255279541, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34156566858291626, + "rewards/pad": 0.0, + "step": 2347 + }, + { + "completion_length": 223.0, + "epoch": 0.7482472912683238, + "grad_norm": 10.627423286437988, + "kl": 0.0859375, + "learning_rate": 2.517527087316762e-07, + "loss": 0.0034, + "reward": 1.5851722955703735, + "reward_std": 0.06106871739029884, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46017220616340637, + "rewards/pad": 0.125, + "step": 2348 + }, + { + "completion_length": 222.15625, + "epoch": 0.748565965583174, + "grad_norm": 94.22887420654297, + "kl": 0.10009765625, + "learning_rate": 2.5143403441682596e-07, + "loss": 0.004, + "reward": 1.6433560848236084, + "reward_std": 0.14962291717529297, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5652311444282532, + "step": 2349 + }, + { + "completion_length": 202.609375, + "epoch": 0.7488846398980242, + "grad_norm": 52.1790657043457, + "kl": 0.08154296875, + "learning_rate": 2.5111536010197577e-07, + "loss": 0.0033, + "reward": 1.7037962675094604, + "reward_std": 0.16860975325107574, + "rewards/answer_reward": 0.140625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.57879638671875, + "step": 2350 + }, + { + "completion_length": 229.140625, + "epoch": 0.7492033142128744, + "grad_norm": 6.283476829528809, + "kl": 0.0908203125, + "learning_rate": 2.507966857871255e-07, + "loss": 0.0036, + "reward": 1.5790940523147583, + "reward_std": 0.14463365077972412, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5634690523147583, + "rewards/pad": 0.015625, + "step": 2351 + }, + { + "completion_length": 183.25, + "epoch": 0.7495219885277247, + "grad_norm": 9.41490650177002, + "kl": 0.1044921875, + "learning_rate": 2.5047801147227533e-07, + "loss": 0.0042, + "reward": 1.7538079023361206, + "reward_std": 0.11042501032352448, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6131829023361206, + "rewards/pad": 0.140625, + "step": 2352 + }, + { + "completion_length": 267.78125, + "epoch": 0.7498406628425749, + "grad_norm": 15.176810264587402, + "kl": 0.07763671875, + "learning_rate": 2.501593371574251e-07, + "loss": 0.0031, + "reward": 1.6882057189941406, + "reward_std": 0.04086899757385254, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5632058382034302, + "step": 2353 + }, + { + "completion_length": 241.53125, + "epoch": 0.7501593371574251, + "grad_norm": 8.03711223602295, + "kl": 0.09326171875, + "learning_rate": 2.498406628425749e-07, + "loss": 0.0037, + "reward": 1.7172772884368896, + "reward_std": 0.13023245334625244, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46727728843688965, + "rewards/pad": 0.25, + "step": 2354 + }, + { + "completion_length": 317.484375, + "epoch": 0.7504780114722753, + "grad_norm": 12.903837203979492, + "kl": 0.06640625, + "learning_rate": 2.4952198852772465e-07, + "loss": 0.0027, + "reward": 1.5770788192749023, + "reward_std": 0.035880204290151596, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5770787000656128, + "step": 2355 + }, + { + "completion_length": 234.71875, + "epoch": 0.7507966857871256, + "grad_norm": 11.72558307647705, + "kl": 0.08740234375, + "learning_rate": 2.4920331421287445e-07, + "loss": 0.0035, + "reward": 1.4853765964508057, + "reward_std": 0.09491764008998871, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5010015964508057, + "step": 2356 + }, + { + "completion_length": 268.03125, + "epoch": 0.7511153601019758, + "grad_norm": 10.013938903808594, + "kl": 0.0751953125, + "learning_rate": 2.488846398980242e-07, + "loss": 0.003, + "reward": 1.596638798713684, + "reward_std": 0.043350279331207275, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5966388583183289, + "rewards/pad": 0.0, + "step": 2357 + }, + { + "completion_length": 199.015625, + "epoch": 0.751434034416826, + "grad_norm": 7.718585968017578, + "kl": 0.115234375, + "learning_rate": 2.48565965583174e-07, + "loss": 0.0046, + "reward": 1.4091236591339111, + "reward_std": 0.07670184969902039, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40912362933158875, + "rewards/pad": 0.0, + "step": 2358 + }, + { + "completion_length": 198.34375, + "epoch": 0.7517527087316762, + "grad_norm": 18.98740577697754, + "kl": 0.09619140625, + "learning_rate": 2.4824729126832377e-07, + "loss": 0.0038, + "reward": 1.7292869091033936, + "reward_std": 0.13945679366588593, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5105368494987488, + "step": 2359 + }, + { + "completion_length": 152.421875, + "epoch": 0.7520713830465264, + "grad_norm": 11.604713439941406, + "kl": 0.10693359375, + "learning_rate": 2.479286169534735e-07, + "loss": 0.0043, + "reward": 1.7420519590377808, + "reward_std": 0.09547455608844757, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49205198884010315, + "rewards/pad": 0.25, + "step": 2360 + }, + { + "completion_length": 300.265625, + "epoch": 0.7523900573613767, + "grad_norm": 10.19469165802002, + "kl": 0.1982421875, + "learning_rate": 2.4760994263862333e-07, + "loss": 0.0079, + "reward": 1.6528478860855103, + "reward_std": 0.06703270226716995, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.527847945690155, + "step": 2361 + }, + { + "completion_length": 177.453125, + "epoch": 0.7527087316762269, + "grad_norm": 11.560866355895996, + "kl": 0.1064453125, + "learning_rate": 2.472912683237731e-07, + "loss": 0.0043, + "reward": 1.546684980392456, + "reward_std": 0.16256418824195862, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.43731003999710083, + "step": 2362 + }, + { + "completion_length": 150.296875, + "epoch": 0.7530274059910771, + "grad_norm": 15.46419620513916, + "kl": 0.10693359375, + "learning_rate": 2.469725940089229e-07, + "loss": 0.0043, + "reward": 1.5246479511260986, + "reward_std": 0.17527268826961517, + "rewards/answer_reward": 0.046875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.47777289152145386, + "step": 2363 + }, + { + "completion_length": 155.6875, + "epoch": 0.7533460803059273, + "grad_norm": 21.624813079833984, + "kl": 0.12353515625, + "learning_rate": 2.4665391969407264e-07, + "loss": 0.0049, + "reward": 1.560457706451416, + "reward_std": 0.14548426866531372, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.576082706451416, + "step": 2364 + }, + { + "completion_length": 146.671875, + "epoch": 0.7536647546207775, + "grad_norm": 13.269795417785645, + "kl": 0.095703125, + "learning_rate": 2.4633524537922245e-07, + "loss": 0.0038, + "reward": 1.6994431018829346, + "reward_std": 0.08348685503005981, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4494430422782898, + "rewards/pad": 0.25, + "step": 2365 + }, + { + "completion_length": 230.625, + "epoch": 0.7539834289356278, + "grad_norm": 27.45595932006836, + "kl": 0.0927734375, + "learning_rate": 2.460165710643722e-07, + "loss": 0.0037, + "reward": 1.59385347366333, + "reward_std": 0.16194438934326172, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4063534140586853, + "step": 2366 + }, + { + "completion_length": 185.765625, + "epoch": 0.754302103250478, + "grad_norm": 3.6742031574249268, + "kl": 0.12158203125, + "learning_rate": 2.45697896749522e-07, + "loss": 0.0049, + "reward": 1.4821596145629883, + "reward_std": 0.05528702214360237, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48215949535369873, + "rewards/pad": 0.0, + "step": 2367 + }, + { + "completion_length": 99.921875, + "epoch": 0.7546207775653282, + "grad_norm": 7.76267671585083, + "kl": 0.12890625, + "learning_rate": 2.4537922243467177e-07, + "loss": 0.0051, + "reward": 1.4437228441238403, + "reward_std": 0.16315695643424988, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45934778451919556, + "rewards/pad": 0.0, + "step": 2368 + }, + { + "completion_length": 257.046875, + "epoch": 0.7549394518801784, + "grad_norm": 9.175533294677734, + "kl": 0.0751953125, + "learning_rate": 2.450605481198215e-07, + "loss": 0.003, + "reward": 1.649479866027832, + "reward_std": 0.1376563012599945, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.524479866027832, + "step": 2369 + }, + { + "completion_length": 251.6875, + "epoch": 0.7552581261950286, + "grad_norm": 15.279226303100586, + "kl": 0.09619140625, + "learning_rate": 2.447418738049713e-07, + "loss": 0.0039, + "reward": 1.6367669105529785, + "reward_std": 0.11726522445678711, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4961419701576233, + "rewards/pad": 0.140625, + "step": 2370 + }, + { + "completion_length": 230.90625, + "epoch": 0.7555768005098789, + "grad_norm": 17.72090721130371, + "kl": 0.0947265625, + "learning_rate": 2.444231994901211e-07, + "loss": 0.0038, + "reward": 1.4760117530822754, + "reward_std": 0.17733457684516907, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5072617530822754, + "step": 2371 + }, + { + "completion_length": 200.296875, + "epoch": 0.7558954748247291, + "grad_norm": 17.369386672973633, + "kl": 0.1064453125, + "learning_rate": 2.4410452517527084e-07, + "loss": 0.0042, + "reward": 1.7264306545257568, + "reward_std": 0.13656508922576904, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5076805949211121, + "rewards/pad": 0.21875, + "step": 2372 + }, + { + "completion_length": 178.359375, + "epoch": 0.7562141491395793, + "grad_norm": 7.309889793395996, + "kl": 0.09619140625, + "learning_rate": 2.4378585086042064e-07, + "loss": 0.0038, + "reward": 1.7076586484909058, + "reward_std": 0.08293125033378601, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.582658588886261, + "step": 2373 + }, + { + "completion_length": 289.359375, + "epoch": 0.7565328234544296, + "grad_norm": 11.607328414916992, + "kl": 0.0966796875, + "learning_rate": 2.434671765455704e-07, + "loss": 0.0039, + "reward": 1.4018770456314087, + "reward_std": 0.1482263058423996, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4331270456314087, + "step": 2374 + }, + { + "completion_length": 208.671875, + "epoch": 0.7568514977692798, + "grad_norm": 16.60245132446289, + "kl": 0.09716796875, + "learning_rate": 2.431485022307202e-07, + "loss": 0.0039, + "reward": 1.4880285263061523, + "reward_std": 0.07213404774665833, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48802849650382996, + "step": 2375 + }, + { + "completion_length": 269.8125, + "epoch": 0.7571701720841301, + "grad_norm": 5.0662760734558105, + "kl": 0.07080078125, + "learning_rate": 2.4282982791586996e-07, + "loss": 0.0028, + "reward": 1.5826705694198608, + "reward_std": 0.08732300996780396, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.47329556941986084, + "step": 2376 + }, + { + "completion_length": 290.34375, + "epoch": 0.7574888463989803, + "grad_norm": 14.884427070617676, + "kl": 0.059814453125, + "learning_rate": 2.4251115360101976e-07, + "loss": 0.0024, + "reward": 1.7173967361450195, + "reward_std": 0.10928597301244736, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6080216765403748, + "rewards/pad": 0.109375, + "step": 2377 + }, + { + "completion_length": 288.875, + "epoch": 0.7578075207138305, + "grad_norm": 6.457363128662109, + "kl": 0.0703125, + "learning_rate": 2.421924792861695e-07, + "loss": 0.0028, + "reward": 1.7575962543487549, + "reward_std": 0.07571257650852203, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38259628415107727, + "rewards/pad": 0.375, + "step": 2378 + }, + { + "completion_length": 169.484375, + "epoch": 0.7581261950286807, + "grad_norm": 18.78461265563965, + "kl": 0.11572265625, + "learning_rate": 2.4187380497131927e-07, + "loss": 0.0046, + "reward": 1.405072808265686, + "reward_std": 0.11775682866573334, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3894478380680084, + "rewards/pad": 0.015625, + "step": 2379 + }, + { + "completion_length": 235.234375, + "epoch": 0.758444869343531, + "grad_norm": 11.631933212280273, + "kl": 0.07373046875, + "learning_rate": 2.415551306564691e-07, + "loss": 0.0029, + "reward": 1.5759958028793335, + "reward_std": 0.32423967123031616, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.3259957730770111, + "rewards/pad": 0.296875, + "step": 2380 + }, + { + "completion_length": 190.09375, + "epoch": 0.7587635436583812, + "grad_norm": 10.003499984741211, + "kl": 0.09375, + "learning_rate": 2.4123645634161883e-07, + "loss": 0.0038, + "reward": 1.4936577081680298, + "reward_std": 0.0779888927936554, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.368657648563385, + "rewards/pad": 0.125, + "step": 2381 + }, + { + "completion_length": 207.703125, + "epoch": 0.7590822179732314, + "grad_norm": 8.13753604888916, + "kl": 0.091796875, + "learning_rate": 2.4091778202676864e-07, + "loss": 0.0037, + "reward": 1.6867070198059082, + "reward_std": 0.10507681965827942, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5617070198059082, + "rewards/pad": 0.125, + "step": 2382 + }, + { + "completion_length": 350.234375, + "epoch": 0.7594008922880816, + "grad_norm": 4.736698150634766, + "kl": 0.048583984375, + "learning_rate": 2.405991077119184e-07, + "loss": 0.0019, + "reward": 1.3926687240600586, + "reward_std": 0.046149738132953644, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3926687240600586, + "step": 2383 + }, + { + "completion_length": 226.28125, + "epoch": 0.7597195666029318, + "grad_norm": 15.912691116333008, + "kl": 0.09228515625, + "learning_rate": 2.402804333970682e-07, + "loss": 0.0037, + "reward": 1.5672473907470703, + "reward_std": 0.09506212174892426, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5672474503517151, + "step": 2384 + }, + { + "completion_length": 285.9375, + "epoch": 0.760038240917782, + "grad_norm": 8.352356910705566, + "kl": 0.06396484375, + "learning_rate": 2.3996175908221796e-07, + "loss": 0.0026, + "reward": 1.492800235748291, + "reward_std": 0.1296931505203247, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5084253549575806, + "step": 2385 + }, + { + "completion_length": 207.90625, + "epoch": 0.7603569152326323, + "grad_norm": 11.334542274475098, + "kl": 0.125, + "learning_rate": 2.3964308476736776e-07, + "loss": 0.005, + "reward": 1.5617526769638062, + "reward_std": 0.17248529195785522, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43675270676612854, + "rewards/pad": 0.140625, + "step": 2386 + }, + { + "completion_length": 222.9375, + "epoch": 0.7606755895474825, + "grad_norm": 9.127038955688477, + "kl": 0.08447265625, + "learning_rate": 2.393244104525175e-07, + "loss": 0.0034, + "reward": 1.5596591234207153, + "reward_std": 0.08980844914913177, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5596591830253601, + "rewards/pad": 0.0, + "step": 2387 + }, + { + "completion_length": 310.453125, + "epoch": 0.7609942638623327, + "grad_norm": 8.183820724487305, + "kl": 0.068359375, + "learning_rate": 2.3900573613766727e-07, + "loss": 0.0027, + "reward": 1.5660204887390137, + "reward_std": 0.12294905632734299, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5816453695297241, + "step": 2388 + }, + { + "completion_length": 279.1875, + "epoch": 0.7613129381771829, + "grad_norm": 6.7372846603393555, + "kl": 0.060546875, + "learning_rate": 2.386870618228171e-07, + "loss": 0.0024, + "reward": 1.493821382522583, + "reward_std": 0.11108361184597015, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3844463527202606, + "step": 2389 + }, + { + "completion_length": 153.015625, + "epoch": 0.7616316124920331, + "grad_norm": 8.310419082641602, + "kl": 0.103515625, + "learning_rate": 2.3836838750796683e-07, + "loss": 0.0041, + "reward": 1.6152290105819702, + "reward_std": 0.1089547723531723, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3652289807796478, + "rewards/pad": 0.25, + "step": 2390 + }, + { + "completion_length": 181.953125, + "epoch": 0.7619502868068834, + "grad_norm": 21.235824584960938, + "kl": 0.11181640625, + "learning_rate": 2.380497131931166e-07, + "loss": 0.0045, + "reward": 1.490411400794983, + "reward_std": 0.11310833692550659, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4904113709926605, + "rewards/pad": 0.0, + "step": 2391 + }, + { + "completion_length": 192.09375, + "epoch": 0.7622689611217336, + "grad_norm": 22.150575637817383, + "kl": 0.1005859375, + "learning_rate": 2.377310388782664e-07, + "loss": 0.004, + "reward": 1.3348021507263184, + "reward_std": 0.12773533165454865, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.31917715072631836, + "rewards/pad": 0.015625, + "step": 2392 + }, + { + "completion_length": 90.96875, + "epoch": 0.7625876354365838, + "grad_norm": 10.078266143798828, + "kl": 0.1357421875, + "learning_rate": 2.3741236456341617e-07, + "loss": 0.0055, + "reward": 1.7374215126037598, + "reward_std": 0.09872999042272568, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7374216318130493, + "rewards/pad": 0.0, + "step": 2393 + }, + { + "completion_length": 287.53125, + "epoch": 0.762906309751434, + "grad_norm": 12.642020225524902, + "kl": 0.06982421875, + "learning_rate": 2.3709369024856595e-07, + "loss": 0.0028, + "reward": 1.5701704025268555, + "reward_std": 0.10735349357128143, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47642046213150024, + "step": 2394 + }, + { + "completion_length": 303.734375, + "epoch": 0.7632249840662843, + "grad_norm": 6.236365795135498, + "kl": 0.0712890625, + "learning_rate": 2.3677501593371573e-07, + "loss": 0.0029, + "reward": 1.4030518531799316, + "reward_std": 0.05589941516518593, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4030519127845764, + "rewards/pad": 0.0, + "step": 2395 + }, + { + "completion_length": 208.3125, + "epoch": 0.7635436583811345, + "grad_norm": 20.96642303466797, + "kl": 0.08642578125, + "learning_rate": 2.3645634161886551e-07, + "loss": 0.0035, + "reward": 1.8169206380844116, + "reward_std": 0.13159474730491638, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5200456976890564, + "step": 2396 + }, + { + "completion_length": 337.28125, + "epoch": 0.7638623326959847, + "grad_norm": 6.730218887329102, + "kl": 0.05859375, + "learning_rate": 2.361376673040153e-07, + "loss": 0.0024, + "reward": 1.4287188053131104, + "reward_std": 0.030213970690965652, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4287187457084656, + "step": 2397 + }, + { + "completion_length": 270.734375, + "epoch": 0.7641810070108349, + "grad_norm": 9.62966537475586, + "kl": 0.083984375, + "learning_rate": 2.3581899298916505e-07, + "loss": 0.0034, + "reward": 1.7294020652770996, + "reward_std": 0.14015543460845947, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5731521248817444, + "rewards/pad": 0.15625, + "step": 2398 + }, + { + "completion_length": 317.75, + "epoch": 0.7644996813256851, + "grad_norm": 8.748055458068848, + "kl": 0.06787109375, + "learning_rate": 2.3550031867431483e-07, + "loss": 0.0027, + "reward": 1.4744210243225098, + "reward_std": 0.10222569853067398, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4900459349155426, + "step": 2399 + }, + { + "completion_length": 207.265625, + "epoch": 0.7648183556405354, + "grad_norm": 14.838871955871582, + "kl": 0.0986328125, + "learning_rate": 2.351816443594646e-07, + "loss": 0.0039, + "reward": 1.5980523824691772, + "reward_std": 0.08159129321575165, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6136773824691772, + "rewards/pad": 0.0, + "step": 2400 + }, + { + "completion_length": 195.15625, + "epoch": 0.7651370299553856, + "grad_norm": 10.845717430114746, + "kl": 0.09521484375, + "learning_rate": 2.348629700446144e-07, + "loss": 0.0038, + "reward": 1.7929725646972656, + "reward_std": 0.09547235071659088, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6679726243019104, + "step": 2401 + }, + { + "completion_length": 190.25, + "epoch": 0.7654557042702358, + "grad_norm": 10.455423355102539, + "kl": 0.111328125, + "learning_rate": 2.3454429572976417e-07, + "loss": 0.0045, + "reward": 1.7947559356689453, + "reward_std": 0.17043697834014893, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5447558164596558, + "step": 2402 + }, + { + "completion_length": 149.828125, + "epoch": 0.765774378585086, + "grad_norm": 21.63300895690918, + "kl": 0.11279296875, + "learning_rate": 2.3422562141491395e-07, + "loss": 0.0045, + "reward": 1.5531684160232544, + "reward_std": 0.0856570452451706, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.428168386220932, + "rewards/pad": 0.125, + "step": 2403 + }, + { + "completion_length": 210.390625, + "epoch": 0.7660930528999362, + "grad_norm": 28.16766357421875, + "kl": 0.0830078125, + "learning_rate": 2.3390694710006373e-07, + "loss": 0.0033, + "reward": 1.4170100688934326, + "reward_std": 0.0960836410522461, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4170100688934326, + "rewards/pad": 0.0, + "step": 2404 + }, + { + "completion_length": 276.203125, + "epoch": 0.7664117272147865, + "grad_norm": 7.249959945678711, + "kl": 0.07373046875, + "learning_rate": 2.335882727852135e-07, + "loss": 0.0029, + "reward": 1.4971795082092285, + "reward_std": 0.036122873425483704, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49717944860458374, + "step": 2405 + }, + { + "completion_length": 394.9375, + "epoch": 0.7667304015296367, + "grad_norm": 4.664658069610596, + "kl": 0.04638671875, + "learning_rate": 2.332695984703633e-07, + "loss": 0.0019, + "reward": 1.3320510387420654, + "reward_std": 0.06756319850683212, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3164260983467102, + "step": 2406 + }, + { + "completion_length": 243.921875, + "epoch": 0.7670490758444869, + "grad_norm": 7.838922023773193, + "kl": 0.1376953125, + "learning_rate": 2.3295092415551307e-07, + "loss": 0.0055, + "reward": 1.678990125656128, + "reward_std": 0.13129065930843353, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.44461509585380554, + "step": 2407 + }, + { + "completion_length": 270.34375, + "epoch": 0.7673677501593371, + "grad_norm": 7.979779243469238, + "kl": 0.0712890625, + "learning_rate": 2.3263224984066283e-07, + "loss": 0.0028, + "reward": 1.5450541973114014, + "reward_std": 0.11059615761041641, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.2950543165206909, + "step": 2408 + }, + { + "completion_length": 220.84375, + "epoch": 0.7676864244741873, + "grad_norm": 8.696356773376465, + "kl": 0.0888671875, + "learning_rate": 2.323135755258126e-07, + "loss": 0.0036, + "reward": 1.5947052240371704, + "reward_std": 0.13390368223190308, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4384552240371704, + "step": 2409 + }, + { + "completion_length": 292.03125, + "epoch": 0.7680050987890376, + "grad_norm": 6.767152309417725, + "kl": 0.076171875, + "learning_rate": 2.319949012109624e-07, + "loss": 0.003, + "reward": 1.6036784648895264, + "reward_std": 0.08758947253227234, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49430355429649353, + "rewards/pad": 0.125, + "step": 2410 + }, + { + "completion_length": 271.140625, + "epoch": 0.7683237731038878, + "grad_norm": 23.512475967407227, + "kl": 0.06982421875, + "learning_rate": 2.3167622689611217e-07, + "loss": 0.0028, + "reward": 1.5783004760742188, + "reward_std": 0.12401663511991501, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46892544627189636, + "rewards/pad": 0.109375, + "step": 2411 + }, + { + "completion_length": 282.5, + "epoch": 0.768642447418738, + "grad_norm": 12.914742469787598, + "kl": 0.07666015625, + "learning_rate": 2.3135755258126195e-07, + "loss": 0.0031, + "reward": 1.5491721630096436, + "reward_std": 0.12556979060173035, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5647971034049988, + "step": 2412 + }, + { + "completion_length": 263.421875, + "epoch": 0.7689611217335883, + "grad_norm": 25.450551986694336, + "kl": 0.08837890625, + "learning_rate": 2.3103887826641173e-07, + "loss": 0.0035, + "reward": 1.4551992416381836, + "reward_std": 0.059495627880096436, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.455199271440506, + "rewards/pad": 0.0, + "step": 2413 + }, + { + "completion_length": 266.90625, + "epoch": 0.7692797960484385, + "grad_norm": 8.406025886535645, + "kl": 0.0830078125, + "learning_rate": 2.307202039515615e-07, + "loss": 0.0033, + "reward": 1.5796451568603516, + "reward_std": 0.12612810730934143, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4702701270580292, + "step": 2414 + }, + { + "completion_length": 261.5, + "epoch": 0.7695984703632888, + "grad_norm": 15.752683639526367, + "kl": 0.10498046875, + "learning_rate": 2.304015296367113e-07, + "loss": 0.0042, + "reward": 1.632777214050293, + "reward_std": 0.18482252955436707, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49215221405029297, + "rewards/pad": 0.140625, + "step": 2415 + }, + { + "completion_length": 271.46875, + "epoch": 0.769917144678139, + "grad_norm": 6.456062316894531, + "kl": 0.0751953125, + "learning_rate": 2.3008285532186105e-07, + "loss": 0.003, + "reward": 1.5452253818511963, + "reward_std": 0.13085828721523285, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4358502924442291, + "step": 2416 + }, + { + "completion_length": 239.734375, + "epoch": 0.7702358189929892, + "grad_norm": 23.77017593383789, + "kl": 0.09619140625, + "learning_rate": 2.2976418100701083e-07, + "loss": 0.0038, + "reward": 1.5104858875274658, + "reward_std": 0.13161501288414001, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5261107683181763, + "step": 2417 + }, + { + "completion_length": 236.46875, + "epoch": 0.7705544933078394, + "grad_norm": 6.151467323303223, + "kl": 0.08984375, + "learning_rate": 2.2944550669216058e-07, + "loss": 0.0036, + "reward": 1.5481237173080444, + "reward_std": 0.18944257497787476, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.45437365770339966, + "step": 2418 + }, + { + "completion_length": 184.203125, + "epoch": 0.7708731676226896, + "grad_norm": 10.371918678283691, + "kl": 0.09130859375, + "learning_rate": 2.2912683237731036e-07, + "loss": 0.0037, + "reward": 1.4940639734268188, + "reward_std": 0.08268411457538605, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4940640330314636, + "step": 2419 + }, + { + "completion_length": 291.65625, + "epoch": 0.7711918419375399, + "grad_norm": 7.256937503814697, + "kl": 0.06982421875, + "learning_rate": 2.2880815806246014e-07, + "loss": 0.0028, + "reward": 1.4115976095199585, + "reward_std": 0.038619644939899445, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4115976095199585, + "rewards/pad": 0.0, + "step": 2420 + }, + { + "completion_length": 273.34375, + "epoch": 0.7715105162523901, + "grad_norm": 63.12538146972656, + "kl": 0.08056640625, + "learning_rate": 2.2848948374760992e-07, + "loss": 0.0032, + "reward": 1.4360949993133545, + "reward_std": 0.03568316251039505, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4360950291156769, + "step": 2421 + }, + { + "completion_length": 340.1875, + "epoch": 0.7718291905672403, + "grad_norm": 8.08768367767334, + "kl": 0.06396484375, + "learning_rate": 2.281708094327597e-07, + "loss": 0.0025, + "reward": 1.5484213829040527, + "reward_std": 0.1030707061290741, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43904638290405273, + "rewards/pad": 0.125, + "step": 2422 + }, + { + "completion_length": 194.046875, + "epoch": 0.7721478648820905, + "grad_norm": 12.11047077178955, + "kl": 0.0869140625, + "learning_rate": 2.2785213511790948e-07, + "loss": 0.0035, + "reward": 1.5583499670028687, + "reward_std": 0.04292614012956619, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3083499073982239, + "step": 2423 + }, + { + "completion_length": 268.515625, + "epoch": 0.7724665391969407, + "grad_norm": 28.141538619995117, + "kl": 0.091796875, + "learning_rate": 2.2753346080305926e-07, + "loss": 0.0037, + "reward": 1.4259402751922607, + "reward_std": 0.15826770663261414, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4259403347969055, + "step": 2424 + }, + { + "completion_length": 207.0625, + "epoch": 0.772785213511791, + "grad_norm": 16.846036911010742, + "kl": 0.10400390625, + "learning_rate": 2.2721478648820904e-07, + "loss": 0.0042, + "reward": 1.5542716979980469, + "reward_std": 0.1610041856765747, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44489675760269165, + "rewards/pad": 0.125, + "step": 2425 + }, + { + "completion_length": 186.0625, + "epoch": 0.7731038878266412, + "grad_norm": 12.483802795410156, + "kl": 0.0888671875, + "learning_rate": 2.2689611217335882e-07, + "loss": 0.0036, + "reward": 1.666634202003479, + "reward_std": 0.050067052245140076, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.666634202003479, + "step": 2426 + }, + { + "completion_length": 200.03125, + "epoch": 0.7734225621414914, + "grad_norm": 11.658013343811035, + "kl": 0.0986328125, + "learning_rate": 2.265774378585086e-07, + "loss": 0.0039, + "reward": 1.7182862758636475, + "reward_std": 0.06905974447727203, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.46828627586364746, + "step": 2427 + }, + { + "completion_length": 233.90625, + "epoch": 0.7737412364563416, + "grad_norm": 7.01774787902832, + "kl": 0.087890625, + "learning_rate": 2.2625876354365836e-07, + "loss": 0.0035, + "reward": 1.5699102878570557, + "reward_std": 0.15561619400978088, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.96875, + "rewards/iou_glue_reward": 0.4761601686477661, + "step": 2428 + }, + { + "completion_length": 175.859375, + "epoch": 0.7740599107711919, + "grad_norm": 25.093128204345703, + "kl": 0.11181640625, + "learning_rate": 2.2594008922880814e-07, + "loss": 0.0045, + "reward": 1.3243741989135742, + "reward_std": 0.13296332955360413, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.33999931812286377, + "step": 2429 + }, + { + "completion_length": 232.96875, + "epoch": 0.7743785850860421, + "grad_norm": 13.453207969665527, + "kl": 0.08544921875, + "learning_rate": 2.2562141491395792e-07, + "loss": 0.0034, + "reward": 1.6193019151687622, + "reward_std": 0.12025083601474762, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.509926974773407, + "step": 2430 + }, + { + "completion_length": 298.03125, + "epoch": 0.7746972594008923, + "grad_norm": 6.668832778930664, + "kl": 0.068359375, + "learning_rate": 2.253027405991077e-07, + "loss": 0.0027, + "reward": 1.5202369689941406, + "reward_std": 0.04842230677604675, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39523690938949585, + "step": 2431 + }, + { + "completion_length": 258.515625, + "epoch": 0.7750159337157425, + "grad_norm": 10.22235107421875, + "kl": 0.078125, + "learning_rate": 2.2498406628425748e-07, + "loss": 0.0031, + "reward": 1.458024501800537, + "reward_std": 0.058063872158527374, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3330245614051819, + "step": 2432 + }, + { + "completion_length": 385.96875, + "epoch": 0.7753346080305927, + "grad_norm": 9.327546119689941, + "kl": 0.056884765625, + "learning_rate": 2.2466539196940726e-07, + "loss": 0.0023, + "reward": 1.412497639656067, + "reward_std": 0.04687918350100517, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4124976098537445, + "step": 2433 + }, + { + "completion_length": 253.546875, + "epoch": 0.775653282345443, + "grad_norm": 9.379130363464355, + "kl": 0.10546875, + "learning_rate": 2.2434671765455704e-07, + "loss": 0.0042, + "reward": 1.610476016998291, + "reward_std": 0.10061073303222656, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6104759573936462, + "step": 2434 + }, + { + "completion_length": 286.59375, + "epoch": 0.7759719566602932, + "grad_norm": 13.452876091003418, + "kl": 0.08544921875, + "learning_rate": 2.2402804333970682e-07, + "loss": 0.0034, + "reward": 1.4705870151519775, + "reward_std": 0.10605093836784363, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4862120449542999, + "step": 2435 + }, + { + "completion_length": 287.171875, + "epoch": 0.7762906309751434, + "grad_norm": 11.824004173278809, + "kl": 0.08251953125, + "learning_rate": 2.237093690248566e-07, + "loss": 0.0033, + "reward": 1.4830718040466309, + "reward_std": 0.08090022951364517, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.37369686365127563, + "rewards/pad": 0.125, + "step": 2436 + }, + { + "completion_length": 307.96875, + "epoch": 0.7766093052899936, + "grad_norm": 12.284720420837402, + "kl": 0.06640625, + "learning_rate": 2.2339069471000636e-07, + "loss": 0.0027, + "reward": 1.5076161623001099, + "reward_std": 0.08636889606714249, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4919911026954651, + "step": 2437 + }, + { + "completion_length": 199.875, + "epoch": 0.7769279796048438, + "grad_norm": 13.545829772949219, + "kl": 0.1328125, + "learning_rate": 2.2307202039515614e-07, + "loss": 0.0053, + "reward": 1.460900068283081, + "reward_std": 0.07126356661319733, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.46090012788772583, + "rewards/pad": 0.0, + "step": 2438 + }, + { + "completion_length": 228.984375, + "epoch": 0.777246653919694, + "grad_norm": 36.11991500854492, + "kl": 0.09033203125, + "learning_rate": 2.2275334608030592e-07, + "loss": 0.0036, + "reward": 1.6342846155166626, + "reward_std": 0.1392236053943634, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5405345559120178, + "rewards/pad": 0.09375, + "step": 2439 + }, + { + "completion_length": 314.921875, + "epoch": 0.7775653282345443, + "grad_norm": 3.917736530303955, + "kl": 0.055419921875, + "learning_rate": 2.224346717654557e-07, + "loss": 0.0022, + "reward": 1.6305112838745117, + "reward_std": 0.09095712006092072, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4898863732814789, + "step": 2440 + }, + { + "completion_length": 244.375, + "epoch": 0.7778840025493945, + "grad_norm": 11.698638916015625, + "kl": 0.09423828125, + "learning_rate": 2.2211599745060548e-07, + "loss": 0.0038, + "reward": 1.5138826370239258, + "reward_std": 0.042464740574359894, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5138826966285706, + "step": 2441 + }, + { + "completion_length": 197.640625, + "epoch": 0.7782026768642447, + "grad_norm": 13.141349792480469, + "kl": 0.11328125, + "learning_rate": 2.2179732313575526e-07, + "loss": 0.0045, + "reward": 1.6041779518127441, + "reward_std": 0.09736239165067673, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6041779518127441, + "rewards/pad": 0.0, + "step": 2442 + }, + { + "completion_length": 254.28125, + "epoch": 0.7785213511790949, + "grad_norm": 3.905799627304077, + "kl": 0.08203125, + "learning_rate": 2.2147864882090504e-07, + "loss": 0.0033, + "reward": 1.56472909450531, + "reward_std": 0.04493452236056328, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43972909450531006, + "step": 2443 + }, + { + "completion_length": 217.59375, + "epoch": 0.7788400254939452, + "grad_norm": 32.241065979003906, + "kl": 0.076171875, + "learning_rate": 2.2115997450605482e-07, + "loss": 0.003, + "reward": 1.9587368965148926, + "reward_std": 0.1502833217382431, + "rewards/pad": 0.4375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5212369561195374, + "step": 2444 + }, + { + "completion_length": 276.984375, + "epoch": 0.7791586998087954, + "grad_norm": 13.728992462158203, + "kl": 0.072265625, + "learning_rate": 2.208413001912046e-07, + "loss": 0.0029, + "reward": 1.7761509418487549, + "reward_std": 0.14614000916481018, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5574009418487549, + "step": 2445 + }, + { + "completion_length": 344.59375, + "epoch": 0.7794773741236456, + "grad_norm": 15.054845809936523, + "kl": 0.06982421875, + "learning_rate": 2.2052262587635438e-07, + "loss": 0.0028, + "reward": 1.475869059562683, + "reward_std": 0.04769786819815636, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4758690595626831, + "step": 2446 + }, + { + "completion_length": 223.125, + "epoch": 0.7797960484384958, + "grad_norm": 7.250488758087158, + "kl": 0.1015625, + "learning_rate": 2.202039515615041e-07, + "loss": 0.0041, + "reward": 1.597805380821228, + "reward_std": 0.18581557273864746, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42593032121658325, + "rewards/pad": 0.171875, + "step": 2447 + }, + { + "completion_length": 325.015625, + "epoch": 0.780114722753346, + "grad_norm": 6.287956237792969, + "kl": 0.08056640625, + "learning_rate": 2.198852772466539e-07, + "loss": 0.0032, + "reward": 1.547116994857788, + "reward_std": 0.037797972559928894, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5471169948577881, + "rewards/pad": 0.0, + "step": 2448 + }, + { + "completion_length": 208.078125, + "epoch": 0.7804333970681963, + "grad_norm": 13.886652946472168, + "kl": 0.10400390625, + "learning_rate": 2.1956660293180367e-07, + "loss": 0.0042, + "reward": 1.5772626399993896, + "reward_std": 0.02991614118218422, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4522625803947449, + "rewards/pad": 0.125, + "step": 2449 + }, + { + "completion_length": 204.296875, + "epoch": 0.7807520713830465, + "grad_norm": 22.697803497314453, + "kl": 0.08251953125, + "learning_rate": 2.1924792861695345e-07, + "loss": 0.0033, + "reward": 1.5583763122558594, + "reward_std": 0.07844780385494232, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43337637186050415, + "rewards/pad": 0.125, + "step": 2450 + }, + { + "completion_length": 147.875, + "epoch": 0.7810707456978967, + "grad_norm": 19.415813446044922, + "kl": 0.1005859375, + "learning_rate": 2.1892925430210323e-07, + "loss": 0.004, + "reward": 1.5877751111984253, + "reward_std": 0.077241450548172, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5877752304077148, + "rewards/pad": 0.0, + "step": 2451 + }, + { + "completion_length": 358.546875, + "epoch": 0.781389420012747, + "grad_norm": 28.12146759033203, + "kl": 0.056884765625, + "learning_rate": 2.18610579987253e-07, + "loss": 0.0023, + "reward": 1.4711517095565796, + "reward_std": 0.0944700762629509, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4867766499519348, + "step": 2452 + }, + { + "completion_length": 204.546875, + "epoch": 0.7817080943275972, + "grad_norm": 22.27834701538086, + "kl": 0.0986328125, + "learning_rate": 2.182919056724028e-07, + "loss": 0.0039, + "reward": 1.2419202327728271, + "reward_std": 0.04451475292444229, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.24192023277282715, + "rewards/pad": 0.0, + "step": 2453 + }, + { + "completion_length": 146.03125, + "epoch": 0.7820267686424475, + "grad_norm": 19.50977325439453, + "kl": 0.1083984375, + "learning_rate": 2.1797323135755257e-07, + "loss": 0.0043, + "reward": 1.8846417665481567, + "reward_std": 0.0772833451628685, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6346417665481567, + "step": 2454 + }, + { + "completion_length": 232.625, + "epoch": 0.7823454429572977, + "grad_norm": 10.139874458312988, + "kl": 0.0751953125, + "learning_rate": 2.1765455704270235e-07, + "loss": 0.003, + "reward": 1.4167118072509766, + "reward_std": 0.07653843611478806, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41671180725097656, + "rewards/pad": 0.0, + "step": 2455 + }, + { + "completion_length": 153.390625, + "epoch": 0.7826641172721479, + "grad_norm": 23.29401969909668, + "kl": 0.109375, + "learning_rate": 2.1733588272785213e-07, + "loss": 0.0044, + "reward": 1.5589635372161865, + "reward_std": 0.1103920266032219, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5589635372161865, + "rewards/pad": 0.0, + "step": 2456 + }, + { + "completion_length": 230.0625, + "epoch": 0.7829827915869981, + "grad_norm": 13.375690460205078, + "kl": 0.0947265625, + "learning_rate": 2.170172084130019e-07, + "loss": 0.0038, + "reward": 1.518528938293457, + "reward_std": 0.048551395535469055, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.518528938293457, + "rewards/pad": 0.0, + "step": 2457 + }, + { + "completion_length": 191.640625, + "epoch": 0.7833014659018483, + "grad_norm": 8.836167335510254, + "kl": 0.11328125, + "learning_rate": 2.1669853409815167e-07, + "loss": 0.0045, + "reward": 1.5738965272903442, + "reward_std": 0.08173016458749771, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5738966464996338, + "rewards/pad": 0.0, + "step": 2458 + }, + { + "completion_length": 151.890625, + "epoch": 0.7836201402166986, + "grad_norm": 15.70499324798584, + "kl": 0.19140625, + "learning_rate": 2.1637985978330145e-07, + "loss": 0.0077, + "reward": 1.6416845321655273, + "reward_std": 0.11252018809318542, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5166846513748169, + "step": 2459 + }, + { + "completion_length": 296.609375, + "epoch": 0.7839388145315488, + "grad_norm": 6.298742294311523, + "kl": 0.06884765625, + "learning_rate": 2.1606118546845123e-07, + "loss": 0.0028, + "reward": 1.72971773147583, + "reward_std": 0.04087499901652336, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6047176718711853, + "step": 2460 + }, + { + "completion_length": 276.859375, + "epoch": 0.784257488846399, + "grad_norm": 6.319355010986328, + "kl": 0.07861328125, + "learning_rate": 2.15742511153601e-07, + "loss": 0.0031, + "reward": 1.771275520324707, + "reward_std": 0.10621865093708038, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5369004607200623, + "step": 2461 + }, + { + "completion_length": 285.140625, + "epoch": 0.7845761631612492, + "grad_norm": 17.279361724853516, + "kl": 0.1044921875, + "learning_rate": 2.154238368387508e-07, + "loss": 0.0042, + "reward": 1.602461576461792, + "reward_std": 0.045225173234939575, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6024615168571472, + "rewards/pad": 0.0, + "step": 2462 + }, + { + "completion_length": 345.984375, + "epoch": 0.7848948374760994, + "grad_norm": 8.349935531616211, + "kl": 0.0634765625, + "learning_rate": 2.1510516252390057e-07, + "loss": 0.0025, + "reward": 1.5142486095428467, + "reward_std": 0.06760507822036743, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5142486691474915, + "rewards/pad": 0.0, + "step": 2463 + }, + { + "completion_length": 381.5625, + "epoch": 0.7852135117909497, + "grad_norm": 13.86041259765625, + "kl": 0.0673828125, + "learning_rate": 2.1478648820905035e-07, + "loss": 0.0027, + "reward": 1.4394135475158691, + "reward_std": 0.03722523897886276, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4394136071205139, + "step": 2464 + }, + { + "completion_length": 258.828125, + "epoch": 0.7855321861057999, + "grad_norm": 6.705070495605469, + "kl": 0.1279296875, + "learning_rate": 2.1446781389420013e-07, + "loss": 0.0051, + "reward": 1.3438005447387695, + "reward_std": 0.12043680995702744, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.37505069375038147, + "rewards/pad": 0.0, + "step": 2465 + }, + { + "completion_length": 194.75, + "epoch": 0.7858508604206501, + "grad_norm": 13.5197172164917, + "kl": 0.0888671875, + "learning_rate": 2.141491395793499e-07, + "loss": 0.0036, + "reward": 1.8589537143707275, + "reward_std": 0.07028871774673462, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6089537739753723, + "rewards/pad": 0.25, + "step": 2466 + }, + { + "completion_length": 124.28125, + "epoch": 0.7861695347355003, + "grad_norm": 10.612565040588379, + "kl": 0.103515625, + "learning_rate": 2.1383046526449967e-07, + "loss": 0.0041, + "reward": 1.6125763654708862, + "reward_std": 0.09464748203754425, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4875763952732086, + "step": 2467 + }, + { + "completion_length": 243.03125, + "epoch": 0.7864882090503506, + "grad_norm": 10.93760871887207, + "kl": 0.09912109375, + "learning_rate": 2.1351179094964945e-07, + "loss": 0.004, + "reward": 1.4043235778808594, + "reward_std": 0.11670240759849548, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.419948548078537, + "step": 2468 + }, + { + "completion_length": 323.078125, + "epoch": 0.7868068833652008, + "grad_norm": 11.92921257019043, + "kl": 0.08642578125, + "learning_rate": 2.1319311663479923e-07, + "loss": 0.0035, + "reward": 1.4428250789642334, + "reward_std": 0.047589220106601715, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4428250789642334, + "step": 2469 + }, + { + "completion_length": 293.140625, + "epoch": 0.787125557680051, + "grad_norm": 5.340349197387695, + "kl": 0.07763671875, + "learning_rate": 2.12874442319949e-07, + "loss": 0.0031, + "reward": 1.5190386772155762, + "reward_std": 0.04808990657329559, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5190386176109314, + "rewards/pad": 0.0, + "step": 2470 + }, + { + "completion_length": 161.796875, + "epoch": 0.7874442319949012, + "grad_norm": 23.361160278320312, + "kl": 0.11279296875, + "learning_rate": 2.125557680050988e-07, + "loss": 0.0045, + "reward": 1.4743843078613281, + "reward_std": 0.09407757222652435, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47438421845436096, + "rewards/pad": 0.0, + "step": 2471 + }, + { + "completion_length": 255.15625, + "epoch": 0.7877629063097514, + "grad_norm": 12.378373146057129, + "kl": 0.08642578125, + "learning_rate": 2.1223709369024857e-07, + "loss": 0.0034, + "reward": 1.305260419845581, + "reward_std": 0.05898979306221008, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.30526041984558105, + "step": 2472 + }, + { + "completion_length": 286.140625, + "epoch": 0.7880815806246017, + "grad_norm": 20.72404670715332, + "kl": 0.0869140625, + "learning_rate": 2.1191841937539835e-07, + "loss": 0.0035, + "reward": 1.698275089263916, + "reward_std": 0.07133931666612625, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.573275089263916, + "step": 2473 + }, + { + "completion_length": 297.484375, + "epoch": 0.7884002549394519, + "grad_norm": 8.229775428771973, + "kl": 0.0654296875, + "learning_rate": 2.1159974506054813e-07, + "loss": 0.0026, + "reward": 1.6661851406097412, + "reward_std": 0.10517589747905731, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5568101406097412, + "step": 2474 + }, + { + "completion_length": 282.421875, + "epoch": 0.7887189292543021, + "grad_norm": 30.26580238342285, + "kl": 0.07275390625, + "learning_rate": 2.112810707456979e-07, + "loss": 0.0029, + "reward": 1.5597797632217407, + "reward_std": 0.10693672299385071, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4972797632217407, + "step": 2475 + }, + { + "completion_length": 307.28125, + "epoch": 0.7890376035691523, + "grad_norm": 183.1418914794922, + "kl": 0.0576171875, + "learning_rate": 2.109623964308477e-07, + "loss": 0.0023, + "reward": 1.5799769163131714, + "reward_std": 0.05586535483598709, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32997697591781616, + "step": 2476 + }, + { + "completion_length": 283.734375, + "epoch": 0.7893562778840025, + "grad_norm": 20.426721572875977, + "kl": 0.09033203125, + "learning_rate": 2.1064372211599742e-07, + "loss": 0.0036, + "reward": 1.4326732158660889, + "reward_std": 0.05556642264127731, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4326731562614441, + "rewards/pad": 0.0, + "step": 2477 + }, + { + "completion_length": 347.671875, + "epoch": 0.7896749521988528, + "grad_norm": 7.112791538238525, + "kl": 0.109375, + "learning_rate": 2.103250478011472e-07, + "loss": 0.0044, + "reward": 1.5783393383026123, + "reward_std": 0.16577112674713135, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.6095892786979675, + "rewards/pad": 0.0, + "step": 2478 + }, + { + "completion_length": 197.96875, + "epoch": 0.789993626513703, + "grad_norm": 7.802004814147949, + "kl": 0.1123046875, + "learning_rate": 2.1000637348629698e-07, + "loss": 0.0045, + "reward": 1.6445430517196655, + "reward_std": 0.06837106496095657, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5195431113243103, + "rewards/pad": 0.125, + "step": 2479 + }, + { + "completion_length": 287.765625, + "epoch": 0.7903123008285532, + "grad_norm": 6.5852742195129395, + "kl": 0.0888671875, + "learning_rate": 2.0968769917144676e-07, + "loss": 0.0036, + "reward": 1.404374122619629, + "reward_std": 0.038719989359378815, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40437403321266174, + "step": 2480 + }, + { + "completion_length": 375.984375, + "epoch": 0.7906309751434034, + "grad_norm": 21.665122985839844, + "kl": 0.07373046875, + "learning_rate": 2.0936902485659654e-07, + "loss": 0.003, + "reward": 1.4499353170394897, + "reward_std": 0.06959246844053268, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44993534684181213, + "step": 2481 + }, + { + "completion_length": 362.21875, + "epoch": 0.7909496494582536, + "grad_norm": 4.866166114807129, + "kl": 0.05615234375, + "learning_rate": 2.0905035054174632e-07, + "loss": 0.0022, + "reward": 1.5529453754425049, + "reward_std": 0.053469765931367874, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5529453754425049, + "step": 2482 + }, + { + "completion_length": 154.125, + "epoch": 0.7912683237731039, + "grad_norm": 9.112666130065918, + "kl": 0.09765625, + "learning_rate": 2.087316762268961e-07, + "loss": 0.0039, + "reward": 1.6775808334350586, + "reward_std": 0.07012620568275452, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5525809526443481, + "rewards/pad": 0.125, + "step": 2483 + }, + { + "completion_length": 243.015625, + "epoch": 0.7915869980879541, + "grad_norm": 9.58671760559082, + "kl": 0.08251953125, + "learning_rate": 2.0841300191204588e-07, + "loss": 0.0033, + "reward": 1.7703495025634766, + "reward_std": 0.10711017996072769, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5359745025634766, + "step": 2484 + }, + { + "completion_length": 379.703125, + "epoch": 0.7919056724028043, + "grad_norm": 8.184744834899902, + "kl": 0.06005859375, + "learning_rate": 2.0809432759719566e-07, + "loss": 0.0024, + "reward": 1.3782320022583008, + "reward_std": 0.030829811468720436, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.25323206186294556, + "step": 2485 + }, + { + "completion_length": 243.953125, + "epoch": 0.7922243467176545, + "grad_norm": 13.434769630432129, + "kl": 0.083984375, + "learning_rate": 2.0777565328234542e-07, + "loss": 0.0034, + "reward": 1.5161212682724, + "reward_std": 0.16218221187591553, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5317463278770447, + "step": 2486 + }, + { + "completion_length": 190.1875, + "epoch": 0.7925430210325047, + "grad_norm": 13.576552391052246, + "kl": 0.091796875, + "learning_rate": 2.074569789674952e-07, + "loss": 0.0037, + "reward": 1.6686426401138306, + "reward_std": 0.041518136858940125, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5436426997184753, + "rewards/pad": 0.125, + "step": 2487 + }, + { + "completion_length": 201.40625, + "epoch": 0.792861695347355, + "grad_norm": 24.43422508239746, + "kl": 0.1103515625, + "learning_rate": 2.0713830465264498e-07, + "loss": 0.0044, + "reward": 1.7355256080627441, + "reward_std": 0.11372970044612885, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.6261506080627441, + "step": 2488 + }, + { + "completion_length": 222.03125, + "epoch": 0.7931803696622052, + "grad_norm": 12.765557289123535, + "kl": 0.09033203125, + "learning_rate": 2.0681963033779476e-07, + "loss": 0.0036, + "reward": 1.6278808116912842, + "reward_std": 0.09283436834812164, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5028807520866394, + "rewards/pad": 0.125, + "step": 2489 + }, + { + "completion_length": 145.171875, + "epoch": 0.7934990439770554, + "grad_norm": 12.70199966430664, + "kl": 0.123046875, + "learning_rate": 2.0650095602294454e-07, + "loss": 0.0049, + "reward": 1.6430946588516235, + "reward_std": 0.0791141539812088, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6430947184562683, + "rewards/pad": 0.0, + "step": 2490 + }, + { + "completion_length": 260.375, + "epoch": 0.7938177182919057, + "grad_norm": 10.663228988647461, + "kl": 0.0927734375, + "learning_rate": 2.0618228170809432e-07, + "loss": 0.0037, + "reward": 1.3959004878997803, + "reward_std": 0.09733359515666962, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3959004580974579, + "step": 2491 + }, + { + "completion_length": 401.734375, + "epoch": 0.794136392606756, + "grad_norm": 24.024709701538086, + "kl": 0.04931640625, + "learning_rate": 2.058636073932441e-07, + "loss": 0.002, + "reward": 1.450697898864746, + "reward_std": 0.026952486485242844, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32569780945777893, + "step": 2492 + }, + { + "completion_length": 221.40625, + "epoch": 0.7944550669216062, + "grad_norm": 9.899370193481445, + "kl": 0.09765625, + "learning_rate": 2.0554493307839388e-07, + "loss": 0.0039, + "reward": 1.6808327436447144, + "reward_std": 0.11763062328100204, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44645780324935913, + "step": 2493 + }, + { + "completion_length": 407.140625, + "epoch": 0.7947737412364564, + "grad_norm": 22.02533721923828, + "kl": 0.052978515625, + "learning_rate": 2.0522625876354366e-07, + "loss": 0.0021, + "reward": 1.3636521100997925, + "reward_std": 0.027319133281707764, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3636520504951477, + "step": 2494 + }, + { + "completion_length": 198.53125, + "epoch": 0.7950924155513066, + "grad_norm": 9.314778327941895, + "kl": 0.08837890625, + "learning_rate": 2.0490758444869344e-07, + "loss": 0.0035, + "reward": 1.634337306022644, + "reward_std": 0.09219536185264587, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3843373954296112, + "rewards/pad": 0.25, + "step": 2495 + }, + { + "completion_length": 110.8125, + "epoch": 0.7954110898661568, + "grad_norm": 7.935081481933594, + "kl": 0.10986328125, + "learning_rate": 2.045889101338432e-07, + "loss": 0.0044, + "reward": 1.6727542877197266, + "reward_std": 0.06368027627468109, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.42275434732437134, + "rewards/pad": 0.25, + "step": 2496 + }, + { + "completion_length": 145.6875, + "epoch": 0.795729764181007, + "grad_norm": 9.061888694763184, + "kl": 0.1103515625, + "learning_rate": 2.0427023581899297e-07, + "loss": 0.0044, + "reward": 1.5904161930084229, + "reward_std": 0.05827527120709419, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5904163122177124, + "rewards/pad": 0.0, + "step": 2497 + }, + { + "completion_length": 256.625, + "epoch": 0.7960484384958573, + "grad_norm": 8.258540153503418, + "kl": 0.083984375, + "learning_rate": 2.0395156150414276e-07, + "loss": 0.0034, + "reward": 1.4634907245635986, + "reward_std": 0.08649997413158417, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4634907841682434, + "rewards/pad": 0.0, + "step": 2498 + }, + { + "completion_length": 207.59375, + "epoch": 0.7963671128107075, + "grad_norm": 10.039529800415039, + "kl": 0.11474609375, + "learning_rate": 2.0363288718929254e-07, + "loss": 0.0046, + "reward": 1.4351905584335327, + "reward_std": 0.08591999113559723, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4195655584335327, + "step": 2499 + }, + { + "completion_length": 341.109375, + "epoch": 0.7966857871255577, + "grad_norm": 7.5028395652771, + "kl": 0.058837890625, + "learning_rate": 2.0331421287444232e-07, + "loss": 0.0024, + "reward": 1.5099382400512695, + "reward_std": 0.0751279965043068, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38493824005126953, + "step": 2500 + }, + { + "completion_length": 395.671875, + "epoch": 0.7970044614404079, + "grad_norm": 7.090115070343018, + "kl": 0.059326171875, + "learning_rate": 2.029955385595921e-07, + "loss": 0.0024, + "reward": 1.5254924297332764, + "reward_std": 0.07720702141523361, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5254924297332764, + "rewards/pad": 0.0, + "step": 2501 + }, + { + "completion_length": 207.59375, + "epoch": 0.7973231357552581, + "grad_norm": 7.278848171234131, + "kl": 0.0849609375, + "learning_rate": 2.0267686424474188e-07, + "loss": 0.0034, + "reward": 1.6780972480773926, + "reward_std": 0.0748632550239563, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4280971586704254, + "rewards/pad": 0.25, + "step": 2502 + }, + { + "completion_length": 302.0, + "epoch": 0.7976418100701084, + "grad_norm": 5.311611175537109, + "kl": 0.07177734375, + "learning_rate": 2.0235818992989166e-07, + "loss": 0.0029, + "reward": 1.7154650688171387, + "reward_std": 0.058912493288517, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5904650688171387, + "step": 2503 + }, + { + "completion_length": 317.140625, + "epoch": 0.7979604843849586, + "grad_norm": 12.959879875183105, + "kl": 0.0615234375, + "learning_rate": 2.0203951561504144e-07, + "loss": 0.0025, + "reward": 1.583716630935669, + "reward_std": 0.09927290678024292, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3493417799472809, + "rewards/pad": 0.25, + "step": 2504 + }, + { + "completion_length": 240.96875, + "epoch": 0.7982791586998088, + "grad_norm": 8.32062816619873, + "kl": 0.07373046875, + "learning_rate": 2.0172084130019122e-07, + "loss": 0.0029, + "reward": 1.3935962915420532, + "reward_std": 0.042469993233680725, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39359626173973083, + "step": 2505 + }, + { + "completion_length": 228.125, + "epoch": 0.798597833014659, + "grad_norm": 22.151350021362305, + "kl": 0.0859375, + "learning_rate": 2.0140216698534097e-07, + "loss": 0.0034, + "reward": 1.4923195838928223, + "reward_std": 0.03624539077281952, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4923197031021118, + "step": 2506 + }, + { + "completion_length": 317.390625, + "epoch": 0.7989165073295093, + "grad_norm": 11.759739875793457, + "kl": 0.0693359375, + "learning_rate": 2.0108349267049073e-07, + "loss": 0.0028, + "reward": 1.5153393745422363, + "reward_std": 0.07655253261327744, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3903394043445587, + "step": 2507 + }, + { + "completion_length": 242.796875, + "epoch": 0.7992351816443595, + "grad_norm": 7.063753128051758, + "kl": 0.09375, + "learning_rate": 2.007648183556405e-07, + "loss": 0.0037, + "reward": 1.5463790893554688, + "reward_std": 0.08136877417564392, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5463790893554688, + "rewards/pad": 0.0, + "step": 2508 + }, + { + "completion_length": 146.03125, + "epoch": 0.7995538559592097, + "grad_norm": 8.412872314453125, + "kl": 0.1064453125, + "learning_rate": 2.004461440407903e-07, + "loss": 0.0043, + "reward": 1.742506742477417, + "reward_std": 0.09771303087472916, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.664381742477417, + "rewards/pad": 0.078125, + "step": 2509 + }, + { + "completion_length": 272.90625, + "epoch": 0.7998725302740599, + "grad_norm": 16.04229736328125, + "kl": 0.080078125, + "learning_rate": 2.0012746972594007e-07, + "loss": 0.0032, + "reward": 1.7979576587677002, + "reward_std": 0.07358251512050629, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5479576587677002, + "step": 2510 + }, + { + "completion_length": 151.4375, + "epoch": 0.8001912045889101, + "grad_norm": 15.322516441345215, + "kl": 0.1298828125, + "learning_rate": 1.9980879541108985e-07, + "loss": 0.0052, + "reward": 1.6307207345962524, + "reward_std": 0.08559076488018036, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6307207345962524, + "step": 2511 + }, + { + "completion_length": 361.640625, + "epoch": 0.8005098789037604, + "grad_norm": 8.897799491882324, + "kl": 0.07470703125, + "learning_rate": 1.9949012109623963e-07, + "loss": 0.003, + "reward": 1.5593959093093872, + "reward_std": 0.05124068260192871, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5593959093093872, + "step": 2512 + }, + { + "completion_length": 280.03125, + "epoch": 0.8008285532186106, + "grad_norm": 10.985091209411621, + "kl": 0.0830078125, + "learning_rate": 1.991714467813894e-07, + "loss": 0.0033, + "reward": 1.8225293159484863, + "reward_std": 0.1210012435913086, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5881543159484863, + "step": 2513 + }, + { + "completion_length": 292.1875, + "epoch": 0.8011472275334608, + "grad_norm": 7.801126956939697, + "kl": 0.087890625, + "learning_rate": 1.988527724665392e-07, + "loss": 0.0035, + "reward": 1.5416383743286133, + "reward_std": 0.05828118696808815, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4166383743286133, + "step": 2514 + }, + { + "completion_length": 350.984375, + "epoch": 0.801465901848311, + "grad_norm": 5.292451858520508, + "kl": 0.06005859375, + "learning_rate": 1.9853409815168897e-07, + "loss": 0.0024, + "reward": 1.5692921876907349, + "reward_std": 0.032198529690504074, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5692921280860901, + "rewards/pad": 0.0, + "step": 2515 + }, + { + "completion_length": 286.109375, + "epoch": 0.8017845761631612, + "grad_norm": 9.00987720489502, + "kl": 0.08251953125, + "learning_rate": 1.9821542383683872e-07, + "loss": 0.0033, + "reward": 1.565173864364624, + "reward_std": 0.07822804898023605, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4557989239692688, + "rewards/pad": 0.109375, + "step": 2516 + }, + { + "completion_length": 305.640625, + "epoch": 0.8021032504780115, + "grad_norm": 8.103169441223145, + "kl": 0.06787109375, + "learning_rate": 1.978967495219885e-07, + "loss": 0.0027, + "reward": 1.5759283304214478, + "reward_std": 0.06781023740768433, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.575928270816803, + "step": 2517 + }, + { + "completion_length": 220.609375, + "epoch": 0.8024219247928617, + "grad_norm": 10.552177429199219, + "kl": 0.095703125, + "learning_rate": 1.9757807520713829e-07, + "loss": 0.0038, + "reward": 1.570098876953125, + "reward_std": 0.09314015507698059, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4450989067554474, + "rewards/pad": 0.125, + "step": 2518 + }, + { + "completion_length": 256.75, + "epoch": 0.8027405991077119, + "grad_norm": 13.762317657470703, + "kl": 0.06982421875, + "learning_rate": 1.9725940089228807e-07, + "loss": 0.0028, + "reward": 1.7126691341400146, + "reward_std": 0.12554529309272766, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4782941937446594, + "rewards/pad": 0.234375, + "step": 2519 + }, + { + "completion_length": 219.484375, + "epoch": 0.8030592734225621, + "grad_norm": 10.376080513000488, + "kl": 0.08544921875, + "learning_rate": 1.9694072657743785e-07, + "loss": 0.0034, + "reward": 1.743795394897461, + "reward_std": 0.07224259525537491, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4937954843044281, + "step": 2520 + }, + { + "completion_length": 316.265625, + "epoch": 0.8033779477374123, + "grad_norm": 7.719282150268555, + "kl": 0.0703125, + "learning_rate": 1.9662205226258763e-07, + "loss": 0.0028, + "reward": 1.6823227405548096, + "reward_std": 0.19630685448646545, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5416978597640991, + "step": 2521 + }, + { + "completion_length": 245.0625, + "epoch": 0.8036966220522626, + "grad_norm": 9.966045379638672, + "kl": 0.08544921875, + "learning_rate": 1.963033779477374e-07, + "loss": 0.0034, + "reward": 1.722424864768982, + "reward_std": 0.04164808616042137, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5974248647689819, + "step": 2522 + }, + { + "completion_length": 404.546875, + "epoch": 0.8040152963671128, + "grad_norm": 9.246283531188965, + "kl": 0.0625, + "learning_rate": 1.959847036328872e-07, + "loss": 0.0025, + "reward": 1.5026819705963135, + "reward_std": 0.13315097987651825, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4089319407939911, + "step": 2523 + }, + { + "completion_length": 249.3125, + "epoch": 0.804333970681963, + "grad_norm": 7.0430169105529785, + "kl": 0.0791015625, + "learning_rate": 1.9566602931803697e-07, + "loss": 0.0032, + "reward": 1.777950406074524, + "reward_std": 0.15799814462661743, + "rewards/pad": 0.34375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43420034646987915, + "step": 2524 + }, + { + "completion_length": 300.484375, + "epoch": 0.8046526449968132, + "grad_norm": 14.292428016662598, + "kl": 0.078125, + "learning_rate": 1.9534735500318675e-07, + "loss": 0.0031, + "reward": 1.4989312887191772, + "reward_std": 0.05096810311079025, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49893131852149963, + "rewards/pad": 0.0, + "step": 2525 + }, + { + "completion_length": 301.5625, + "epoch": 0.8049713193116634, + "grad_norm": 8.920188903808594, + "kl": 0.083984375, + "learning_rate": 1.950286806883365e-07, + "loss": 0.0034, + "reward": 1.684213399887085, + "reward_std": 0.13281765580177307, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5748384594917297, + "rewards/pad": 0.125, + "step": 2526 + }, + { + "completion_length": 248.40625, + "epoch": 0.8052899936265137, + "grad_norm": 21.75054931640625, + "kl": 0.0703125, + "learning_rate": 1.9471000637348628e-07, + "loss": 0.0028, + "reward": 1.7569079399108887, + "reward_std": 0.12462820112705231, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5381580591201782, + "rewards/pad": 0.25, + "step": 2527 + }, + { + "completion_length": 294.828125, + "epoch": 0.8056086679413639, + "grad_norm": 6.906178951263428, + "kl": 0.0615234375, + "learning_rate": 1.9439133205863606e-07, + "loss": 0.0025, + "reward": 1.5684860944747925, + "reward_std": 0.10319699347019196, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4591110646724701, + "step": 2528 + }, + { + "completion_length": 127.1875, + "epoch": 0.8059273422562141, + "grad_norm": 53.47869110107422, + "kl": 0.1376953125, + "learning_rate": 1.9407265774378584e-07, + "loss": 0.0055, + "reward": 1.8714401721954346, + "reward_std": 0.11145314574241638, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.49644017219543457, + "step": 2529 + }, + { + "completion_length": 225.71875, + "epoch": 0.8062460165710643, + "grad_norm": 8.28803539276123, + "kl": 0.09326171875, + "learning_rate": 1.9375398342893563e-07, + "loss": 0.0037, + "reward": 1.4719030857086182, + "reward_std": 0.07403188943862915, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.471902996301651, + "step": 2530 + }, + { + "completion_length": 255.265625, + "epoch": 0.8065646908859146, + "grad_norm": 15.238947868347168, + "kl": 0.08544921875, + "learning_rate": 1.934353091140854e-07, + "loss": 0.0034, + "reward": 1.6258214712142944, + "reward_std": 0.06549122929573059, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6258213520050049, + "step": 2531 + }, + { + "completion_length": 240.4375, + "epoch": 0.8068833652007649, + "grad_norm": 16.703702926635742, + "kl": 0.091796875, + "learning_rate": 1.9311663479923519e-07, + "loss": 0.0037, + "reward": 1.4397114515304565, + "reward_std": 0.13731428980827332, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42408645153045654, + "rewards/pad": 0.03125, + "step": 2532 + }, + { + "completion_length": 342.203125, + "epoch": 0.8072020395156151, + "grad_norm": 9.475166320800781, + "kl": 0.0771484375, + "learning_rate": 1.9279796048438497e-07, + "loss": 0.0031, + "reward": 1.584092617034912, + "reward_std": 0.14641991257667542, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.6153426766395569, + "step": 2533 + }, + { + "completion_length": 293.265625, + "epoch": 0.8075207138304653, + "grad_norm": 10.645809173583984, + "kl": 0.06689453125, + "learning_rate": 1.9247928616953475e-07, + "loss": 0.0027, + "reward": 1.8401165008544922, + "reward_std": 0.0870843380689621, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.715116560459137, + "rewards/pad": 0.125, + "step": 2534 + }, + { + "completion_length": 313.296875, + "epoch": 0.8078393881453155, + "grad_norm": 4.8072428703308105, + "kl": 0.146484375, + "learning_rate": 1.921606118546845e-07, + "loss": 0.0059, + "reward": 1.4932180643081665, + "reward_std": 0.06363866478204727, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3682180643081665, + "step": 2535 + }, + { + "completion_length": 225.609375, + "epoch": 0.8081580624601657, + "grad_norm": 17.09768295288086, + "kl": 0.07958984375, + "learning_rate": 1.9184193753983428e-07, + "loss": 0.0032, + "reward": 1.644900918006897, + "reward_std": 0.14637455344200134, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.41052594780921936, + "step": 2536 + }, + { + "completion_length": 242.953125, + "epoch": 0.808476736775016, + "grad_norm": 85.25773620605469, + "kl": 0.0830078125, + "learning_rate": 1.9152326322498406e-07, + "loss": 0.0033, + "reward": 1.539243459701538, + "reward_std": 0.05695287883281708, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5392435193061829, + "rewards/pad": 0.0, + "step": 2537 + }, + { + "completion_length": 318.828125, + "epoch": 0.8087954110898662, + "grad_norm": 9.385004997253418, + "kl": 0.07421875, + "learning_rate": 1.9120458891013382e-07, + "loss": 0.003, + "reward": 1.5466983318328857, + "reward_std": 0.030538583174347878, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5466983318328857, + "step": 2538 + }, + { + "completion_length": 362.859375, + "epoch": 0.8091140854047164, + "grad_norm": 17.053071975708008, + "kl": 0.08349609375, + "learning_rate": 1.908859145952836e-07, + "loss": 0.0033, + "reward": 1.459040641784668, + "reward_std": 0.04085429757833481, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45904070138931274, + "step": 2539 + }, + { + "completion_length": 199.28125, + "epoch": 0.8094327597195666, + "grad_norm": 11.204050064086914, + "kl": 0.1015625, + "learning_rate": 1.9056724028043338e-07, + "loss": 0.0041, + "reward": 1.6462476253509521, + "reward_std": 0.08865442872047424, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5212478041648865, + "step": 2540 + }, + { + "completion_length": 103.6875, + "epoch": 0.8097514340344169, + "grad_norm": 17.3836612701416, + "kl": 0.134765625, + "learning_rate": 1.9024856596558316e-07, + "loss": 0.0054, + "reward": 1.7457025051116943, + "reward_std": 0.08871598541736603, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6207026243209839, + "rewards/pad": 0.125, + "step": 2541 + }, + { + "completion_length": 219.5, + "epoch": 0.8100701083492671, + "grad_norm": 12.529458999633789, + "kl": 0.10009765625, + "learning_rate": 1.8992989165073294e-07, + "loss": 0.004, + "reward": 1.6205840110778809, + "reward_std": 0.054811395704746246, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49558398127555847, + "rewards/pad": 0.125, + "step": 2542 + }, + { + "completion_length": 153.234375, + "epoch": 0.8103887826641173, + "grad_norm": 21.83299446105957, + "kl": 0.0986328125, + "learning_rate": 1.8961121733588272e-07, + "loss": 0.0039, + "reward": 1.8295905590057373, + "reward_std": 0.14615780115127563, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5952155590057373, + "step": 2543 + }, + { + "completion_length": 434.453125, + "epoch": 0.8107074569789675, + "grad_norm": 13.690393447875977, + "kl": 0.0498046875, + "learning_rate": 1.892925430210325e-07, + "loss": 0.002, + "reward": 1.5930736064910889, + "reward_std": 0.03718571364879608, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4680735766887665, + "step": 2544 + }, + { + "completion_length": 109.1875, + "epoch": 0.8110261312938177, + "grad_norm": 17.369874954223633, + "kl": 0.234375, + "learning_rate": 1.8897386870618225e-07, + "loss": 0.0094, + "reward": 1.766268253326416, + "reward_std": 0.10231577605009079, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.641268253326416, + "rewards/pad": 0.125, + "step": 2545 + }, + { + "completion_length": 246.703125, + "epoch": 0.811344805608668, + "grad_norm": 22.70635986328125, + "kl": 0.09326171875, + "learning_rate": 1.8865519439133203e-07, + "loss": 0.0037, + "reward": 1.5770329236984253, + "reward_std": 0.06733634322881699, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5770329236984253, + "rewards/pad": 0.0, + "step": 2546 + }, + { + "completion_length": 154.921875, + "epoch": 0.8116634799235182, + "grad_norm": 118.38398742675781, + "kl": 0.10986328125, + "learning_rate": 1.8833652007648181e-07, + "loss": 0.0044, + "reward": 1.5974708795547485, + "reward_std": 0.06037134677171707, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5974709391593933, + "rewards/pad": 0.0, + "step": 2547 + }, + { + "completion_length": 427.75, + "epoch": 0.8119821542383684, + "grad_norm": 7.559087753295898, + "kl": 0.049072265625, + "learning_rate": 1.880178457616316e-07, + "loss": 0.002, + "reward": 1.5248229503631592, + "reward_std": 0.12492824345827103, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4154479503631592, + "step": 2548 + }, + { + "completion_length": 266.109375, + "epoch": 0.8123008285532186, + "grad_norm": 14.66143798828125, + "kl": 0.0732421875, + "learning_rate": 1.8769917144678138e-07, + "loss": 0.0029, + "reward": 1.530194640159607, + "reward_std": 0.12328522652387619, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40519458055496216, + "rewards/pad": 0.125, + "step": 2549 + }, + { + "completion_length": 203.40625, + "epoch": 0.8126195028680688, + "grad_norm": 12.825209617614746, + "kl": 0.11279296875, + "learning_rate": 1.8738049713193116e-07, + "loss": 0.0045, + "reward": 1.687673568725586, + "reward_std": 0.14001277089118958, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5782985687255859, + "step": 2550 + }, + { + "completion_length": 341.703125, + "epoch": 0.812938177182919, + "grad_norm": 5.511659622192383, + "kl": 0.056396484375, + "learning_rate": 1.8706182281708094e-07, + "loss": 0.0023, + "reward": 1.509248971939087, + "reward_std": 0.05761463940143585, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5092489719390869, + "step": 2551 + }, + { + "completion_length": 297.109375, + "epoch": 0.8132568514977693, + "grad_norm": 15.541271209716797, + "kl": 0.060791015625, + "learning_rate": 1.8674314850223072e-07, + "loss": 0.0024, + "reward": 1.5715813636779785, + "reward_std": 0.085521399974823, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3372064232826233, + "rewards/pad": 0.25, + "step": 2552 + }, + { + "completion_length": 269.765625, + "epoch": 0.8135755258126195, + "grad_norm": 6.019388198852539, + "kl": 0.08935546875, + "learning_rate": 1.864244741873805e-07, + "loss": 0.0036, + "reward": 1.7263768911361694, + "reward_std": 0.10301776975393295, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6013767719268799, + "step": 2553 + }, + { + "completion_length": 311.90625, + "epoch": 0.8138942001274697, + "grad_norm": 5.30941104888916, + "kl": 0.0849609375, + "learning_rate": 1.8610579987253028e-07, + "loss": 0.0034, + "reward": 1.5381710529327393, + "reward_std": 0.16679885983467102, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.47567105293273926, + "step": 2554 + }, + { + "completion_length": 304.328125, + "epoch": 0.8142128744423199, + "grad_norm": 14.053765296936035, + "kl": 0.068359375, + "learning_rate": 1.8578712555768003e-07, + "loss": 0.0027, + "reward": 1.3786784410476685, + "reward_std": 0.1604098379611969, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.33180350065231323, + "rewards/pad": 0.0625, + "step": 2555 + }, + { + "completion_length": 320.921875, + "epoch": 0.8145315487571702, + "grad_norm": 7.8060479164123535, + "kl": 0.0751953125, + "learning_rate": 1.854684512428298e-07, + "loss": 0.003, + "reward": 1.6688108444213867, + "reward_std": 0.1284911185503006, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4344358742237091, + "rewards/pad": 0.25, + "step": 2556 + }, + { + "completion_length": 310.8125, + "epoch": 0.8148502230720204, + "grad_norm": 9.67390251159668, + "kl": 0.0634765625, + "learning_rate": 1.851497769279796e-07, + "loss": 0.0025, + "reward": 1.7895619869232178, + "reward_std": 0.05394981428980827, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.414561927318573, + "rewards/pad": 0.375, + "step": 2557 + }, + { + "completion_length": 285.828125, + "epoch": 0.8151688973868706, + "grad_norm": 5.931857109069824, + "kl": 0.07763671875, + "learning_rate": 1.8483110261312937e-07, + "loss": 0.0031, + "reward": 1.5367307662963867, + "reward_std": 0.04691781848669052, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5367306470870972, + "step": 2558 + }, + { + "completion_length": 220.203125, + "epoch": 0.8154875717017208, + "grad_norm": 14.692937850952148, + "kl": 0.08642578125, + "learning_rate": 1.8451242829827915e-07, + "loss": 0.0035, + "reward": 1.648263931274414, + "reward_std": 0.07860487699508667, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5232639312744141, + "rewards/pad": 0.125, + "step": 2559 + }, + { + "completion_length": 394.859375, + "epoch": 0.815806246016571, + "grad_norm": 18.749855041503906, + "kl": 0.06982421875, + "learning_rate": 1.8419375398342893e-07, + "loss": 0.0028, + "reward": 1.6806821823120117, + "reward_std": 0.055952079594135284, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5556822419166565, + "step": 2560 + }, + { + "completion_length": 409.359375, + "epoch": 0.8161249203314213, + "grad_norm": 6.769157409667969, + "kl": 0.0556640625, + "learning_rate": 1.8387507966857871e-07, + "loss": 0.0022, + "reward": 1.4579136371612549, + "reward_std": 0.10316675901412964, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.47353869676589966, + "rewards/pad": 0.0, + "step": 2561 + }, + { + "completion_length": 229.09375, + "epoch": 0.8164435946462715, + "grad_norm": 16.578529357910156, + "kl": 0.07421875, + "learning_rate": 1.835564053537285e-07, + "loss": 0.003, + "reward": 1.8228061199188232, + "reward_std": 0.10866206139326096, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46343109011650085, + "step": 2562 + }, + { + "completion_length": 221.390625, + "epoch": 0.8167622689611217, + "grad_norm": 6.562174320220947, + "kl": 0.0908203125, + "learning_rate": 1.8323773103887828e-07, + "loss": 0.0036, + "reward": 1.7422194480895996, + "reward_std": 0.14775794744491577, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5078444480895996, + "rewards/pad": 0.25, + "step": 2563 + }, + { + "completion_length": 348.828125, + "epoch": 0.8170809432759719, + "grad_norm": 11.428489685058594, + "kl": 0.0556640625, + "learning_rate": 1.8291905672402806e-07, + "loss": 0.0022, + "reward": 1.4955813884735107, + "reward_std": 0.0692591667175293, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49558135867118835, + "step": 2564 + }, + { + "completion_length": 271.328125, + "epoch": 0.8173996175908221, + "grad_norm": 10.209710121154785, + "kl": 0.08251953125, + "learning_rate": 1.826003824091778e-07, + "loss": 0.0033, + "reward": 1.4857373237609863, + "reward_std": 0.05850472301244736, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48573732376098633, + "rewards/pad": 0.0, + "step": 2565 + }, + { + "completion_length": 197.640625, + "epoch": 0.8177182919056724, + "grad_norm": 8.506927490234375, + "kl": 0.10400390625, + "learning_rate": 1.822817080943276e-07, + "loss": 0.0042, + "reward": 1.485414743423462, + "reward_std": 0.07199355959892273, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4854147434234619, + "step": 2566 + }, + { + "completion_length": 255.4375, + "epoch": 0.8180369662205226, + "grad_norm": 8.188014030456543, + "kl": 0.0732421875, + "learning_rate": 1.8196303377947737e-07, + "loss": 0.0029, + "reward": 1.5830761194229126, + "reward_std": 0.1581452488899231, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3643261194229126, + "step": 2567 + }, + { + "completion_length": 287.8125, + "epoch": 0.8183556405353728, + "grad_norm": 9.338285446166992, + "kl": 0.07421875, + "learning_rate": 1.8164435946462715e-07, + "loss": 0.003, + "reward": 1.59792160987854, + "reward_std": 0.07224056124687195, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.59792160987854, + "step": 2568 + }, + { + "completion_length": 261.0625, + "epoch": 0.818674314850223, + "grad_norm": 43.08826446533203, + "kl": 0.083984375, + "learning_rate": 1.813256851497769e-07, + "loss": 0.0034, + "reward": 1.6259100437164307, + "reward_std": 0.15336614847183228, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39153510332107544, + "rewards/pad": 0.234375, + "step": 2569 + }, + { + "completion_length": 206.421875, + "epoch": 0.8189929891650733, + "grad_norm": 10.55008602142334, + "kl": 0.0888671875, + "learning_rate": 1.8100701083492669e-07, + "loss": 0.0036, + "reward": 1.8721909523010254, + "reward_std": 0.0830574557185173, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.7471910119056702, + "step": 2570 + }, + { + "completion_length": 211.5, + "epoch": 0.8193116634799236, + "grad_norm": 13.250950813293457, + "kl": 0.07958984375, + "learning_rate": 1.8068833652007647e-07, + "loss": 0.0032, + "reward": 1.6348446607589722, + "reward_std": 0.09269119054079056, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4004696011543274, + "rewards/pad": 0.234375, + "step": 2571 + }, + { + "completion_length": 259.75, + "epoch": 0.8196303377947738, + "grad_norm": 9.012632369995117, + "kl": 0.08837890625, + "learning_rate": 1.8036966220522625e-07, + "loss": 0.0035, + "reward": 1.5878616571426392, + "reward_std": 0.05733507499098778, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4628615975379944, + "rewards/pad": 0.125, + "step": 2572 + }, + { + "completion_length": 174.53125, + "epoch": 0.819949012109624, + "grad_norm": 23.10243034362793, + "kl": 0.08642578125, + "learning_rate": 1.8005098789037603e-07, + "loss": 0.0035, + "reward": 1.8872909545898438, + "reward_std": 0.09623947739601135, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6372909545898438, + "step": 2573 + }, + { + "completion_length": 259.578125, + "epoch": 0.8202676864244742, + "grad_norm": 10.32063102722168, + "kl": 0.1435546875, + "learning_rate": 1.797323135755258e-07, + "loss": 0.0057, + "reward": 1.5099594593048096, + "reward_std": 0.04081812873482704, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5099595785140991, + "step": 2574 + }, + { + "completion_length": 153.90625, + "epoch": 0.8205863607393244, + "grad_norm": 18.978233337402344, + "kl": 0.10009765625, + "learning_rate": 1.7941363926067556e-07, + "loss": 0.004, + "reward": 1.6833765506744385, + "reward_std": 0.15838143229484558, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.44900164008140564, + "step": 2575 + }, + { + "completion_length": 164.03125, + "epoch": 0.8209050350541747, + "grad_norm": 23.3841495513916, + "kl": 0.0869140625, + "learning_rate": 1.7909496494582534e-07, + "loss": 0.0035, + "reward": 1.5571430921554565, + "reward_std": 0.1657448410987854, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5258930325508118, + "rewards/pad": 0.046875, + "step": 2576 + }, + { + "completion_length": 208.15625, + "epoch": 0.8212237093690249, + "grad_norm": 11.401373863220215, + "kl": 0.07861328125, + "learning_rate": 1.7877629063097512e-07, + "loss": 0.0031, + "reward": 1.849107265472412, + "reward_std": 0.07179224491119385, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5991072058677673, + "step": 2577 + }, + { + "completion_length": 207.796875, + "epoch": 0.8215423836838751, + "grad_norm": 6.517545223236084, + "kl": 0.07763671875, + "learning_rate": 1.784576163161249e-07, + "loss": 0.0031, + "reward": 1.6891777515411377, + "reward_std": 0.12289692461490631, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3298027217388153, + "step": 2578 + }, + { + "completion_length": 400.546875, + "epoch": 0.8218610579987253, + "grad_norm": 3.41658353805542, + "kl": 0.052490234375, + "learning_rate": 1.7813894200127468e-07, + "loss": 0.0021, + "reward": 1.3222894668579102, + "reward_std": 0.026268957182765007, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.1972896009683609, + "step": 2579 + }, + { + "completion_length": 336.28125, + "epoch": 0.8221797323135756, + "grad_norm": 6.9868974685668945, + "kl": 0.05615234375, + "learning_rate": 1.7782026768642447e-07, + "loss": 0.0023, + "reward": 1.4629323482513428, + "reward_std": 0.1270006000995636, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.47855743765830994, + "step": 2580 + }, + { + "completion_length": 169.578125, + "epoch": 0.8224984066284258, + "grad_norm": 41.07691955566406, + "kl": 0.115234375, + "learning_rate": 1.7750159337157425e-07, + "loss": 0.0046, + "reward": 1.686937689781189, + "reward_std": 0.17813950777053833, + "rewards/answer_reward": 0.21875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.46818774938583374, + "step": 2581 + }, + { + "completion_length": 268.8125, + "epoch": 0.822817080943276, + "grad_norm": 7.311337947845459, + "kl": 0.08544921875, + "learning_rate": 1.7718291905672403e-07, + "loss": 0.0034, + "reward": 1.6053481101989746, + "reward_std": 0.05105286091566086, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6053481698036194, + "step": 2582 + }, + { + "completion_length": 252.90625, + "epoch": 0.8231357552581262, + "grad_norm": 9.382682800292969, + "kl": 0.1025390625, + "learning_rate": 1.768642447418738e-07, + "loss": 0.0041, + "reward": 1.7015126943588257, + "reward_std": 0.14050918817520142, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4827626943588257, + "rewards/pad": 0.234375, + "step": 2583 + }, + { + "completion_length": 216.9375, + "epoch": 0.8234544295729764, + "grad_norm": 9.241425514221191, + "kl": 0.1171875, + "learning_rate": 1.7654557042702356e-07, + "loss": 0.0047, + "reward": 1.474588394165039, + "reward_std": 0.0695609524846077, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47458839416503906, + "rewards/pad": 0.0, + "step": 2584 + }, + { + "completion_length": 348.734375, + "epoch": 0.8237731038878267, + "grad_norm": 7.2932448387146, + "kl": 0.0673828125, + "learning_rate": 1.7622689611217334e-07, + "loss": 0.0027, + "reward": 1.5508739948272705, + "reward_std": 0.0942344218492508, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5664990544319153, + "rewards/pad": 0.0, + "step": 2585 + }, + { + "completion_length": 191.140625, + "epoch": 0.8240917782026769, + "grad_norm": 15.595159530639648, + "kl": 0.08935546875, + "learning_rate": 1.7590822179732312e-07, + "loss": 0.0036, + "reward": 1.4914007186889648, + "reward_std": 0.0594581663608551, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.366400808095932, + "rewards/pad": 0.125, + "step": 2586 + }, + { + "completion_length": 209.671875, + "epoch": 0.8244104525175271, + "grad_norm": 10.024636268615723, + "kl": 0.11279296875, + "learning_rate": 1.755895474824729e-07, + "loss": 0.0045, + "reward": 1.5031325817108154, + "reward_std": 0.09866450726985931, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5187575221061707, + "rewards/pad": 0.0, + "step": 2587 + }, + { + "completion_length": 204.75, + "epoch": 0.8247291268323773, + "grad_norm": 24.551767349243164, + "kl": 0.10595703125, + "learning_rate": 1.7527087316762268e-07, + "loss": 0.0042, + "reward": 1.5804860591888428, + "reward_std": 0.10609886050224304, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5961111187934875, + "step": 2588 + }, + { + "completion_length": 212.84375, + "epoch": 0.8250478011472275, + "grad_norm": 13.566366195678711, + "kl": 0.08935546875, + "learning_rate": 1.7495219885277246e-07, + "loss": 0.0036, + "reward": 1.7298917770385742, + "reward_std": 0.10028212517499924, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6048917174339294, + "rewards/pad": 0.125, + "step": 2589 + }, + { + "completion_length": 300.640625, + "epoch": 0.8253664754620778, + "grad_norm": 10.322278022766113, + "kl": 0.07568359375, + "learning_rate": 1.7463352453792224e-07, + "loss": 0.003, + "reward": 1.5587718486785889, + "reward_std": 0.16429835557937622, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.953125, + "rewards/tracking_iou_reward": 0.48064684867858887, + "step": 2590 + }, + { + "completion_length": 256.109375, + "epoch": 0.825685149776928, + "grad_norm": 7.206241607666016, + "kl": 0.08544921875, + "learning_rate": 1.7431485022307202e-07, + "loss": 0.0034, + "reward": 1.5976800918579102, + "reward_std": 0.09668730199337006, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4883049726486206, + "step": 2591 + }, + { + "completion_length": 214.484375, + "epoch": 0.8260038240917782, + "grad_norm": 10.769760131835938, + "kl": 0.0849609375, + "learning_rate": 1.739961759082218e-07, + "loss": 0.0034, + "reward": 1.6574702262878418, + "reward_std": 0.06978029012680054, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5324702262878418, + "rewards/pad": 0.125, + "step": 2592 + }, + { + "completion_length": 371.171875, + "epoch": 0.8263224984066284, + "grad_norm": 9.234620094299316, + "kl": 0.05908203125, + "learning_rate": 1.7367750159337159e-07, + "loss": 0.0024, + "reward": 1.3417534828186035, + "reward_std": 0.08860547840595245, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.35737860202789307, + "step": 2593 + }, + { + "completion_length": 268.015625, + "epoch": 0.8266411727214786, + "grad_norm": 40.45941925048828, + "kl": 0.0751953125, + "learning_rate": 1.7335882727852134e-07, + "loss": 0.003, + "reward": 1.4371353387832642, + "reward_std": 0.12356918305158615, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.218385249376297, + "step": 2594 + }, + { + "completion_length": 292.671875, + "epoch": 0.8269598470363289, + "grad_norm": 9.114712715148926, + "kl": 0.08056640625, + "learning_rate": 1.7304015296367112e-07, + "loss": 0.0032, + "reward": 1.538050889968872, + "reward_std": 0.057188645005226135, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5380508303642273, + "step": 2595 + }, + { + "completion_length": 225.78125, + "epoch": 0.8272785213511791, + "grad_norm": 13.69244384765625, + "kl": 0.07470703125, + "learning_rate": 1.727214786488209e-07, + "loss": 0.003, + "reward": 1.7152938842773438, + "reward_std": 0.05327215790748596, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46529388427734375, + "step": 2596 + }, + { + "completion_length": 337.6875, + "epoch": 0.8275971956660293, + "grad_norm": 19.979782104492188, + "kl": 0.0703125, + "learning_rate": 1.7240280433397068e-07, + "loss": 0.0028, + "reward": 1.6229982376098633, + "reward_std": 0.11573445796966553, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.38862332701683044, + "rewards/pad": 0.25, + "step": 2597 + }, + { + "completion_length": 288.65625, + "epoch": 0.8279158699808795, + "grad_norm": 10.287363052368164, + "kl": 0.0751953125, + "learning_rate": 1.7208413001912046e-07, + "loss": 0.003, + "reward": 1.3860034942626953, + "reward_std": 0.08018055558204651, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3860034942626953, + "rewards/pad": 0.0, + "step": 2598 + }, + { + "completion_length": 314.4375, + "epoch": 0.8282345442957297, + "grad_norm": 31.766921997070312, + "kl": 0.076171875, + "learning_rate": 1.7176545570427024e-07, + "loss": 0.003, + "reward": 1.3634631633758545, + "reward_std": 0.10952109098434448, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37908822298049927, + "step": 2599 + }, + { + "completion_length": 271.296875, + "epoch": 0.82855321861058, + "grad_norm": 16.726682662963867, + "kl": 0.07080078125, + "learning_rate": 1.7144678138942e-07, + "loss": 0.0028, + "reward": 1.6758776903152466, + "reward_std": 0.07480183988809586, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5508776903152466, + "rewards/pad": 0.125, + "step": 2600 + }, + { + "completion_length": 241.3125, + "epoch": 0.8288718929254302, + "grad_norm": 14.826166152954102, + "kl": 0.1044921875, + "learning_rate": 1.7112810707456978e-07, + "loss": 0.0042, + "reward": 1.5708891153335571, + "reward_std": 0.08528805524110794, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5708891153335571, + "rewards/pad": 0.0, + "step": 2601 + }, + { + "completion_length": 371.375, + "epoch": 0.8291905672402804, + "grad_norm": 22.454978942871094, + "kl": 0.0556640625, + "learning_rate": 1.7080943275971956e-07, + "loss": 0.0022, + "reward": 1.581399917602539, + "reward_std": 0.11798357963562012, + "rewards/pad": 0.0625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5345250368118286, + "step": 2602 + }, + { + "completion_length": 356.171875, + "epoch": 0.8295092415551306, + "grad_norm": 24.175840377807617, + "kl": 0.045654296875, + "learning_rate": 1.7049075844486934e-07, + "loss": 0.0018, + "reward": 1.6505095958709717, + "reward_std": 0.038452018052339554, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4005095958709717, + "rewards/pad": 0.25, + "step": 2603 + }, + { + "completion_length": 161.25, + "epoch": 0.8298279158699808, + "grad_norm": 22.794755935668945, + "kl": 0.11279296875, + "learning_rate": 1.701720841300191e-07, + "loss": 0.0045, + "reward": 1.7162601947784424, + "reward_std": 0.09409304708242416, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5912601947784424, + "step": 2604 + }, + { + "completion_length": 156.90625, + "epoch": 0.8301465901848311, + "grad_norm": 8.154170036315918, + "kl": 0.1123046875, + "learning_rate": 1.6985340981516887e-07, + "loss": 0.0045, + "reward": 1.560814380645752, + "reward_std": 0.049747712910175323, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5608144402503967, + "rewards/pad": 0.0, + "step": 2605 + }, + { + "completion_length": 282.671875, + "epoch": 0.8304652644996813, + "grad_norm": 12.08977222442627, + "kl": 0.07275390625, + "learning_rate": 1.6953473550031865e-07, + "loss": 0.0029, + "reward": 1.6754486560821533, + "reward_std": 0.0713881179690361, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5660737156867981, + "step": 2606 + }, + { + "completion_length": 252.90625, + "epoch": 0.8307839388145315, + "grad_norm": 14.890015602111816, + "kl": 0.07568359375, + "learning_rate": 1.6921606118546843e-07, + "loss": 0.003, + "reward": 1.662458896636963, + "reward_std": 0.07578548789024353, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5374589562416077, + "step": 2607 + }, + { + "completion_length": 208.03125, + "epoch": 0.8311026131293817, + "grad_norm": 9.151792526245117, + "kl": 0.07666015625, + "learning_rate": 1.6889738687061821e-07, + "loss": 0.0031, + "reward": 1.6875064373016357, + "reward_std": 0.1356935203075409, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46875646710395813, + "step": 2608 + }, + { + "completion_length": 246.5, + "epoch": 0.831421287444232, + "grad_norm": 12.123498916625977, + "kl": 0.09619140625, + "learning_rate": 1.68578712555768e-07, + "loss": 0.0039, + "reward": 1.754729986190796, + "reward_std": 0.10019632428884506, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5047299861907959, + "rewards/pad": 0.25, + "step": 2609 + }, + { + "completion_length": 257.328125, + "epoch": 0.8317399617590823, + "grad_norm": 30.473678588867188, + "kl": 0.07568359375, + "learning_rate": 1.6826003824091777e-07, + "loss": 0.003, + "reward": 1.5858802795410156, + "reward_std": 0.1852986067533493, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5077552199363708, + "step": 2610 + }, + { + "completion_length": 400.0, + "epoch": 0.8320586360739325, + "grad_norm": 6.983999252319336, + "kl": 0.064453125, + "learning_rate": 1.6794136392606755e-07, + "loss": 0.0026, + "reward": 1.3887776136398315, + "reward_std": 0.09014880657196045, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.27940258383750916, + "step": 2611 + }, + { + "completion_length": 386.953125, + "epoch": 0.8323773103887827, + "grad_norm": 8.778849601745605, + "kl": 0.049560546875, + "learning_rate": 1.6762268961121734e-07, + "loss": 0.002, + "reward": 1.4016900062561035, + "reward_std": 0.0967012494802475, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4173150956630707, + "rewards/pad": 0.0, + "step": 2612 + }, + { + "completion_length": 115.78125, + "epoch": 0.8326959847036329, + "grad_norm": 12.407329559326172, + "kl": 0.1259765625, + "learning_rate": 1.6730401529636712e-07, + "loss": 0.005, + "reward": 1.786607027053833, + "reward_std": 0.1045801043510437, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.661607027053833, + "step": 2613 + }, + { + "completion_length": 261.046875, + "epoch": 0.8330146590184832, + "grad_norm": 13.980108261108398, + "kl": 0.091796875, + "learning_rate": 1.6698534098151687e-07, + "loss": 0.0037, + "reward": 1.6479519605636597, + "reward_std": 0.04881848394870758, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5229519605636597, + "step": 2614 + }, + { + "completion_length": 305.546875, + "epoch": 0.8333333333333334, + "grad_norm": 5.3638434410095215, + "kl": 0.08837890625, + "learning_rate": 1.6666666666666665e-07, + "loss": 0.0035, + "reward": 1.4231975078582764, + "reward_std": 0.12304175645112991, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4544474482536316, + "step": 2615 + }, + { + "completion_length": 263.234375, + "epoch": 0.8336520076481836, + "grad_norm": 8.397769927978516, + "kl": 0.080078125, + "learning_rate": 1.6634799235181643e-07, + "loss": 0.0032, + "reward": 1.6945033073425293, + "reward_std": 0.057246167212724686, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4445032775402069, + "step": 2616 + }, + { + "completion_length": 420.171875, + "epoch": 0.8339706819630338, + "grad_norm": 12.449578285217285, + "kl": 0.06298828125, + "learning_rate": 1.660293180369662e-07, + "loss": 0.0025, + "reward": 1.3212941884994507, + "reward_std": 0.041054822504520416, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.32129421830177307, + "rewards/pad": 0.0, + "step": 2617 + }, + { + "completion_length": 204.796875, + "epoch": 0.834289356277884, + "grad_norm": 19.821781158447266, + "kl": 0.08447265625, + "learning_rate": 1.65710643722116e-07, + "loss": 0.0034, + "reward": 1.7124440670013428, + "reward_std": 0.049534693360328674, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.587444007396698, + "rewards/pad": 0.125, + "step": 2618 + }, + { + "completion_length": 149.09375, + "epoch": 0.8346080305927343, + "grad_norm": 17.232561111450195, + "kl": 0.115234375, + "learning_rate": 1.6539196940726577e-07, + "loss": 0.0046, + "reward": 1.5556838512420654, + "reward_std": 0.0869147777557373, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5556838512420654, + "rewards/pad": 0.0, + "step": 2619 + }, + { + "completion_length": 304.625, + "epoch": 0.8349267049075845, + "grad_norm": 5.9082255363464355, + "kl": 0.06298828125, + "learning_rate": 1.6507329509241555e-07, + "loss": 0.0025, + "reward": 1.6162960529327393, + "reward_std": 0.08056774735450745, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4912959933280945, + "rewards/pad": 0.125, + "step": 2620 + }, + { + "completion_length": 224.484375, + "epoch": 0.8352453792224347, + "grad_norm": 11.629868507385254, + "kl": 0.0849609375, + "learning_rate": 1.6475462077756533e-07, + "loss": 0.0034, + "reward": 1.3327964544296265, + "reward_std": 0.06474357098340988, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3327964246273041, + "rewards/pad": 0.0, + "step": 2621 + }, + { + "completion_length": 165.796875, + "epoch": 0.8355640535372849, + "grad_norm": 139.20846557617188, + "kl": 0.12353515625, + "learning_rate": 1.6443594646271511e-07, + "loss": 0.005, + "reward": 1.564035415649414, + "reward_std": 0.07771393656730652, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4390353560447693, + "rewards/pad": 0.125, + "step": 2622 + }, + { + "completion_length": 273.6875, + "epoch": 0.8358827278521351, + "grad_norm": 10.315620422363281, + "kl": 0.06787109375, + "learning_rate": 1.641172721478649e-07, + "loss": 0.0027, + "reward": 1.5206451416015625, + "reward_std": 0.07278362661600113, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4112701416015625, + "step": 2623 + }, + { + "completion_length": 221.265625, + "epoch": 0.8362014021669854, + "grad_norm": 13.258879661560059, + "kl": 0.09423828125, + "learning_rate": 1.6379859783301465e-07, + "loss": 0.0038, + "reward": 1.6685354709625244, + "reward_std": 0.1319824755191803, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5904104709625244, + "rewards/pad": 0.078125, + "step": 2624 + }, + { + "completion_length": 263.875, + "epoch": 0.8365200764818356, + "grad_norm": 12.78669261932373, + "kl": 0.08251953125, + "learning_rate": 1.6347992351816443e-07, + "loss": 0.0033, + "reward": 1.412626028060913, + "reward_std": 0.04722040146589279, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4126260578632355, + "rewards/pad": 0.0, + "step": 2625 + }, + { + "completion_length": 296.828125, + "epoch": 0.8368387507966858, + "grad_norm": 22.007631301879883, + "kl": 0.0556640625, + "learning_rate": 1.631612492033142e-07, + "loss": 0.0022, + "reward": 1.7572367191314697, + "reward_std": 0.11621341109275818, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5228617191314697, + "rewards/pad": 0.234375, + "step": 2626 + }, + { + "completion_length": 313.0, + "epoch": 0.837157425111536, + "grad_norm": 6.745031356811523, + "kl": 0.080078125, + "learning_rate": 1.62842574888464e-07, + "loss": 0.0032, + "reward": 1.3981493711471558, + "reward_std": 0.10961896181106567, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4293993413448334, + "step": 2627 + }, + { + "completion_length": 347.609375, + "epoch": 0.8374760994263862, + "grad_norm": 9.913939476013184, + "kl": 0.060791015625, + "learning_rate": 1.6252390057361377e-07, + "loss": 0.0024, + "reward": 1.5316874980926514, + "reward_std": 0.11328306794166565, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5473124980926514, + "rewards/pad": 0.0, + "step": 2628 + }, + { + "completion_length": 157.21875, + "epoch": 0.8377947737412365, + "grad_norm": 20.527606964111328, + "kl": 0.08251953125, + "learning_rate": 1.6220522625876355e-07, + "loss": 0.0033, + "reward": 1.8070180416107178, + "reward_std": 0.15403233468532562, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.572642982006073, + "step": 2629 + }, + { + "completion_length": 299.59375, + "epoch": 0.8381134480560867, + "grad_norm": 6.451737880706787, + "kl": 0.06689453125, + "learning_rate": 1.6188655194391333e-07, + "loss": 0.0027, + "reward": 1.6873538494110107, + "reward_std": 0.13507935404777527, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5779789090156555, + "step": 2630 + }, + { + "completion_length": 289.46875, + "epoch": 0.8384321223709369, + "grad_norm": 17.178695678710938, + "kl": 0.07958984375, + "learning_rate": 1.6156787762906309e-07, + "loss": 0.0032, + "reward": 1.43893301486969, + "reward_std": 0.057767391204833984, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43893301486968994, + "step": 2631 + }, + { + "completion_length": 264.78125, + "epoch": 0.8387507966857871, + "grad_norm": 20.748149871826172, + "kl": 0.07080078125, + "learning_rate": 1.6124920331421287e-07, + "loss": 0.0028, + "reward": 1.5110678672790527, + "reward_std": 0.10878480970859528, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3860679566860199, + "step": 2632 + }, + { + "completion_length": 303.09375, + "epoch": 0.8390694710006373, + "grad_norm": 10.335489273071289, + "kl": 0.06689453125, + "learning_rate": 1.6093052899936262e-07, + "loss": 0.0027, + "reward": 1.610609531402588, + "reward_std": 0.061860740184783936, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4856095314025879, + "step": 2633 + }, + { + "completion_length": 207.921875, + "epoch": 0.8393881453154876, + "grad_norm": 13.028353691101074, + "kl": 0.0908203125, + "learning_rate": 1.606118546845124e-07, + "loss": 0.0036, + "reward": 1.662575125694275, + "reward_std": 0.0714992880821228, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5375750064849854, + "rewards/pad": 0.125, + "step": 2634 + }, + { + "completion_length": 286.265625, + "epoch": 0.8397068196303378, + "grad_norm": 20.43387222290039, + "kl": 0.078125, + "learning_rate": 1.6029318036966218e-07, + "loss": 0.0031, + "reward": 1.6184622049331665, + "reward_std": 0.08133666217327118, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5090871453285217, + "step": 2635 + }, + { + "completion_length": 292.96875, + "epoch": 0.840025493945188, + "grad_norm": 9.506821632385254, + "kl": 0.1123046875, + "learning_rate": 1.5997450605481196e-07, + "loss": 0.0045, + "reward": 1.5370888710021973, + "reward_std": 0.1493566632270813, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.44333887100219727, + "step": 2636 + }, + { + "completion_length": 311.421875, + "epoch": 0.8403441682600382, + "grad_norm": 15.433694839477539, + "kl": 0.07568359375, + "learning_rate": 1.5965583173996174e-07, + "loss": 0.003, + "reward": 1.522940993309021, + "reward_std": 0.1161201074719429, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.413565993309021, + "rewards/pad": 0.125, + "step": 2637 + }, + { + "completion_length": 209.765625, + "epoch": 0.8406628425748884, + "grad_norm": 9.586259841918945, + "kl": 0.0869140625, + "learning_rate": 1.5933715742511152e-07, + "loss": 0.0035, + "reward": 1.6498695611953735, + "reward_std": 0.14003899693489075, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47799453139305115, + "step": 2638 + }, + { + "completion_length": 244.203125, + "epoch": 0.8409815168897387, + "grad_norm": 29.076448440551758, + "kl": 0.08203125, + "learning_rate": 1.590184831102613e-07, + "loss": 0.0033, + "reward": 1.7706998586654663, + "reward_std": 0.12016107887029648, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6144499182701111, + "rewards/pad": 0.15625, + "step": 2639 + }, + { + "completion_length": 261.046875, + "epoch": 0.8413001912045889, + "grad_norm": 8.538639068603516, + "kl": 0.07373046875, + "learning_rate": 1.5869980879541108e-07, + "loss": 0.003, + "reward": 1.514484167098999, + "reward_std": 0.05714438855648041, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.514484167098999, + "rewards/pad": 0.0, + "step": 2640 + }, + { + "completion_length": 364.96875, + "epoch": 0.8416188655194391, + "grad_norm": 24.829381942749023, + "kl": 0.072265625, + "learning_rate": 1.5838113448056086e-07, + "loss": 0.0029, + "reward": 1.4145584106445312, + "reward_std": 0.06661966443061829, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41455844044685364, + "step": 2641 + }, + { + "completion_length": 212.734375, + "epoch": 0.8419375398342893, + "grad_norm": 9.248557090759277, + "kl": 0.0966796875, + "learning_rate": 1.5806246016571064e-07, + "loss": 0.0039, + "reward": 1.6410948038101196, + "reward_std": 0.07060378789901733, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5160947442054749, + "rewards/pad": 0.125, + "step": 2642 + }, + { + "completion_length": 242.828125, + "epoch": 0.8422562141491395, + "grad_norm": 10.468927383422852, + "kl": 0.09912109375, + "learning_rate": 1.577437858508604e-07, + "loss": 0.004, + "reward": 1.6543145179748535, + "reward_std": 0.044029463082551956, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6543145179748535, + "rewards/pad": 0.0, + "step": 2643 + }, + { + "completion_length": 217.28125, + "epoch": 0.8425748884639898, + "grad_norm": 7.1923065185546875, + "kl": 0.1591796875, + "learning_rate": 1.5742511153601018e-07, + "loss": 0.0064, + "reward": 1.661090612411499, + "reward_std": 0.08890962600708008, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.536090612411499, + "step": 2644 + }, + { + "completion_length": 355.34375, + "epoch": 0.84289356277884, + "grad_norm": 19.381942749023438, + "kl": 0.0751953125, + "learning_rate": 1.5710643722115996e-07, + "loss": 0.003, + "reward": 1.3861980438232422, + "reward_std": 0.1279515027999878, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2768230736255646, + "step": 2645 + }, + { + "completion_length": 299.25, + "epoch": 0.8432122370936902, + "grad_norm": 10.264814376831055, + "kl": 0.0830078125, + "learning_rate": 1.5678776290630974e-07, + "loss": 0.0033, + "reward": 1.570202112197876, + "reward_std": 0.11829538643360138, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.570202112197876, + "step": 2646 + }, + { + "completion_length": 249.5, + "epoch": 0.8435309114085404, + "grad_norm": 11.279900550842285, + "kl": 0.08447265625, + "learning_rate": 1.5646908859145952e-07, + "loss": 0.0034, + "reward": 1.544940710067749, + "reward_std": 0.06372781842947006, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.419940710067749, + "step": 2647 + }, + { + "completion_length": 211.84375, + "epoch": 0.8438495857233907, + "grad_norm": 14.549507141113281, + "kl": 0.08935546875, + "learning_rate": 1.561504142766093e-07, + "loss": 0.0036, + "reward": 1.6167564392089844, + "reward_std": 0.11819630861282349, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4917564392089844, + "rewards/pad": 0.125, + "step": 2648 + }, + { + "completion_length": 214.484375, + "epoch": 0.844168260038241, + "grad_norm": 18.767784118652344, + "kl": 0.107421875, + "learning_rate": 1.5583173996175908e-07, + "loss": 0.0043, + "reward": 1.6396679878234863, + "reward_std": 0.15843240916728973, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.6709179878234863, + "step": 2649 + }, + { + "completion_length": 352.203125, + "epoch": 0.8444869343530912, + "grad_norm": 7.323569297790527, + "kl": 0.07080078125, + "learning_rate": 1.5551306564690886e-07, + "loss": 0.0028, + "reward": 1.368403673171997, + "reward_std": 0.05267880856990814, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3684036433696747, + "rewards/pad": 0.0, + "step": 2650 + }, + { + "completion_length": 248.84375, + "epoch": 0.8448056086679414, + "grad_norm": 14.06733226776123, + "kl": 0.09423828125, + "learning_rate": 1.5519439133205864e-07, + "loss": 0.0038, + "reward": 1.4895764589309692, + "reward_std": 0.19936680793762207, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.9375, + "rewards/tracking_iou_reward": 0.5520764589309692, + "step": 2651 + }, + { + "completion_length": 146.21875, + "epoch": 0.8451242829827916, + "grad_norm": 14.021964073181152, + "kl": 0.1103515625, + "learning_rate": 1.5487571701720842e-07, + "loss": 0.0044, + "reward": 1.5428390502929688, + "reward_std": 0.10221075266599655, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5428390502929688, + "rewards/pad": 0.0, + "step": 2652 + }, + { + "completion_length": 262.390625, + "epoch": 0.8454429572976419, + "grad_norm": 5.666436195373535, + "kl": 0.08203125, + "learning_rate": 1.5455704270235818e-07, + "loss": 0.0033, + "reward": 1.78913414478302, + "reward_std": 0.07610681653022766, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5391342043876648, + "step": 2653 + }, + { + "completion_length": 275.375, + "epoch": 0.8457616316124921, + "grad_norm": 8.450581550598145, + "kl": 0.07861328125, + "learning_rate": 1.5423836838750796e-07, + "loss": 0.0031, + "reward": 1.5056837797164917, + "reward_std": 0.034122534096241, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5056837201118469, + "rewards/pad": 0.0, + "step": 2654 + }, + { + "completion_length": 295.03125, + "epoch": 0.8460803059273423, + "grad_norm": 9.418177604675293, + "kl": 0.0791015625, + "learning_rate": 1.5391969407265774e-07, + "loss": 0.0032, + "reward": 1.5818204879760742, + "reward_std": 0.09731556475162506, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4724454879760742, + "step": 2655 + }, + { + "completion_length": 194.40625, + "epoch": 0.8463989802421925, + "grad_norm": 36.22610092163086, + "kl": 0.099609375, + "learning_rate": 1.5360101975780752e-07, + "loss": 0.004, + "reward": 1.5101938247680664, + "reward_std": 0.09336835891008377, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5101937651634216, + "rewards/pad": 0.0, + "step": 2656 + }, + { + "completion_length": 334.34375, + "epoch": 0.8467176545570427, + "grad_norm": 18.423858642578125, + "kl": 0.07275390625, + "learning_rate": 1.532823454429573e-07, + "loss": 0.0029, + "reward": 1.5258831977844238, + "reward_std": 0.13142189383506775, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5415083169937134, + "step": 2657 + }, + { + "completion_length": 164.921875, + "epoch": 0.847036328871893, + "grad_norm": 29.420068740844727, + "kl": 0.11328125, + "learning_rate": 1.5296367112810708e-07, + "loss": 0.0045, + "reward": 1.5939750671386719, + "reward_std": 0.10682038962841034, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5939749479293823, + "rewards/pad": 0.0, + "step": 2658 + }, + { + "completion_length": 157.109375, + "epoch": 0.8473550031867432, + "grad_norm": 44.709381103515625, + "kl": 0.11376953125, + "learning_rate": 1.5264499681325686e-07, + "loss": 0.0046, + "reward": 1.6389029026031494, + "reward_std": 0.14254002273082733, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.513903021812439, + "rewards/pad": 0.125, + "step": 2659 + }, + { + "completion_length": 393.3125, + "epoch": 0.8476736775015934, + "grad_norm": 3.54472017288208, + "kl": 0.046630859375, + "learning_rate": 1.5232632249840664e-07, + "loss": 0.0019, + "reward": 1.4698128700256348, + "reward_std": 0.04882584884762764, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46981289982795715, + "step": 2660 + }, + { + "completion_length": 279.828125, + "epoch": 0.8479923518164436, + "grad_norm": 6.6133832931518555, + "kl": 0.09765625, + "learning_rate": 1.5200764818355642e-07, + "loss": 0.0039, + "reward": 1.4522119760513306, + "reward_std": 0.15472984313964844, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4834619462490082, + "rewards/pad": 0.0, + "step": 2661 + }, + { + "completion_length": 270.28125, + "epoch": 0.8483110261312938, + "grad_norm": 22.683170318603516, + "kl": 0.06982421875, + "learning_rate": 1.5168897386870618e-07, + "loss": 0.0028, + "reward": 1.789536714553833, + "reward_std": 0.17844700813293457, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4614117741584778, + "rewards/pad": 0.34375, + "step": 2662 + }, + { + "completion_length": 353.015625, + "epoch": 0.848629700446144, + "grad_norm": 4.951906681060791, + "kl": 0.0634765625, + "learning_rate": 1.5137029955385593e-07, + "loss": 0.0025, + "reward": 1.4932067394256592, + "reward_std": 0.08742601424455643, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.383831650018692, + "rewards/pad": 0.125, + "step": 2663 + }, + { + "completion_length": 313.75, + "epoch": 0.8489483747609943, + "grad_norm": 5.683970928192139, + "kl": 0.07275390625, + "learning_rate": 1.510516252390057e-07, + "loss": 0.0029, + "reward": 1.6920580863952637, + "reward_std": 0.13436323404312134, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5826830863952637, + "step": 2664 + }, + { + "completion_length": 276.703125, + "epoch": 0.8492670490758445, + "grad_norm": 7.645228385925293, + "kl": 0.10546875, + "learning_rate": 1.507329509241555e-07, + "loss": 0.0042, + "reward": 1.4928104877471924, + "reward_std": 0.06374173611402512, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4928104877471924, + "rewards/pad": 0.0, + "step": 2665 + }, + { + "completion_length": 308.59375, + "epoch": 0.8495857233906947, + "grad_norm": 7.788649559020996, + "kl": 0.0693359375, + "learning_rate": 1.5041427660930527e-07, + "loss": 0.0028, + "reward": 1.4845178127288818, + "reward_std": 0.09160245954990387, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5001428723335266, + "rewards/pad": 0.0, + "step": 2666 + }, + { + "completion_length": 285.078125, + "epoch": 0.8499043977055449, + "grad_norm": 16.08645248413086, + "kl": 0.0791015625, + "learning_rate": 1.5009560229445505e-07, + "loss": 0.0032, + "reward": 1.4720804691314697, + "reward_std": 0.11547581106424332, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4877054691314697, + "rewards/pad": 0.0, + "step": 2667 + }, + { + "completion_length": 340.0, + "epoch": 0.8502230720203952, + "grad_norm": 7.84180212020874, + "kl": 0.07177734375, + "learning_rate": 1.4977692797960483e-07, + "loss": 0.0029, + "reward": 1.407630443572998, + "reward_std": 0.1025407463312149, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4232555031776428, + "step": 2668 + }, + { + "completion_length": 273.625, + "epoch": 0.8505417463352454, + "grad_norm": 8.022253036499023, + "kl": 0.091796875, + "learning_rate": 1.494582536647546e-07, + "loss": 0.0037, + "reward": 1.5171573162078857, + "reward_std": 0.16217833757400513, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.32965725660324097, + "step": 2669 + }, + { + "completion_length": 305.609375, + "epoch": 0.8508604206500956, + "grad_norm": 11.595643997192383, + "kl": 0.06298828125, + "learning_rate": 1.491395793499044e-07, + "loss": 0.0025, + "reward": 1.793088674545288, + "reward_std": 0.1004258394241333, + "rewards/pad": 0.203125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5899636745452881, + "step": 2670 + }, + { + "completion_length": 367.6875, + "epoch": 0.8511790949649458, + "grad_norm": 7.779849529266357, + "kl": 0.05859375, + "learning_rate": 1.4882090503505417e-07, + "loss": 0.0023, + "reward": 1.4648220539093018, + "reward_std": 0.07983763515949249, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44919702410697937, + "step": 2671 + }, + { + "completion_length": 235.9375, + "epoch": 0.851497769279796, + "grad_norm": 16.491291046142578, + "kl": 0.08251953125, + "learning_rate": 1.4850223072020395e-07, + "loss": 0.0033, + "reward": 1.4662169218063354, + "reward_std": 0.047906674444675446, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4662169814109802, + "step": 2672 + }, + { + "completion_length": 248.765625, + "epoch": 0.8518164435946463, + "grad_norm": 8.279892921447754, + "kl": 0.06884765625, + "learning_rate": 1.481835564053537e-07, + "loss": 0.0028, + "reward": 1.8111933469772339, + "reward_std": 0.07763614505529404, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5611932873725891, + "step": 2673 + }, + { + "completion_length": 332.0, + "epoch": 0.8521351179094965, + "grad_norm": 47.811439514160156, + "kl": 0.07373046875, + "learning_rate": 1.478648820905035e-07, + "loss": 0.003, + "reward": 1.4083024263381958, + "reward_std": 0.03486569970846176, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.408302366733551, + "rewards/pad": 0.0, + "step": 2674 + }, + { + "completion_length": 352.671875, + "epoch": 0.8524537922243467, + "grad_norm": 15.739791870117188, + "kl": 0.048828125, + "learning_rate": 1.4754620777565327e-07, + "loss": 0.0019, + "reward": 1.4655406475067139, + "reward_std": 0.049866218119859695, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.34054064750671387, + "step": 2675 + }, + { + "completion_length": 206.34375, + "epoch": 0.8527724665391969, + "grad_norm": 9.068931579589844, + "kl": 0.1337890625, + "learning_rate": 1.4722753346080305e-07, + "loss": 0.0054, + "reward": 1.5625362396240234, + "reward_std": 0.08124029636383057, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43753618001937866, + "rewards/pad": 0.125, + "step": 2676 + }, + { + "completion_length": 277.78125, + "epoch": 0.8530911408540471, + "grad_norm": 26.652000427246094, + "kl": 0.07373046875, + "learning_rate": 1.4690885914595283e-07, + "loss": 0.0029, + "reward": 1.658882975578308, + "reward_std": 0.03863655775785446, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6588829159736633, + "rewards/pad": 0.0, + "step": 2677 + }, + { + "completion_length": 309.40625, + "epoch": 0.8534098151688974, + "grad_norm": 11.035650253295898, + "kl": 0.0703125, + "learning_rate": 1.465901848311026e-07, + "loss": 0.0028, + "reward": 1.4774425029754639, + "reward_std": 0.1272467076778412, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49306756258010864, + "rewards/pad": 0.0, + "step": 2678 + }, + { + "completion_length": 173.75, + "epoch": 0.8537284894837476, + "grad_norm": 8.464993476867676, + "kl": 0.0888671875, + "learning_rate": 1.462715105162524e-07, + "loss": 0.0036, + "reward": 1.6125917434692383, + "reward_std": 0.059951480478048325, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6125918030738831, + "rewards/pad": 0.0, + "step": 2679 + }, + { + "completion_length": 269.875, + "epoch": 0.8540471637985978, + "grad_norm": 5.818058490753174, + "kl": 0.09521484375, + "learning_rate": 1.4595283620140217e-07, + "loss": 0.0038, + "reward": 1.5869768857955933, + "reward_std": 0.1202688217163086, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.47760194540023804, + "step": 2680 + }, + { + "completion_length": 188.140625, + "epoch": 0.854365838113448, + "grad_norm": 15.608235359191895, + "kl": 0.09912109375, + "learning_rate": 1.4563416188655195e-07, + "loss": 0.004, + "reward": 1.6593985557556152, + "reward_std": 0.06854577362537384, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6593986749649048, + "rewards/pad": 0.0, + "step": 2681 + }, + { + "completion_length": 360.71875, + "epoch": 0.8546845124282982, + "grad_norm": 11.706225395202637, + "kl": 0.0673828125, + "learning_rate": 1.4531548757170173e-07, + "loss": 0.0027, + "reward": 1.5080907344818115, + "reward_std": 0.11667753756046295, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39871570467948914, + "rewards/pad": 0.125, + "step": 2682 + }, + { + "completion_length": 194.328125, + "epoch": 0.8550031867431485, + "grad_norm": 16.0618953704834, + "kl": 0.099609375, + "learning_rate": 1.4499681325685149e-07, + "loss": 0.004, + "reward": 1.6319692134857178, + "reward_std": 0.055502306669950485, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6319692730903625, + "rewards/pad": 0.0, + "step": 2683 + }, + { + "completion_length": 188.96875, + "epoch": 0.8553218610579987, + "grad_norm": 38.90488815307617, + "kl": 0.1162109375, + "learning_rate": 1.4467813894200127e-07, + "loss": 0.0047, + "reward": 1.5885021686553955, + "reward_std": 0.07763275504112244, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5885021686553955, + "rewards/pad": 0.0, + "step": 2684 + }, + { + "completion_length": 279.625, + "epoch": 0.8556405353728489, + "grad_norm": 5.486372470855713, + "kl": 0.06982421875, + "learning_rate": 1.4435946462715105e-07, + "loss": 0.0028, + "reward": 1.6683847904205322, + "reward_std": 0.060479871928691864, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5433847904205322, + "step": 2685 + }, + { + "completion_length": 198.859375, + "epoch": 0.8559592096876991, + "grad_norm": 15.562231063842773, + "kl": 0.08837890625, + "learning_rate": 1.4404079031230083e-07, + "loss": 0.0035, + "reward": 1.700699806213379, + "reward_std": 0.08188517391681671, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5756998062133789, + "rewards/pad": 0.125, + "step": 2686 + }, + { + "completion_length": 229.46875, + "epoch": 0.8562778840025494, + "grad_norm": 16.420684814453125, + "kl": 0.07666015625, + "learning_rate": 1.437221159974506e-07, + "loss": 0.0031, + "reward": 1.7302379608154297, + "reward_std": 0.04194648563861847, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48023805022239685, + "step": 2687 + }, + { + "completion_length": 264.03125, + "epoch": 0.8565965583173997, + "grad_norm": 9.722140312194824, + "kl": 0.08447265625, + "learning_rate": 1.434034416826004e-07, + "loss": 0.0034, + "reward": 1.574733853340149, + "reward_std": 0.05277637392282486, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4497338533401489, + "rewards/pad": 0.125, + "step": 2688 + }, + { + "completion_length": 195.390625, + "epoch": 0.8569152326322499, + "grad_norm": 14.187362670898438, + "kl": 0.09326171875, + "learning_rate": 1.4308476736775017e-07, + "loss": 0.0037, + "reward": 1.424652099609375, + "reward_std": 0.07914784550666809, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4246521592140198, + "step": 2689 + }, + { + "completion_length": 247.71875, + "epoch": 0.8572339069471001, + "grad_norm": 5.567068099975586, + "kl": 0.08837890625, + "learning_rate": 1.4276609305289995e-07, + "loss": 0.0035, + "reward": 1.4163810014724731, + "reward_std": 0.08065269887447357, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41638097167015076, + "step": 2690 + }, + { + "completion_length": 315.296875, + "epoch": 0.8575525812619503, + "grad_norm": 11.564667701721191, + "kl": 0.07373046875, + "learning_rate": 1.4244741873804973e-07, + "loss": 0.0029, + "reward": 1.5152671337127686, + "reward_std": 0.10225661844015121, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4683920443058014, + "step": 2691 + }, + { + "completion_length": 254.421875, + "epoch": 0.8578712555768006, + "grad_norm": 21.23450469970703, + "kl": 0.07177734375, + "learning_rate": 1.4212874442319946e-07, + "loss": 0.0029, + "reward": 1.4381251335144043, + "reward_std": 0.04972090572118759, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3131251335144043, + "rewards/pad": 0.125, + "step": 2692 + }, + { + "completion_length": 297.359375, + "epoch": 0.8581899298916508, + "grad_norm": 9.087606430053711, + "kl": 0.0693359375, + "learning_rate": 1.4181007010834924e-07, + "loss": 0.0028, + "reward": 1.5673494338989258, + "reward_std": 0.08237025141716003, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5829744338989258, + "rewards/pad": 0.0, + "step": 2693 + }, + { + "completion_length": 240.21875, + "epoch": 0.858508604206501, + "grad_norm": 15.42308521270752, + "kl": 0.09521484375, + "learning_rate": 1.4149139579349902e-07, + "loss": 0.0038, + "reward": 1.4050732851028442, + "reward_std": 0.132409930229187, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.42069828510284424, + "step": 2694 + }, + { + "completion_length": 306.671875, + "epoch": 0.8588272785213512, + "grad_norm": 13.6598482131958, + "kl": 0.056884765625, + "learning_rate": 1.411727214786488e-07, + "loss": 0.0023, + "reward": 1.866136074066162, + "reward_std": 0.10433478653430939, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4911360442638397, + "step": 2695 + }, + { + "completion_length": 223.0, + "epoch": 0.8591459528362014, + "grad_norm": 11.171992301940918, + "kl": 0.09130859375, + "learning_rate": 1.4085404716379858e-07, + "loss": 0.0037, + "reward": 1.6910674571990967, + "reward_std": 0.1258898377418518, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44106757640838623, + "rewards/pad": 0.25, + "step": 2696 + }, + { + "completion_length": 208.546875, + "epoch": 0.8594646271510517, + "grad_norm": 19.124313354492188, + "kl": 0.1171875, + "learning_rate": 1.4053537284894836e-07, + "loss": 0.0047, + "reward": 1.6321409940719604, + "reward_std": 0.1274343580007553, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5071409940719604, + "rewards/pad": 0.125, + "step": 2697 + }, + { + "completion_length": 247.859375, + "epoch": 0.8597833014659019, + "grad_norm": 9.619743347167969, + "kl": 0.07373046875, + "learning_rate": 1.4021669853409814e-07, + "loss": 0.0029, + "reward": 1.7662237882614136, + "reward_std": 0.11302575469017029, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5318487286567688, + "step": 2698 + }, + { + "completion_length": 196.3125, + "epoch": 0.8601019757807521, + "grad_norm": 12.168063163757324, + "kl": 0.0927734375, + "learning_rate": 1.3989802421924792e-07, + "loss": 0.0037, + "reward": 1.6692471504211426, + "reward_std": 0.08357395231723785, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6692471504211426, + "rewards/pad": 0.0, + "step": 2699 + }, + { + "completion_length": 271.71875, + "epoch": 0.8604206500956023, + "grad_norm": 18.2286376953125, + "kl": 0.07421875, + "learning_rate": 1.395793499043977e-07, + "loss": 0.003, + "reward": 1.4802391529083252, + "reward_std": 0.0642646849155426, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4802391827106476, + "rewards/pad": 0.0, + "step": 2700 + }, + { + "completion_length": 297.078125, + "epoch": 0.8607393244104525, + "grad_norm": 7.670638561248779, + "kl": 0.07666015625, + "learning_rate": 1.3926067558954748e-07, + "loss": 0.0031, + "reward": 1.6246676445007324, + "reward_std": 0.056935422122478485, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49966752529144287, + "step": 2701 + }, + { + "completion_length": 272.28125, + "epoch": 0.8610579987253028, + "grad_norm": 11.62264347076416, + "kl": 0.09375, + "learning_rate": 1.3894200127469724e-07, + "loss": 0.0038, + "reward": 1.3839402198791504, + "reward_std": 0.09366807341575623, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3839401602745056, + "rewards/pad": 0.0, + "step": 2702 + }, + { + "completion_length": 249.40625, + "epoch": 0.861376673040153, + "grad_norm": 8.985459327697754, + "kl": 0.0849609375, + "learning_rate": 1.3862332695984702e-07, + "loss": 0.0034, + "reward": 1.6927094459533691, + "reward_std": 0.08651595562696457, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5677094459533691, + "step": 2703 + }, + { + "completion_length": 152.25, + "epoch": 0.8616953473550032, + "grad_norm": 12.082322120666504, + "kl": 0.11865234375, + "learning_rate": 1.383046526449968e-07, + "loss": 0.0048, + "reward": 1.6535862684249878, + "reward_std": 0.06323709338903427, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6535862684249878, + "rewards/pad": 0.0, + "step": 2704 + }, + { + "completion_length": 253.703125, + "epoch": 0.8620140216698534, + "grad_norm": 9.971209526062012, + "kl": 0.09619140625, + "learning_rate": 1.3798597833014658e-07, + "loss": 0.0039, + "reward": 1.4282680749893188, + "reward_std": 0.13955871760845184, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.44389307498931885, + "rewards/pad": 0.0, + "step": 2705 + }, + { + "completion_length": 150.5625, + "epoch": 0.8623326959847036, + "grad_norm": 20.146631240844727, + "kl": 0.1328125, + "learning_rate": 1.3766730401529636e-07, + "loss": 0.0053, + "reward": 1.5105044841766357, + "reward_std": 0.16675220429897308, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.541754424571991, + "rewards/pad": 0.0, + "step": 2706 + }, + { + "completion_length": 256.875, + "epoch": 0.8626513702995539, + "grad_norm": 6.620942115783691, + "kl": 0.09716796875, + "learning_rate": 1.3734862970044614e-07, + "loss": 0.0039, + "reward": 1.4991254806518555, + "reward_std": 0.05944891646504402, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49912554025650024, + "step": 2707 + }, + { + "completion_length": 217.640625, + "epoch": 0.8629700446144041, + "grad_norm": 10.063702583312988, + "kl": 0.091796875, + "learning_rate": 1.3702995538559592e-07, + "loss": 0.0037, + "reward": 1.7139660120010376, + "reward_std": 0.15648072957992554, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.6045910716056824, + "step": 2708 + }, + { + "completion_length": 310.6875, + "epoch": 0.8632887189292543, + "grad_norm": 8.95224380493164, + "kl": 0.06982421875, + "learning_rate": 1.367112810707457e-07, + "loss": 0.0028, + "reward": 1.5477850437164307, + "reward_std": 0.032181769609451294, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4227849841117859, + "step": 2709 + }, + { + "completion_length": 211.4375, + "epoch": 0.8636073932441045, + "grad_norm": 12.4854736328125, + "kl": 0.091796875, + "learning_rate": 1.3639260675589548e-07, + "loss": 0.0037, + "reward": 1.8467566967010498, + "reward_std": 0.055249616503715515, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4717566967010498, + "step": 2710 + }, + { + "completion_length": 288.5, + "epoch": 0.8639260675589547, + "grad_norm": 16.495664596557617, + "kl": 0.06494140625, + "learning_rate": 1.3607393244104526e-07, + "loss": 0.0026, + "reward": 1.6814583539962769, + "reward_std": 0.08853200823068619, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5720832943916321, + "step": 2711 + }, + { + "completion_length": 163.34375, + "epoch": 0.864244741873805, + "grad_norm": 29.02391242980957, + "kl": 0.1220703125, + "learning_rate": 1.3575525812619501e-07, + "loss": 0.0049, + "reward": 1.8566462993621826, + "reward_std": 0.19339735805988312, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6535212397575378, + "rewards/pad": 0.203125, + "step": 2712 + }, + { + "completion_length": 259.15625, + "epoch": 0.8645634161886552, + "grad_norm": 5.97607421875, + "kl": 0.0703125, + "learning_rate": 1.354365838113448e-07, + "loss": 0.0028, + "reward": 1.30307137966156, + "reward_std": 0.03802892565727234, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.30307137966156006, + "rewards/pad": 0.0, + "step": 2713 + }, + { + "completion_length": 358.9375, + "epoch": 0.8648820905035054, + "grad_norm": 12.710834503173828, + "kl": 0.0673828125, + "learning_rate": 1.3511790949649458e-07, + "loss": 0.0027, + "reward": 1.3558748960494995, + "reward_std": 0.10074446350336075, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3714999556541443, + "rewards/pad": 0.0, + "step": 2714 + }, + { + "completion_length": 211.765625, + "epoch": 0.8652007648183556, + "grad_norm": 10.720243453979492, + "kl": 0.0849609375, + "learning_rate": 1.3479923518164436e-07, + "loss": 0.0034, + "reward": 1.6154134273529053, + "reward_std": 0.05592378228902817, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.61541348695755, + "step": 2715 + }, + { + "completion_length": 263.359375, + "epoch": 0.8655194391332058, + "grad_norm": 10.494246482849121, + "kl": 0.07421875, + "learning_rate": 1.3448056086679414e-07, + "loss": 0.003, + "reward": 1.6904199123382568, + "reward_std": 0.08571196347475052, + "rewards/pad": 0.296875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3935449719429016, + "step": 2716 + }, + { + "completion_length": 412.921875, + "epoch": 0.8658381134480561, + "grad_norm": 6.365828514099121, + "kl": 0.051513671875, + "learning_rate": 1.3416188655194392e-07, + "loss": 0.0021, + "reward": 1.421118974685669, + "reward_std": 0.04486546665430069, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42111900448799133, + "step": 2717 + }, + { + "completion_length": 218.015625, + "epoch": 0.8661567877629063, + "grad_norm": 64.41060638427734, + "kl": 0.09130859375, + "learning_rate": 1.338432122370937e-07, + "loss": 0.0037, + "reward": 1.6570738554000854, + "reward_std": 0.1489730179309845, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5945739150047302, + "rewards/pad": 0.0625, + "step": 2718 + }, + { + "completion_length": 173.34375, + "epoch": 0.8664754620777565, + "grad_norm": 18.154401779174805, + "kl": 0.0966796875, + "learning_rate": 1.3352453792224348e-07, + "loss": 0.0039, + "reward": 1.8273169994354248, + "reward_std": 0.11780412495136261, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5929418802261353, + "step": 2719 + }, + { + "completion_length": 192.921875, + "epoch": 0.8667941363926067, + "grad_norm": 13.782989501953125, + "kl": 0.09130859375, + "learning_rate": 1.3320586360739326e-07, + "loss": 0.0036, + "reward": 1.579949140548706, + "reward_std": 0.125904381275177, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4393240511417389, + "step": 2720 + }, + { + "completion_length": 370.375, + "epoch": 0.8671128107074569, + "grad_norm": 4.068702220916748, + "kl": 0.06298828125, + "learning_rate": 1.3288718929254304e-07, + "loss": 0.0025, + "reward": 1.5640076398849487, + "reward_std": 0.12377434968948364, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.47025763988494873, + "step": 2721 + }, + { + "completion_length": 254.09375, + "epoch": 0.8674314850223072, + "grad_norm": 9.885491371154785, + "kl": 0.0771484375, + "learning_rate": 1.325685149776928e-07, + "loss": 0.0031, + "reward": 1.5922003984451294, + "reward_std": 0.08162754029035568, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4672004282474518, + "rewards/pad": 0.125, + "step": 2722 + }, + { + "completion_length": 368.609375, + "epoch": 0.8677501593371574, + "grad_norm": 9.013411521911621, + "kl": 0.05859375, + "learning_rate": 1.3224984066284255e-07, + "loss": 0.0023, + "reward": 1.5695838928222656, + "reward_std": 0.0959939956665039, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49145886301994324, + "step": 2723 + }, + { + "completion_length": 289.75, + "epoch": 0.8680688336520076, + "grad_norm": 9.853470802307129, + "kl": 0.07177734375, + "learning_rate": 1.3193116634799233e-07, + "loss": 0.0029, + "reward": 1.5496944189071655, + "reward_std": 0.1058008000254631, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.534069299697876, + "rewards/pad": 0.015625, + "step": 2724 + }, + { + "completion_length": 205.625, + "epoch": 0.8683875079668578, + "grad_norm": 15.999686241149902, + "kl": 0.111328125, + "learning_rate": 1.316124920331421e-07, + "loss": 0.0045, + "reward": 1.5291621685028076, + "reward_std": 0.11018738150596619, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5291622281074524, + "step": 2725 + }, + { + "completion_length": 263.171875, + "epoch": 0.868706182281708, + "grad_norm": 19.691360473632812, + "kl": 0.0771484375, + "learning_rate": 1.312938177182919e-07, + "loss": 0.0031, + "reward": 1.544965386390686, + "reward_std": 0.033744215965270996, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.544965386390686, + "rewards/pad": 0.0, + "step": 2726 + }, + { + "completion_length": 162.203125, + "epoch": 0.8690248565965584, + "grad_norm": 15.681041717529297, + "kl": 0.11279296875, + "learning_rate": 1.3097514340344167e-07, + "loss": 0.0045, + "reward": 1.5828311443328857, + "reward_std": 0.11496913433074951, + "rewards/answer_reward": 0.09375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4890812337398529, + "step": 2727 + }, + { + "completion_length": 246.6875, + "epoch": 0.8693435309114086, + "grad_norm": 7.064988613128662, + "kl": 0.09033203125, + "learning_rate": 1.3065646908859145e-07, + "loss": 0.0036, + "reward": 1.592658519744873, + "reward_std": 0.057133033871650696, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.592658519744873, + "step": 2728 + }, + { + "completion_length": 263.90625, + "epoch": 0.8696622052262588, + "grad_norm": 13.716483116149902, + "kl": 0.0751953125, + "learning_rate": 1.3033779477374123e-07, + "loss": 0.003, + "reward": 1.7002770900726318, + "reward_std": 0.09610147774219513, + "rewards/answer_reward": 0.15625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5440271496772766, + "step": 2729 + }, + { + "completion_length": 204.828125, + "epoch": 0.869980879541109, + "grad_norm": 11.942748069763184, + "kl": 0.1171875, + "learning_rate": 1.30019120458891e-07, + "loss": 0.0047, + "reward": 1.5761370658874512, + "reward_std": 0.09762246161699295, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45113712549209595, + "rewards/pad": 0.125, + "step": 2730 + }, + { + "completion_length": 314.28125, + "epoch": 0.8702995538559593, + "grad_norm": 9.208845138549805, + "kl": 0.0751953125, + "learning_rate": 1.297004461440408e-07, + "loss": 0.003, + "reward": 1.6383988857269287, + "reward_std": 0.11062052845954895, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5290238261222839, + "step": 2731 + }, + { + "completion_length": 168.3125, + "epoch": 0.8706182281708095, + "grad_norm": 8.693392753601074, + "kl": 0.13671875, + "learning_rate": 1.2938177182919055e-07, + "loss": 0.0055, + "reward": 1.375337839126587, + "reward_std": 0.10295344889163971, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.39096277952194214, + "rewards/pad": 0.0, + "step": 2732 + }, + { + "completion_length": 306.65625, + "epoch": 0.8709369024856597, + "grad_norm": 19.472307205200195, + "kl": 0.09130859375, + "learning_rate": 1.2906309751434033e-07, + "loss": 0.0036, + "reward": 1.4047869443893433, + "reward_std": 0.04510766640305519, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4047868847846985, + "step": 2733 + }, + { + "completion_length": 227.859375, + "epoch": 0.8712555768005099, + "grad_norm": 7.83612585067749, + "kl": 0.09765625, + "learning_rate": 1.287444231994901e-07, + "loss": 0.0039, + "reward": 1.5649466514587402, + "reward_std": 0.11199548840522766, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5649466514587402, + "step": 2734 + }, + { + "completion_length": 306.8125, + "epoch": 0.8715742511153601, + "grad_norm": 12.435422897338867, + "kl": 0.09716796875, + "learning_rate": 1.284257488846399e-07, + "loss": 0.0039, + "reward": 1.689566969871521, + "reward_std": 0.0790565237402916, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.564566969871521, + "step": 2735 + }, + { + "completion_length": 341.9375, + "epoch": 0.8718929254302104, + "grad_norm": 4.984146595001221, + "kl": 0.07470703125, + "learning_rate": 1.2810707456978967e-07, + "loss": 0.003, + "reward": 1.5292251110076904, + "reward_std": 0.06255487352609634, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5292251110076904, + "step": 2736 + }, + { + "completion_length": 245.65625, + "epoch": 0.8722115997450606, + "grad_norm": 20.56971549987793, + "kl": 0.076171875, + "learning_rate": 1.2778840025493945e-07, + "loss": 0.003, + "reward": 1.6221401691436768, + "reward_std": 0.2308957576751709, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.481515109539032, + "rewards/pad": 0.15625, + "step": 2737 + }, + { + "completion_length": 325.453125, + "epoch": 0.8725302740599108, + "grad_norm": 9.56176471710205, + "kl": 0.0673828125, + "learning_rate": 1.2746972594008923e-07, + "loss": 0.0027, + "reward": 1.4896824359893799, + "reward_std": 0.07883624732494354, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3959324359893799, + "step": 2738 + }, + { + "completion_length": 275.0625, + "epoch": 0.872848948374761, + "grad_norm": 12.742481231689453, + "kl": 0.064453125, + "learning_rate": 1.27151051625239e-07, + "loss": 0.0026, + "reward": 1.5121251344680786, + "reward_std": 0.12479326128959656, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5277501940727234, + "step": 2739 + }, + { + "completion_length": 194.65625, + "epoch": 0.8731676226896112, + "grad_norm": 23.468538284301758, + "kl": 0.11279296875, + "learning_rate": 1.268323773103888e-07, + "loss": 0.0045, + "reward": 1.4817605018615723, + "reward_std": 0.06805586069822311, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48176059126853943, + "step": 2740 + }, + { + "completion_length": 158.875, + "epoch": 0.8734862970044615, + "grad_norm": 14.54665756225586, + "kl": 0.10400390625, + "learning_rate": 1.2651370299553854e-07, + "loss": 0.0042, + "reward": 1.7623220682144165, + "reward_std": 0.11608093976974487, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6216970086097717, + "step": 2741 + }, + { + "completion_length": 313.140625, + "epoch": 0.8738049713193117, + "grad_norm": 16.616018295288086, + "kl": 0.06591796875, + "learning_rate": 1.2619502868068832e-07, + "loss": 0.0026, + "reward": 1.532200813293457, + "reward_std": 0.04246199131011963, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.2822008430957794, + "step": 2742 + }, + { + "completion_length": 213.953125, + "epoch": 0.8741236456341619, + "grad_norm": 12.715255737304688, + "kl": 0.0908203125, + "learning_rate": 1.258763543658381e-07, + "loss": 0.0036, + "reward": 1.5940533876419067, + "reward_std": 0.10762360692024231, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46905332803726196, + "step": 2743 + }, + { + "completion_length": 204.671875, + "epoch": 0.8744423199490121, + "grad_norm": 10.193065643310547, + "kl": 0.083984375, + "learning_rate": 1.2555768005098789e-07, + "loss": 0.0034, + "reward": 1.8008592128753662, + "reward_std": 0.06925635039806366, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6758592128753662, + "rewards/pad": 0.125, + "step": 2744 + }, + { + "completion_length": 206.671875, + "epoch": 0.8747609942638623, + "grad_norm": 12.41603946685791, + "kl": 0.1083984375, + "learning_rate": 1.2523900573613767e-07, + "loss": 0.0043, + "reward": 1.6150926351547241, + "reward_std": 0.11028784513473511, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6150925159454346, + "rewards/pad": 0.0, + "step": 2745 + }, + { + "completion_length": 321.78125, + "epoch": 0.8750796685787126, + "grad_norm": 6.864892482757568, + "kl": 0.087890625, + "learning_rate": 1.2492033142128745e-07, + "loss": 0.0035, + "reward": 1.5100083351135254, + "reward_std": 0.06830344349145889, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5100083351135254, + "step": 2746 + }, + { + "completion_length": 253.78125, + "epoch": 0.8753983428935628, + "grad_norm": 8.223762512207031, + "kl": 0.08984375, + "learning_rate": 1.2460165710643723e-07, + "loss": 0.0036, + "reward": 1.8030331134796143, + "reward_std": 0.23443907499313354, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.5842832326889038, + "rewards/pad": 0.25, + "step": 2747 + }, + { + "completion_length": 240.453125, + "epoch": 0.875717017208413, + "grad_norm": 359.0318298339844, + "kl": 0.0859375, + "learning_rate": 1.24282982791587e-07, + "loss": 0.0034, + "reward": 1.5661985874176025, + "reward_std": 0.05305684357881546, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5661985874176025, + "step": 2748 + }, + { + "completion_length": 374.515625, + "epoch": 0.8760356915232632, + "grad_norm": 33.193660736083984, + "kl": 0.0693359375, + "learning_rate": 1.2396430847673676e-07, + "loss": 0.0028, + "reward": 1.477623701095581, + "reward_std": 0.07416988909244537, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36824867129325867, + "step": 2749 + }, + { + "completion_length": 200.328125, + "epoch": 0.8763543658381134, + "grad_norm": 27.331974029541016, + "kl": 0.0947265625, + "learning_rate": 1.2364563416188654e-07, + "loss": 0.0038, + "reward": 1.3313639163970947, + "reward_std": 0.08686615526676178, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3313639760017395, + "step": 2750 + }, + { + "completion_length": 289.078125, + "epoch": 0.8766730401529637, + "grad_norm": 7.792123317718506, + "kl": 0.064453125, + "learning_rate": 1.2332695984703632e-07, + "loss": 0.0026, + "reward": 1.655777931213379, + "reward_std": 0.13871659338474274, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5307778716087341, + "step": 2751 + }, + { + "completion_length": 150.4375, + "epoch": 0.8769917144678139, + "grad_norm": 33.16511154174805, + "kl": 0.0908203125, + "learning_rate": 1.230082855321861e-07, + "loss": 0.0036, + "reward": 1.5670239925384521, + "reward_std": 0.11330273747444153, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5826489925384521, + "rewards/pad": 0.0, + "step": 2752 + }, + { + "completion_length": 214.890625, + "epoch": 0.8773103887826641, + "grad_norm": 13.900517463684082, + "kl": 0.0859375, + "learning_rate": 1.2268961121733588e-07, + "loss": 0.0034, + "reward": 1.5166895389556885, + "reward_std": 0.08768896758556366, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.39168962836265564, + "step": 2753 + }, + { + "completion_length": 251.96875, + "epoch": 0.8776290630975143, + "grad_norm": 7.415778636932373, + "kl": 0.07421875, + "learning_rate": 1.2237093690248564e-07, + "loss": 0.003, + "reward": 1.5468295812606812, + "reward_std": 0.10374996066093445, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45307961106300354, + "step": 2754 + }, + { + "completion_length": 208.3125, + "epoch": 0.8779477374123645, + "grad_norm": 14.48386287689209, + "kl": 0.08740234375, + "learning_rate": 1.2205226258763542e-07, + "loss": 0.0035, + "reward": 1.5509672164916992, + "reward_std": 0.06657718122005463, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5509671568870544, + "rewards/pad": 0.0, + "step": 2755 + }, + { + "completion_length": 146.421875, + "epoch": 0.8782664117272148, + "grad_norm": 11.69698715209961, + "kl": 0.107421875, + "learning_rate": 1.217335882727852e-07, + "loss": 0.0043, + "reward": 1.556875467300415, + "reward_std": 0.057311661541461945, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5568753480911255, + "rewards/pad": 0.0, + "step": 2756 + }, + { + "completion_length": 347.984375, + "epoch": 0.878585086042065, + "grad_norm": 8.806641578674316, + "kl": 0.06494140625, + "learning_rate": 1.2141491395793498e-07, + "loss": 0.0026, + "reward": 1.3751672506332397, + "reward_std": 0.1003868505358696, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3907921612262726, + "step": 2757 + }, + { + "completion_length": 261.796875, + "epoch": 0.8789037603569152, + "grad_norm": 9.35265827178955, + "kl": 0.0849609375, + "learning_rate": 1.2109623964308476e-07, + "loss": 0.0034, + "reward": 1.5465914011001587, + "reward_std": 0.09646390378475189, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5465914011001587, + "rewards/pad": 0.0, + "step": 2758 + }, + { + "completion_length": 219.0625, + "epoch": 0.8792224346717654, + "grad_norm": 31.54623794555664, + "kl": 0.0859375, + "learning_rate": 1.2077756532823454e-07, + "loss": 0.0034, + "reward": 1.446937084197998, + "reward_std": 0.04907142370939255, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32193702459335327, + "step": 2759 + }, + { + "completion_length": 271.890625, + "epoch": 0.8795411089866156, + "grad_norm": 14.196030616760254, + "kl": 0.10986328125, + "learning_rate": 1.2045889101338432e-07, + "loss": 0.0044, + "reward": 1.5679893493652344, + "reward_std": 0.24801567196846008, + "rewards/format_reward_tg": 0.953125, + "rewards/iou_timestamp_reward": 0.4429892897605896, + "rewards/pad": 0.171875, + "step": 2760 + }, + { + "completion_length": 268.984375, + "epoch": 0.8798597833014659, + "grad_norm": 9.754602432250977, + "kl": 0.0751953125, + "learning_rate": 1.201402166985341e-07, + "loss": 0.003, + "reward": 1.6172590255737305, + "reward_std": 0.11558352410793304, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49225905537605286, + "step": 2761 + }, + { + "completion_length": 296.71875, + "epoch": 0.8801784576163161, + "grad_norm": 6.027721405029297, + "kl": 0.068359375, + "learning_rate": 1.1982154238368388e-07, + "loss": 0.0027, + "reward": 1.869599461555481, + "reward_std": 0.18542581796646118, + "rewards/pad": 0.359375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.525849461555481, + "step": 2762 + }, + { + "completion_length": 143.796875, + "epoch": 0.8804971319311663, + "grad_norm": 22.146970748901367, + "kl": 0.12353515625, + "learning_rate": 1.1950286806883364e-07, + "loss": 0.0049, + "reward": 1.416458010673523, + "reward_std": 0.07813596725463867, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4164580702781677, + "rewards/pad": 0.0, + "step": 2763 + }, + { + "completion_length": 252.796875, + "epoch": 0.8808158062460165, + "grad_norm": 21.758710861206055, + "kl": 0.095703125, + "learning_rate": 1.1918419375398342e-07, + "loss": 0.0038, + "reward": 1.4576029777526855, + "reward_std": 0.06436774134635925, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4576028883457184, + "step": 2764 + }, + { + "completion_length": 262.53125, + "epoch": 0.8811344805608667, + "grad_norm": 85.56441497802734, + "kl": 0.080078125, + "learning_rate": 1.188655194391332e-07, + "loss": 0.0032, + "reward": 1.5895549058914185, + "reward_std": 0.10457515716552734, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.48017996549606323, + "step": 2765 + }, + { + "completion_length": 293.046875, + "epoch": 0.8814531548757171, + "grad_norm": 109.34986877441406, + "kl": 0.07275390625, + "learning_rate": 1.1854684512428298e-07, + "loss": 0.0029, + "reward": 1.5263152122497559, + "reward_std": 0.03732115402817726, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40131521224975586, + "rewards/pad": 0.125, + "step": 2766 + }, + { + "completion_length": 173.46875, + "epoch": 0.8817718291905673, + "grad_norm": 14.634538650512695, + "kl": 0.10595703125, + "learning_rate": 1.1822817080943276e-07, + "loss": 0.0042, + "reward": 1.7670618295669556, + "reward_std": 0.15223199129104614, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5639367699623108, + "rewards/pad": 0.203125, + "step": 2767 + }, + { + "completion_length": 287.375, + "epoch": 0.8820905035054175, + "grad_norm": 9.117862701416016, + "kl": 0.068359375, + "learning_rate": 1.1790949649458252e-07, + "loss": 0.0027, + "reward": 1.5190690755844116, + "reward_std": 0.04640193283557892, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39406919479370117, + "rewards/pad": 0.125, + "step": 2768 + }, + { + "completion_length": 289.34375, + "epoch": 0.8824091778202677, + "grad_norm": 11.593138694763184, + "kl": 0.06396484375, + "learning_rate": 1.175908221797323e-07, + "loss": 0.0026, + "reward": 1.5423412322998047, + "reward_std": 0.1217973604798317, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4954662322998047, + "rewards/pad": 0.046875, + "step": 2769 + }, + { + "completion_length": 298.53125, + "epoch": 0.882727852135118, + "grad_norm": 11.1407470703125, + "kl": 0.06640625, + "learning_rate": 1.1727214786488209e-07, + "loss": 0.0027, + "reward": 1.5723299980163574, + "reward_std": 0.048630490899086, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5723298788070679, + "rewards/pad": 0.0, + "step": 2770 + }, + { + "completion_length": 194.359375, + "epoch": 0.8830465264499682, + "grad_norm": 18.737730026245117, + "kl": 0.087890625, + "learning_rate": 1.1695347355003187e-07, + "loss": 0.0035, + "reward": 1.5446221828460693, + "reward_std": 0.0422695018351078, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5446221828460693, + "step": 2771 + }, + { + "completion_length": 202.625, + "epoch": 0.8833652007648184, + "grad_norm": 16.50901222229004, + "kl": 0.0859375, + "learning_rate": 1.1663479923518165e-07, + "loss": 0.0034, + "reward": 1.4691429138183594, + "reward_std": 0.17112360894680023, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3597680628299713, + "rewards/pad": 0.109375, + "step": 2772 + }, + { + "completion_length": 232.125, + "epoch": 0.8836838750796686, + "grad_norm": 14.47046184539795, + "kl": 0.119140625, + "learning_rate": 1.1631612492033141e-07, + "loss": 0.0048, + "reward": 1.4901710748672485, + "reward_std": 0.08002850413322449, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4901711344718933, + "rewards/pad": 0.0, + "step": 2773 + }, + { + "completion_length": 195.40625, + "epoch": 0.8840025493945188, + "grad_norm": 68.46620178222656, + "kl": 0.0888671875, + "learning_rate": 1.159974506054812e-07, + "loss": 0.0036, + "reward": 1.6715497970581055, + "reward_std": 0.15683436393737793, + "rewards/answer_reward": 0.265625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.40592485666275024, + "step": 2774 + }, + { + "completion_length": 184.78125, + "epoch": 0.884321223709369, + "grad_norm": 20.505659103393555, + "kl": 0.1201171875, + "learning_rate": 1.1567877629063097e-07, + "loss": 0.0048, + "reward": 1.5303431749343872, + "reward_std": 0.22911837697029114, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4053431451320648, + "rewards/pad": 0.140625, + "step": 2775 + }, + { + "completion_length": 267.0625, + "epoch": 0.8846398980242193, + "grad_norm": 8.295364379882812, + "kl": 0.126953125, + "learning_rate": 1.1536010197578076e-07, + "loss": 0.0051, + "reward": 1.6413304805755615, + "reward_std": 0.06875017285346985, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5163305401802063, + "rewards/pad": 0.125, + "step": 2776 + }, + { + "completion_length": 422.984375, + "epoch": 0.8849585723390695, + "grad_norm": 5.138932228088379, + "kl": 0.05224609375, + "learning_rate": 1.1504142766093052e-07, + "loss": 0.0021, + "reward": 1.5415451526641846, + "reward_std": 0.1087242066860199, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.43217024207115173, + "step": 2777 + }, + { + "completion_length": 175.25, + "epoch": 0.8852772466539197, + "grad_norm": 13.963083267211914, + "kl": 0.08837890625, + "learning_rate": 1.1472275334608029e-07, + "loss": 0.0035, + "reward": 1.8134742975234985, + "reward_std": 0.08984021097421646, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5634742975234985, + "rewards/pad": 0.25, + "step": 2778 + }, + { + "completion_length": 296.796875, + "epoch": 0.8855959209687699, + "grad_norm": 66.78641510009766, + "kl": 0.119140625, + "learning_rate": 1.1440407903123007e-07, + "loss": 0.0047, + "reward": 1.3360178470611572, + "reward_std": 0.03126439452171326, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33601781725883484, + "step": 2779 + }, + { + "completion_length": 348.859375, + "epoch": 0.8859145952836202, + "grad_norm": 20.583782196044922, + "kl": 0.06982421875, + "learning_rate": 1.1408540471637985e-07, + "loss": 0.0028, + "reward": 1.4972288608551025, + "reward_std": 0.056556351482868195, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49722886085510254, + "rewards/pad": 0.0, + "step": 2780 + }, + { + "completion_length": 311.40625, + "epoch": 0.8862332695984704, + "grad_norm": 13.552074432373047, + "kl": 0.0712890625, + "learning_rate": 1.1376673040152963e-07, + "loss": 0.0028, + "reward": 1.6806656122207642, + "reward_std": 0.07932628691196442, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5556654930114746, + "step": 2781 + }, + { + "completion_length": 191.140625, + "epoch": 0.8865519439133206, + "grad_norm": 19.05376434326172, + "kl": 0.091796875, + "learning_rate": 1.1344805608667941e-07, + "loss": 0.0037, + "reward": 1.6291983127593994, + "reward_std": 0.17766210436820984, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5510733127593994, + "step": 2782 + }, + { + "completion_length": 204.453125, + "epoch": 0.8868706182281708, + "grad_norm": 17.7001895904541, + "kl": 0.10888671875, + "learning_rate": 1.1312938177182918e-07, + "loss": 0.0044, + "reward": 1.477161169052124, + "reward_std": 0.07692308723926544, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47716113924980164, + "rewards/pad": 0.0, + "step": 2783 + }, + { + "completion_length": 157.640625, + "epoch": 0.887189292543021, + "grad_norm": 10.790077209472656, + "kl": 0.11865234375, + "learning_rate": 1.1281070745697896e-07, + "loss": 0.0048, + "reward": 1.555767297744751, + "reward_std": 0.08283720910549164, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5557674169540405, + "rewards/pad": 0.0, + "step": 2784 + }, + { + "completion_length": 249.6875, + "epoch": 0.8875079668578713, + "grad_norm": 23.696714401245117, + "kl": 0.0986328125, + "learning_rate": 1.1249203314212874e-07, + "loss": 0.0039, + "reward": 1.5565049648284912, + "reward_std": 0.07266457378864288, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5565049052238464, + "rewards/pad": 0.0, + "step": 2785 + }, + { + "completion_length": 243.3125, + "epoch": 0.8878266411727215, + "grad_norm": 12.905336380004883, + "kl": 0.07763671875, + "learning_rate": 1.1217335882727852e-07, + "loss": 0.0031, + "reward": 1.6837048530578613, + "reward_std": 0.13713642954826355, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6993297934532166, + "step": 2786 + }, + { + "completion_length": 209.0, + "epoch": 0.8881453154875717, + "grad_norm": 11.713064193725586, + "kl": 0.08984375, + "learning_rate": 1.118546845124283e-07, + "loss": 0.0036, + "reward": 1.5936157703399658, + "reward_std": 0.16193625330924988, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4998658001422882, + "rewards/pad": 0.125, + "step": 2787 + }, + { + "completion_length": 206.03125, + "epoch": 0.8884639898024219, + "grad_norm": 20.389999389648438, + "kl": 0.1005859375, + "learning_rate": 1.1153601019757807e-07, + "loss": 0.004, + "reward": 1.4149174690246582, + "reward_std": 0.1828393191099167, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.43054234981536865, + "rewards/pad": 0.0, + "step": 2788 + }, + { + "completion_length": 279.6875, + "epoch": 0.8887826641172721, + "grad_norm": 10.964557647705078, + "kl": 0.0859375, + "learning_rate": 1.1121733588272785e-07, + "loss": 0.0034, + "reward": 1.7892601490020752, + "reward_std": 0.09654922038316727, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6642600893974304, + "rewards/pad": 0.125, + "step": 2789 + }, + { + "completion_length": 349.515625, + "epoch": 0.8891013384321224, + "grad_norm": 7.1359758377075195, + "kl": 0.053466796875, + "learning_rate": 1.1089866156787763e-07, + "loss": 0.0021, + "reward": 1.4936069250106812, + "reward_std": 0.10209286212921143, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5092318654060364, + "step": 2790 + }, + { + "completion_length": 331.53125, + "epoch": 0.8894200127469726, + "grad_norm": 10.255035400390625, + "kl": 0.05712890625, + "learning_rate": 1.1057998725302741e-07, + "loss": 0.0023, + "reward": 1.6107176542282104, + "reward_std": 0.04477081447839737, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48571765422821045, + "rewards/pad": 0.125, + "step": 2791 + }, + { + "completion_length": 360.96875, + "epoch": 0.8897386870618228, + "grad_norm": 17.346084594726562, + "kl": 0.06591796875, + "learning_rate": 1.1026131293817719e-07, + "loss": 0.0026, + "reward": 1.4452977180480957, + "reward_std": 0.13373248279094696, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4765477478504181, + "step": 2792 + }, + { + "completion_length": 179.078125, + "epoch": 0.890057361376673, + "grad_norm": 13.599401473999023, + "kl": 0.0908203125, + "learning_rate": 1.0994263862332694e-07, + "loss": 0.0036, + "reward": 1.6920461654663086, + "reward_std": 0.09594403207302094, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5670461058616638, + "rewards/pad": 0.125, + "step": 2793 + }, + { + "completion_length": 200.421875, + "epoch": 0.8903760356915232, + "grad_norm": 10.814448356628418, + "kl": 0.09814453125, + "learning_rate": 1.0962396430847672e-07, + "loss": 0.0039, + "reward": 1.4065415859222412, + "reward_std": 0.052019260823726654, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.406541645526886, + "rewards/pad": 0.0, + "step": 2794 + }, + { + "completion_length": 248.234375, + "epoch": 0.8906947100063735, + "grad_norm": 12.750199317932129, + "kl": 0.06494140625, + "learning_rate": 1.093052899936265e-07, + "loss": 0.0026, + "reward": 1.776698350906372, + "reward_std": 0.1367223858833313, + "rewards/pad": 0.328125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44857338070869446, + "step": 2795 + }, + { + "completion_length": 257.71875, + "epoch": 0.8910133843212237, + "grad_norm": 7.891810417175293, + "kl": 0.083984375, + "learning_rate": 1.0898661567877629e-07, + "loss": 0.0034, + "reward": 1.8464381694793701, + "reward_std": 0.08025862276554108, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5964381694793701, + "rewards/pad": 0.25, + "step": 2796 + }, + { + "completion_length": 196.890625, + "epoch": 0.8913320586360739, + "grad_norm": 12.372171401977539, + "kl": 0.0869140625, + "learning_rate": 1.0866794136392607e-07, + "loss": 0.0035, + "reward": 1.5370149612426758, + "reward_std": 0.0749216079711914, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5370149612426758, + "rewards/pad": 0.0, + "step": 2797 + }, + { + "completion_length": 210.09375, + "epoch": 0.8916507329509241, + "grad_norm": 11.810192108154297, + "kl": 0.0966796875, + "learning_rate": 1.0834926704907583e-07, + "loss": 0.0039, + "reward": 1.3395678997039795, + "reward_std": 0.1803453117609024, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.2614429295063019, + "rewards/pad": 0.09375, + "step": 2798 + }, + { + "completion_length": 265.53125, + "epoch": 0.8919694072657743, + "grad_norm": 18.495361328125, + "kl": 0.10693359375, + "learning_rate": 1.0803059273422561e-07, + "loss": 0.0043, + "reward": 1.8013211488723755, + "reward_std": 0.1095597967505455, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5669461488723755, + "rewards/pad": 0.234375, + "step": 2799 + }, + { + "completion_length": 256.640625, + "epoch": 0.8922880815806246, + "grad_norm": 15.504286766052246, + "kl": 0.078125, + "learning_rate": 1.077119184193754e-07, + "loss": 0.0031, + "reward": 1.6308748722076416, + "reward_std": 0.03704064339399338, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5058748126029968, + "step": 2800 + }, + { + "completion_length": 328.734375, + "epoch": 0.8926067558954748, + "grad_norm": 14.028239250183105, + "kl": 0.06201171875, + "learning_rate": 1.0739324410452518e-07, + "loss": 0.0025, + "reward": 1.3260129690170288, + "reward_std": 0.0780874714255333, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32601290941238403, + "step": 2801 + }, + { + "completion_length": 382.078125, + "epoch": 0.892925430210325, + "grad_norm": 42.63294219970703, + "kl": 0.09814453125, + "learning_rate": 1.0707456978967496e-07, + "loss": 0.0039, + "reward": 1.3472610712051392, + "reward_std": 0.15402668714523315, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.37851107120513916, + "step": 2802 + }, + { + "completion_length": 197.203125, + "epoch": 0.8932441045251752, + "grad_norm": 11.023219108581543, + "kl": 0.08935546875, + "learning_rate": 1.0675589547482472e-07, + "loss": 0.0036, + "reward": 1.520681381225586, + "reward_std": 0.10876883566379547, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5050563812255859, + "rewards/pad": 0.015625, + "step": 2803 + }, + { + "completion_length": 319.359375, + "epoch": 0.8935627788400254, + "grad_norm": 143.46206665039062, + "kl": 0.072265625, + "learning_rate": 1.064372211599745e-07, + "loss": 0.0029, + "reward": 1.4527530670166016, + "reward_std": 0.046053387224674225, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.32775312662124634, + "step": 2804 + }, + { + "completion_length": 249.546875, + "epoch": 0.8938814531548758, + "grad_norm": 30.37599754333496, + "kl": 0.08203125, + "learning_rate": 1.0611854684512428e-07, + "loss": 0.0033, + "reward": 1.5027915239334106, + "reward_std": 0.06004762649536133, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5027914643287659, + "rewards/pad": 0.0, + "step": 2805 + }, + { + "completion_length": 235.421875, + "epoch": 0.894200127469726, + "grad_norm": 8.845436096191406, + "kl": 0.08447265625, + "learning_rate": 1.0579987253027406e-07, + "loss": 0.0034, + "reward": 1.6691420078277588, + "reward_std": 0.06246839463710785, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5441420674324036, + "rewards/pad": 0.125, + "step": 2806 + }, + { + "completion_length": 250.03125, + "epoch": 0.8945188017845762, + "grad_norm": 5.992687225341797, + "kl": 0.0849609375, + "learning_rate": 1.0548119821542384e-07, + "loss": 0.0034, + "reward": 1.6259146928787231, + "reward_std": 0.06487414240837097, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5009146928787231, + "rewards/pad": 0.125, + "step": 2807 + }, + { + "completion_length": 307.234375, + "epoch": 0.8948374760994264, + "grad_norm": 9.169029235839844, + "kl": 0.0869140625, + "learning_rate": 1.051625239005736e-07, + "loss": 0.0035, + "reward": 1.4168381690979004, + "reward_std": 0.1290677785873413, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4324631690979004, + "rewards/pad": 0.0, + "step": 2808 + }, + { + "completion_length": 204.796875, + "epoch": 0.8951561504142767, + "grad_norm": 21.92626190185547, + "kl": 0.0859375, + "learning_rate": 1.0484384958572338e-07, + "loss": 0.0034, + "reward": 1.656879186630249, + "reward_std": 0.12416574358940125, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5475040674209595, + "rewards/pad": 0.125, + "step": 2809 + }, + { + "completion_length": 260.203125, + "epoch": 0.8954748247291269, + "grad_norm": 14.02978515625, + "kl": 0.10693359375, + "learning_rate": 1.0452517527087316e-07, + "loss": 0.0043, + "reward": 1.5584192276000977, + "reward_std": 0.08877848088741302, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4490443170070648, + "step": 2810 + }, + { + "completion_length": 370.109375, + "epoch": 0.8957934990439771, + "grad_norm": 7.561759948730469, + "kl": 0.0595703125, + "learning_rate": 1.0420650095602294e-07, + "loss": 0.0024, + "reward": 1.6742053031921387, + "reward_std": 0.03354213014245033, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5492053031921387, + "step": 2811 + }, + { + "completion_length": 317.0625, + "epoch": 0.8961121733588273, + "grad_norm": 54.87419128417969, + "kl": 0.0712890625, + "learning_rate": 1.0388782664117271e-07, + "loss": 0.0029, + "reward": 1.637012004852295, + "reward_std": 0.16005094349384308, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5432620048522949, + "step": 2812 + }, + { + "completion_length": 160.796875, + "epoch": 0.8964308476736775, + "grad_norm": 6.998462200164795, + "kl": 0.08544921875, + "learning_rate": 1.0356915232632249e-07, + "loss": 0.0034, + "reward": 1.7436039447784424, + "reward_std": 0.047648873180150986, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36860400438308716, + "step": 2813 + }, + { + "completion_length": 266.59375, + "epoch": 0.8967495219885278, + "grad_norm": 9.871182441711426, + "kl": 0.078125, + "learning_rate": 1.0325047801147227e-07, + "loss": 0.0031, + "reward": 1.5711551904678345, + "reward_std": 0.06365922838449478, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5711551904678345, + "step": 2814 + }, + { + "completion_length": 212.109375, + "epoch": 0.897068196303378, + "grad_norm": 9.782853126525879, + "kl": 0.09619140625, + "learning_rate": 1.0293180369662205e-07, + "loss": 0.0038, + "reward": 1.6640100479125977, + "reward_std": 0.06419562548398972, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5390099883079529, + "step": 2815 + }, + { + "completion_length": 196.34375, + "epoch": 0.8973868706182282, + "grad_norm": 10.195688247680664, + "kl": 0.087890625, + "learning_rate": 1.0261312938177183e-07, + "loss": 0.0035, + "reward": 1.5186760425567627, + "reward_std": 0.10513772070407867, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.40930113196372986, + "rewards/pad": 0.125, + "step": 2816 + }, + { + "completion_length": 347.6875, + "epoch": 0.8977055449330784, + "grad_norm": 5.789995193481445, + "kl": 0.0791015625, + "learning_rate": 1.022944550669216e-07, + "loss": 0.0032, + "reward": 1.5678385496139526, + "reward_std": 0.0854002982378006, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.44283854961395264, + "step": 2817 + }, + { + "completion_length": 267.203125, + "epoch": 0.8980242192479286, + "grad_norm": 23.427282333374023, + "kl": 0.08154296875, + "learning_rate": 1.0197578075207138e-07, + "loss": 0.0033, + "reward": 1.720076084136963, + "reward_std": 0.14952677488327026, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5169510245323181, + "rewards/pad": 0.21875, + "step": 2818 + }, + { + "completion_length": 244.140625, + "epoch": 0.8983428935627789, + "grad_norm": 9.095246315002441, + "kl": 0.09130859375, + "learning_rate": 1.0165710643722116e-07, + "loss": 0.0037, + "reward": 1.6292340755462646, + "reward_std": 0.0824042558670044, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5042339563369751, + "rewards/pad": 0.125, + "step": 2819 + }, + { + "completion_length": 236.90625, + "epoch": 0.8986615678776291, + "grad_norm": 21.445959091186523, + "kl": 0.0888671875, + "learning_rate": 1.0133843212237094e-07, + "loss": 0.0035, + "reward": 1.616790771484375, + "reward_std": 0.16625714302062988, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.507415771484375, + "step": 2820 + }, + { + "completion_length": 226.953125, + "epoch": 0.8989802421924793, + "grad_norm": 9.607512474060059, + "kl": 0.1025390625, + "learning_rate": 1.0101975780752072e-07, + "loss": 0.0041, + "reward": 1.4582407474517822, + "reward_std": 0.06423820555210114, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.33324068784713745, + "rewards/pad": 0.125, + "step": 2821 + }, + { + "completion_length": 202.296875, + "epoch": 0.8992989165073295, + "grad_norm": 40.7913818359375, + "kl": 0.10498046875, + "learning_rate": 1.0070108349267049e-07, + "loss": 0.0042, + "reward": 1.6470917463302612, + "reward_std": 0.1350969672203064, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5377167463302612, + "rewards/pad": 0.125, + "step": 2822 + }, + { + "completion_length": 292.65625, + "epoch": 0.8996175908221797, + "grad_norm": 12.874184608459473, + "kl": 0.06640625, + "learning_rate": 1.0038240917782025e-07, + "loss": 0.0027, + "reward": 1.6831600666046143, + "reward_std": 0.05511980876326561, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5581599473953247, + "step": 2823 + }, + { + "completion_length": 195.5625, + "epoch": 0.89993626513703, + "grad_norm": 15.63112735748291, + "kl": 0.083984375, + "learning_rate": 1.0006373486297003e-07, + "loss": 0.0034, + "reward": 1.7760124206542969, + "reward_std": 0.060271888971328735, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6510124206542969, + "rewards/pad": 0.125, + "step": 2824 + }, + { + "completion_length": 368.421875, + "epoch": 0.9002549394518802, + "grad_norm": 10.406145095825195, + "kl": 0.07470703125, + "learning_rate": 9.974506054811981e-08, + "loss": 0.003, + "reward": 1.6415812969207764, + "reward_std": 0.1609392762184143, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5478312969207764, + "rewards/pad": 0.109375, + "step": 2825 + }, + { + "completion_length": 194.78125, + "epoch": 0.9005736137667304, + "grad_norm": 23.362503051757812, + "kl": 0.0966796875, + "learning_rate": 9.94263862332696e-08, + "loss": 0.0039, + "reward": 1.7137928009033203, + "reward_std": 0.09469905495643616, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7137926816940308, + "rewards/pad": 0.0, + "step": 2826 + }, + { + "completion_length": 155.046875, + "epoch": 0.9008922880815806, + "grad_norm": 36.5787467956543, + "kl": 0.0986328125, + "learning_rate": 9.910771191841936e-08, + "loss": 0.0039, + "reward": 1.466389536857605, + "reward_std": 0.12822285294532776, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.482014536857605, + "step": 2827 + }, + { + "completion_length": 246.375, + "epoch": 0.9012109623964308, + "grad_norm": 30.5788516998291, + "kl": 0.0908203125, + "learning_rate": 9.878903760356914e-08, + "loss": 0.0036, + "reward": 1.5797438621520996, + "reward_std": 0.13034319877624512, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4859938323497772, + "rewards/pad": 0.109375, + "step": 2828 + }, + { + "completion_length": 249.953125, + "epoch": 0.9015296367112811, + "grad_norm": 7.016419410705566, + "kl": 0.09521484375, + "learning_rate": 9.847036328871892e-08, + "loss": 0.0038, + "reward": 1.6300158500671387, + "reward_std": 0.09085611253976822, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5050158500671387, + "rewards/pad": 0.125, + "step": 2829 + }, + { + "completion_length": 248.46875, + "epoch": 0.9018483110261313, + "grad_norm": 12.229598045349121, + "kl": 0.1162109375, + "learning_rate": 9.81516889738687e-08, + "loss": 0.0046, + "reward": 1.2450551986694336, + "reward_std": 0.0686068907380104, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.2450551837682724, + "step": 2830 + }, + { + "completion_length": 105.421875, + "epoch": 0.9021669853409815, + "grad_norm": 17.319347381591797, + "kl": 0.173828125, + "learning_rate": 9.783301465901848e-08, + "loss": 0.0069, + "reward": 1.8770604133605957, + "reward_std": 0.18187379837036133, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6426854729652405, + "rewards/pad": 0.234375, + "step": 2831 + }, + { + "completion_length": 244.40625, + "epoch": 0.9024856596558317, + "grad_norm": 13.74586296081543, + "kl": 0.076171875, + "learning_rate": 9.751434034416825e-08, + "loss": 0.003, + "reward": 1.508291482925415, + "reward_std": 0.08148925751447678, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.508291482925415, + "rewards/pad": 0.0, + "step": 2832 + }, + { + "completion_length": 239.046875, + "epoch": 0.9028043339706819, + "grad_norm": 10.063822746276855, + "kl": 0.08447265625, + "learning_rate": 9.719566602931803e-08, + "loss": 0.0034, + "reward": 1.597080945968628, + "reward_std": 0.056462403386831284, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5970809459686279, + "step": 2833 + }, + { + "completion_length": 278.8125, + "epoch": 0.9031230082855322, + "grad_norm": 4.26906681060791, + "kl": 0.078125, + "learning_rate": 9.687699171446781e-08, + "loss": 0.0031, + "reward": 1.5676623582839966, + "reward_std": 0.07192361354827881, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.45828741788864136, + "step": 2834 + }, + { + "completion_length": 218.5, + "epoch": 0.9034416826003824, + "grad_norm": 18.245553970336914, + "kl": 0.08203125, + "learning_rate": 9.655831739961759e-08, + "loss": 0.0033, + "reward": 1.528798222541809, + "reward_std": 0.043091028928756714, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4037981331348419, + "rewards/pad": 0.125, + "step": 2835 + }, + { + "completion_length": 159.328125, + "epoch": 0.9037603569152326, + "grad_norm": 7.013727188110352, + "kl": 0.103515625, + "learning_rate": 9.623964308476737e-08, + "loss": 0.0041, + "reward": 1.5832340717315674, + "reward_std": 0.12680181860923767, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.473859041929245, + "step": 2836 + }, + { + "completion_length": 237.671875, + "epoch": 0.9040790312300828, + "grad_norm": 20.34369468688965, + "kl": 0.095703125, + "learning_rate": 9.592096876991714e-08, + "loss": 0.0038, + "reward": 1.3584239482879639, + "reward_std": 0.1278751641511917, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.37404897809028625, + "step": 2837 + }, + { + "completion_length": 289.9375, + "epoch": 0.904397705544933, + "grad_norm": 16.990022659301758, + "kl": 0.0712890625, + "learning_rate": 9.560229445506691e-08, + "loss": 0.0029, + "reward": 1.6058906316757202, + "reward_std": 0.06301936507225037, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4808906316757202, + "rewards/pad": 0.125, + "step": 2838 + }, + { + "completion_length": 319.953125, + "epoch": 0.9047163798597833, + "grad_norm": 8.548789978027344, + "kl": 0.0791015625, + "learning_rate": 9.528362014021669e-08, + "loss": 0.0032, + "reward": 1.5624927282333374, + "reward_std": 0.11143027245998383, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4218676686286926, + "step": 2839 + }, + { + "completion_length": 187.125, + "epoch": 0.9050350541746335, + "grad_norm": 10.730565071105957, + "kl": 0.10205078125, + "learning_rate": 9.496494582536647e-08, + "loss": 0.0041, + "reward": 1.679841160774231, + "reward_std": 0.1068398505449295, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.570466160774231, + "rewards/pad": 0.109375, + "step": 2840 + }, + { + "completion_length": 185.015625, + "epoch": 0.9053537284894837, + "grad_norm": 27.30592155456543, + "kl": 0.07568359375, + "learning_rate": 9.464627151051625e-08, + "loss": 0.003, + "reward": 1.6507123708724976, + "reward_std": 0.18122225999832153, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4944624602794647, + "step": 2841 + }, + { + "completion_length": 245.65625, + "epoch": 0.9056724028043339, + "grad_norm": 5.355472564697266, + "kl": 0.0712890625, + "learning_rate": 9.432759719566602e-08, + "loss": 0.0029, + "reward": 1.4526417255401611, + "reward_std": 0.07515285909175873, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4370167553424835, + "rewards/pad": 0.015625, + "step": 2842 + }, + { + "completion_length": 150.125, + "epoch": 0.9059910771191841, + "grad_norm": 186.78187561035156, + "kl": 0.11279296875, + "learning_rate": 9.40089228808158e-08, + "loss": 0.0045, + "reward": 1.5947678089141846, + "reward_std": 0.07983440905809402, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5947678685188293, + "rewards/pad": 0.0, + "step": 2843 + }, + { + "completion_length": 226.046875, + "epoch": 0.9063097514340345, + "grad_norm": 13.87447452545166, + "kl": 0.07373046875, + "learning_rate": 9.369024856596558e-08, + "loss": 0.0029, + "reward": 1.6612658500671387, + "reward_std": 0.1668727546930313, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4112658202648163, + "step": 2844 + }, + { + "completion_length": 327.59375, + "epoch": 0.9066284257488847, + "grad_norm": 7.351214408874512, + "kl": 0.060791015625, + "learning_rate": 9.337157425111536e-08, + "loss": 0.0024, + "reward": 1.4562897682189941, + "reward_std": 0.06675264239311218, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4562898874282837, + "step": 2845 + }, + { + "completion_length": 229.640625, + "epoch": 0.9069471000637349, + "grad_norm": 14.260663032531738, + "kl": 0.08154296875, + "learning_rate": 9.305289993626514e-08, + "loss": 0.0033, + "reward": 1.507227897644043, + "reward_std": 0.04547261819243431, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.507227897644043, + "rewards/pad": 0.0, + "step": 2846 + }, + { + "completion_length": 274.28125, + "epoch": 0.9072657743785851, + "grad_norm": 4.880247592926025, + "kl": 0.0927734375, + "learning_rate": 9.27342256214149e-08, + "loss": 0.0037, + "reward": 1.577436923980713, + "reward_std": 0.18369559943675995, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.48368698358535767, + "rewards/pad": 0.125, + "step": 2847 + }, + { + "completion_length": 301.828125, + "epoch": 0.9075844486934354, + "grad_norm": 7.844675064086914, + "kl": 0.123046875, + "learning_rate": 9.241555130656469e-08, + "loss": 0.0049, + "reward": 1.560899257659912, + "reward_std": 0.06565877795219421, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5608993172645569, + "step": 2848 + }, + { + "completion_length": 224.046875, + "epoch": 0.9079031230082856, + "grad_norm": 15.078897476196289, + "kl": 0.08349609375, + "learning_rate": 9.209687699171447e-08, + "loss": 0.0033, + "reward": 1.7017287015914917, + "reward_std": 0.09162631630897522, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4517287611961365, + "step": 2849 + }, + { + "completion_length": 221.53125, + "epoch": 0.9082217973231358, + "grad_norm": 14.18359661102295, + "kl": 0.095703125, + "learning_rate": 9.177820267686425e-08, + "loss": 0.0038, + "reward": 1.5834238529205322, + "reward_std": 0.12328074872493744, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3646739721298218, + "rewards/pad": 0.21875, + "step": 2850 + }, + { + "completion_length": 207.4375, + "epoch": 0.908540471637986, + "grad_norm": 16.44442367553711, + "kl": 0.08740234375, + "learning_rate": 9.145952836201403e-08, + "loss": 0.0035, + "reward": 1.647652268409729, + "reward_std": 0.11559803783893585, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.538277268409729, + "rewards/pad": 0.125, + "step": 2851 + }, + { + "completion_length": 332.234375, + "epoch": 0.9088591459528362, + "grad_norm": 70.49515533447266, + "kl": 0.058837890625, + "learning_rate": 9.11408540471638e-08, + "loss": 0.0024, + "reward": 1.549206018447876, + "reward_std": 0.16914981603622437, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.4554559886455536, + "rewards/pad": 0.125, + "step": 2852 + }, + { + "completion_length": 387.90625, + "epoch": 0.9091778202676865, + "grad_norm": 9.856443405151367, + "kl": 0.053955078125, + "learning_rate": 9.082217973231358e-08, + "loss": 0.0022, + "reward": 1.5135695934295654, + "reward_std": 0.04079330712556839, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5135695934295654, + "step": 2853 + }, + { + "completion_length": 153.375, + "epoch": 0.9094964945825367, + "grad_norm": 22.22443962097168, + "kl": 0.11083984375, + "learning_rate": 9.050350541746334e-08, + "loss": 0.0044, + "reward": 1.5554393529891968, + "reward_std": 0.03890087455511093, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.430439293384552, + "rewards/pad": 0.125, + "step": 2854 + }, + { + "completion_length": 242.296875, + "epoch": 0.9098151688973869, + "grad_norm": 7.04885196685791, + "kl": 0.08056640625, + "learning_rate": 9.018483110261312e-08, + "loss": 0.0032, + "reward": 1.5216234922409058, + "reward_std": 0.04502193629741669, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.39662352204322815, + "step": 2855 + }, + { + "completion_length": 310.09375, + "epoch": 0.9101338432122371, + "grad_norm": 7.436180591583252, + "kl": 0.0703125, + "learning_rate": 8.98661567877629e-08, + "loss": 0.0028, + "reward": 1.6724367141723633, + "reward_std": 0.052072592079639435, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5474367737770081, + "step": 2856 + }, + { + "completion_length": 257.375, + "epoch": 0.9104525175270873, + "grad_norm": 13.737147331237793, + "kl": 0.08642578125, + "learning_rate": 8.954748247291267e-08, + "loss": 0.0035, + "reward": 1.5169252157211304, + "reward_std": 0.13501378893852234, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47005027532577515, + "step": 2857 + }, + { + "completion_length": 286.03125, + "epoch": 0.9107711918419376, + "grad_norm": 14.250129699707031, + "kl": 0.08447265625, + "learning_rate": 8.922880815806245e-08, + "loss": 0.0034, + "reward": 1.4254100322723389, + "reward_std": 0.10079248249530792, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.42541009187698364, + "step": 2858 + }, + { + "completion_length": 226.6875, + "epoch": 0.9110898661567878, + "grad_norm": 16.881498336791992, + "kl": 0.06396484375, + "learning_rate": 8.891013384321223e-08, + "loss": 0.0026, + "reward": 1.9186750650405884, + "reward_std": 0.10284863412380219, + "rewards/answer_reward": 0.5, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.41867509484291077, + "step": 2859 + }, + { + "completion_length": 356.6875, + "epoch": 0.911408540471638, + "grad_norm": 7.63905143737793, + "kl": 0.052490234375, + "learning_rate": 8.859145952836201e-08, + "loss": 0.0021, + "reward": 1.4054787158966064, + "reward_std": 0.11696632206439972, + "rewards/pad": 0.09375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.31172871589660645, + "step": 2860 + }, + { + "completion_length": 305.28125, + "epoch": 0.9117272147864882, + "grad_norm": 23.72255516052246, + "kl": 0.054443359375, + "learning_rate": 8.827278521351178e-08, + "loss": 0.0022, + "reward": 1.6052396297454834, + "reward_std": 0.13836698234081268, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.511489748954773, + "step": 2861 + }, + { + "completion_length": 257.546875, + "epoch": 0.9120458891013384, + "grad_norm": 9.212669372558594, + "kl": 0.0791015625, + "learning_rate": 8.795411089866156e-08, + "loss": 0.0032, + "reward": 1.6073306798934937, + "reward_std": 0.08537907898426056, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.3729557394981384, + "step": 2862 + }, + { + "completion_length": 219.921875, + "epoch": 0.9123645634161887, + "grad_norm": 10.81699275970459, + "kl": 0.08837890625, + "learning_rate": 8.763543658381134e-08, + "loss": 0.0035, + "reward": 1.4224504232406616, + "reward_std": 0.15301409363746643, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4224504232406616, + "step": 2863 + }, + { + "completion_length": 266.21875, + "epoch": 0.9126832377310389, + "grad_norm": 11.685783386230469, + "kl": 0.08203125, + "learning_rate": 8.731676226896112e-08, + "loss": 0.0033, + "reward": 1.657499074935913, + "reward_std": 0.07901878654956818, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4074990451335907, + "step": 2864 + }, + { + "completion_length": 201.703125, + "epoch": 0.9130019120458891, + "grad_norm": 109.4502944946289, + "kl": 0.09130859375, + "learning_rate": 8.69980879541109e-08, + "loss": 0.0036, + "reward": 1.5427440404891968, + "reward_std": 0.11305812746286392, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5583691000938416, + "rewards/pad": 0.0, + "step": 2865 + }, + { + "completion_length": 219.46875, + "epoch": 0.9133205863607393, + "grad_norm": 52.34642028808594, + "kl": 0.10009765625, + "learning_rate": 8.667941363926067e-08, + "loss": 0.004, + "reward": 1.4690560102462769, + "reward_std": 0.08319102227687836, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34405606985092163, + "rewards/pad": 0.125, + "step": 2866 + }, + { + "completion_length": 231.09375, + "epoch": 0.9136392606755895, + "grad_norm": 9.66303539276123, + "kl": 0.07470703125, + "learning_rate": 8.636073932441045e-08, + "loss": 0.003, + "reward": 1.5846589803695679, + "reward_std": 0.1445649415254593, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5221589803695679, + "rewards/pad": 0.0625, + "step": 2867 + }, + { + "completion_length": 200.296875, + "epoch": 0.9139579349904398, + "grad_norm": 20.801359176635742, + "kl": 0.1103515625, + "learning_rate": 8.604206500956023e-08, + "loss": 0.0044, + "reward": 1.584848403930664, + "reward_std": 0.11578701436519623, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5848484039306641, + "rewards/pad": 0.0, + "step": 2868 + }, + { + "completion_length": 208.046875, + "epoch": 0.91427660930529, + "grad_norm": 16.8826847076416, + "kl": 0.1025390625, + "learning_rate": 8.572339069471e-08, + "loss": 0.0041, + "reward": 1.6622300148010254, + "reward_std": 0.07939045131206512, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6622299551963806, + "step": 2869 + }, + { + "completion_length": 242.828125, + "epoch": 0.9145952836201402, + "grad_norm": 5.844204902648926, + "kl": 0.07666015625, + "learning_rate": 8.540471637985978e-08, + "loss": 0.0031, + "reward": 1.5444097518920898, + "reward_std": 0.031050903722643852, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4194098711013794, + "step": 2870 + }, + { + "completion_length": 210.890625, + "epoch": 0.9149139579349904, + "grad_norm": 15.819625854492188, + "kl": 0.10302734375, + "learning_rate": 8.508604206500955e-08, + "loss": 0.0041, + "reward": 1.4890446662902832, + "reward_std": 0.11497621238231659, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4734196066856384, + "step": 2871 + }, + { + "completion_length": 279.6875, + "epoch": 0.9152326322498406, + "grad_norm": 14.926518440246582, + "kl": 0.0751953125, + "learning_rate": 8.476736775015933e-08, + "loss": 0.003, + "reward": 1.5654652118682861, + "reward_std": 0.058053113520145416, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4404652714729309, + "step": 2872 + }, + { + "completion_length": 294.046875, + "epoch": 0.9155513065646909, + "grad_norm": 8.7820463180542, + "kl": 0.087890625, + "learning_rate": 8.444869343530911e-08, + "loss": 0.0035, + "reward": 1.6744558811187744, + "reward_std": 0.07011964917182922, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5494560599327087, + "step": 2873 + }, + { + "completion_length": 201.15625, + "epoch": 0.9158699808795411, + "grad_norm": 21.165136337280273, + "kl": 0.087890625, + "learning_rate": 8.413001912045889e-08, + "loss": 0.0035, + "reward": 1.5898141860961914, + "reward_std": 0.08441634476184845, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5898141860961914, + "step": 2874 + }, + { + "completion_length": 215.3125, + "epoch": 0.9161886551943913, + "grad_norm": 14.786849021911621, + "kl": 0.1259765625, + "learning_rate": 8.381134480560867e-08, + "loss": 0.005, + "reward": 1.7169456481933594, + "reward_std": 0.09836380928754807, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48257070779800415, + "step": 2875 + }, + { + "completion_length": 261.90625, + "epoch": 0.9165073295092415, + "grad_norm": 8.064598083496094, + "kl": 0.083984375, + "learning_rate": 8.349267049075843e-08, + "loss": 0.0034, + "reward": 1.6613750457763672, + "reward_std": 0.06930655241012573, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.41137492656707764, + "rewards/pad": 0.25, + "step": 2876 + }, + { + "completion_length": 257.5, + "epoch": 0.9168260038240917, + "grad_norm": 15.72818374633789, + "kl": 0.0859375, + "learning_rate": 8.317399617590822e-08, + "loss": 0.0034, + "reward": 1.6899898052215576, + "reward_std": 0.06479023396968842, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43998971581459045, + "rewards/pad": 0.25, + "step": 2877 + }, + { + "completion_length": 305.796875, + "epoch": 0.917144678138942, + "grad_norm": 76.97195434570312, + "kl": 0.1611328125, + "learning_rate": 8.2855321861058e-08, + "loss": 0.0065, + "reward": 1.6203289031982422, + "reward_std": 0.10637544095516205, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4953289031982422, + "rewards/pad": 0.125, + "step": 2878 + }, + { + "completion_length": 283.390625, + "epoch": 0.9174633524537922, + "grad_norm": 7.364353656768799, + "kl": 0.0771484375, + "learning_rate": 8.253664754620778e-08, + "loss": 0.0031, + "reward": 1.577134132385254, + "reward_std": 0.13365203142166138, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5927591323852539, + "step": 2879 + }, + { + "completion_length": 210.359375, + "epoch": 0.9177820267686424, + "grad_norm": 18.346982955932617, + "kl": 0.09326171875, + "learning_rate": 8.221797323135756e-08, + "loss": 0.0037, + "reward": 1.4383864402770996, + "reward_std": 0.14986774325370789, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40713638067245483, + "rewards/pad": 0.03125, + "step": 2880 + }, + { + "completion_length": 205.109375, + "epoch": 0.9181007010834926, + "grad_norm": 11.897275924682617, + "kl": 0.09521484375, + "learning_rate": 8.189929891650732e-08, + "loss": 0.0038, + "reward": 1.6616731882095337, + "reward_std": 0.06684476137161255, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4116731584072113, + "step": 2881 + }, + { + "completion_length": 189.671875, + "epoch": 0.9184193753983428, + "grad_norm": 9.623435974121094, + "kl": 0.087890625, + "learning_rate": 8.15806246016571e-08, + "loss": 0.0035, + "reward": 1.5173976421356201, + "reward_std": 0.051018401980400085, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5173976421356201, + "rewards/pad": 0.0, + "step": 2882 + }, + { + "completion_length": 277.890625, + "epoch": 0.9187380497131931, + "grad_norm": 7.127283096313477, + "kl": 0.07177734375, + "learning_rate": 8.126195028680689e-08, + "loss": 0.0029, + "reward": 1.5310813188552856, + "reward_std": 0.09413205087184906, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5467062592506409, + "step": 2883 + }, + { + "completion_length": 280.34375, + "epoch": 0.9190567240280434, + "grad_norm": 6.712581157684326, + "kl": 0.07470703125, + "learning_rate": 8.094327597195667e-08, + "loss": 0.003, + "reward": 1.4587035179138184, + "reward_std": 0.07780376821756363, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.34932857751846313, + "rewards/pad": 0.125, + "step": 2884 + }, + { + "completion_length": 200.96875, + "epoch": 0.9193753983428936, + "grad_norm": 10.461310386657715, + "kl": 0.099609375, + "learning_rate": 8.062460165710643e-08, + "loss": 0.004, + "reward": 1.4560800790786743, + "reward_std": 0.09152737259864807, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4560800790786743, + "step": 2885 + }, + { + "completion_length": 195.28125, + "epoch": 0.9196940726577438, + "grad_norm": 19.952253341674805, + "kl": 0.2265625, + "learning_rate": 8.03059273422562e-08, + "loss": 0.0091, + "reward": 1.5408905744552612, + "reward_std": 0.06746616214513779, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5408905744552612, + "rewards/pad": 0.0, + "step": 2886 + }, + { + "completion_length": 204.234375, + "epoch": 0.920012746972594, + "grad_norm": 27.799915313720703, + "kl": 0.07373046875, + "learning_rate": 7.998725302740598e-08, + "loss": 0.0029, + "reward": 1.5065892934799194, + "reward_std": 0.12254871428012848, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.38158929347991943, + "rewards/pad": 0.125, + "step": 2887 + }, + { + "completion_length": 259.484375, + "epoch": 0.9203314212874443, + "grad_norm": 17.611223220825195, + "kl": 0.080078125, + "learning_rate": 7.966857871255576e-08, + "loss": 0.0032, + "reward": 1.7014851570129395, + "reward_std": 0.10515861958265305, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5764852166175842, + "step": 2888 + }, + { + "completion_length": 262.984375, + "epoch": 0.9206500956022945, + "grad_norm": 14.8787841796875, + "kl": 0.07080078125, + "learning_rate": 7.934990439770554e-08, + "loss": 0.0028, + "reward": 1.4765360355377197, + "reward_std": 0.10343047976493835, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3515360355377197, + "rewards/pad": 0.125, + "step": 2889 + }, + { + "completion_length": 216.078125, + "epoch": 0.9209687699171447, + "grad_norm": 11.560380935668945, + "kl": 0.11474609375, + "learning_rate": 7.903123008285532e-08, + "loss": 0.0046, + "reward": 1.5976759195327759, + "reward_std": 0.1031763032078743, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47267594933509827, + "rewards/pad": 0.125, + "step": 2890 + }, + { + "completion_length": 211.40625, + "epoch": 0.9212874442319949, + "grad_norm": 11.442146301269531, + "kl": 0.1005859375, + "learning_rate": 7.871255576800509e-08, + "loss": 0.004, + "reward": 1.5785408020019531, + "reward_std": 0.11714388430118561, + "rewards/pad": 0.140625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4379158020019531, + "step": 2891 + }, + { + "completion_length": 286.515625, + "epoch": 0.9216061185468452, + "grad_norm": 7.306763648986816, + "kl": 0.06884765625, + "learning_rate": 7.839388145315487e-08, + "loss": 0.0028, + "reward": 1.492227554321289, + "reward_std": 0.0733942911028862, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4922274649143219, + "step": 2892 + }, + { + "completion_length": 232.046875, + "epoch": 0.9219247928616954, + "grad_norm": 10.7002534866333, + "kl": 0.0869140625, + "learning_rate": 7.807520713830465e-08, + "loss": 0.0035, + "reward": 1.589614748954773, + "reward_std": 0.04720890522003174, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.589614748954773, + "step": 2893 + }, + { + "completion_length": 294.625, + "epoch": 0.9222434671765456, + "grad_norm": 8.909106254577637, + "kl": 0.138671875, + "learning_rate": 7.775653282345443e-08, + "loss": 0.0055, + "reward": 1.6108899116516113, + "reward_std": 0.08002126216888428, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6108898520469666, + "step": 2894 + }, + { + "completion_length": 153.015625, + "epoch": 0.9225621414913958, + "grad_norm": 18.675016403198242, + "kl": 0.1328125, + "learning_rate": 7.743785850860421e-08, + "loss": 0.0053, + "reward": 1.551439881324768, + "reward_std": 0.09808996319770813, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5514398813247681, + "rewards/pad": 0.0, + "step": 2895 + }, + { + "completion_length": 278.375, + "epoch": 0.922880815806246, + "grad_norm": 39.86183547973633, + "kl": 0.0673828125, + "learning_rate": 7.711918419375398e-08, + "loss": 0.0027, + "reward": 1.4703035354614258, + "reward_std": 0.1023271381855011, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4703035354614258, + "step": 2896 + }, + { + "completion_length": 189.25, + "epoch": 0.9231994901210963, + "grad_norm": 7.673598289489746, + "kl": 0.09716796875, + "learning_rate": 7.680050987890376e-08, + "loss": 0.0039, + "reward": 1.5865952968597412, + "reward_std": 0.048405349254608154, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5865952372550964, + "rewards/pad": 0.0, + "step": 2897 + }, + { + "completion_length": 304.890625, + "epoch": 0.9235181644359465, + "grad_norm": 6.57677698135376, + "kl": 0.06494140625, + "learning_rate": 7.648183556405354e-08, + "loss": 0.0026, + "reward": 1.470102071762085, + "reward_std": 0.06978422403335571, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47010213136672974, + "step": 2898 + }, + { + "completion_length": 336.0, + "epoch": 0.9238368387507967, + "grad_norm": 10.75872802734375, + "kl": 0.055908203125, + "learning_rate": 7.616316124920332e-08, + "loss": 0.0022, + "reward": 1.6533763408660889, + "reward_std": 0.05971824750304222, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5283763408660889, + "step": 2899 + }, + { + "completion_length": 232.671875, + "epoch": 0.9241555130656469, + "grad_norm": 6.692727565765381, + "kl": 0.08154296875, + "learning_rate": 7.584448693435309e-08, + "loss": 0.0033, + "reward": 1.5848937034606934, + "reward_std": 0.0841706320643425, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45989376306533813, + "rewards/pad": 0.125, + "step": 2900 + }, + { + "completion_length": 213.953125, + "epoch": 0.9244741873804971, + "grad_norm": 19.63039779663086, + "kl": 0.07275390625, + "learning_rate": 7.552581261950285e-08, + "loss": 0.0029, + "reward": 1.582329273223877, + "reward_std": 0.09304428100585938, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4573292136192322, + "step": 2901 + }, + { + "completion_length": 204.390625, + "epoch": 0.9247928616953474, + "grad_norm": 29.540363311767578, + "kl": 0.08056640625, + "learning_rate": 7.520713830465264e-08, + "loss": 0.0032, + "reward": 1.8274729251861572, + "reward_std": 0.15914708375930786, + "rewards/answer_reward": 0.3125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.514972984790802, + "step": 2902 + }, + { + "completion_length": 256.71875, + "epoch": 0.9251115360101976, + "grad_norm": 14.957947731018066, + "kl": 0.16796875, + "learning_rate": 7.488846398980242e-08, + "loss": 0.0067, + "reward": 1.803660273551941, + "reward_std": 0.05807485803961754, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5536603927612305, + "step": 2903 + }, + { + "completion_length": 162.78125, + "epoch": 0.9254302103250478, + "grad_norm": 39.77608871459961, + "kl": 0.0927734375, + "learning_rate": 7.45697896749522e-08, + "loss": 0.0037, + "reward": 1.5235381126403809, + "reward_std": 0.0962752103805542, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.39853811264038086, + "rewards/pad": 0.125, + "step": 2904 + }, + { + "completion_length": 211.46875, + "epoch": 0.925748884639898, + "grad_norm": 25.95490074157715, + "kl": 0.09765625, + "learning_rate": 7.425111536010198e-08, + "loss": 0.0039, + "reward": 1.7679393291473389, + "reward_std": 0.11521582305431366, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.6585642099380493, + "step": 2905 + }, + { + "completion_length": 266.78125, + "epoch": 0.9260675589547482, + "grad_norm": 24.75307846069336, + "kl": 0.349609375, + "learning_rate": 7.393244104525174e-08, + "loss": 0.014, + "reward": 1.5795177221298218, + "reward_std": 0.07660073786973953, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45451778173446655, + "rewards/pad": 0.125, + "step": 2906 + }, + { + "completion_length": 165.796875, + "epoch": 0.9263862332695985, + "grad_norm": 9.585287094116211, + "kl": 0.0986328125, + "learning_rate": 7.361376673040152e-08, + "loss": 0.0039, + "reward": 1.718969464302063, + "reward_std": 0.10125716775655746, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5939695239067078, + "rewards/pad": 0.125, + "step": 2907 + }, + { + "completion_length": 253.953125, + "epoch": 0.9267049075844487, + "grad_norm": 15.626591682434082, + "kl": 0.10009765625, + "learning_rate": 7.32950924155513e-08, + "loss": 0.004, + "reward": 1.4436264038085938, + "reward_std": 0.10515650361776352, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4592515230178833, + "step": 2908 + }, + { + "completion_length": 221.015625, + "epoch": 0.9270235818992989, + "grad_norm": 11.700095176696777, + "kl": 0.08154296875, + "learning_rate": 7.297641810070109e-08, + "loss": 0.0033, + "reward": 1.6605483293533325, + "reward_std": 0.07712937146425247, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5355482697486877, + "step": 2909 + }, + { + "completion_length": 233.765625, + "epoch": 0.9273422562141491, + "grad_norm": 9.402727127075195, + "kl": 0.08837890625, + "learning_rate": 7.265774378585087e-08, + "loss": 0.0035, + "reward": 1.52135169506073, + "reward_std": 0.08518072962760925, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.52135169506073, + "rewards/pad": 0.0, + "step": 2910 + }, + { + "completion_length": 247.359375, + "epoch": 0.9276609305289993, + "grad_norm": 10.16604232788086, + "kl": 0.10546875, + "learning_rate": 7.233906947100063e-08, + "loss": 0.0042, + "reward": 1.5909231901168823, + "reward_std": 0.0987737774848938, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5909231901168823, + "step": 2911 + }, + { + "completion_length": 157.125, + "epoch": 0.9279796048438496, + "grad_norm": 17.30365562438965, + "kl": 0.09326171875, + "learning_rate": 7.202039515615041e-08, + "loss": 0.0037, + "reward": 1.5729458332061768, + "reward_std": 0.08636891841888428, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.44794589281082153, + "step": 2912 + }, + { + "completion_length": 282.265625, + "epoch": 0.9282982791586998, + "grad_norm": 16.395715713500977, + "kl": 0.087890625, + "learning_rate": 7.17017208413002e-08, + "loss": 0.0035, + "reward": 1.5452919006347656, + "reward_std": 0.053568463772535324, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5452919602394104, + "step": 2913 + }, + { + "completion_length": 405.84375, + "epoch": 0.92861695347355, + "grad_norm": 7.0628485679626465, + "kl": 0.048095703125, + "learning_rate": 7.138304652644997e-08, + "loss": 0.0019, + "reward": 1.531229019165039, + "reward_std": 0.04710269346833229, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5312290191650391, + "step": 2914 + }, + { + "completion_length": 205.46875, + "epoch": 0.9289356277884002, + "grad_norm": 8.386787414550781, + "kl": 0.11083984375, + "learning_rate": 7.106437221159973e-08, + "loss": 0.0044, + "reward": 1.684080958366394, + "reward_std": 0.10352471470832825, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.574705958366394, + "step": 2915 + }, + { + "completion_length": 266.25, + "epoch": 0.9292543021032504, + "grad_norm": 17.049253463745117, + "kl": 0.080078125, + "learning_rate": 7.074569789674951e-08, + "loss": 0.0032, + "reward": 1.6391890048980713, + "reward_std": 0.08314747363328934, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3891889750957489, + "step": 2916 + }, + { + "completion_length": 258.8125, + "epoch": 0.9295729764181007, + "grad_norm": 11.82989501953125, + "kl": 0.08056640625, + "learning_rate": 7.042702358189929e-08, + "loss": 0.0032, + "reward": 1.608884572982788, + "reward_std": 0.12440791726112366, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4526345431804657, + "step": 2917 + }, + { + "completion_length": 294.09375, + "epoch": 0.9298916507329509, + "grad_norm": 14.522891998291016, + "kl": 0.08203125, + "learning_rate": 7.010834926704907e-08, + "loss": 0.0033, + "reward": 1.634501338005066, + "reward_std": 0.08189462870359421, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.6501263976097107, + "rewards/pad": 0.0, + "step": 2918 + }, + { + "completion_length": 355.28125, + "epoch": 0.9302103250478011, + "grad_norm": 10.51163101196289, + "kl": 0.0576171875, + "learning_rate": 6.978967495219885e-08, + "loss": 0.0023, + "reward": 1.5610953569412231, + "reward_std": 0.1257803738117218, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.46734535694122314, + "step": 2919 + }, + { + "completion_length": 245.546875, + "epoch": 0.9305289993626513, + "grad_norm": 14.03441047668457, + "kl": 0.099609375, + "learning_rate": 6.947100063734862e-08, + "loss": 0.004, + "reward": 1.5305681228637695, + "reward_std": 0.06954821199178696, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5305681228637695, + "rewards/pad": 0.0, + "step": 2920 + }, + { + "completion_length": 312.1875, + "epoch": 0.9308476736775015, + "grad_norm": 10.062077522277832, + "kl": 0.0732421875, + "learning_rate": 6.91523263224984e-08, + "loss": 0.0029, + "reward": 1.5083460807800293, + "reward_std": 0.08206836134195328, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5239709615707397, + "rewards/pad": 0.0, + "step": 2921 + }, + { + "completion_length": 298.671875, + "epoch": 0.9311663479923518, + "grad_norm": 7.746303081512451, + "kl": 0.087890625, + "learning_rate": 6.883365200764818e-08, + "loss": 0.0035, + "reward": 1.5460608005523682, + "reward_std": 0.041064534336328506, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5460608005523682, + "rewards/pad": 0.0, + "step": 2922 + }, + { + "completion_length": 217.46875, + "epoch": 0.9314850223072021, + "grad_norm": 8.386974334716797, + "kl": 0.09130859375, + "learning_rate": 6.851497769279796e-08, + "loss": 0.0036, + "reward": 1.5595431327819824, + "reward_std": 0.07949882000684738, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.43454310297966003, + "rewards/pad": 0.125, + "step": 2923 + }, + { + "completion_length": 346.375, + "epoch": 0.9318036966220523, + "grad_norm": 6.066806316375732, + "kl": 0.076171875, + "learning_rate": 6.819630337794774e-08, + "loss": 0.003, + "reward": 1.5769262313842773, + "reward_std": 0.0969569981098175, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5925512909889221, + "rewards/pad": 0.0, + "step": 2924 + }, + { + "completion_length": 312.3125, + "epoch": 0.9321223709369025, + "grad_norm": 7.170636177062988, + "kl": 0.080078125, + "learning_rate": 6.787762906309751e-08, + "loss": 0.0032, + "reward": 1.492858648300171, + "reward_std": 0.12179407477378845, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5084837079048157, + "step": 2925 + }, + { + "completion_length": 197.78125, + "epoch": 0.9324410452517528, + "grad_norm": 18.992835998535156, + "kl": 0.09423828125, + "learning_rate": 6.755895474824729e-08, + "loss": 0.0038, + "reward": 1.4061367511749268, + "reward_std": 0.09154429286718369, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.42176181077957153, + "step": 2926 + }, + { + "completion_length": 220.6875, + "epoch": 0.932759719566603, + "grad_norm": 23.679418563842773, + "kl": 0.0888671875, + "learning_rate": 6.724028043339707e-08, + "loss": 0.0035, + "reward": 1.6889541149139404, + "reward_std": 0.09088733792304993, + "rewards/answer_reward": 0.171875, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5170789957046509, + "step": 2927 + }, + { + "completion_length": 214.296875, + "epoch": 0.9330783938814532, + "grad_norm": 10.682872772216797, + "kl": 0.076171875, + "learning_rate": 6.692160611854685e-08, + "loss": 0.003, + "reward": 1.7833725214004517, + "reward_std": 0.06558363139629364, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5333724617958069, + "step": 2928 + }, + { + "completion_length": 309.59375, + "epoch": 0.9333970681963034, + "grad_norm": 5.863774299621582, + "kl": 0.064453125, + "learning_rate": 6.660293180369663e-08, + "loss": 0.0026, + "reward": 1.6170300245285034, + "reward_std": 0.06263009458780289, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4920300841331482, + "step": 2929 + }, + { + "completion_length": 307.515625, + "epoch": 0.9337157425111536, + "grad_norm": 34.34066390991211, + "kl": 0.07373046875, + "learning_rate": 6.62842574888464e-08, + "loss": 0.0029, + "reward": 1.4313735961914062, + "reward_std": 0.05077674239873886, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43137362599372864, + "step": 2930 + }, + { + "completion_length": 214.859375, + "epoch": 0.9340344168260039, + "grad_norm": 19.838254928588867, + "kl": 0.1279296875, + "learning_rate": 6.596558317399616e-08, + "loss": 0.0051, + "reward": 1.5669782161712646, + "reward_std": 0.11023472994565964, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.44197824597358704, + "rewards/pad": 0.125, + "step": 2931 + }, + { + "completion_length": 262.6875, + "epoch": 0.9343530911408541, + "grad_norm": 6.702688217163086, + "kl": 0.0703125, + "learning_rate": 6.564690885914594e-08, + "loss": 0.0028, + "reward": 1.4523699283599854, + "reward_std": 0.04695475473999977, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3273698091506958, + "step": 2932 + }, + { + "completion_length": 163.0625, + "epoch": 0.9346717654557043, + "grad_norm": 11.438345909118652, + "kl": 0.0966796875, + "learning_rate": 6.532823454429572e-08, + "loss": 0.0039, + "reward": 1.7000656127929688, + "reward_std": 0.05705934017896652, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5750656127929688, + "step": 2933 + }, + { + "completion_length": 305.65625, + "epoch": 0.9349904397705545, + "grad_norm": 23.206321716308594, + "kl": 0.072265625, + "learning_rate": 6.50095602294455e-08, + "loss": 0.0029, + "reward": 1.7570136785507202, + "reward_std": 0.07544252276420593, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5070135593414307, + "step": 2934 + }, + { + "completion_length": 339.765625, + "epoch": 0.9353091140854047, + "grad_norm": 13.489253997802734, + "kl": 0.07568359375, + "learning_rate": 6.469088591459527e-08, + "loss": 0.003, + "reward": 1.8512710332870483, + "reward_std": 0.09367866814136505, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4762710928916931, + "step": 2935 + }, + { + "completion_length": 256.5, + "epoch": 0.935627788400255, + "grad_norm": 19.88876724243164, + "kl": 0.08837890625, + "learning_rate": 6.437221159974505e-08, + "loss": 0.0035, + "reward": 1.566627860069275, + "reward_std": 0.04243381693959236, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5666278600692749, + "step": 2936 + }, + { + "completion_length": 277.828125, + "epoch": 0.9359464627151052, + "grad_norm": 12.794341087341309, + "kl": 0.076171875, + "learning_rate": 6.405353728489483e-08, + "loss": 0.003, + "reward": 1.5573514699935913, + "reward_std": 0.14613482356071472, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4479764699935913, + "step": 2937 + }, + { + "completion_length": 164.578125, + "epoch": 0.9362651370299554, + "grad_norm": 10.677695274353027, + "kl": 0.1064453125, + "learning_rate": 6.373486297004461e-08, + "loss": 0.0043, + "reward": 1.8390767574310303, + "reward_std": 0.07184389978647232, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5890767574310303, + "rewards/pad": 0.25, + "step": 2938 + }, + { + "completion_length": 268.046875, + "epoch": 0.9365838113448056, + "grad_norm": 9.086060523986816, + "kl": 0.08837890625, + "learning_rate": 6.34161886551944e-08, + "loss": 0.0035, + "reward": 1.5117340087890625, + "reward_std": 0.04868802800774574, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5117339491844177, + "rewards/pad": 0.0, + "step": 2939 + }, + { + "completion_length": 349.109375, + "epoch": 0.9369024856596558, + "grad_norm": 16.004676818847656, + "kl": 0.054931640625, + "learning_rate": 6.309751434034416e-08, + "loss": 0.0022, + "reward": 1.4877666234970093, + "reward_std": 0.10770290344953537, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3783915936946869, + "rewards/pad": 0.125, + "step": 2940 + }, + { + "completion_length": 371.609375, + "epoch": 0.9372211599745061, + "grad_norm": 29.801067352294922, + "kl": 0.055908203125, + "learning_rate": 6.277884002549394e-08, + "loss": 0.0022, + "reward": 1.3484127521514893, + "reward_std": 0.16493690013885498, + "rewards/answer_reward": 0.015625, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3484126329421997, + "step": 2941 + }, + { + "completion_length": 183.03125, + "epoch": 0.9375398342893563, + "grad_norm": 15.473187446594238, + "kl": 0.09375, + "learning_rate": 6.246016571064372e-08, + "loss": 0.0038, + "reward": 1.7514030933380127, + "reward_std": 0.13864727318286896, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5326529741287231, + "rewards/pad": 0.21875, + "step": 2942 + }, + { + "completion_length": 221.15625, + "epoch": 0.9378585086042065, + "grad_norm": 35.03718948364258, + "kl": 0.0888671875, + "learning_rate": 6.21414913957935e-08, + "loss": 0.0036, + "reward": 1.5468283891677856, + "reward_std": 0.1377798318862915, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5624533891677856, + "rewards/pad": 0.0, + "step": 2943 + }, + { + "completion_length": 279.015625, + "epoch": 0.9381771829190567, + "grad_norm": 10.888705253601074, + "kl": 0.068359375, + "learning_rate": 6.182281708094327e-08, + "loss": 0.0027, + "reward": 1.8214476108551025, + "reward_std": 0.06730944663286209, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5714476108551025, + "step": 2944 + }, + { + "completion_length": 263.1875, + "epoch": 0.9384958572339069, + "grad_norm": 6.947081089019775, + "kl": 0.080078125, + "learning_rate": 6.150414276609305e-08, + "loss": 0.0032, + "reward": 1.4168049097061157, + "reward_std": 0.15775421261787415, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4480549395084381, + "step": 2945 + }, + { + "completion_length": 254.953125, + "epoch": 0.9388145315487572, + "grad_norm": 16.894763946533203, + "kl": 0.087890625, + "learning_rate": 6.118546845124282e-08, + "loss": 0.0035, + "reward": 1.4989348649978638, + "reward_std": 0.12767580151557922, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.514559805393219, + "step": 2946 + }, + { + "completion_length": 293.296875, + "epoch": 0.9391332058636074, + "grad_norm": 18.079181671142578, + "kl": 0.0732421875, + "learning_rate": 6.08667941363926e-08, + "loss": 0.0029, + "reward": 1.4271761178970337, + "reward_std": 0.029198169708251953, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4271760880947113, + "step": 2947 + }, + { + "completion_length": 303.421875, + "epoch": 0.9394518801784576, + "grad_norm": 9.24788761138916, + "kl": 0.0771484375, + "learning_rate": 6.054811982154238e-08, + "loss": 0.0031, + "reward": 1.5654431581497192, + "reward_std": 0.07180614024400711, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5654430985450745, + "step": 2948 + }, + { + "completion_length": 252.984375, + "epoch": 0.9397705544933078, + "grad_norm": 12.085131645202637, + "kl": 0.09375, + "learning_rate": 6.022944550669216e-08, + "loss": 0.0038, + "reward": 1.5537967681884766, + "reward_std": 0.06366712599992752, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5537967681884766, + "step": 2949 + }, + { + "completion_length": 309.9375, + "epoch": 0.940089228808158, + "grad_norm": 5.42282772064209, + "kl": 0.06640625, + "learning_rate": 5.991077119184194e-08, + "loss": 0.0026, + "reward": 1.6361989974975586, + "reward_std": 0.04752662777900696, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3861989378929138, + "step": 2950 + }, + { + "completion_length": 304.859375, + "epoch": 0.9404079031230083, + "grad_norm": 21.39364242553711, + "kl": 0.0908203125, + "learning_rate": 5.959209687699171e-08, + "loss": 0.0036, + "reward": 1.4632328748703003, + "reward_std": 0.1383098065853119, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4788578748703003, + "step": 2951 + }, + { + "completion_length": 266.921875, + "epoch": 0.9407265774378585, + "grad_norm": 17.54840850830078, + "kl": 0.08203125, + "learning_rate": 5.927342256214149e-08, + "loss": 0.0033, + "reward": 1.537185788154602, + "reward_std": 0.10543005913496017, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4121857285499573, + "rewards/pad": 0.125, + "step": 2952 + }, + { + "completion_length": 164.421875, + "epoch": 0.9410452517527087, + "grad_norm": 7.701272010803223, + "kl": 0.095703125, + "learning_rate": 5.895474824729126e-08, + "loss": 0.0038, + "reward": 1.5821001529693604, + "reward_std": 0.09362144768238068, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45710012316703796, + "rewards/pad": 0.125, + "step": 2953 + }, + { + "completion_length": 335.703125, + "epoch": 0.9413639260675589, + "grad_norm": 8.84483528137207, + "kl": 0.06787109375, + "learning_rate": 5.863607393244104e-08, + "loss": 0.0027, + "reward": 1.5164875984191895, + "reward_std": 0.03204556554555893, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5164875984191895, + "step": 2954 + }, + { + "completion_length": 288.28125, + "epoch": 0.9416826003824091, + "grad_norm": 5.423102378845215, + "kl": 0.06591796875, + "learning_rate": 5.831739961759082e-08, + "loss": 0.0026, + "reward": 1.6633018255233765, + "reward_std": 0.11971494555473328, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4758017361164093, + "rewards/pad": 0.1875, + "step": 2955 + }, + { + "completion_length": 228.59375, + "epoch": 0.9420012746972594, + "grad_norm": 9.51950454711914, + "kl": 0.08056640625, + "learning_rate": 5.79987253027406e-08, + "loss": 0.0032, + "reward": 1.7245519161224365, + "reward_std": 0.07877679169178009, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4745519757270813, + "step": 2956 + }, + { + "completion_length": 242.96875, + "epoch": 0.9423199490121096, + "grad_norm": 16.62901496887207, + "kl": 0.07080078125, + "learning_rate": 5.768005098789038e-08, + "loss": 0.0028, + "reward": 1.5734515190124512, + "reward_std": 0.0714656189084053, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5734515190124512, + "step": 2957 + }, + { + "completion_length": 167.28125, + "epoch": 0.9426386233269598, + "grad_norm": 27.43533706665039, + "kl": 0.10888671875, + "learning_rate": 5.7361376673040145e-08, + "loss": 0.0044, + "reward": 1.538001537322998, + "reward_std": 0.12343016266822815, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.538001537322998, + "rewards/pad": 0.0, + "step": 2958 + }, + { + "completion_length": 211.296875, + "epoch": 0.94295729764181, + "grad_norm": 16.95347785949707, + "kl": 0.08544921875, + "learning_rate": 5.7042702358189925e-08, + "loss": 0.0034, + "reward": 1.5702875852584839, + "reward_std": 0.056008193641901016, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4452875554561615, + "rewards/pad": 0.125, + "step": 2959 + }, + { + "completion_length": 198.75, + "epoch": 0.9432759719566602, + "grad_norm": 8.831547737121582, + "kl": 0.1005859375, + "learning_rate": 5.6724028043339706e-08, + "loss": 0.004, + "reward": 1.3954182863235474, + "reward_std": 0.09478885680437088, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3954182267189026, + "rewards/pad": 0.0, + "step": 2960 + }, + { + "completion_length": 243.953125, + "epoch": 0.9435946462715105, + "grad_norm": 10.098033905029297, + "kl": 0.07568359375, + "learning_rate": 5.640535372848948e-08, + "loss": 0.003, + "reward": 1.6640007495880127, + "reward_std": 0.05323734134435654, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5390008687973022, + "step": 2961 + }, + { + "completion_length": 207.359375, + "epoch": 0.9439133205863608, + "grad_norm": 21.087514877319336, + "kl": 0.08447265625, + "learning_rate": 5.608667941363926e-08, + "loss": 0.0034, + "reward": 1.7884836196899414, + "reward_std": 0.20367951691150665, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5541085600852966, + "rewards/pad": 0.234375, + "step": 2962 + }, + { + "completion_length": 246.984375, + "epoch": 0.944231994901211, + "grad_norm": 11.247840881347656, + "kl": 0.08984375, + "learning_rate": 5.5768005098789034e-08, + "loss": 0.0036, + "reward": 1.6184732913970947, + "reward_std": 0.1049545407295227, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49347323179244995, + "rewards/pad": 0.125, + "step": 2963 + }, + { + "completion_length": 370.34375, + "epoch": 0.9445506692160612, + "grad_norm": 8.138402938842773, + "kl": 0.056884765625, + "learning_rate": 5.5449330783938815e-08, + "loss": 0.0023, + "reward": 1.4136632680892944, + "reward_std": 0.10494749248027802, + "rewards/pad": 0.078125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.33553823828697205, + "step": 2964 + }, + { + "completion_length": 167.796875, + "epoch": 0.9448693435309115, + "grad_norm": 67.82130432128906, + "kl": 0.11962890625, + "learning_rate": 5.5130656469088595e-08, + "loss": 0.0048, + "reward": 1.7002036571502686, + "reward_std": 0.17276358604431152, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.46582862734794617, + "step": 2965 + }, + { + "completion_length": 315.9375, + "epoch": 0.9451880178457617, + "grad_norm": 8.57858943939209, + "kl": 0.087890625, + "learning_rate": 5.481198215423836e-08, + "loss": 0.0035, + "reward": 1.7237110137939453, + "reward_std": 0.06048474460840225, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4737110137939453, + "rewards/pad": 0.25, + "step": 2966 + }, + { + "completion_length": 243.09375, + "epoch": 0.9455066921606119, + "grad_norm": 9.491878509521484, + "kl": 0.08203125, + "learning_rate": 5.449330783938814e-08, + "loss": 0.0033, + "reward": 1.6464054584503174, + "reward_std": 0.12060653418302536, + "rewards/pad": 0.109375, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5526554584503174, + "step": 2967 + }, + { + "completion_length": 252.5, + "epoch": 0.9458253664754621, + "grad_norm": 15.32401180267334, + "kl": 0.072265625, + "learning_rate": 5.417463352453792e-08, + "loss": 0.0029, + "reward": 1.6297492980957031, + "reward_std": 0.11908349394798279, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.41099923849105835, + "rewards/pad": 0.234375, + "step": 2968 + }, + { + "completion_length": 274.90625, + "epoch": 0.9461440407903123, + "grad_norm": 6.705809593200684, + "kl": 0.078125, + "learning_rate": 5.38559592096877e-08, + "loss": 0.0031, + "reward": 1.565004587173462, + "reward_std": 0.0942590981721878, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45562952756881714, + "rewards/pad": 0.125, + "step": 2969 + }, + { + "completion_length": 210.984375, + "epoch": 0.9464627151051626, + "grad_norm": 19.868122100830078, + "kl": 0.08984375, + "learning_rate": 5.353728489483748e-08, + "loss": 0.0036, + "reward": 1.6228530406951904, + "reward_std": 0.06939147412776947, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49785304069519043, + "rewards/pad": 0.125, + "step": 2970 + }, + { + "completion_length": 212.421875, + "epoch": 0.9467813894200128, + "grad_norm": 11.730999946594238, + "kl": 0.09228515625, + "learning_rate": 5.321861057998725e-08, + "loss": 0.0037, + "reward": 1.6018955707550049, + "reward_std": 0.05306608974933624, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.47689566016197205, + "rewards/pad": 0.125, + "step": 2971 + }, + { + "completion_length": 216.578125, + "epoch": 0.947100063734863, + "grad_norm": 10.840819358825684, + "kl": 0.10400390625, + "learning_rate": 5.289993626513703e-08, + "loss": 0.0042, + "reward": 1.5596673488616943, + "reward_std": 0.1178320050239563, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4659172594547272, + "rewards/pad": 0.09375, + "step": 2972 + }, + { + "completion_length": 258.984375, + "epoch": 0.9474187380497132, + "grad_norm": 21.22210693359375, + "kl": 0.07177734375, + "learning_rate": 5.25812619502868e-08, + "loss": 0.0029, + "reward": 1.6138757467269897, + "reward_std": 0.062143437564373016, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.48887574672698975, + "step": 2973 + }, + { + "completion_length": 399.984375, + "epoch": 0.9477374123645634, + "grad_norm": 11.115797996520996, + "kl": 0.05224609375, + "learning_rate": 5.226258763543658e-08, + "loss": 0.0021, + "reward": 1.3853999376296997, + "reward_std": 0.14694945514202118, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.4166499674320221, + "step": 2974 + }, + { + "completion_length": 325.015625, + "epoch": 0.9480560866794137, + "grad_norm": 140.2318115234375, + "kl": 0.061279296875, + "learning_rate": 5.1943913320586354e-08, + "loss": 0.0024, + "reward": 1.497594952583313, + "reward_std": 0.09182921797037125, + "rewards/pad": 0.03125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.466344952583313, + "step": 2975 + }, + { + "completion_length": 112.921875, + "epoch": 0.9483747609942639, + "grad_norm": 14.602428436279297, + "kl": 0.1201171875, + "learning_rate": 5.1625239005736134e-08, + "loss": 0.0048, + "reward": 1.7921075820922852, + "reward_std": 0.12751063704490662, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5577325820922852, + "rewards/pad": 0.234375, + "step": 2976 + }, + { + "completion_length": 180.671875, + "epoch": 0.9486934353091141, + "grad_norm": 28.626598358154297, + "kl": 0.0888671875, + "learning_rate": 5.1306564690885915e-08, + "loss": 0.0036, + "reward": 1.3622570037841797, + "reward_std": 0.0660131573677063, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.23725706338882446, + "step": 2977 + }, + { + "completion_length": 260.421875, + "epoch": 0.9490121096239643, + "grad_norm": 8.978132247924805, + "kl": 0.125, + "learning_rate": 5.098789037603569e-08, + "loss": 0.005, + "reward": 1.5364545583724976, + "reward_std": 0.12177839875221252, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.552079439163208, + "rewards/pad": 0.0, + "step": 2978 + }, + { + "completion_length": 203.125, + "epoch": 0.9493307839388145, + "grad_norm": 12.183414459228516, + "kl": 0.10107421875, + "learning_rate": 5.066921606118547e-08, + "loss": 0.004, + "reward": 1.5364055633544922, + "reward_std": 0.11670367419719696, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5520305037498474, + "rewards/pad": 0.0, + "step": 2979 + }, + { + "completion_length": 331.40625, + "epoch": 0.9496494582536648, + "grad_norm": 21.630046844482422, + "kl": 0.056640625, + "learning_rate": 5.035054174633524e-08, + "loss": 0.0023, + "reward": 1.5015981197357178, + "reward_std": 0.14297401905059814, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4547232687473297, + "rewards/pad": 0.0625, + "step": 2980 + }, + { + "completion_length": 311.828125, + "epoch": 0.949968132568515, + "grad_norm": 10.797045707702637, + "kl": 0.09423828125, + "learning_rate": 5.003186743148502e-08, + "loss": 0.0038, + "reward": 1.3974107503890991, + "reward_std": 0.04455138370394707, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.27241069078445435, + "step": 2981 + }, + { + "completion_length": 265.359375, + "epoch": 0.9502868068833652, + "grad_norm": 12.558472633361816, + "kl": 0.07763671875, + "learning_rate": 4.97131931166348e-08, + "loss": 0.0031, + "reward": 1.7381936311721802, + "reward_std": 0.13941644132137299, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.5194436311721802, + "step": 2982 + }, + { + "completion_length": 150.96875, + "epoch": 0.9506054811982154, + "grad_norm": 36.841224670410156, + "kl": 0.1220703125, + "learning_rate": 4.939451880178457e-08, + "loss": 0.0049, + "reward": 1.7243318557739258, + "reward_std": 0.10196399688720703, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.599331796169281, + "rewards/pad": 0.125, + "step": 2983 + }, + { + "completion_length": 398.90625, + "epoch": 0.9509241555130656, + "grad_norm": 5.658376216888428, + "kl": 0.047119140625, + "learning_rate": 4.907584448693435e-08, + "loss": 0.0019, + "reward": 1.5583215951919556, + "reward_std": 0.07376955449581146, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.44894659519195557, + "step": 2984 + }, + { + "completion_length": 262.96875, + "epoch": 0.9512428298279159, + "grad_norm": 14.046929359436035, + "kl": 0.0712890625, + "learning_rate": 4.8757170172084126e-08, + "loss": 0.0029, + "reward": 1.5000898838043213, + "reward_std": 0.12163540720939636, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.3907149136066437, + "rewards/pad": 0.125, + "step": 2985 + }, + { + "completion_length": 149.234375, + "epoch": 0.9515615041427661, + "grad_norm": 11.515385627746582, + "kl": 0.1005859375, + "learning_rate": 4.8438495857233906e-08, + "loss": 0.004, + "reward": 1.8945035934448242, + "reward_std": 0.11986218392848969, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6445035934448242, + "rewards/pad": 0.25, + "step": 2986 + }, + { + "completion_length": 211.65625, + "epoch": 0.9518801784576163, + "grad_norm": 63.49930953979492, + "kl": 0.08837890625, + "learning_rate": 4.811982154238369e-08, + "loss": 0.0035, + "reward": 1.7222895622253418, + "reward_std": 0.07968949526548386, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5972896814346313, + "rewards/pad": 0.125, + "step": 2987 + }, + { + "completion_length": 252.140625, + "epoch": 0.9521988527724665, + "grad_norm": 9.624911308288574, + "kl": 0.0966796875, + "learning_rate": 4.7801147227533454e-08, + "loss": 0.0039, + "reward": 1.764341950416565, + "reward_std": 0.15120723843574524, + "rewards/pad": 0.171875, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5924668908119202, + "step": 2988 + }, + { + "completion_length": 277.484375, + "epoch": 0.9525175270873167, + "grad_norm": 7.495218276977539, + "kl": 0.07861328125, + "learning_rate": 4.7482472912683235e-08, + "loss": 0.0032, + "reward": 1.6148617267608643, + "reward_std": 0.08406472206115723, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.48986178636550903, + "rewards/pad": 0.125, + "step": 2989 + }, + { + "completion_length": 398.421875, + "epoch": 0.952836201402167, + "grad_norm": 10.273070335388184, + "kl": 0.060546875, + "learning_rate": 4.716379859783301e-08, + "loss": 0.0024, + "reward": 1.4720619916915894, + "reward_std": 0.03158481419086456, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.47206199169158936, + "step": 2990 + }, + { + "completion_length": 337.234375, + "epoch": 0.9531548757170172, + "grad_norm": 8.842098236083984, + "kl": 0.0517578125, + "learning_rate": 4.684512428298279e-08, + "loss": 0.0021, + "reward": 1.4910411834716797, + "reward_std": 0.05693596601486206, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49104124307632446, + "step": 2991 + }, + { + "completion_length": 268.4375, + "epoch": 0.9534735500318674, + "grad_norm": 7.0008745193481445, + "kl": 0.08837890625, + "learning_rate": 4.652644996813257e-08, + "loss": 0.0035, + "reward": 1.5434587001800537, + "reward_std": 0.05424448847770691, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5434587001800537, + "step": 2992 + }, + { + "completion_length": 112.6875, + "epoch": 0.9537922243467176, + "grad_norm": 19.789148330688477, + "kl": 0.09619140625, + "learning_rate": 4.6207775653282343e-08, + "loss": 0.0038, + "reward": 2.011953830718994, + "reward_std": 0.14079678058624268, + "rewards/answer_reward": 0.359375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6525790095329285, + "step": 2993 + }, + { + "completion_length": 262.640625, + "epoch": 0.9541108986615678, + "grad_norm": 8.151646614074707, + "kl": 0.08154296875, + "learning_rate": 4.5889101338432124e-08, + "loss": 0.0033, + "reward": 1.683987021446228, + "reward_std": 0.15515395998954773, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.574612021446228, + "rewards/pad": 0.109375, + "step": 2994 + }, + { + "completion_length": 295.46875, + "epoch": 0.9544295729764181, + "grad_norm": 12.131200790405273, + "kl": 0.083984375, + "learning_rate": 4.55704270235819e-08, + "loss": 0.0034, + "reward": 1.648716926574707, + "reward_std": 0.14547915756702423, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.539341926574707, + "step": 2995 + }, + { + "completion_length": 228.140625, + "epoch": 0.9547482472912683, + "grad_norm": 14.250265121459961, + "kl": 0.07958984375, + "learning_rate": 4.525175270873167e-08, + "loss": 0.0032, + "reward": 1.8075170516967773, + "reward_std": 0.18245282769203186, + "rewards/answer_reward": 0.4375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.37001708149909973, + "step": 2996 + }, + { + "completion_length": 321.1875, + "epoch": 0.9550669216061185, + "grad_norm": 5.624185085296631, + "kl": 0.0712890625, + "learning_rate": 4.493307839388145e-08, + "loss": 0.0029, + "reward": 1.5331188440322876, + "reward_std": 0.06479167938232422, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40811875462532043, + "rewards/pad": 0.125, + "step": 2997 + }, + { + "completion_length": 195.109375, + "epoch": 0.9553855959209687, + "grad_norm": 14.505610466003418, + "kl": 0.107421875, + "learning_rate": 4.4614404079031226e-08, + "loss": 0.0043, + "reward": 1.5591657161712646, + "reward_std": 0.06373967230319977, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5591658353805542, + "step": 2998 + }, + { + "completion_length": 223.515625, + "epoch": 0.9557042702358189, + "grad_norm": 13.931330680847168, + "kl": 0.091796875, + "learning_rate": 4.4295729764181007e-08, + "loss": 0.0037, + "reward": 1.6169328689575195, + "reward_std": 0.04845960810780525, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6169329285621643, + "rewards/pad": 0.0, + "step": 2999 + }, + { + "completion_length": 316.390625, + "epoch": 0.9560229445506692, + "grad_norm": 16.772754669189453, + "kl": 0.0625, + "learning_rate": 4.397705544933078e-08, + "loss": 0.0025, + "reward": 1.7339085340499878, + "reward_std": 0.09912504255771637, + "rewards/pad": 0.234375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4995335340499878, + "step": 3000 + }, + { + "completion_length": 300.765625, + "epoch": 0.9563416188655195, + "grad_norm": 7.473287582397461, + "kl": 0.07958984375, + "learning_rate": 4.365838113448056e-08, + "loss": 0.0032, + "reward": 1.4801604747772217, + "reward_std": 0.04294013977050781, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4801604151725769, + "rewards/pad": 0.0, + "step": 3001 + }, + { + "completion_length": 289.828125, + "epoch": 0.9566602931803697, + "grad_norm": 11.7194242477417, + "kl": 0.06640625, + "learning_rate": 4.3339706819630335e-08, + "loss": 0.0027, + "reward": 1.5888422727584839, + "reward_std": 0.04296314716339111, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4638422429561615, + "rewards/pad": 0.125, + "step": 3002 + }, + { + "completion_length": 325.671875, + "epoch": 0.9569789674952199, + "grad_norm": 28.08829116821289, + "kl": 0.0673828125, + "learning_rate": 4.3021032504780115e-08, + "loss": 0.0027, + "reward": 1.465879201889038, + "reward_std": 0.059772029519081116, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.34087926149368286, + "step": 3003 + }, + { + "completion_length": 211.9375, + "epoch": 0.9572976418100702, + "grad_norm": 12.590349197387695, + "kl": 0.1513671875, + "learning_rate": 4.270235818992989e-08, + "loss": 0.0061, + "reward": 1.5774794816970825, + "reward_std": 0.10665491223335266, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5774794816970825, + "rewards/pad": 0.0, + "step": 3004 + }, + { + "completion_length": 187.5, + "epoch": 0.9576163161249204, + "grad_norm": 16.79472541809082, + "kl": 0.12255859375, + "learning_rate": 4.238368387507966e-08, + "loss": 0.0049, + "reward": 1.6055787801742554, + "reward_std": 0.08426319062709808, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6055787801742554, + "step": 3005 + }, + { + "completion_length": 263.3125, + "epoch": 0.9579349904397706, + "grad_norm": 11.45641803741455, + "kl": 0.0849609375, + "learning_rate": 4.2065009560229444e-08, + "loss": 0.0034, + "reward": 1.622631311416626, + "reward_std": 0.059184275567531586, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.497631311416626, + "rewards/pad": 0.125, + "step": 3006 + }, + { + "completion_length": 196.921875, + "epoch": 0.9582536647546208, + "grad_norm": 5.744304180145264, + "kl": 0.076171875, + "learning_rate": 4.174633524537922e-08, + "loss": 0.003, + "reward": 1.3995598554611206, + "reward_std": 0.15788444876670837, + "rewards/format_reward_tg": 0.96875, + "rewards/iou_timestamp_reward": 0.3058098256587982, + "rewards/pad": 0.125, + "step": 3007 + }, + { + "completion_length": 203.890625, + "epoch": 0.958572339069471, + "grad_norm": 11.544098854064941, + "kl": 0.08154296875, + "learning_rate": 4.1427660930529e-08, + "loss": 0.0033, + "reward": 1.6876447200775146, + "reward_std": 0.07707478106021881, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4376447796821594, + "step": 3008 + }, + { + "completion_length": 241.15625, + "epoch": 0.9588910133843213, + "grad_norm": 11.547026634216309, + "kl": 0.08056640625, + "learning_rate": 4.110898661567878e-08, + "loss": 0.0032, + "reward": 1.5496833324432373, + "reward_std": 0.06832106411457062, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5496833920478821, + "rewards/pad": 0.0, + "step": 3009 + }, + { + "completion_length": 247.8125, + "epoch": 0.9592096876991715, + "grad_norm": 10.342385292053223, + "kl": 0.0751953125, + "learning_rate": 4.079031230082855e-08, + "loss": 0.003, + "reward": 1.7064106464385986, + "reward_std": 0.08886480331420898, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5814106464385986, + "step": 3010 + }, + { + "completion_length": 301.1875, + "epoch": 0.9595283620140217, + "grad_norm": 9.354042053222656, + "kl": 0.09716796875, + "learning_rate": 4.047163798597833e-08, + "loss": 0.0039, + "reward": 1.569082498550415, + "reward_std": 0.09403829276561737, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.584707498550415, + "rewards/pad": 0.0, + "step": 3011 + }, + { + "completion_length": 207.21875, + "epoch": 0.9598470363288719, + "grad_norm": 27.846111297607422, + "kl": 0.07421875, + "learning_rate": 4.01529636711281e-08, + "loss": 0.003, + "reward": 1.7822484970092773, + "reward_std": 0.10999485850334167, + "rewards/pad": 0.375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.40724843740463257, + "step": 3012 + }, + { + "completion_length": 151.546875, + "epoch": 0.9601657106437221, + "grad_norm": 13.443730354309082, + "kl": 0.12060546875, + "learning_rate": 3.983428935627788e-08, + "loss": 0.0048, + "reward": 1.720609188079834, + "reward_std": 0.05341852456331253, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.7206092476844788, + "step": 3013 + }, + { + "completion_length": 299.65625, + "epoch": 0.9604843849585724, + "grad_norm": 14.898906707763672, + "kl": 0.0625, + "learning_rate": 3.951561504142766e-08, + "loss": 0.0025, + "reward": 1.4509539604187012, + "reward_std": 0.11394208669662476, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.46657896041870117, + "step": 3014 + }, + { + "completion_length": 360.953125, + "epoch": 0.9608030592734226, + "grad_norm": 7.477005481719971, + "kl": 0.07666015625, + "learning_rate": 3.9196940726577435e-08, + "loss": 0.0031, + "reward": 1.4555785655975342, + "reward_std": 0.11029116809368134, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4712035357952118, + "rewards/pad": 0.0, + "step": 3015 + }, + { + "completion_length": 217.671875, + "epoch": 0.9611217335882728, + "grad_norm": 11.383493423461914, + "kl": 0.0869140625, + "learning_rate": 3.8878266411727215e-08, + "loss": 0.0035, + "reward": 1.6601845026016235, + "reward_std": 0.1018948182463646, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6601845622062683, + "rewards/pad": 0.0, + "step": 3016 + }, + { + "completion_length": 242.90625, + "epoch": 0.961440407903123, + "grad_norm": 10.623103141784668, + "kl": 0.0810546875, + "learning_rate": 3.855959209687699e-08, + "loss": 0.0032, + "reward": 1.6002237796783447, + "reward_std": 0.07772623002529144, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6002237796783447, + "step": 3017 + }, + { + "completion_length": 327.765625, + "epoch": 0.9617590822179732, + "grad_norm": 14.112882614135742, + "kl": 0.064453125, + "learning_rate": 3.824091778202677e-08, + "loss": 0.0026, + "reward": 1.7028043270111084, + "reward_std": 0.19873130321502686, + "rewards/pad": 0.21875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4996793270111084, + "step": 3018 + }, + { + "completion_length": 407.71875, + "epoch": 0.9620777565328235, + "grad_norm": 21.201560974121094, + "kl": 0.05224609375, + "learning_rate": 3.7922243467176544e-08, + "loss": 0.0021, + "reward": 1.4655060768127441, + "reward_std": 0.08513548970222473, + "rewards/answer_reward": 0.109375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.35613104701042175, + "step": 3019 + }, + { + "completion_length": 226.421875, + "epoch": 0.9623964308476737, + "grad_norm": 5.293650150299072, + "kl": 0.07861328125, + "learning_rate": 3.760356915232632e-08, + "loss": 0.0031, + "reward": 1.4824628829956055, + "reward_std": 0.056464605033397675, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.48246294260025024, + "step": 3020 + }, + { + "completion_length": 311.5, + "epoch": 0.9627151051625239, + "grad_norm": 13.04458999633789, + "kl": 0.08251953125, + "learning_rate": 3.72848948374761e-08, + "loss": 0.0033, + "reward": 1.4492220878601074, + "reward_std": 0.04893079772591591, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4492220878601074, + "rewards/pad": 0.0, + "step": 3021 + }, + { + "completion_length": 225.640625, + "epoch": 0.9630337794773741, + "grad_norm": 16.370359420776367, + "kl": 0.1796875, + "learning_rate": 3.696622052262587e-08, + "loss": 0.0072, + "reward": 1.5129005908966064, + "reward_std": 0.10321009904146194, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.38790059089660645, + "step": 3022 + }, + { + "completion_length": 162.375, + "epoch": 0.9633524537922243, + "grad_norm": 17.733623504638672, + "kl": 0.095703125, + "learning_rate": 3.664754620777565e-08, + "loss": 0.0038, + "reward": 1.785008430480957, + "reward_std": 0.07291992753744125, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5350083708763123, + "rewards/pad": 0.25, + "step": 3023 + }, + { + "completion_length": 312.4375, + "epoch": 0.9636711281070746, + "grad_norm": 10.358829498291016, + "kl": 0.06640625, + "learning_rate": 3.632887189292543e-08, + "loss": 0.0027, + "reward": 1.5295699834823608, + "reward_std": 0.1087612509727478, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.29519492387771606, + "rewards/pad": 0.25, + "step": 3024 + }, + { + "completion_length": 313.203125, + "epoch": 0.9639898024219248, + "grad_norm": 10.354582786560059, + "kl": 0.0712890625, + "learning_rate": 3.601019757807521e-08, + "loss": 0.0028, + "reward": 1.6526461839675903, + "reward_std": 0.04885178059339523, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5276461243629456, + "step": 3025 + }, + { + "completion_length": 216.9375, + "epoch": 0.964308476736775, + "grad_norm": 6.7993011474609375, + "kl": 0.0947265625, + "learning_rate": 3.569152326322499e-08, + "loss": 0.0038, + "reward": 1.5121433734893799, + "reward_std": 0.15271823108196259, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4027683734893799, + "rewards/pad": 0.125, + "step": 3026 + }, + { + "completion_length": 323.609375, + "epoch": 0.9646271510516252, + "grad_norm": 6.0091166496276855, + "kl": 0.08935546875, + "learning_rate": 3.5372848948374755e-08, + "loss": 0.0036, + "reward": 1.5815134048461914, + "reward_std": 0.04523976147174835, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4565134644508362, + "step": 3027 + }, + { + "completion_length": 259.109375, + "epoch": 0.9649458253664754, + "grad_norm": 123.76586151123047, + "kl": 0.0859375, + "learning_rate": 3.5054174633524535e-08, + "loss": 0.0034, + "reward": 1.640549898147583, + "reward_std": 0.06378833204507828, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.515549898147583, + "step": 3028 + }, + { + "completion_length": 305.84375, + "epoch": 0.9652644996813257, + "grad_norm": 7.624619483947754, + "kl": 0.08984375, + "learning_rate": 3.473550031867431e-08, + "loss": 0.0036, + "reward": 1.532264232635498, + "reward_std": 0.15833765268325806, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4228892922401428, + "step": 3029 + }, + { + "completion_length": 350.484375, + "epoch": 0.9655831739961759, + "grad_norm": 10.998796463012695, + "kl": 0.07666015625, + "learning_rate": 3.441682600382409e-08, + "loss": 0.0031, + "reward": 1.5304185152053833, + "reward_std": 0.02771463245153427, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4054185152053833, + "rewards/pad": 0.125, + "step": 3030 + }, + { + "completion_length": 186.796875, + "epoch": 0.9659018483110261, + "grad_norm": 18.013072967529297, + "kl": 0.095703125, + "learning_rate": 3.409815168897387e-08, + "loss": 0.0038, + "reward": 1.7294105291366577, + "reward_std": 0.1627049595117569, + "rewards/answer_reward": 0.34375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3856606185436249, + "step": 3031 + }, + { + "completion_length": 245.1875, + "epoch": 0.9662205226258763, + "grad_norm": 21.26409912109375, + "kl": 0.0986328125, + "learning_rate": 3.3779477374123644e-08, + "loss": 0.004, + "reward": 1.552625298500061, + "reward_std": 0.1314411163330078, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.45887523889541626, + "rewards/pad": 0.109375, + "step": 3032 + }, + { + "completion_length": 314.453125, + "epoch": 0.9665391969407265, + "grad_norm": 10.736640930175781, + "kl": 0.09033203125, + "learning_rate": 3.3460803059273424e-08, + "loss": 0.0036, + "reward": 1.3965399265289307, + "reward_std": 0.08710728585720062, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.41216492652893066, + "rewards/pad": 0.0, + "step": 3033 + }, + { + "completion_length": 421.6875, + "epoch": 0.9668578712555768, + "grad_norm": 5.672484397888184, + "kl": 0.06689453125, + "learning_rate": 3.31421287444232e-08, + "loss": 0.0027, + "reward": 1.3580100536346436, + "reward_std": 0.15528865158557892, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.3892601430416107, + "step": 3034 + }, + { + "completion_length": 214.609375, + "epoch": 0.967176545570427, + "grad_norm": 14.969710350036621, + "kl": 0.1025390625, + "learning_rate": 3.282345442957297e-08, + "loss": 0.0041, + "reward": 1.651845932006836, + "reward_std": 0.11346770823001862, + "rewards/answer_reward": 0.203125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.4643459916114807, + "step": 3035 + }, + { + "completion_length": 242.296875, + "epoch": 0.9674952198852772, + "grad_norm": 41.26520919799805, + "kl": 0.08837890625, + "learning_rate": 3.250478011472275e-08, + "loss": 0.0035, + "reward": 1.4151108264923096, + "reward_std": 0.059655025601387024, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.41511082649230957, + "step": 3036 + }, + { + "completion_length": 254.078125, + "epoch": 0.9678138942001274, + "grad_norm": 10.369619369506836, + "kl": 0.08544921875, + "learning_rate": 3.2186105799872527e-08, + "loss": 0.0034, + "reward": 1.386507272720337, + "reward_std": 0.05853161960840225, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.3865073323249817, + "rewards/pad": 0.0, + "step": 3037 + }, + { + "completion_length": 318.75, + "epoch": 0.9681325685149776, + "grad_norm": 11.73049259185791, + "kl": 0.08056640625, + "learning_rate": 3.186743148502231e-08, + "loss": 0.0032, + "reward": 1.4719185829162598, + "reward_std": 0.11228282004594803, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.48754358291625977, + "step": 3038 + }, + { + "completion_length": 196.453125, + "epoch": 0.9684512428298279, + "grad_norm": 31.66900634765625, + "kl": 0.0966796875, + "learning_rate": 3.154875717017208e-08, + "loss": 0.0039, + "reward": 1.5930705070495605, + "reward_std": 0.08366744220256805, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5930704474449158, + "step": 3039 + }, + { + "completion_length": 270.4375, + "epoch": 0.9687699171446782, + "grad_norm": 6.654904365539551, + "kl": 0.09375, + "learning_rate": 3.123008285532186e-08, + "loss": 0.0038, + "reward": 1.749192237854004, + "reward_std": 0.052496008574962616, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6241921782493591, + "step": 3040 + }, + { + "completion_length": 288.640625, + "epoch": 0.9690885914595284, + "grad_norm": 11.12962818145752, + "kl": 0.0869140625, + "learning_rate": 3.0911408540471635e-08, + "loss": 0.0035, + "reward": 1.713568925857544, + "reward_std": 0.13127964735031128, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.604193925857544, + "step": 3041 + }, + { + "completion_length": 217.484375, + "epoch": 0.9694072657743786, + "grad_norm": 13.664684295654297, + "kl": 0.103515625, + "learning_rate": 3.059273422562141e-08, + "loss": 0.0041, + "reward": 1.5261034965515137, + "reward_std": 0.07350903749465942, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5261036157608032, + "step": 3042 + }, + { + "completion_length": 209.25, + "epoch": 0.9697259400892289, + "grad_norm": 11.630436897277832, + "kl": 0.08837890625, + "learning_rate": 3.027405991077119e-08, + "loss": 0.0035, + "reward": 1.5359015464782715, + "reward_std": 0.11306829750537872, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5202765464782715, + "rewards/pad": 0.015625, + "step": 3043 + }, + { + "completion_length": 262.640625, + "epoch": 0.9700446144040791, + "grad_norm": 8.941901206970215, + "kl": 0.087890625, + "learning_rate": 2.995538559592097e-08, + "loss": 0.0035, + "reward": 1.4941205978393555, + "reward_std": 0.07758435606956482, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.49412065744400024, + "step": 3044 + }, + { + "completion_length": 284.078125, + "epoch": 0.9703632887189293, + "grad_norm": 5.911899566650391, + "kl": 0.06591796875, + "learning_rate": 2.9636711281070744e-08, + "loss": 0.0026, + "reward": 1.6351535320281982, + "reward_std": 0.05527539551258087, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5101535320281982, + "step": 3045 + }, + { + "completion_length": 166.40625, + "epoch": 0.9706819630337795, + "grad_norm": 15.447898864746094, + "kl": 0.09765625, + "learning_rate": 2.931803696622052e-08, + "loss": 0.0039, + "reward": 1.826016902923584, + "reward_std": 0.10933683067560196, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5760170221328735, + "rewards/pad": 0.25, + "step": 3046 + }, + { + "completion_length": 258.515625, + "epoch": 0.9710006373486297, + "grad_norm": 10.936978340148926, + "kl": 0.06884765625, + "learning_rate": 2.89993626513703e-08, + "loss": 0.0028, + "reward": 1.5791189670562744, + "reward_std": 0.16111625730991364, + "rewards/pad": 0.046875, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.5478689670562744, + "step": 3047 + }, + { + "completion_length": 157.703125, + "epoch": 0.97131931166348, + "grad_norm": 10.121866226196289, + "kl": 0.12451171875, + "learning_rate": 2.8680688336520072e-08, + "loss": 0.005, + "reward": 1.6469779014587402, + "reward_std": 0.06688619405031204, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5219780206680298, + "rewards/pad": 0.125, + "step": 3048 + }, + { + "completion_length": 265.21875, + "epoch": 0.9716379859783302, + "grad_norm": 12.310712814331055, + "kl": 0.07666015625, + "learning_rate": 2.8362014021669853e-08, + "loss": 0.0031, + "reward": 1.7152507305145264, + "reward_std": 0.13572877645492554, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6058759093284607, + "rewards/pad": 0.109375, + "step": 3049 + }, + { + "completion_length": 254.15625, + "epoch": 0.9719566602931804, + "grad_norm": 12.733508110046387, + "kl": 0.087890625, + "learning_rate": 2.804333970681963e-08, + "loss": 0.0035, + "reward": 1.4946439266204834, + "reward_std": 0.07657530903816223, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4946439862251282, + "rewards/pad": 0.0, + "step": 3050 + }, + { + "completion_length": 233.859375, + "epoch": 0.9722753346080306, + "grad_norm": 27.716503143310547, + "kl": 0.09228515625, + "learning_rate": 2.7724665391969407e-08, + "loss": 0.0037, + "reward": 1.657578945159912, + "reward_std": 0.07279810309410095, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5325790643692017, + "rewards/pad": 0.125, + "step": 3051 + }, + { + "completion_length": 167.03125, + "epoch": 0.9725940089228808, + "grad_norm": 10.394854545593262, + "kl": 0.09716796875, + "learning_rate": 2.740599107711918e-08, + "loss": 0.0039, + "reward": 1.4805113077163696, + "reward_std": 0.06541478633880615, + "rewards/answer_reward": 0.0, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.4805113673210144, + "step": 3052 + }, + { + "completion_length": 294.46875, + "epoch": 0.9729126832377311, + "grad_norm": 10.962387084960938, + "kl": 0.07958984375, + "learning_rate": 2.708731676226896e-08, + "loss": 0.0032, + "reward": 1.5387133359909058, + "reward_std": 0.0850515067577362, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5387133359909058, + "step": 3053 + }, + { + "completion_length": 350.5, + "epoch": 0.9732313575525813, + "grad_norm": 21.669363021850586, + "kl": 0.06591796875, + "learning_rate": 2.676864244741874e-08, + "loss": 0.0026, + "reward": 1.494605302810669, + "reward_std": 0.03655810281634331, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4946053624153137, + "rewards/pad": 0.0, + "step": 3054 + }, + { + "completion_length": 250.296875, + "epoch": 0.9735500318674315, + "grad_norm": 26.71428871154785, + "kl": 0.087890625, + "learning_rate": 2.6449968132568516e-08, + "loss": 0.0035, + "reward": 1.5646378993988037, + "reward_std": 0.06353458762168884, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5646378397941589, + "step": 3055 + }, + { + "completion_length": 207.828125, + "epoch": 0.9738687061822817, + "grad_norm": 7.8366780281066895, + "kl": 0.07373046875, + "learning_rate": 2.613129381771829e-08, + "loss": 0.003, + "reward": 1.8365015983581543, + "reward_std": 0.10568691045045853, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5865016579627991, + "step": 3056 + }, + { + "completion_length": 367.109375, + "epoch": 0.9741873804971319, + "grad_norm": 13.387369155883789, + "kl": 0.0654296875, + "learning_rate": 2.5812619502868067e-08, + "loss": 0.0026, + "reward": 1.498044729232788, + "reward_std": 0.09658186882734299, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.49804478883743286, + "rewards/pad": 0.0, + "step": 3057 + }, + { + "completion_length": 297.328125, + "epoch": 0.9745060548119822, + "grad_norm": 8.86507511138916, + "kl": 0.0859375, + "learning_rate": 2.5493945188017844e-08, + "loss": 0.0034, + "reward": 1.4849259853363037, + "reward_std": 0.0937286987900734, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3599260747432709, + "step": 3058 + }, + { + "completion_length": 233.90625, + "epoch": 0.9748247291268324, + "grad_norm": 14.907334327697754, + "kl": 0.06787109375, + "learning_rate": 2.517527087316762e-08, + "loss": 0.0027, + "reward": 1.9055280685424805, + "reward_std": 0.12614959478378296, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5617780685424805, + "rewards/pad": 0.34375, + "step": 3059 + }, + { + "completion_length": 259.359375, + "epoch": 0.9751434034416826, + "grad_norm": 7.794898509979248, + "kl": 0.08203125, + "learning_rate": 2.48565965583174e-08, + "loss": 0.0033, + "reward": 1.4615026712417603, + "reward_std": 0.10705969482660294, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.47712767124176025, + "step": 3060 + }, + { + "completion_length": 283.703125, + "epoch": 0.9754620777565328, + "grad_norm": 5.05447244644165, + "kl": 0.08740234375, + "learning_rate": 2.4537922243467176e-08, + "loss": 0.0035, + "reward": 1.5107604265213013, + "reward_std": 0.07834285497665405, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5107604265213013, + "rewards/pad": 0.0, + "step": 3061 + }, + { + "completion_length": 251.140625, + "epoch": 0.975780752071383, + "grad_norm": 30.65591049194336, + "kl": 0.08984375, + "learning_rate": 2.4219247928616953e-08, + "loss": 0.0036, + "reward": 1.40012526512146, + "reward_std": 0.03734934702515602, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4001252055168152, + "step": 3062 + }, + { + "completion_length": 236.359375, + "epoch": 0.9760994263862333, + "grad_norm": 5.879389762878418, + "kl": 0.091796875, + "learning_rate": 2.3900573613766727e-08, + "loss": 0.0037, + "reward": 1.4694182872772217, + "reward_std": 0.136412113904953, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.36004331707954407, + "step": 3063 + }, + { + "completion_length": 393.828125, + "epoch": 0.9764181007010835, + "grad_norm": 5.189061164855957, + "kl": 0.052734375, + "learning_rate": 2.3581899298916504e-08, + "loss": 0.0021, + "reward": 1.5653045177459717, + "reward_std": 0.03525523096323013, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5653046369552612, + "step": 3064 + }, + { + "completion_length": 206.09375, + "epoch": 0.9767367750159337, + "grad_norm": 17.348432540893555, + "kl": 0.0869140625, + "learning_rate": 2.3263224984066285e-08, + "loss": 0.0035, + "reward": 1.4528409242630005, + "reward_std": 0.08727478981018066, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46846598386764526, + "rewards/pad": 0.0, + "step": 3065 + }, + { + "completion_length": 207.40625, + "epoch": 0.9770554493307839, + "grad_norm": 23.430965423583984, + "kl": 0.09716796875, + "learning_rate": 2.2944550669216062e-08, + "loss": 0.0039, + "reward": 1.7889633178710938, + "reward_std": 0.06907358765602112, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.663963258266449, + "rewards/pad": 0.125, + "step": 3066 + }, + { + "completion_length": 257.234375, + "epoch": 0.9773741236456341, + "grad_norm": 34.405094146728516, + "kl": 0.0693359375, + "learning_rate": 2.2625876354365836e-08, + "loss": 0.0028, + "reward": 1.8198498487472534, + "reward_std": 0.0726766362786293, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5698498487472534, + "step": 3067 + }, + { + "completion_length": 270.6875, + "epoch": 0.9776927979604844, + "grad_norm": 14.171384811401367, + "kl": 0.08740234375, + "learning_rate": 2.2307202039515613e-08, + "loss": 0.0035, + "reward": 1.4805707931518555, + "reward_std": 0.15138691663742065, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.41807082295417786, + "rewards/pad": 0.078125, + "step": 3068 + }, + { + "completion_length": 296.125, + "epoch": 0.9780114722753346, + "grad_norm": 7.556200981140137, + "kl": 0.07080078125, + "learning_rate": 2.198852772466539e-08, + "loss": 0.0028, + "reward": 1.5037003755569458, + "reward_std": 0.07935502380132675, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.503700315952301, + "step": 3069 + }, + { + "completion_length": 268.703125, + "epoch": 0.9783301465901848, + "grad_norm": 9.175628662109375, + "kl": 0.08447265625, + "learning_rate": 2.1669853409815167e-08, + "loss": 0.0034, + "reward": 1.480644941329956, + "reward_std": 0.10172773152589798, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4962700605392456, + "step": 3070 + }, + { + "completion_length": 221.421875, + "epoch": 0.978648820905035, + "grad_norm": 12.232224464416504, + "kl": 0.09228515625, + "learning_rate": 2.1351179094964945e-08, + "loss": 0.0037, + "reward": 1.790949821472168, + "reward_std": 0.10301777720451355, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5409497618675232, + "rewards/pad": 0.25, + "step": 3071 + }, + { + "completion_length": 209.25, + "epoch": 0.9789674952198852, + "grad_norm": 19.196441650390625, + "kl": 0.08984375, + "learning_rate": 2.1032504780114722e-08, + "loss": 0.0036, + "reward": 1.5424752235412598, + "reward_std": 0.11825200170278549, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5581002831459045, + "rewards/pad": 0.0, + "step": 3072 + }, + { + "completion_length": 257.734375, + "epoch": 0.9792861695347355, + "grad_norm": 20.04810333251953, + "kl": 0.0791015625, + "learning_rate": 2.07138304652645e-08, + "loss": 0.0032, + "reward": 1.635162591934204, + "reward_std": 0.05953432619571686, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5101625919342041, + "step": 3073 + }, + { + "completion_length": 264.4375, + "epoch": 0.9796048438495857, + "grad_norm": 10.390314102172852, + "kl": 0.078125, + "learning_rate": 2.0395156150414276e-08, + "loss": 0.0031, + "reward": 1.5732051134109497, + "reward_std": 0.20447908341884613, + "rewards/pad": 0.15625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.4325801134109497, + "step": 3074 + }, + { + "completion_length": 303.15625, + "epoch": 0.9799235181644359, + "grad_norm": 9.053024291992188, + "kl": 0.07421875, + "learning_rate": 2.007648183556405e-08, + "loss": 0.003, + "reward": 1.5527631044387817, + "reward_std": 0.06574830412864685, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.552763044834137, + "step": 3075 + }, + { + "completion_length": 198.3125, + "epoch": 0.9802421924792861, + "grad_norm": 16.09470558166504, + "kl": 0.09912109375, + "learning_rate": 1.975780752071383e-08, + "loss": 0.004, + "reward": 1.5192358493804932, + "reward_std": 0.08530691266059875, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5192359089851379, + "rewards/pad": 0.0, + "step": 3076 + }, + { + "completion_length": 224.71875, + "epoch": 0.9805608667941363, + "grad_norm": 5.323574066162109, + "kl": 0.07421875, + "learning_rate": 1.9439133205863608e-08, + "loss": 0.003, + "reward": 1.782647967338562, + "reward_std": 0.07638280838727951, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6576479077339172, + "step": 3077 + }, + { + "completion_length": 124.421875, + "epoch": 0.9808795411089866, + "grad_norm": 51.17085266113281, + "kl": 0.11328125, + "learning_rate": 1.9120458891013385e-08, + "loss": 0.0045, + "reward": 1.8514604568481445, + "reward_std": 0.18483218550682068, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5077104568481445, + "rewards/pad": 0.359375, + "step": 3078 + }, + { + "completion_length": 324.921875, + "epoch": 0.9811982154238368, + "grad_norm": 9.296708106994629, + "kl": 0.060791015625, + "learning_rate": 1.880178457616316e-08, + "loss": 0.0024, + "reward": 1.5254148244857788, + "reward_std": 0.07078447937965393, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4004148840904236, + "rewards/pad": 0.125, + "step": 3079 + }, + { + "completion_length": 314.078125, + "epoch": 0.9815168897386871, + "grad_norm": 10.421858787536621, + "kl": 0.08349609375, + "learning_rate": 1.8483110261312936e-08, + "loss": 0.0033, + "reward": 1.4044888019561768, + "reward_std": 0.0950625091791153, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.42011380195617676, + "rewards/pad": 0.0, + "step": 3080 + }, + { + "completion_length": 165.375, + "epoch": 0.9818355640535373, + "grad_norm": 13.181941986083984, + "kl": 0.10986328125, + "learning_rate": 1.8164435946462717e-08, + "loss": 0.0044, + "reward": 1.4772001504898071, + "reward_std": 0.09890478849411011, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.32095012068748474, + "rewards/pad": 0.15625, + "step": 3081 + }, + { + "completion_length": 172.53125, + "epoch": 0.9821542383683876, + "grad_norm": 27.367578506469727, + "kl": 0.10888671875, + "learning_rate": 1.7845761631612494e-08, + "loss": 0.0044, + "reward": 1.5015852451324463, + "reward_std": 0.055210791528224945, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5015853643417358, + "rewards/pad": 0.0, + "step": 3082 + }, + { + "completion_length": 237.671875, + "epoch": 0.9824729126832378, + "grad_norm": 8.945556640625, + "kl": 0.083984375, + "learning_rate": 1.7527087316762268e-08, + "loss": 0.0034, + "reward": 1.4884425401687622, + "reward_std": 0.08792269229888916, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4884425401687622, + "step": 3083 + }, + { + "completion_length": 188.234375, + "epoch": 0.982791586998088, + "grad_norm": 8.917075157165527, + "kl": 0.10107421875, + "learning_rate": 1.7208413001912045e-08, + "loss": 0.004, + "reward": 1.3387261629104614, + "reward_std": 0.056616295129060745, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3387261927127838, + "step": 3084 + }, + { + "completion_length": 215.625, + "epoch": 0.9831102613129382, + "grad_norm": 20.064624786376953, + "kl": 0.1142578125, + "learning_rate": 1.6889738687061822e-08, + "loss": 0.0046, + "reward": 1.5038783550262451, + "reward_std": 0.1603861153125763, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5195032954216003, + "rewards/pad": 0.0, + "step": 3085 + }, + { + "completion_length": 381.375, + "epoch": 0.9834289356277884, + "grad_norm": 15.217743873596191, + "kl": 0.056640625, + "learning_rate": 1.65710643722116e-08, + "loss": 0.0023, + "reward": 1.4081106185913086, + "reward_std": 0.0775367021560669, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.40811046957969666, + "rewards/pad": 0.0, + "step": 3086 + }, + { + "completion_length": 238.875, + "epoch": 0.9837476099426387, + "grad_norm": 6.312625885009766, + "kl": 0.06591796875, + "learning_rate": 1.6252390057361376e-08, + "loss": 0.0026, + "reward": 1.6966602802276611, + "reward_std": 0.08635349571704865, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4466603398323059, + "step": 3087 + }, + { + "completion_length": 270.75, + "epoch": 0.9840662842574889, + "grad_norm": 31.017301559448242, + "kl": 0.08203125, + "learning_rate": 1.5933715742511154e-08, + "loss": 0.0033, + "reward": 1.5948489904403687, + "reward_std": 0.10828594118356705, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.46984899044036865, + "step": 3088 + }, + { + "completion_length": 261.640625, + "epoch": 0.9843849585723391, + "grad_norm": 9.238334655761719, + "kl": 0.07763671875, + "learning_rate": 1.561504142766093e-08, + "loss": 0.0031, + "reward": 1.5603516101837158, + "reward_std": 0.08074082434177399, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5603516101837158, + "rewards/pad": 0.0, + "step": 3089 + }, + { + "completion_length": 253.40625, + "epoch": 0.9847036328871893, + "grad_norm": 10.491616249084473, + "kl": 0.0869140625, + "learning_rate": 1.5296367112810705e-08, + "loss": 0.0035, + "reward": 1.5297356843948364, + "reward_std": 0.07021459937095642, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5297356843948364, + "step": 3090 + }, + { + "completion_length": 344.921875, + "epoch": 0.9850223072020395, + "grad_norm": 10.11392879486084, + "kl": 0.107421875, + "learning_rate": 1.4977692797960485e-08, + "loss": 0.0043, + "reward": 1.5372244119644165, + "reward_std": 0.07054002583026886, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5372244119644165, + "step": 3091 + }, + { + "completion_length": 331.859375, + "epoch": 0.9853409815168898, + "grad_norm": 6.936378002166748, + "kl": 0.048583984375, + "learning_rate": 1.465901848311026e-08, + "loss": 0.0019, + "reward": 1.771822214126587, + "reward_std": 0.057387061417102814, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5218220949172974, + "step": 3092 + }, + { + "completion_length": 309.125, + "epoch": 0.98565965583174, + "grad_norm": 9.3517484664917, + "kl": 0.09375, + "learning_rate": 1.4340344168260036e-08, + "loss": 0.0037, + "reward": 1.5548007488250732, + "reward_std": 0.12453365325927734, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5704257488250732, + "rewards/pad": 0.0, + "step": 3093 + }, + { + "completion_length": 255.140625, + "epoch": 0.9859783301465902, + "grad_norm": 20.259626388549805, + "kl": 0.09716796875, + "learning_rate": 1.4021669853409815e-08, + "loss": 0.0039, + "reward": 1.5752711296081543, + "reward_std": 0.1396198868751526, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4815211296081543, + "rewards/pad": 0.09375, + "step": 3094 + }, + { + "completion_length": 330.953125, + "epoch": 0.9862970044614404, + "grad_norm": 52.81623840332031, + "kl": 0.054931640625, + "learning_rate": 1.370299553855959e-08, + "loss": 0.0022, + "reward": 1.4935202598571777, + "reward_std": 0.06092238426208496, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.36852020025253296, + "step": 3095 + }, + { + "completion_length": 441.265625, + "epoch": 0.9866156787762906, + "grad_norm": 8.006575584411621, + "kl": 0.043701171875, + "learning_rate": 1.338432122370937e-08, + "loss": 0.0017, + "reward": 1.448917031288147, + "reward_std": 0.0884983167052269, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.43329206109046936, + "step": 3096 + }, + { + "completion_length": 317.15625, + "epoch": 0.9869343530911409, + "grad_norm": 10.468679428100586, + "kl": 0.06640625, + "learning_rate": 1.3065646908859145e-08, + "loss": 0.0027, + "reward": 1.6145520210266113, + "reward_std": 0.04079999029636383, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6145520806312561, + "step": 3097 + }, + { + "completion_length": 233.34375, + "epoch": 0.9872530274059911, + "grad_norm": 11.847618103027344, + "kl": 0.06640625, + "learning_rate": 1.2746972594008922e-08, + "loss": 0.0027, + "reward": 1.839026689529419, + "reward_std": 0.06752954423427582, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7140265703201294, + "rewards/pad": 0.125, + "step": 3098 + }, + { + "completion_length": 204.03125, + "epoch": 0.9875717017208413, + "grad_norm": 10.032548904418945, + "kl": 0.083984375, + "learning_rate": 1.24282982791587e-08, + "loss": 0.0034, + "reward": 1.5784478187561035, + "reward_std": 0.09089922904968262, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4534478187561035, + "step": 3099 + }, + { + "completion_length": 178.796875, + "epoch": 0.9878903760356915, + "grad_norm": 27.579824447631836, + "kl": 0.10009765625, + "learning_rate": 1.2109623964308477e-08, + "loss": 0.004, + "reward": 1.7521710395812988, + "reward_std": 0.11859433352947235, + "rewards/answer_reward": 0.375, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.3927960693836212, + "step": 3100 + }, + { + "completion_length": 272.109375, + "epoch": 0.9882090503505417, + "grad_norm": 9.311613082885742, + "kl": 0.09326171875, + "learning_rate": 1.1790949649458252e-08, + "loss": 0.0037, + "reward": 1.4809329509735107, + "reward_std": 0.15124361217021942, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.37155792117118835, + "step": 3101 + }, + { + "completion_length": 376.75, + "epoch": 0.988527724665392, + "grad_norm": 6.454984664916992, + "kl": 0.06982421875, + "learning_rate": 1.1472275334608031e-08, + "loss": 0.0028, + "reward": 1.4383108615875244, + "reward_std": 0.12465780973434448, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.45393580198287964, + "step": 3102 + }, + { + "completion_length": 228.46875, + "epoch": 0.9888463989802422, + "grad_norm": 4.873990058898926, + "kl": 0.10009765625, + "learning_rate": 1.1153601019757807e-08, + "loss": 0.004, + "reward": 1.5083374977111816, + "reward_std": 0.057083502411842346, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.3833376169204712, + "step": 3103 + }, + { + "completion_length": 273.109375, + "epoch": 0.9891650732950924, + "grad_norm": 12.395194053649902, + "kl": 0.08154296875, + "learning_rate": 1.0834926704907584e-08, + "loss": 0.0033, + "reward": 1.5387074947357178, + "reward_std": 0.042679332196712494, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5387074947357178, + "step": 3104 + }, + { + "completion_length": 422.8125, + "epoch": 0.9894837476099426, + "grad_norm": 8.831942558288574, + "kl": 0.04541015625, + "learning_rate": 1.0516252390057361e-08, + "loss": 0.0018, + "reward": 1.2390778064727783, + "reward_std": 0.12053517252206802, + "rewards/pad": 0.015625, + "rewards/tracking_format_reward": 0.984375, + "rewards/tracking_iou_reward": 0.23907773196697235, + "step": 3105 + }, + { + "completion_length": 231.875, + "epoch": 0.9898024219247928, + "grad_norm": 9.457585334777832, + "kl": 0.09716796875, + "learning_rate": 1.0197578075207138e-08, + "loss": 0.0039, + "reward": 1.7127227783203125, + "reward_std": 0.0834982842206955, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4627227187156677, + "step": 3106 + }, + { + "completion_length": 317.328125, + "epoch": 0.9901210962396431, + "grad_norm": 13.605870246887207, + "kl": 0.056884765625, + "learning_rate": 9.878903760356915e-09, + "loss": 0.0023, + "reward": 1.7096130847930908, + "reward_std": 0.13412657380104065, + "rewards/answer_reward": 0.03125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.6783630847930908, + "step": 3107 + }, + { + "completion_length": 106.90625, + "epoch": 0.9904397705544933, + "grad_norm": 14.606616973876953, + "kl": 0.125, + "learning_rate": 9.560229445506692e-09, + "loss": 0.005, + "reward": 1.7832053899765015, + "reward_std": 0.0934794694185257, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6582053899765015, + "rewards/pad": 0.125, + "step": 3108 + }, + { + "completion_length": 177.0, + "epoch": 0.9907584448693435, + "grad_norm": 15.736312866210938, + "kl": 0.10498046875, + "learning_rate": 9.241555130656468e-09, + "loss": 0.0042, + "reward": 1.4562592506408691, + "reward_std": 0.23588356375694275, + "rewards/answer_reward": 0.0625, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3937593996524811, + "step": 3109 + }, + { + "completion_length": 204.671875, + "epoch": 0.9910771191841937, + "grad_norm": 7.896600723266602, + "kl": 0.10498046875, + "learning_rate": 8.922880815806247e-09, + "loss": 0.0042, + "reward": 1.458459734916687, + "reward_std": 0.03806007280945778, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.458459734916687, + "rewards/pad": 0.0, + "step": 3110 + }, + { + "completion_length": 256.171875, + "epoch": 0.9913957934990439, + "grad_norm": 15.68986988067627, + "kl": 0.125, + "learning_rate": 8.604206500956022e-09, + "loss": 0.005, + "reward": 1.4623175859451294, + "reward_std": 0.0622483566403389, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.3373175263404846, + "step": 3111 + }, + { + "completion_length": 287.453125, + "epoch": 0.9917144678138942, + "grad_norm": 12.322236061096191, + "kl": 0.0517578125, + "learning_rate": 8.2855321861058e-09, + "loss": 0.0021, + "reward": 1.8157682418823242, + "reward_std": 0.13324689865112305, + "rewards/pad": 0.34375, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4720180630683899, + "step": 3112 + }, + { + "completion_length": 157.875, + "epoch": 0.9920331421287444, + "grad_norm": 33.36964416503906, + "kl": 0.1181640625, + "learning_rate": 7.966857871255577e-09, + "loss": 0.0047, + "reward": 1.681018590927124, + "reward_std": 0.08750802278518677, + "rewards/answer_reward": 0.125, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5560184717178345, + "step": 3113 + }, + { + "completion_length": 241.21875, + "epoch": 0.9923518164435946, + "grad_norm": 19.668622970581055, + "kl": 0.083984375, + "learning_rate": 7.648183556405352e-09, + "loss": 0.0034, + "reward": 1.6106711626052856, + "reward_std": 0.06073055416345596, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6106711626052856, + "rewards/pad": 0.0, + "step": 3114 + }, + { + "completion_length": 419.046875, + "epoch": 0.9926704907584448, + "grad_norm": 4.147368431091309, + "kl": 0.060791015625, + "learning_rate": 7.32950924155513e-09, + "loss": 0.0024, + "reward": 1.546280860900879, + "reward_std": 0.06868711113929749, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5462808012962341, + "step": 3115 + }, + { + "completion_length": 225.765625, + "epoch": 0.992989165073295, + "grad_norm": 12.72802448272705, + "kl": 0.09423828125, + "learning_rate": 7.0108349267049075e-09, + "loss": 0.0038, + "reward": 1.6859400272369385, + "reward_std": 0.1193547323346138, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.45156511664390564, + "rewards/pad": 0.234375, + "step": 3116 + }, + { + "completion_length": 222.84375, + "epoch": 0.9933078393881453, + "grad_norm": 23.26289176940918, + "kl": 0.0859375, + "learning_rate": 6.692160611854685e-09, + "loss": 0.0034, + "reward": 1.8061788082122803, + "reward_std": 0.09761648625135422, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.5561787486076355, + "step": 3117 + }, + { + "completion_length": 316.8125, + "epoch": 0.9936265137029955, + "grad_norm": 15.188164710998535, + "kl": 0.07568359375, + "learning_rate": 6.373486297004461e-09, + "loss": 0.003, + "reward": 1.4298902750015259, + "reward_std": 0.03217019885778427, + "rewards/pad": 0.0, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.4298902451992035, + "step": 3118 + }, + { + "completion_length": 168.53125, + "epoch": 0.9939451880178458, + "grad_norm": 22.4268741607666, + "kl": 0.1201171875, + "learning_rate": 6.054811982154238e-09, + "loss": 0.0048, + "reward": 1.6596311330795288, + "reward_std": 0.15989093482494354, + "rewards/answer_reward": 0.234375, + "rewards/format_reward_gqa": 1.0, + "rewards/iou_glue_reward": 0.42525607347488403, + "step": 3119 + }, + { + "completion_length": 156.65625, + "epoch": 0.994263862332696, + "grad_norm": 13.560260772705078, + "kl": 0.09765625, + "learning_rate": 5.7361376673040155e-09, + "loss": 0.0039, + "reward": 1.680901050567627, + "reward_std": 0.1962885558605194, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.602776050567627, + "rewards/pad": 0.078125, + "step": 3120 + }, + { + "completion_length": 408.453125, + "epoch": 0.9945825366475463, + "grad_norm": 3.027728796005249, + "kl": 0.041015625, + "learning_rate": 5.417463352453792e-09, + "loss": 0.0016, + "reward": 1.4375839233398438, + "reward_std": 0.12024590373039246, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 0.96875, + "rewards/tracking_iou_reward": 0.34383392333984375, + "step": 3121 + }, + { + "completion_length": 248.828125, + "epoch": 0.9949012109623965, + "grad_norm": 11.69973087310791, + "kl": 0.08984375, + "learning_rate": 5.098789037603569e-09, + "loss": 0.0036, + "reward": 1.7863315343856812, + "reward_std": 0.10297290980815887, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.6613315343856812, + "step": 3122 + }, + { + "completion_length": 172.765625, + "epoch": 0.9952198852772467, + "grad_norm": 18.64386558532715, + "kl": 0.12890625, + "learning_rate": 4.780114722753346e-09, + "loss": 0.0052, + "reward": 1.7994670867919922, + "reward_std": 0.09457459300756454, + "rewards/answer_reward": 0.25, + "rewards/format_reward_gqa": 0.984375, + "rewards/iou_glue_reward": 0.5650919675827026, + "step": 3123 + }, + { + "completion_length": 233.65625, + "epoch": 0.9955385595920969, + "grad_norm": 21.83030128479004, + "kl": 0.09228515625, + "learning_rate": 4.4614404079031234e-09, + "loss": 0.0037, + "reward": 1.4759409427642822, + "reward_std": 0.09514787793159485, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.49156588315963745, + "rewards/pad": 0.0, + "step": 3124 + }, + { + "completion_length": 261.5625, + "epoch": 0.9958572339069471, + "grad_norm": 9.203996658325195, + "kl": 0.08056640625, + "learning_rate": 4.1427660930529e-09, + "loss": 0.0032, + "reward": 1.6252367496490479, + "reward_std": 0.08587397634983063, + "rewards/pad": 0.125, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.5002367496490479, + "step": 3125 + }, + { + "completion_length": 216.03125, + "epoch": 0.9961759082217974, + "grad_norm": 20.595712661743164, + "kl": 0.1083984375, + "learning_rate": 3.824091778202676e-09, + "loss": 0.0043, + "reward": 1.822039008140564, + "reward_std": 0.12611322104930878, + "rewards/pad": 0.25, + "rewards/tracking_format_reward": 1.0, + "rewards/tracking_iou_reward": 0.572039008140564, + "step": 3126 + }, + { + "completion_length": 269.921875, + "epoch": 0.9964945825366476, + "grad_norm": 11.260319709777832, + "kl": 0.08984375, + "learning_rate": 3.5054174633524538e-09, + "loss": 0.0036, + "reward": 1.6543762683868408, + "reward_std": 0.18545392155647278, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5450013279914856, + "rewards/pad": 0.125, + "step": 3127 + }, + { + "completion_length": 251.5, + "epoch": 0.9968132568514978, + "grad_norm": 13.050055503845215, + "kl": 0.08349609375, + "learning_rate": 3.1867431485022305e-09, + "loss": 0.0033, + "reward": 1.8711285591125488, + "reward_std": 0.0955628752708435, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.6211286783218384, + "rewards/pad": 0.25, + "step": 3128 + }, + { + "completion_length": 306.6875, + "epoch": 0.997131931166348, + "grad_norm": 27.774784088134766, + "kl": 0.138671875, + "learning_rate": 2.8680688336520077e-09, + "loss": 0.0055, + "reward": 1.4905447959899902, + "reward_std": 0.11900576949119568, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.5061697959899902, + "rewards/pad": 0.0, + "step": 3129 + }, + { + "completion_length": 336.984375, + "epoch": 0.9974506054811982, + "grad_norm": 11.642563819885254, + "kl": 0.055419921875, + "learning_rate": 2.5493945188017845e-09, + "loss": 0.0022, + "reward": 1.4699349403381348, + "reward_std": 0.054827552288770676, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.34493494033813477, + "rewards/pad": 0.125, + "step": 3130 + }, + { + "completion_length": 191.234375, + "epoch": 0.9977692797960485, + "grad_norm": 12.029446601867676, + "kl": 0.0859375, + "learning_rate": 2.2307202039515617e-09, + "loss": 0.0034, + "reward": 1.5740249156951904, + "reward_std": 0.034564144909381866, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5740248560905457, + "rewards/pad": 0.0, + "step": 3131 + }, + { + "completion_length": 246.734375, + "epoch": 0.9980879541108987, + "grad_norm": 20.317331314086914, + "kl": 0.08447265625, + "learning_rate": 1.912045889101338e-09, + "loss": 0.0034, + "reward": 1.579896092414856, + "reward_std": 0.06583913415670395, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.579896092414856, + "rewards/pad": 0.0, + "step": 3132 + }, + { + "completion_length": 256.1875, + "epoch": 0.9984066284257489, + "grad_norm": 22.188108444213867, + "kl": 0.12060546875, + "learning_rate": 1.5933715742511153e-09, + "loss": 0.0048, + "reward": 1.6689432859420776, + "reward_std": 0.13535553216934204, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.5439432859420776, + "rewards/pad": 0.125, + "step": 3133 + }, + { + "completion_length": 168.90625, + "epoch": 0.9987253027405991, + "grad_norm": 14.834891319274902, + "kl": 0.08984375, + "learning_rate": 1.2746972594008923e-09, + "loss": 0.0036, + "reward": 1.698204517364502, + "reward_std": 0.10654359310865402, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.46382954716682434, + "rewards/pad": 0.25, + "step": 3134 + }, + { + "completion_length": 354.34375, + "epoch": 0.9990439770554493, + "grad_norm": 5.06793212890625, + "kl": 0.0546875, + "learning_rate": 9.56022944550669e-10, + "loss": 0.0022, + "reward": 1.5813429355621338, + "reward_std": 0.11093565076589584, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.4719679355621338, + "rewards/pad": 0.125, + "step": 3135 + }, + { + "completion_length": 214.9375, + "epoch": 0.9993626513702996, + "grad_norm": 11.776244163513184, + "kl": 0.11865234375, + "learning_rate": 6.373486297004461e-10, + "loss": 0.0047, + "reward": 1.6603457927703857, + "reward_std": 0.15173858404159546, + "rewards/format_reward_tg": 0.984375, + "rewards/iou_timestamp_reward": 0.550970733165741, + "rewards/pad": 0.125, + "step": 3136 + }, + { + "completion_length": 160.828125, + "epoch": 0.9996813256851498, + "grad_norm": 10.365925788879395, + "kl": 0.083984375, + "learning_rate": 3.1867431485022307e-10, + "loss": 0.0034, + "reward": 1.7423681020736694, + "reward_std": 0.10533448308706284, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.4923681616783142, + "rewards/pad": 0.25, + "step": 3137 + }, + { + "completion_length": 148.1999969482422, + "epoch": 1.0, + "grad_norm": 60.47707748413086, + "kl": 0.09228515625, + "learning_rate": 0.0, + "loss": 0.0033, + "reward": 1.7849794626235962, + "reward_std": 0.1385474056005478, + "rewards/format_reward_tg": 1.0, + "rewards/iou_timestamp_reward": 0.7849794030189514, + "rewards/pad": 0.0, + "step": 3138 + } + ], + "logging_steps": 1.0, + "max_steps": 3138, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}