{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 27, "global_step": 267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 92.16071701049805, "epoch": 0.003745318352059925, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.3455178663134575, "reward_std": 0.7725450992584229, "rewards/correctness_reward_func": 0.191964291036129, "rewards/int_reward_func": 0.2812500149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1276964358985424, "step": 1 }, { "completion_length": 99.06696891784668, "epoch": 0.00749063670411985, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.38210269808769226, "reward_std": 0.8393888622522354, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.2600446492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0743705378845334, "step": 2 }, { "completion_length": 95.87054061889648, "epoch": 0.011235955056179775, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.24831698834896088, "reward_std": 0.7660860866308212, "rewards/correctness_reward_func": 0.12053571734577417, "rewards/int_reward_func": 0.2421875149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11440625134855509, "step": 3 }, { "completion_length": 100.84152221679688, "epoch": 0.0149812734082397, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.3874709792435169, "reward_std": 0.8373937755823135, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.251116082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08239509297709446, "step": 4 }, { "completion_length": 107.81696701049805, "epoch": 0.018726591760299626, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.3461473397910595, "reward_std": 0.8639847934246063, "rewards/correctness_reward_func": 0.2187500111758709, "rewards/int_reward_func": 0.2834821566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15608482621610165, "step": 5 }, { "completion_length": 88.96428871154785, "epoch": 0.02247191011235955, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.23790179379284382, "reward_std": 0.8017762005329132, "rewards/correctness_reward_func": 0.14285715389996767, "rewards/int_reward_func": 0.2310267947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13598215393722057, "step": 6 }, { "completion_length": 86.72768211364746, "epoch": 0.026217228464419477, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.33704913780093193, "reward_std": 0.786924734711647, "rewards/correctness_reward_func": 0.2053571492433548, "rewards/int_reward_func": 0.2667410895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1350491177290678, "step": 7 }, { "completion_length": 87.54687881469727, "epoch": 0.0299625468164794, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.390026792883873, "reward_std": 0.7708619683980942, "rewards/correctness_reward_func": 0.2053571492433548, "rewards/int_reward_func": 0.2354910857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.050821430049836636, "step": 8 }, { "completion_length": 87.82366561889648, "epoch": 0.033707865168539325, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.30632367357611656, "reward_std": 0.8439056426286697, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.2477678656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11555134132504463, "step": 9 }, { "completion_length": 95.44196891784668, "epoch": 0.03745318352059925, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.36654020100831985, "reward_std": 0.7821808308362961, "rewards/correctness_reward_func": 0.1830357238650322, "rewards/int_reward_func": 0.263392873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07988839386962354, "step": 10 }, { "completion_length": 88.40178871154785, "epoch": 0.04119850187265917, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.3682924136519432, "reward_std": 0.8412070125341415, "rewards/correctness_reward_func": 0.2098214440047741, "rewards/int_reward_func": 0.2600446604192257, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10157366236671805, "step": 11 }, { "completion_length": 96.40178871154785, "epoch": 0.0449438202247191, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.3570691980421543, "reward_std": 0.831629067659378, "rewards/correctness_reward_func": 0.20535715110599995, "rewards/int_reward_func": 0.2633928693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1116808035876602, "step": 12 }, { "completion_length": 98.67634582519531, "epoch": 0.04868913857677903, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.31250446662306786, "reward_std": 0.7651553750038147, "rewards/correctness_reward_func": 0.1696428656578064, "rewards/int_reward_func": 0.2343750074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09151339251548052, "step": 13 }, { "completion_length": 94.42634201049805, "epoch": 0.052434456928838954, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 0.35333259031176567, "reward_std": 0.8554573208093643, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2645089477300644, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13439063727855682, "step": 14 }, { "completion_length": 95.30580711364746, "epoch": 0.056179775280898875, "grad_norm": 0.7303056716918945, "kl": 0.0, "learning_rate": 1.8518518518518518e-07, "loss": 0.0, "reward": 0.3904196694493294, "reward_std": 0.8479138016700745, "rewards/correctness_reward_func": 0.2098214440047741, "rewards/int_reward_func": 0.2544642984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07386607304215431, "step": 15 }, { "completion_length": 98.70089721679688, "epoch": 0.0599250936329588, "grad_norm": 0.6679372191429138, "kl": 0.0, "learning_rate": 3.7037037037037036e-07, "loss": -0.0, "reward": 0.26223884522914886, "reward_std": 0.8470287472009659, "rewards/correctness_reward_func": 0.1830357201397419, "rewards/int_reward_func": 0.243303582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16410045325756073, "step": 16 }, { "completion_length": 111.6004524230957, "epoch": 0.06367041198501873, "grad_norm": 0.8377946019172668, "kl": 8.493661880493164e-07, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "reward": 0.27681921795010567, "reward_std": 0.8451116383075714, "rewards/correctness_reward_func": 0.1875000037252903, "rewards/int_reward_func": 0.2488839365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1595647381618619, "step": 17 }, { "completion_length": 99.96875381469727, "epoch": 0.06741573033707865, "grad_norm": 0.9236070513725281, "kl": 0.00010453164577484131, "learning_rate": 7.407407407407407e-07, "loss": 0.0, "reward": 0.31320536509156227, "reward_std": 0.8322850167751312, "rewards/correctness_reward_func": 0.191964291036129, "rewards/int_reward_func": 0.2455357275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12429465167224407, "step": 18 }, { "completion_length": 98.84598731994629, "epoch": 0.07116104868913857, "grad_norm": 0.9007355570793152, "kl": 0.0017764568328857422, "learning_rate": 9.259259259259259e-07, "loss": 0.0001, "reward": 0.2539866119623184, "reward_std": 0.8283544480800629, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.227678582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1477991035208106, "step": 19 }, { "completion_length": 89.91964721679688, "epoch": 0.0749063670411985, "grad_norm": 0.8131362199783325, "kl": 0.009290695190429688, "learning_rate": 1.111111111111111e-06, "loss": 0.0004, "reward": 0.4152901992201805, "reward_std": 0.7349574714899063, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.258928582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0668526804074645, "step": 20 }, { "completion_length": 92.15402221679688, "epoch": 0.07865168539325842, "grad_norm": 0.8349559307098389, "kl": 0.055389404296875, "learning_rate": 1.2962962962962962e-06, "loss": 0.0022, "reward": 0.33069421350955963, "reward_std": 0.7918245047330856, "rewards/correctness_reward_func": 0.19196429289877415, "rewards/int_reward_func": 0.2656250074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12689509708434343, "step": 21 }, { "completion_length": 90.09821891784668, "epoch": 0.08239700374531835, "grad_norm": 1.059292197227478, "kl": 0.13580322265625, "learning_rate": 1.4814814814814815e-06, "loss": 0.0054, "reward": 0.3621741235256195, "reward_std": 0.8592714816331863, "rewards/correctness_reward_func": 0.2187500149011612, "rewards/int_reward_func": 0.2533482164144516, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10992411337792873, "step": 22 }, { "completion_length": 90.31473541259766, "epoch": 0.08614232209737828, "grad_norm": 0.9306014776229858, "kl": 0.22802734375, "learning_rate": 1.6666666666666667e-06, "loss": 0.0091, "reward": 0.27021654695272446, "reward_std": 0.7604184001684189, "rewards/correctness_reward_func": 0.1428571492433548, "rewards/int_reward_func": 0.251116082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12375670112669468, "step": 23 }, { "completion_length": 85.87277221679688, "epoch": 0.0898876404494382, "grad_norm": 1.0507615804672241, "kl": 0.26690673828125, "learning_rate": 1.8518518518518519e-06, "loss": 0.0107, "reward": 0.2973214313387871, "reward_std": 0.72261543571949, "rewards/correctness_reward_func": 0.13839286379516125, "rewards/int_reward_func": 0.2455357275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08660715073347092, "step": 24 }, { "completion_length": 93.05580711364746, "epoch": 0.09363295880149813, "grad_norm": 1.3314857482910156, "kl": 0.27276611328125, "learning_rate": 2.037037037037037e-06, "loss": 0.0109, "reward": 0.27803125604987144, "reward_std": 0.80119389295578, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.2310267947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1271026823669672, "step": 25 }, { "completion_length": 94.82589721679688, "epoch": 0.09737827715355805, "grad_norm": 1.0931949615478516, "kl": 0.29876708984375, "learning_rate": 2.222222222222222e-06, "loss": 0.012, "reward": 0.31898215785622597, "reward_std": 0.861026868224144, "rewards/correctness_reward_func": 0.1785714365541935, "rewards/int_reward_func": 0.2611607238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.12186607345938683, "step": 26 }, { "completion_length": 102.84821891784668, "epoch": 0.10112359550561797, "grad_norm": 0.9510552883148193, "kl": 0.35101318359375, "learning_rate": 2.4074074074074075e-06, "loss": 0.014, "reward": 0.2714241296052933, "reward_std": 0.7749656587839127, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2444196566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13817412219941616, "step": 27 }, { "completion_length": 112.87277412414551, "epoch": 0.10486891385767791, "grad_norm": 0.8143340945243835, "kl": 0.4764404296875, "learning_rate": 2.5925925925925925e-06, "loss": 0.0191, "reward": 0.23841295577585697, "reward_std": 0.7267381250858307, "rewards/correctness_reward_func": 0.12500000186264515, "rewards/int_reward_func": 0.2544642947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14105134829878807, "step": 28 }, { "completion_length": 91.91071891784668, "epoch": 0.10861423220973783, "grad_norm": 1.2296696901321411, "kl": 0.5228271484375, "learning_rate": 2.7777777777777783e-06, "loss": 0.0209, "reward": 0.33742189407348633, "reward_std": 0.8348551988601685, "rewards/correctness_reward_func": 0.1919642984867096, "rewards/int_reward_func": 0.2533482238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10789063014090061, "step": 29 }, { "completion_length": 88.34152030944824, "epoch": 0.11235955056179775, "grad_norm": 0.7870422601699829, "kl": 0.50323486328125, "learning_rate": 2.962962962962963e-06, "loss": 0.0201, "reward": 0.29978572577238083, "reward_std": 0.8016993254423141, "rewards/correctness_reward_func": 0.160714291036129, "rewards/int_reward_func": 0.2645089402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1254375111311674, "step": 30 }, { "completion_length": 103.2433090209961, "epoch": 0.11610486891385768, "grad_norm": 1.320225715637207, "kl": 0.71630859375, "learning_rate": 3.1481481481481483e-06, "loss": 0.0286, "reward": 0.31782814115285873, "reward_std": 0.8109631538391113, "rewards/correctness_reward_func": 0.191964291036129, "rewards/int_reward_func": 0.2477678693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12190402112901211, "step": 31 }, { "completion_length": 102.47321891784668, "epoch": 0.1198501872659176, "grad_norm": 1.1958893537521362, "kl": 0.6319580078125, "learning_rate": 3.3333333333333333e-06, "loss": 0.0253, "reward": 0.3368035778403282, "reward_std": 0.8891346454620361, "rewards/correctness_reward_func": 0.2187500149011612, "rewards/int_reward_func": 0.2767857313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15873214416205883, "step": 32 }, { "completion_length": 110.43080711364746, "epoch": 0.12359550561797752, "grad_norm": 0.900262176990509, "kl": 0.5997314453125, "learning_rate": 3.5185185185185187e-06, "loss": 0.024, "reward": 0.2625982239842415, "reward_std": 0.814198911190033, "rewards/correctness_reward_func": 0.16071429196745157, "rewards/int_reward_func": 0.2321428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1302589364349842, "step": 33 }, { "completion_length": 96.2433090209961, "epoch": 0.12734082397003746, "grad_norm": 0.8053016662597656, "kl": 0.5283203125, "learning_rate": 3.7037037037037037e-06, "loss": 0.0211, "reward": 0.2832053676247597, "reward_std": 0.773906797170639, "rewards/correctness_reward_func": 0.1830357238650322, "rewards/int_reward_func": 0.2421875111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14201787114143372, "step": 34 }, { "completion_length": 99.74107360839844, "epoch": 0.13108614232209737, "grad_norm": 0.7698966860771179, "kl": 0.5833740234375, "learning_rate": 3.88888888888889e-06, "loss": 0.0233, "reward": 0.3971473351120949, "reward_std": 0.8169043958187103, "rewards/correctness_reward_func": 0.2366071529686451, "rewards/int_reward_func": 0.2667410857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10620089713484049, "step": 35 }, { "completion_length": 91.65402030944824, "epoch": 0.1348314606741573, "grad_norm": 0.6963524222373962, "kl": 0.6768798828125, "learning_rate": 4.074074074074074e-06, "loss": 0.0271, "reward": 0.2977009005844593, "reward_std": 0.9012245386838913, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2455357238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1710491143167019, "step": 36 }, { "completion_length": 86.2120590209961, "epoch": 0.13857677902621723, "grad_norm": 0.6502078771591187, "kl": 0.7315673828125, "learning_rate": 4.2592592592592596e-06, "loss": 0.0293, "reward": 0.4257053807377815, "reward_std": 0.7627889215946198, "rewards/correctness_reward_func": 0.2053571492433548, "rewards/int_reward_func": 0.2611607238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04081250121816993, "step": 37 }, { "completion_length": 88.02232551574707, "epoch": 0.14232209737827714, "grad_norm": 0.7598965764045715, "kl": 0.7764892578125, "learning_rate": 4.444444444444444e-06, "loss": 0.0311, "reward": 0.26222768798470497, "reward_std": 0.7311272174119949, "rewards/correctness_reward_func": 0.1383928656578064, "rewards/int_reward_func": 0.2466517947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12281697243452072, "step": 38 }, { "completion_length": 88.76116561889648, "epoch": 0.14606741573033707, "grad_norm": 0.9369046688079834, "kl": 0.779052734375, "learning_rate": 4.62962962962963e-06, "loss": 0.0312, "reward": 0.2584107182919979, "reward_std": 0.8221316933631897, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2466518022119999, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15453572571277618, "step": 39 }, { "completion_length": 88.61607551574707, "epoch": 0.149812734082397, "grad_norm": 0.6541325449943542, "kl": 0.7415771484375, "learning_rate": 4.814814814814815e-06, "loss": 0.0297, "reward": 0.338582631200552, "reward_std": 0.773887574672699, "rewards/correctness_reward_func": 0.191964291036129, "rewards/int_reward_func": 0.2522321529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10561384446918964, "step": 40 }, { "completion_length": 93.6473274230957, "epoch": 0.15355805243445692, "grad_norm": 0.7286244630813599, "kl": 0.772216796875, "learning_rate": 5e-06, "loss": 0.0309, "reward": 0.2527187615633011, "reward_std": 0.7823167890310287, "rewards/correctness_reward_func": 0.1473214328289032, "rewards/int_reward_func": 0.2533482275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14795089792460203, "step": 41 }, { "completion_length": 102.1004524230957, "epoch": 0.15730337078651685, "grad_norm": 0.6125639081001282, "kl": 0.7431640625, "learning_rate": 4.999785818935018e-06, "loss": 0.0297, "reward": 0.372944213449955, "reward_std": 0.8073680251836777, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.279017873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11589509434998035, "step": 42 }, { "completion_length": 95.78794860839844, "epoch": 0.16104868913857678, "grad_norm": 0.7778175473213196, "kl": 0.887451171875, "learning_rate": 4.999143312438893e-06, "loss": 0.0355, "reward": 0.3458884060382843, "reward_std": 0.8312461376190186, "rewards/correctness_reward_func": 0.2008928656578064, "rewards/int_reward_func": 0.2421875149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09719196986407042, "step": 43 }, { "completion_length": 104.68080711364746, "epoch": 0.1647940074906367, "grad_norm": 0.9053827524185181, "kl": 0.83203125, "learning_rate": 4.998072590601808e-06, "loss": 0.0333, "reward": 0.28239064663648605, "reward_std": 0.7673767507076263, "rewards/correctness_reward_func": 0.16517857648432255, "rewards/int_reward_func": 0.2354910783469677, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11827902123332024, "step": 44 }, { "completion_length": 91.47768211364746, "epoch": 0.16853932584269662, "grad_norm": 0.7150729894638062, "kl": 0.8095703125, "learning_rate": 4.9965738368864345e-06, "loss": 0.0324, "reward": 0.4368147626519203, "reward_std": 0.8043892681598663, "rewards/correctness_reward_func": 0.2455357238650322, "rewards/int_reward_func": 0.2879464365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09666741825640202, "step": 45 }, { "completion_length": 106.02009391784668, "epoch": 0.17228464419475656, "grad_norm": 0.7475388050079346, "kl": 0.9344482421875, "learning_rate": 4.994647308096509e-06, "loss": 0.0374, "reward": 0.2440937664359808, "reward_std": 0.7551652044057846, "rewards/correctness_reward_func": 0.1339285783469677, "rewards/int_reward_func": 0.2388392947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1286741215735674, "step": 46 }, { "completion_length": 94.04018211364746, "epoch": 0.1760299625468165, "grad_norm": 0.6236558556556702, "kl": 0.8037109375, "learning_rate": 4.992293334332821e-06, "loss": 0.0322, "reward": 0.37872322648763657, "reward_std": 0.8807232677936554, "rewards/correctness_reward_func": 0.2678571492433548, "rewards/int_reward_func": 0.2589285857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14806250110268593, "step": 47 }, { "completion_length": 92.28571701049805, "epoch": 0.1797752808988764, "grad_norm": 0.6654737591743469, "kl": 0.818115234375, "learning_rate": 4.989512318936654e-06, "loss": 0.0327, "reward": 0.39247100055217743, "reward_std": 0.743692010641098, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.2578125149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.057305806782096624, "step": 48 }, { "completion_length": 93.03571891784668, "epoch": 0.18352059925093633, "grad_norm": 0.7210425734519958, "kl": 0.825927734375, "learning_rate": 4.986304738420684e-06, "loss": 0.033, "reward": 0.35212278366088867, "reward_std": 0.8072675913572311, "rewards/correctness_reward_func": 0.2187500149011612, "rewards/int_reward_func": 0.2488839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11551116220653057, "step": 49 }, { "completion_length": 94.02902221679688, "epoch": 0.18726591760299627, "grad_norm": 0.6812981963157654, "kl": 0.92724609375, "learning_rate": 4.982671142387316e-06, "loss": 0.0371, "reward": 0.2549062632024288, "reward_std": 0.91233891248703, "rewards/correctness_reward_func": 0.1830357238650322, "rewards/int_reward_func": 0.2466517984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17478126287460327, "step": 50 }, { "completion_length": 98.83259391784668, "epoch": 0.19101123595505617, "grad_norm": 0.7230132818222046, "kl": 0.9453125, "learning_rate": 4.978612153434527e-06, "loss": 0.0378, "reward": 0.3085335083305836, "reward_std": 0.7170540690422058, "rewards/correctness_reward_func": 0.13392858020961285, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08543973276391625, "step": 51 }, { "completion_length": 92.35491371154785, "epoch": 0.1947565543071161, "grad_norm": 0.7230132818222046, "kl": 1.035888671875, "learning_rate": 4.978612153434527e-06, "loss": 0.0414, "reward": 0.39346206933259964, "reward_std": 0.7441791445016861, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.2522321566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03287723264656961, "step": 52 }, { "completion_length": 89.34821701049805, "epoch": 0.19850187265917604, "grad_norm": 0.620324969291687, "kl": 0.957275390625, "learning_rate": 4.974128467049177e-06, "loss": 0.0383, "reward": 0.3490491136908531, "reward_std": 0.747399315237999, "rewards/correctness_reward_func": 0.1830357164144516, "rewards/int_reward_func": 0.2522321604192257, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.08733483403921127, "step": 53 }, { "completion_length": 98.08928871154785, "epoch": 0.20224719101123595, "grad_norm": 0.617904782295227, "kl": 1.1640625, "learning_rate": 4.9692208514878445e-06, "loss": 0.0466, "reward": 0.21967187896370888, "reward_std": 0.7784698009490967, "rewards/correctness_reward_func": 0.14285714738070965, "rewards/int_reward_func": 0.2232142984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14639955759048462, "step": 54 }, { "completion_length": 90.87946891784668, "epoch": 0.20599250936329588, "grad_norm": 0.6099480390548706, "kl": 1.119140625, "learning_rate": 4.963890147645195e-06, "loss": 0.0448, "reward": 0.3465201109647751, "reward_std": 0.8060361593961716, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11218527238816023, "step": 55 }, { "completion_length": 90.78795051574707, "epoch": 0.20973782771535582, "grad_norm": 0.6998101472854614, "kl": 1.171875, "learning_rate": 4.958137268909887e-06, "loss": 0.0469, "reward": 0.3585915267467499, "reward_std": 0.7672727555036545, "rewards/correctness_reward_func": 0.191964291036129, "rewards/int_reward_func": 0.2488839365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08225669572129846, "step": 56 }, { "completion_length": 91.67857551574707, "epoch": 0.21348314606741572, "grad_norm": 0.8839861154556274, "kl": 1.13525390625, "learning_rate": 4.9519632010080765e-06, "loss": 0.0454, "reward": 0.3334464356303215, "reward_std": 0.7685143500566483, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.2578125074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09847322292625904, "step": 57 }, { "completion_length": 81.05804061889648, "epoch": 0.21722846441947566, "grad_norm": 0.5016757249832153, "kl": 1.03076171875, "learning_rate": 4.9453690018345144e-06, "loss": 0.0412, "reward": 0.4211518168449402, "reward_std": 0.8437229245901108, "rewards/correctness_reward_func": 0.2187500149011612, "rewards/int_reward_func": 0.2779017947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0755000039935112, "step": 58 }, { "completion_length": 101.41518211364746, "epoch": 0.2209737827715356, "grad_norm": 0.6329123377799988, "kl": 1.089111328125, "learning_rate": 4.938355801271282e-06, "loss": 0.0436, "reward": 0.35140402615070343, "reward_std": 0.7762987017631531, "rewards/correctness_reward_func": 0.1785714402794838, "rewards/int_reward_func": 0.2656250149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09279241226613522, "step": 59 }, { "completion_length": 105.19420051574707, "epoch": 0.2247191011235955, "grad_norm": 0.6004884839057922, "kl": 1.02001953125, "learning_rate": 4.930924800994192e-06, "loss": 0.0408, "reward": 0.2473437450826168, "reward_std": 0.7411400526762009, "rewards/correctness_reward_func": 0.1428571492433548, "rewards/int_reward_func": 0.2466517984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14216518588364124, "step": 60 }, { "completion_length": 98.32589530944824, "epoch": 0.22846441947565543, "grad_norm": 0.8213242292404175, "kl": 1.04638671875, "learning_rate": 4.923077274266886e-06, "loss": 0.0419, "reward": 0.29397991858422756, "reward_std": 0.7984266579151154, "rewards/correctness_reward_func": 0.17857143841683865, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13459152355790138, "step": 61 }, { "completion_length": 95.77009201049805, "epoch": 0.23220973782771537, "grad_norm": 0.7614482641220093, "kl": 0.9754638671875, "learning_rate": 4.914814565722671e-06, "loss": 0.039, "reward": 0.25892411917448044, "reward_std": 0.6874004900455475, "rewards/correctness_reward_func": 0.1383928582072258, "rewards/int_reward_func": 0.251116082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13058483134955168, "step": 62 }, { "completion_length": 88.84375381469727, "epoch": 0.23595505617977527, "grad_norm": 0.6162322759628296, "kl": 0.9012451171875, "learning_rate": 4.906138091134118e-06, "loss": 0.0361, "reward": 0.4282499924302101, "reward_std": 0.867719978094101, "rewards/correctness_reward_func": 0.2321428656578064, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08402678836137056, "step": 63 }, { "completion_length": 98.63393211364746, "epoch": 0.2397003745318352, "grad_norm": 0.7493047118186951, "kl": 0.9737548828125, "learning_rate": 4.897049337170483e-06, "loss": 0.0389, "reward": 0.31915403716266155, "reward_std": 0.7978127002716064, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.2488839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1395513443276286, "step": 64 }, { "completion_length": 99.77232551574707, "epoch": 0.24344569288389514, "grad_norm": 0.55640709400177, "kl": 0.95849609375, "learning_rate": 4.887549861142967e-06, "loss": 0.0383, "reward": 0.25655804201960564, "reward_std": 0.746478259563446, "rewards/correctness_reward_func": 0.1339285783469677, "rewards/int_reward_func": 0.2767857313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1541562583297491, "step": 65 }, { "completion_length": 90.46205711364746, "epoch": 0.24719101123595505, "grad_norm": 0.5036749243736267, "kl": 0.875732421875, "learning_rate": 4.8776412907378845e-06, "loss": 0.035, "reward": 0.3458192050457001, "reward_std": 0.8146399855613708, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.2578125111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13074330985546112, "step": 66 }, { "completion_length": 109.09821891784668, "epoch": 0.250936329588015, "grad_norm": 0.5405407547950745, "kl": 0.9388427734375, "learning_rate": 4.867325323737765e-06, "loss": 0.0376, "reward": 0.2163794655352831, "reward_std": 0.695435032248497, "rewards/correctness_reward_func": 0.1250000074505806, "rewards/int_reward_func": 0.2299107238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13853124901652336, "step": 67 }, { "completion_length": 96.1495590209961, "epoch": 0.2546816479400749, "grad_norm": 0.7321078777313232, "kl": 0.9552001953125, "learning_rate": 4.856603727730446e-06, "loss": 0.0382, "reward": 0.3762388601899147, "reward_std": 0.8388219773769379, "rewards/correctness_reward_func": 0.2366071492433548, "rewards/int_reward_func": 0.2600446566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12041295692324638, "step": 68 }, { "completion_length": 94.25893211364746, "epoch": 0.25842696629213485, "grad_norm": 0.5617873668670654, "kl": 0.9840087890625, "learning_rate": 4.845478339806211e-06, "loss": 0.0394, "reward": 0.3388616181910038, "reward_std": 0.8814976066350937, "rewards/correctness_reward_func": 0.2276785857975483, "rewards/int_reward_func": 0.2689732275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15779018122702837, "step": 69 }, { "completion_length": 100.38839721679688, "epoch": 0.26217228464419473, "grad_norm": 0.7381689548492432, "kl": 1.1729736328125, "learning_rate": 4.833951066243004e-06, "loss": 0.0469, "reward": 0.3259017989039421, "reward_std": 0.7590171247720718, "rewards/correctness_reward_func": 0.165178582072258, "rewards/int_reward_func": 0.2566964328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09597321972250938, "step": 70 }, { "completion_length": 108.24777412414551, "epoch": 0.26591760299625467, "grad_norm": 0.7399603724479675, "kl": 1.2216796875, "learning_rate": 4.822023882179811e-06, "loss": 0.0489, "reward": 0.138060272205621, "reward_std": 0.8277581036090851, "rewards/correctness_reward_func": 0.098214291036129, "rewards/int_reward_func": 0.2321428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19229689799249172, "step": 71 }, { "completion_length": 91.00893211364746, "epoch": 0.2696629213483146, "grad_norm": 0.49646005034446716, "kl": 0.997802734375, "learning_rate": 4.809698831278217e-06, "loss": 0.0399, "reward": 0.31739287078380585, "reward_std": 0.821430504322052, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.258928582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.137964291498065, "step": 72 }, { "completion_length": 89.22545051574707, "epoch": 0.27340823970037453, "grad_norm": 0.5307531952857971, "kl": 0.9476318359375, "learning_rate": 4.796978025372247e-06, "loss": 0.0379, "reward": 0.3167254589498043, "reward_std": 0.8111777305603027, "rewards/correctness_reward_func": 0.16071429662406445, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11184598784893751, "step": 73 }, { "completion_length": 108.57813262939453, "epoch": 0.27715355805243447, "grad_norm": 0.9819021224975586, "kl": 1.1944580078125, "learning_rate": 4.783863644106502e-06, "loss": 0.0478, "reward": 0.43339288234710693, "reward_std": 0.805885374546051, "rewards/correctness_reward_func": 0.2410714402794838, "rewards/int_reward_func": 0.2633928656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07107143104076385, "step": 74 }, { "completion_length": 97.66071891784668, "epoch": 0.2808988764044944, "grad_norm": 0.5349671244621277, "kl": 0.9276123046875, "learning_rate": 4.770357934562704e-06, "loss": 0.0371, "reward": 0.25291070714592934, "reward_std": 0.7776944190263748, "rewards/correctness_reward_func": 0.1517857201397419, "rewards/int_reward_func": 0.2321428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13101786747574806, "step": 75 }, { "completion_length": 99.77455711364746, "epoch": 0.2846441947565543, "grad_norm": 0.4924392104148865, "kl": 0.9676513671875, "learning_rate": 4.7564632108746524e-06, "loss": 0.0387, "reward": 0.29688840731978416, "reward_std": 0.7467798590660095, "rewards/correctness_reward_func": 0.1651785746216774, "rewards/int_reward_func": 0.2377232313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10601340420544147, "step": 76 }, { "completion_length": 108.47545051574707, "epoch": 0.2883895131086142, "grad_norm": 0.45954495668411255, "kl": 0.9312744140625, "learning_rate": 4.742181853831721e-06, "loss": 0.0372, "reward": 0.2200825996696949, "reward_std": 0.7668928056955338, "rewards/correctness_reward_func": 0.1428571492433548, "rewards/int_reward_func": 0.2377232201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1604977697134018, "step": 77 }, { "completion_length": 108.40848731994629, "epoch": 0.29213483146067415, "grad_norm": 0.5183126330375671, "kl": 0.931640625, "learning_rate": 4.72751631047092e-06, "loss": 0.0373, "reward": 0.26953795552253723, "reward_std": 0.7980407774448395, "rewards/correctness_reward_func": 0.1830357238650322, "rewards/int_reward_func": 0.2544642984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16796205937862396, "step": 78 }, { "completion_length": 92.30357551574707, "epoch": 0.2958801498127341, "grad_norm": 0.6169615983963013, "kl": 0.814453125, "learning_rate": 4.712469093657605e-06, "loss": 0.0326, "reward": 0.3473794758319855, "reward_std": 0.7407716810703278, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2455357313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08565625501796603, "step": 79 }, { "completion_length": 96.31920051574707, "epoch": 0.299625468164794, "grad_norm": 0.5736718773841858, "kl": 0.82373046875, "learning_rate": 4.697042781654913e-06, "loss": 0.0329, "reward": 0.3164888694882393, "reward_std": 0.8256205767393112, "rewards/correctness_reward_func": 0.160714291036129, "rewards/int_reward_func": 0.2734375074505806, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11877902504056692, "step": 80 }, { "completion_length": 95.03348731994629, "epoch": 0.30337078651685395, "grad_norm": 0.6198201179504395, "kl": 0.7996826171875, "learning_rate": 4.681240017681994e-06, "loss": 0.032, "reward": 0.31822992861270905, "reward_std": 0.7278983741998672, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.2500000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08802009373903275, "step": 81 }, { "completion_length": 94.79911041259766, "epoch": 0.30711610486891383, "grad_norm": 0.4967269003391266, "kl": 0.791748046875, "learning_rate": 4.665063509461098e-06, "loss": 0.0317, "reward": 0.37110715731978416, "reward_std": 0.8040148764848709, "rewards/correctness_reward_func": 0.2633928693830967, "rewards/int_reward_func": 0.2477678693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1400535786524415, "step": 82 }, { "completion_length": 93.06696891784668, "epoch": 0.31086142322097376, "grad_norm": 0.5414807796478271, "kl": 0.799072265625, "learning_rate": 4.648516028753632e-06, "loss": 0.032, "reward": 0.3070870563387871, "reward_std": 0.9167025238275528, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.2500000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16166296042501926, "step": 83 }, { "completion_length": 85.48661231994629, "epoch": 0.3146067415730337, "grad_norm": 0.6058946251869202, "kl": 0.8023681640625, "learning_rate": 4.631600410885231e-06, "loss": 0.0321, "reward": 0.31676117703318596, "reward_std": 0.8016230016946793, "rewards/correctness_reward_func": 0.1785714365541935, "rewards/int_reward_func": 0.2232142947614193, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08614062378183007, "step": 84 }, { "completion_length": 98.87947082519531, "epoch": 0.31835205992509363, "grad_norm": 0.5493951439857483, "kl": 0.810302734375, "learning_rate": 4.614319554259934e-06, "loss": 0.0324, "reward": 0.26373885199427605, "reward_std": 0.7828188389539719, "rewards/correctness_reward_func": 0.1428571492433548, "rewards/int_reward_func": 0.2421875149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12130581215023994, "step": 85 }, { "completion_length": 98.88170051574707, "epoch": 0.32209737827715357, "grad_norm": 0.5060502290725708, "kl": 0.8116455078125, "learning_rate": 4.596676419863561e-06, "loss": 0.0325, "reward": 0.37905358523130417, "reward_std": 0.7987204343080521, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08411608170717955, "step": 86 }, { "completion_length": 95.37723731994629, "epoch": 0.3258426966292135, "grad_norm": 0.45160311460494995, "kl": 0.814697265625, "learning_rate": 4.578674030756364e-06, "loss": 0.0326, "reward": 0.3752902075648308, "reward_std": 0.7861279100179672, "rewards/correctness_reward_func": 0.2142857238650322, "rewards/int_reward_func": 0.2566964402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09569196694064885, "step": 87 }, { "completion_length": 88.16964530944824, "epoch": 0.3295880149812734, "grad_norm": 0.4520312249660492, "kl": 0.8048095703125, "learning_rate": 4.560315471555039e-06, "loss": 0.0322, "reward": 0.40060270577669144, "reward_std": 0.827767089009285, "rewards/correctness_reward_func": 0.2410714402794838, "rewards/int_reward_func": 0.2399553656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0804241057485342, "step": 88 }, { "completion_length": 90.47098541259766, "epoch": 0.3333333333333333, "grad_norm": 0.4482274651527405, "kl": 0.802490234375, "learning_rate": 4.541603887904198e-06, "loss": 0.0321, "reward": 0.46391965448856354, "reward_std": 0.8666775524616241, "rewards/correctness_reward_func": 0.2812500149011612, "rewards/int_reward_func": 0.2845982313156128, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1019285786896944, "step": 89 }, { "completion_length": 99.05357360839844, "epoch": 0.33707865168539325, "grad_norm": 0.48688769340515137, "kl": 0.8892822265625, "learning_rate": 4.522542485937369e-06, "loss": 0.0356, "reward": 0.32227010279893875, "reward_std": 0.7231635600328445, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.2455357238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09737277776002884, "step": 90 }, { "completion_length": 96.9062557220459, "epoch": 0.3408239700374532, "grad_norm": 0.713897168636322, "kl": 0.83984375, "learning_rate": 4.503134531727652e-06, "loss": 0.0336, "reward": 0.3822232261300087, "reward_std": 0.8144369274377823, "rewards/correctness_reward_func": 0.2455357275903225, "rewards/int_reward_func": 0.258928582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1222410760819912, "step": 91 }, { "completion_length": 92.32589530944824, "epoch": 0.3445692883895131, "grad_norm": 0.48127949237823486, "kl": 0.8839111328125, "learning_rate": 4.4833833507280884e-06, "loss": 0.0354, "reward": 0.2896517887711525, "reward_std": 0.8091708421707153, "rewards/correctness_reward_func": 0.160714291036129, "rewards/int_reward_func": 0.2455357238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11659822333604097, "step": 92 }, { "completion_length": 113.4062557220459, "epoch": 0.34831460674157305, "grad_norm": 0.5383365154266357, "kl": 1.0166015625, "learning_rate": 4.463292327201862e-06, "loss": 0.0407, "reward": 0.2778482399880886, "reward_std": 0.7528630048036575, "rewards/correctness_reward_func": 0.1607142947614193, "rewards/int_reward_func": 0.2388393022119999, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12170535884797573, "step": 93 }, { "completion_length": 90.40179061889648, "epoch": 0.352059925093633, "grad_norm": 0.47072696685791016, "kl": 0.8814697265625, "learning_rate": 4.442864903642428e-06, "loss": 0.0353, "reward": 0.3879285827279091, "reward_std": 0.7762220501899719, "rewards/correctness_reward_func": 0.2321428693830967, "rewards/int_reward_func": 0.2377232275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08193750027567148, "step": 94 }, { "completion_length": 100.53795051574707, "epoch": 0.35580524344569286, "grad_norm": 0.5055387020111084, "kl": 0.9866943359375, "learning_rate": 4.422104580183649e-06, "loss": 0.0395, "reward": 0.27919645234942436, "reward_std": 0.8693763017654419, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2455357238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15383929014205933, "step": 95 }, { "completion_length": 98.15402221679688, "epoch": 0.3595505617977528, "grad_norm": 0.5205227732658386, "kl": 1.0927734375, "learning_rate": 4.401014914000078e-06, "loss": 0.0437, "reward": 0.30646876618266106, "reward_std": 0.8004807382822037, "rewards/correctness_reward_func": 0.1830357201397419, "rewards/int_reward_func": 0.238839291036129, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11540625896304846, "step": 96 }, { "completion_length": 92.7410774230957, "epoch": 0.36329588014981273, "grad_norm": 0.5007465481758118, "kl": 1.0205078125, "learning_rate": 4.379599518697444e-06, "loss": 0.0408, "reward": 0.4242701083421707, "reward_std": 0.900767520070076, "rewards/correctness_reward_func": 0.2321428693830967, "rewards/int_reward_func": 0.279017873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08689062856137753, "step": 97 }, { "completion_length": 95.44196891784668, "epoch": 0.36704119850187267, "grad_norm": 0.7685222029685974, "kl": 1.154541015625, "learning_rate": 4.357862063693486e-06, "loss": 0.0462, "reward": 0.3419933207333088, "reward_std": 0.7976544201374054, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.2522321529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10220312792807817, "step": 98 }, { "completion_length": 97.05803871154785, "epoch": 0.3707865168539326, "grad_norm": 0.48678070306777954, "kl": 1.0458984375, "learning_rate": 4.335806273589214e-06, "loss": 0.0418, "reward": 0.32839956879615784, "reward_std": 0.7194608449935913, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.2488839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07673437846824527, "step": 99 }, { "completion_length": 89.83259391784668, "epoch": 0.37453183520599254, "grad_norm": 0.4802840054035187, "kl": 0.9891357421875, "learning_rate": 4.313435927530719e-06, "loss": 0.0396, "reward": 0.323910728096962, "reward_std": 0.8017723858356476, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.2377232238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08791964408010244, "step": 100 }, { "completion_length": 92.61607551574707, "epoch": 0.3782771535580524, "grad_norm": 0.6329229474067688, "kl": 0.9765625, "learning_rate": 4.290754858561636e-06, "loss": 0.0391, "reward": 0.3065357282757759, "reward_std": 0.7879298776388168, "rewards/correctness_reward_func": 0.165178582072258, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12650001049041748, "step": 101 }, { "completion_length": 92.51116371154785, "epoch": 0.38202247191011235, "grad_norm": 0.5261896848678589, "kl": 0.99658203125, "learning_rate": 4.267766952966369e-06, "loss": 0.0399, "reward": 0.3286317139863968, "reward_std": 0.7215069979429245, "rewards/correctness_reward_func": 0.1473214365541935, "rewards/int_reward_func": 0.2611607275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07985044876113534, "step": 102 }, { "completion_length": 104.85937881469727, "epoch": 0.3857677902621723, "grad_norm": 0.4765637516975403, "kl": 1.0252685546875, "learning_rate": 4.244476149604201e-06, "loss": 0.041, "reward": 0.3370089456439018, "reward_std": 0.7856772691011429, "rewards/correctness_reward_func": 0.1785714402794838, "rewards/int_reward_func": 0.2533482313156128, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09602678846567869, "step": 103 }, { "completion_length": 92.55804061889648, "epoch": 0.3895131086142322, "grad_norm": 0.5377345681190491, "kl": 0.9796142578125, "learning_rate": 4.220886439234385e-06, "loss": 0.0392, "reward": 0.3739665374159813, "reward_std": 0.8817652761936188, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2354910857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0847388431429863, "step": 104 }, { "completion_length": 102.57366561889648, "epoch": 0.39325842696629215, "grad_norm": 0.5319781303405762, "kl": 1.1024169921875, "learning_rate": 4.197001863832355e-06, "loss": 0.0441, "reward": 0.33513617515563965, "reward_std": 0.7641059011220932, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.2544642984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11575670912861824, "step": 105 }, { "completion_length": 95.90625381469727, "epoch": 0.3970037453183521, "grad_norm": 0.5436156392097473, "kl": 1.011474609375, "learning_rate": 4.172826515897146e-06, "loss": 0.0405, "reward": 0.3867567144334316, "reward_std": 0.7985697090625763, "rewards/correctness_reward_func": 0.2142857275903225, "rewards/int_reward_func": 0.2700893059372902, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.09873438253998756, "step": 106 }, { "completion_length": 89.44866561889648, "epoch": 0.40074906367041196, "grad_norm": 0.47593066096305847, "kl": 1.074462890625, "learning_rate": 4.1483645377501726e-06, "loss": 0.043, "reward": 0.36116072721779346, "reward_std": 0.8109498172998428, "rewards/correctness_reward_func": 0.19642857648432255, "rewards/int_reward_func": 0.2332589365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06852679001167417, "step": 107 }, { "completion_length": 98.08482551574707, "epoch": 0.4044943820224719, "grad_norm": 0.47593066096305847, "kl": NaN, "learning_rate": 4.1483645377501726e-06, "loss": 0.042, "reward": 0.3247567042708397, "reward_std": 0.756167471408844, "rewards/correctness_reward_func": 0.165178582072258, "rewards/int_reward_func": 0.2488839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08930580969899893, "step": 108 }, { "completion_length": 96.78571701049805, "epoch": 0.40823970037453183, "grad_norm": 0.46640312671661377, "kl": 1.106689453125, "learning_rate": 4.123620120825459e-06, "loss": 0.0443, "reward": 0.3182366043329239, "reward_std": 0.8166698515415192, "rewards/correctness_reward_func": 0.20982143469154835, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.141584824770689, "step": 109 }, { "completion_length": 103.3504524230957, "epoch": 0.41198501872659177, "grad_norm": 0.4819093346595764, "kl": 1.209228515625, "learning_rate": 4.098597504951462e-06, "loss": 0.0484, "reward": 0.45162054151296616, "reward_std": 0.9206108599901199, "rewards/correctness_reward_func": 0.3080357313156128, "rewards/int_reward_func": 0.2801339402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1365491133183241, "step": 110 }, { "completion_length": 95.90402412414551, "epoch": 0.4157303370786517, "grad_norm": 0.48896247148513794, "kl": 1.0751953125, "learning_rate": 4.073300977624594e-06, "loss": 0.043, "reward": 0.2652589473873377, "reward_std": 0.7796717882156372, "rewards/correctness_reward_func": 0.160714291036129, "rewards/int_reward_func": 0.2533482201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14880357310175896, "step": 111 }, { "completion_length": 90.68527221679688, "epoch": 0.41947565543071164, "grad_norm": 0.4631924033164978, "kl": 1.094970703125, "learning_rate": 4.047734873274586e-06, "loss": 0.0438, "reward": 0.35960714891552925, "reward_std": 0.7207369059324265, "rewards/correctness_reward_func": 0.17410715529695153, "rewards/int_reward_func": 0.2845982238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09909821674227715, "step": 112 }, { "completion_length": 97.99330711364746, "epoch": 0.4232209737827715, "grad_norm": 0.5063531398773193, "kl": 1.1484375, "learning_rate": 4.021903572521802e-06, "loss": 0.0459, "reward": 0.41493305563926697, "reward_std": 0.8120662122964859, "rewards/correctness_reward_func": 0.2410714402794838, "rewards/int_reward_func": 0.2566964365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.08395089209079742, "step": 113 }, { "completion_length": 85.47991561889648, "epoch": 0.42696629213483145, "grad_norm": 0.4622070789337158, "kl": 1.0574951171875, "learning_rate": 3.995811501426648e-06, "loss": 0.0423, "reward": 0.3168504536151886, "reward_std": 0.7642460912466049, "rewards/correctness_reward_func": 0.1875000149011612, "rewards/int_reward_func": 0.2633928656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1340424194931984, "step": 114 }, { "completion_length": 94.50000381469727, "epoch": 0.4307116104868914, "grad_norm": 0.4713800251483917, "kl": 1.068359375, "learning_rate": 3.969463130731183e-06, "loss": 0.0427, "reward": 0.3659776858985424, "reward_std": 0.891373872756958, "rewards/correctness_reward_func": 0.2678571492433548, "rewards/int_reward_func": 0.2444196566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14629911817610264, "step": 115 }, { "completion_length": 96.42187881469727, "epoch": 0.4344569288389513, "grad_norm": 0.527915894985199, "kl": 1.056884765625, "learning_rate": 3.942862975093085e-06, "loss": 0.0423, "reward": 0.36224332079291344, "reward_std": 0.8091815561056137, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2477678693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1087388452142477, "step": 116 }, { "completion_length": 95.6093807220459, "epoch": 0.43820224719101125, "grad_norm": 0.5208942294120789, "kl": 1.090576171875, "learning_rate": 3.916015592312083e-06, "loss": 0.0436, "reward": 0.27370089665055275, "reward_std": 0.8227901756763458, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.254464291036129, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15487053990364075, "step": 117 }, { "completion_length": 105.62054061889648, "epoch": 0.4419475655430712, "grad_norm": 0.47664541006088257, "kl": 1.18017578125, "learning_rate": 3.888925582549006e-06, "loss": 0.0472, "reward": 0.28062277287244797, "reward_std": 0.8103707134723663, "rewards/correctness_reward_func": 0.1830357201397419, "rewards/int_reward_func": 0.2455357201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14794866368174553, "step": 118 }, { "completion_length": 94.9598274230957, "epoch": 0.44569288389513106, "grad_norm": 0.5299546122550964, "kl": 1.0885009765625, "learning_rate": 3.861597587537568e-06, "loss": 0.0435, "reward": 0.2977410815656185, "reward_std": 0.7406027764081955, "rewards/correctness_reward_func": 0.1517857201397419, "rewards/int_reward_func": 0.2254464402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07949107605963945, "step": 119 }, { "completion_length": 98.04911422729492, "epoch": 0.449438202247191, "grad_norm": 0.4211517870426178, "kl": 1.2236328125, "learning_rate": 3.83403628978903e-06, "loss": 0.0489, "reward": 0.2801852785050869, "reward_std": 0.8073955476284027, "rewards/correctness_reward_func": 0.1607142947614193, "rewards/int_reward_func": 0.247767873108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12829688470810652, "step": 120 }, { "completion_length": 96.47098731994629, "epoch": 0.45318352059925093, "grad_norm": 0.4674068093299866, "kl": 1.210693359375, "learning_rate": 3.806246411789872e-06, "loss": 0.0484, "reward": 0.35595760494470596, "reward_std": 0.8347803801298141, "rewards/correctness_reward_func": 0.2098214402794838, "rewards/int_reward_func": 0.2544642947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10832812543958426, "step": 121 }, { "completion_length": 97.76116561889648, "epoch": 0.45692883895131087, "grad_norm": 0.7582751512527466, "kl": 1.22021484375, "learning_rate": 3.77823271519263e-06, "loss": 0.0488, "reward": 0.35320091247558594, "reward_std": 0.7386835068464279, "rewards/correctness_reward_func": 0.1964285746216774, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10327232570853084, "step": 122 }, { "completion_length": 95.66071891784668, "epoch": 0.4606741573033708, "grad_norm": 0.7582751512527466, "kl": NaN, "learning_rate": 3.77823271519263e-06, "loss": 0.046, "reward": 0.31566742807626724, "reward_std": 0.8195231109857559, "rewards/correctness_reward_func": 0.1696428656578064, "rewards/int_reward_func": 0.2366071492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09058259081211872, "step": 123 }, { "completion_length": 93.24330711364746, "epoch": 0.46441947565543074, "grad_norm": 0.49510565400123596, "kl": 1.078125, "learning_rate": 3.7500000000000005e-06, "loss": 0.0431, "reward": 0.3665379509329796, "reward_std": 0.9554053395986557, "rewards/correctness_reward_func": 0.2366071566939354, "rewards/int_reward_func": 0.2622767947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1323459828272462, "step": 124 }, { "completion_length": 97.34152221679688, "epoch": 0.4681647940074906, "grad_norm": 0.6952568888664246, "kl": 1.18701171875, "learning_rate": 3.721553103742388e-06, "loss": 0.0475, "reward": 0.34241294860839844, "reward_std": 0.8731215000152588, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.2678571566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13080134615302086, "step": 125 }, { "completion_length": 99.09821891784668, "epoch": 0.47191011235955055, "grad_norm": 0.46382632851600647, "kl": 1.170654296875, "learning_rate": 3.6928969006490212e-06, "loss": 0.0468, "reward": 0.31666965037584305, "reward_std": 0.7566796094179153, "rewards/correctness_reward_func": 0.1517857201397419, "rewards/int_reward_func": 0.2723214440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10743750259280205, "step": 126 }, { "completion_length": 91.62277412414551, "epoch": 0.4756554307116105, "grad_norm": 0.47625091671943665, "kl": 0.992431640625, "learning_rate": 3.664036300812779e-06, "loss": 0.0397, "reward": 0.3642299249768257, "reward_std": 0.951588049530983, "rewards/correctness_reward_func": 0.2410714402794838, "rewards/int_reward_func": 0.2712053656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14804688468575478, "step": 127 }, { "completion_length": 100.61607360839844, "epoch": 0.4794007490636704, "grad_norm": 0.5847791433334351, "kl": 1.225830078125, "learning_rate": 3.634976249348867e-06, "loss": 0.049, "reward": 0.42196429520845413, "reward_std": 0.8636786490678787, "rewards/correctness_reward_func": 0.2455357238650322, "rewards/int_reward_func": 0.2533482201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07691965182311833, "step": 128 }, { "completion_length": 98.10714530944824, "epoch": 0.48314606741573035, "grad_norm": 0.44887715578079224, "kl": 1.186767578125, "learning_rate": 3.6057217255475034e-06, "loss": 0.0475, "reward": 0.2306763455271721, "reward_std": 0.745768278837204, "rewards/correctness_reward_func": 0.1339285783469677, "rewards/int_reward_func": 0.2399553619325161, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14320759288966656, "step": 129 }, { "completion_length": 92.93750381469727, "epoch": 0.4868913857677903, "grad_norm": 0.44887715578079224, "kl": NaN, "learning_rate": 3.6057217255475034e-06, "loss": 0.0481, "reward": 0.2954799123108387, "reward_std": 0.7976376265287399, "rewards/correctness_reward_func": 0.1517857238650322, "rewards/int_reward_func": 0.2566964328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11300223972648382, "step": 130 }, { "completion_length": 102.15178871154785, "epoch": 0.49063670411985016, "grad_norm": 0.417877733707428, "kl": 1.196044921875, "learning_rate": 3.5762777420207382e-06, "loss": 0.0478, "reward": 0.29901787638664246, "reward_std": 0.7817905694246292, "rewards/correctness_reward_func": 0.1785714328289032, "rewards/int_reward_func": 0.2656250149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14517857134342194, "step": 131 }, { "completion_length": 99.67187881469727, "epoch": 0.4943820224719101, "grad_norm": 0.492396742105484, "kl": 1.19775390625, "learning_rate": 3.5466493438435707e-06, "loss": 0.0479, "reward": 0.2629486694931984, "reward_std": 0.8080793470144272, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.2645089402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1578102707862854, "step": 132 }, { "completion_length": 97.48214721679688, "epoch": 0.49812734082397003, "grad_norm": 0.5177117586135864, "kl": 1.177490234375, "learning_rate": 3.516841607689501e-06, "loss": 0.0471, "reward": 0.2645267955958843, "reward_std": 0.7447800785303116, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.2455357275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13725892454385757, "step": 133 }, { "completion_length": 95.82366561889648, "epoch": 0.50187265917603, "grad_norm": 0.5745740532875061, "kl": 1.138916015625, "learning_rate": 3.486859640960668e-06, "loss": 0.0456, "reward": 0.2982388660311699, "reward_std": 0.8590549826622009, "rewards/correctness_reward_func": 0.1785714328289032, "rewards/int_reward_func": 0.2522321492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1325647421181202, "step": 134 }, { "completion_length": 91.2567024230957, "epoch": 0.5056179775280899, "grad_norm": 0.6395649313926697, "kl": 1.1143798828125, "learning_rate": 3.4567085809127247e-06, "loss": 0.0446, "reward": 0.3479754514992237, "reward_std": 0.8721490353345871, "rewards/correctness_reward_func": 0.2008928693830967, "rewards/int_reward_func": 0.2466517947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09956920135300606, "step": 135 }, { "completion_length": 92.41295051574707, "epoch": 0.5093632958801498, "grad_norm": 0.6169349551200867, "kl": 1.121826171875, "learning_rate": 3.426393593774591e-06, "loss": 0.0449, "reward": 0.26565179601311684, "reward_std": 0.7934366017580032, "rewards/correctness_reward_func": 0.1339285783469677, "rewards/int_reward_func": 0.2433035857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11158036068081856, "step": 136 }, { "completion_length": 96.50223541259766, "epoch": 0.5131086142322098, "grad_norm": 0.5668321251869202, "kl": 1.08935546875, "learning_rate": 3.39591987386325e-06, "loss": 0.0436, "reward": 0.32920314325019717, "reward_std": 0.7672218978404999, "rewards/correctness_reward_func": 0.20982143515720963, "rewards/int_reward_func": 0.2421875074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12280581146478653, "step": 137 }, { "completion_length": 94.98661231994629, "epoch": 0.5168539325842697, "grad_norm": 0.5555436611175537, "kl": 1.1796875, "learning_rate": 3.3652926426937327e-06, "loss": 0.0472, "reward": 0.3373348340392113, "reward_std": 0.8585019558668137, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.2566964440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1157901817932725, "step": 138 }, { "completion_length": 100.43527221679688, "epoch": 0.5205992509363296, "grad_norm": 0.5975056290626526, "kl": 1.116455078125, "learning_rate": 3.3345171480844275e-06, "loss": 0.0447, "reward": 0.33340851217508316, "reward_std": 0.8432840257883072, "rewards/correctness_reward_func": 0.196428582072258, "rewards/int_reward_func": 0.2488839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11190402135252953, "step": 139 }, { "completion_length": 110.37946701049805, "epoch": 0.5243445692883895, "grad_norm": 0.5592173337936401, "kl": 1.2734375, "learning_rate": 3.303598663257904e-06, "loss": 0.0509, "reward": 0.3411696571856737, "reward_std": 0.8827303797006607, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14208929613232613, "step": 140 }, { "completion_length": 104.59375381469727, "epoch": 0.5280898876404494, "grad_norm": 0.5417644381523132, "kl": 1.274169921875, "learning_rate": 3.272542485937369e-06, "loss": 0.051, "reward": 0.367312528192997, "reward_std": 0.7450851798057556, "rewards/correctness_reward_func": 0.20089286752045155, "rewards/int_reward_func": 0.2578125149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09139286354184151, "step": 141 }, { "completion_length": 86.5714340209961, "epoch": 0.5318352059925093, "grad_norm": 0.5099084973335266, "kl": 1.106689453125, "learning_rate": 3.2413539374389275e-06, "loss": 0.0443, "reward": 0.41947099566459656, "reward_std": 0.7897130697965622, "rewards/correctness_reward_func": 0.2187500111758709, "rewards/int_reward_func": 0.258928582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05820759106427431, "step": 142 }, { "completion_length": 97.21875381469727, "epoch": 0.5355805243445693, "grad_norm": 0.41567039489746094, "kl": 1.190185546875, "learning_rate": 3.2100383617598075e-06, "loss": 0.0476, "reward": 0.2790111724752933, "reward_std": 0.7919286489486694, "rewards/correctness_reward_func": 0.16964286752045155, "rewards/int_reward_func": 0.2444196566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13505134359002113, "step": 143 }, { "completion_length": 92.24553871154785, "epoch": 0.5393258426966292, "grad_norm": 0.5126021504402161, "kl": 1.135009765625, "learning_rate": 3.1786011246626858e-06, "loss": 0.0454, "reward": 0.33713172376155853, "reward_std": 0.759757861495018, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.2578125074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.09590401872992516, "step": 144 }, { "completion_length": 92.36384391784668, "epoch": 0.5430711610486891, "grad_norm": 0.4743898808956146, "kl": 1.125732421875, "learning_rate": 3.147047612756302e-06, "loss": 0.045, "reward": 0.3338058143854141, "reward_std": 0.7495080679655075, "rewards/correctness_reward_func": 0.1741071492433548, "rewards/int_reward_func": 0.2399553656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.0813727667555213, "step": 145 }, { "completion_length": 96.77902030944824, "epoch": 0.5468164794007491, "grad_norm": 0.5349624156951904, "kl": 1.19140625, "learning_rate": 3.115383232572483e-06, "loss": 0.0476, "reward": 0.3870870769023895, "reward_std": 0.8124971240758896, "rewards/correctness_reward_func": 0.2232142984867096, "rewards/int_reward_func": 0.2533482275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08947544917464256, "step": 146 }, { "completion_length": 100.25670051574707, "epoch": 0.550561797752809, "grad_norm": 0.3959732949733734, "kl": 1.1826171875, "learning_rate": 3.0836134096397642e-06, "loss": 0.0473, "reward": 0.3632299154996872, "reward_std": 0.8887846767902374, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10886830929666758, "step": 147 }, { "completion_length": 95.57366561889648, "epoch": 0.5543071161048689, "grad_norm": 0.5033745765686035, "kl": 1.179931640625, "learning_rate": 3.051743587553754e-06, "loss": 0.0472, "reward": 0.33835939317941666, "reward_std": 0.7949195951223373, "rewards/correctness_reward_func": 0.1607142947614193, "rewards/int_reward_func": 0.2466517947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.07012277067406103, "step": 148 }, { "completion_length": 88.57143211364746, "epoch": 0.5580524344569289, "grad_norm": 0.4301639795303345, "kl": 1.153564453125, "learning_rate": 3.019779227044398e-06, "loss": 0.0462, "reward": 0.26906250417232513, "reward_std": 0.7477044612169266, "rewards/correctness_reward_func": 0.1607142947614193, "rewards/int_reward_func": 0.2578125037252903, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14946428686380386, "step": 149 }, { "completion_length": 104.3437557220459, "epoch": 0.5617977528089888, "grad_norm": 0.49258914589881897, "kl": 1.2353515625, "learning_rate": 2.9877258050403214e-06, "loss": 0.0494, "reward": 0.3209241144359112, "reward_std": 0.821040615439415, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.2444196529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11545982770621777, "step": 150 }, { "completion_length": 97.84821891784668, "epoch": 0.5655430711610487, "grad_norm": 0.6680523753166199, "kl": 1.210205078125, "learning_rate": 2.9555888137303695e-06, "loss": 0.0484, "reward": 0.4514397457242012, "reward_std": 0.8471043556928635, "rewards/correctness_reward_func": 0.2455357238650322, "rewards/int_reward_func": 0.2767857238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07088169828057289, "step": 151 }, { "completion_length": 95.99777221679688, "epoch": 0.5692883895131086, "grad_norm": 0.44427061080932617, "kl": 1.162353515625, "learning_rate": 2.9233737596225616e-06, "loss": 0.0465, "reward": 0.3017522394657135, "reward_std": 0.7615446895360947, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2600446566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12347099208272994, "step": 152 }, { "completion_length": 89.79687881469727, "epoch": 0.5730337078651685, "grad_norm": 0.3936914801597595, "kl": 1.1171875, "learning_rate": 2.8910861626005774e-06, "loss": 0.0447, "reward": 0.4248616322875023, "reward_std": 0.8152914345264435, "rewards/correctness_reward_func": 0.2410714328289032, "rewards/int_reward_func": 0.286830373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10304017923772335, "step": 153 }, { "completion_length": 90.57589530944824, "epoch": 0.5767790262172284, "grad_norm": 0.43302392959594727, "kl": 1.1453857421875, "learning_rate": 2.858731554977948e-06, "loss": 0.0458, "reward": 0.3075290396809578, "reward_std": 0.7411210238933563, "rewards/correctness_reward_func": 0.16071429289877415, "rewards/int_reward_func": 0.2410714402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09425670048221946, "step": 154 }, { "completion_length": 88.81920051574707, "epoch": 0.5805243445692884, "grad_norm": 0.4523748457431793, "kl": 1.1572265625, "learning_rate": 2.82631548055013e-06, "loss": 0.0463, "reward": 0.3362343907356262, "reward_std": 0.8344388753175735, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2220982238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10907812882214785, "step": 155 }, { "completion_length": 92.31920051574707, "epoch": 0.5842696629213483, "grad_norm": 0.7492924928665161, "kl": 1.13818359375, "learning_rate": 2.7938434936445946e-06, "loss": 0.0455, "reward": 0.2696942128241062, "reward_std": 0.7281434237957001, "rewards/correctness_reward_func": 0.1785714365541935, "rewards/int_reward_func": 0.2421875074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1510647376999259, "step": 156 }, { "completion_length": 91.05580711364746, "epoch": 0.5880149812734082, "grad_norm": 0.5227993130683899, "kl": 1.126220703125, "learning_rate": 2.761321158169134e-06, "loss": 0.045, "reward": 0.2759977802634239, "reward_std": 0.9335441738367081, "rewards/correctness_reward_func": 0.2098214402794838, "rewards/int_reward_func": 0.2388392984867096, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17377902194857597, "step": 157 }, { "completion_length": 95.73437881469727, "epoch": 0.5917602996254682, "grad_norm": 0.462187260389328, "kl": 1.1458740234375, "learning_rate": 2.7287540466585067e-06, "loss": 0.0458, "reward": 0.3727143071591854, "reward_std": 0.8857483267784119, "rewards/correctness_reward_func": 0.2544642947614193, "rewards/int_reward_func": 0.2656250111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14737500809133053, "step": 158 }, { "completion_length": 93.68750381469727, "epoch": 0.5955056179775281, "grad_norm": 0.5321673154830933, "kl": 1.202392578125, "learning_rate": 2.696147739319613e-06, "loss": 0.0481, "reward": 0.41518306732177734, "reward_std": 0.7417058497667313, "rewards/correctness_reward_func": 0.2276785783469677, "rewards/int_reward_func": 0.2678571492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08035267796367407, "step": 159 }, { "completion_length": 93.1093807220459, "epoch": 0.599250936329588, "grad_norm": 0.4620046615600586, "kl": 1.119873046875, "learning_rate": 2.663507823075358e-06, "loss": 0.0448, "reward": 0.32153796777129173, "reward_std": 0.8575054854154587, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.2500000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13381919264793396, "step": 160 }, { "completion_length": 100.61161041259766, "epoch": 0.602996254681648, "grad_norm": 0.4500925838947296, "kl": 1.206298828125, "learning_rate": 2.6308398906073603e-06, "loss": 0.0483, "reward": 0.3319799229502678, "reward_std": 0.7600451558828354, "rewards/correctness_reward_func": 0.1830357201397419, "rewards/int_reward_func": 0.2544642947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10552009008824825, "step": 161 }, { "completion_length": 97.59598731994629, "epoch": 0.6067415730337079, "grad_norm": 0.4500925838947296, "kl": NaN, "learning_rate": 2.6308398906073603e-06, "loss": 0.0447, "reward": 0.2094486728310585, "reward_std": 0.7504701465368271, "rewards/correctness_reward_func": 0.12500000558793545, "rewards/int_reward_func": 0.2321428693830967, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14881027303636074, "step": 162 }, { "completion_length": 94.28125381469727, "epoch": 0.6104868913857678, "grad_norm": 0.4558067321777344, "kl": 1.154296875, "learning_rate": 2.5981495393976718e-06, "loss": 0.0462, "reward": 0.26968081295490265, "reward_std": 0.8480332493782043, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.2410714440047741, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1287567038089037, "step": 163 }, { "completion_length": 98.24107551574707, "epoch": 0.6142322097378277, "grad_norm": 0.4498756229877472, "kl": 1.177734375, "learning_rate": 2.5654423707696834e-06, "loss": 0.0471, "reward": 0.42699556052684784, "reward_std": 0.8207688927650452, "rewards/correctness_reward_func": 0.2544642984867096, "rewards/int_reward_func": 0.2667410783469677, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09420982468873262, "step": 164 }, { "completion_length": 93.59152221679688, "epoch": 0.6179775280898876, "grad_norm": 0.40786251425743103, "kl": 1.1708984375, "learning_rate": 2.5327239889283613e-06, "loss": 0.0468, "reward": 0.28490403294563293, "reward_std": 0.7075008153915405, "rewards/correctness_reward_func": 0.1562500037252903, "rewards/int_reward_func": 0.2399553693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11130134668201208, "step": 165 }, { "completion_length": 97.06920051574707, "epoch": 0.6217228464419475, "grad_norm": 0.4559285640716553, "kl": 1.146484375, "learning_rate": 2.5e-06, "loss": 0.0459, "reward": 0.36037053912878036, "reward_std": 0.7710148096084595, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.2488839328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09833482303656638, "step": 166 }, { "completion_length": 102.25223731994629, "epoch": 0.6254681647940075, "grad_norm": 0.4517359137535095, "kl": 1.250732421875, "learning_rate": 2.4672760110716395e-06, "loss": 0.05, "reward": 0.3416629731655121, "reward_std": 0.7625188678503036, "rewards/correctness_reward_func": 0.2098214402794838, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12820313312113285, "step": 167 }, { "completion_length": 106.65402412414551, "epoch": 0.6292134831460674, "grad_norm": 0.5974560379981995, "kl": 1.312255859375, "learning_rate": 2.434557629230318e-06, "loss": 0.0525, "reward": 0.3028549253940582, "reward_std": 0.6672599911689758, "rewards/correctness_reward_func": 0.12946429336443543, "rewards/int_reward_func": 0.2622767947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08888616785407066, "step": 168 }, { "completion_length": 101.56696701049805, "epoch": 0.6329588014981273, "grad_norm": 0.5514973402023315, "kl": 1.394287109375, "learning_rate": 2.4018504606023295e-06, "loss": 0.0558, "reward": 0.3376808315515518, "reward_std": 0.7231378108263016, "rewards/correctness_reward_func": 0.1696428693830967, "rewards/int_reward_func": 0.2555803619325161, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08754241955466568, "step": 169 }, { "completion_length": 96.87054061889648, "epoch": 0.6367041198501873, "grad_norm": 0.47777560353279114, "kl": 1.227783203125, "learning_rate": 2.3691601093926406e-06, "loss": 0.0491, "reward": 0.33633705973625183, "reward_std": 0.7033251821994781, "rewards/correctness_reward_func": 0.1696428656578064, "rewards/int_reward_func": 0.2500000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.08442188054323196, "step": 170 }, { "completion_length": 98.07366371154785, "epoch": 0.6404494382022472, "grad_norm": 0.47777560353279114, "kl": NaN, "learning_rate": 2.3691601093926406e-06, "loss": 0.0531, "reward": 0.35397323966026306, "reward_std": 0.8121795952320099, "rewards/correctness_reward_func": 0.196428582072258, "rewards/int_reward_func": 0.2600446492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10250000539235771, "step": 171 }, { "completion_length": 95.59821891784668, "epoch": 0.6441947565543071, "grad_norm": 0.47777560353279114, "kl": NaN, "learning_rate": 2.3691601093926406e-06, "loss": 0.053, "reward": 0.43456026911735535, "reward_std": 0.9222677648067474, "rewards/correctness_reward_func": 0.2857142984867096, "rewards/int_reward_func": 0.2633928768336773, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1145468857139349, "step": 172 }, { "completion_length": 90.60714721679688, "epoch": 0.6479400749063671, "grad_norm": 0.4840092957019806, "kl": 1.194580078125, "learning_rate": 2.3364921769246423e-06, "loss": 0.0478, "reward": 0.27813393622636795, "reward_std": 0.899843841791153, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.2455357238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17275893315672874, "step": 173 }, { "completion_length": 97.23214912414551, "epoch": 0.651685393258427, "grad_norm": 0.5476865172386169, "kl": 1.3369140625, "learning_rate": 2.3038522606803882e-06, "loss": 0.0535, "reward": 0.3434709906578064, "reward_std": 0.7968785911798477, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.2566964402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10518973506987095, "step": 174 }, { "completion_length": 100.55357551574707, "epoch": 0.6554307116104869, "grad_norm": 0.44576528668403625, "kl": 1.319091796875, "learning_rate": 2.271245953341494e-06, "loss": 0.0528, "reward": 0.3407433070242405, "reward_std": 0.7602152675390244, "rewards/correctness_reward_func": 0.160714291036129, "rewards/int_reward_func": 0.2488839440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0688549093902111, "step": 175 }, { "completion_length": 94.70089721679688, "epoch": 0.6591760299625468, "grad_norm": 0.4624294340610504, "kl": 1.206298828125, "learning_rate": 2.238678841830867e-06, "loss": 0.0483, "reward": 0.3417031392455101, "reward_std": 0.8133516311645508, "rewards/correctness_reward_func": 0.1919642984867096, "rewards/int_reward_func": 0.258928582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10918973386287689, "step": 176 }, { "completion_length": 90.06473541259766, "epoch": 0.6629213483146067, "grad_norm": 0.4517357349395752, "kl": 1.264404296875, "learning_rate": 2.2061565063554063e-06, "loss": 0.0506, "reward": 0.2323437575250864, "reward_std": 0.7963760495185852, "rewards/correctness_reward_func": 0.1428571492433548, "rewards/int_reward_func": 0.2388392984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14935269264969975, "step": 177 }, { "completion_length": 95.54464721679688, "epoch": 0.6666666666666666, "grad_norm": 0.4517357349395752, "kl": NaN, "learning_rate": 2.2061565063554063e-06, "loss": 0.0465, "reward": 0.33185046166181564, "reward_std": 0.7673598080873489, "rewards/correctness_reward_func": 0.15625000558793545, "rewards/int_reward_func": 0.2633928693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08779241424053907, "step": 178 }, { "completion_length": 95.11830711364746, "epoch": 0.6704119850187266, "grad_norm": 0.4517357349395752, "kl": NaN, "learning_rate": 2.2061565063554063e-06, "loss": 0.0485, "reward": 0.3244776912033558, "reward_std": 0.7535363733768463, "rewards/correctness_reward_func": 0.15178572200238705, "rewards/int_reward_func": 0.2533482201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08065625140443444, "step": 179 }, { "completion_length": 92.10937881469727, "epoch": 0.6741573033707865, "grad_norm": 0.43252718448638916, "kl": 1.205078125, "learning_rate": 2.173684519449872e-06, "loss": 0.0482, "reward": 0.3435089588165283, "reward_std": 0.6813161820173264, "rewards/correctness_reward_func": 0.1919642984867096, "rewards/int_reward_func": 0.2354910895228386, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08394643478095531, "step": 180 }, { "completion_length": 92.46652221679688, "epoch": 0.6779026217228464, "grad_norm": 0.4764062166213989, "kl": 1.170166015625, "learning_rate": 2.1412684450220524e-06, "loss": 0.0468, "reward": 0.40853575617074966, "reward_std": 0.8349853605031967, "rewards/correctness_reward_func": 0.2500000111758709, "rewards/int_reward_func": 0.2600446566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10150893498212099, "step": 181 }, { "completion_length": 97.13616371154785, "epoch": 0.6816479400749064, "grad_norm": 0.4394303262233734, "kl": 1.2177734375, "learning_rate": 2.1089138373994226e-06, "loss": 0.0487, "reward": 0.3534955531358719, "reward_std": 0.7782185822725296, "rewards/correctness_reward_func": 0.2008928693830967, "rewards/int_reward_func": 0.271205373108387, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11860269121825695, "step": 182 }, { "completion_length": 93.8035774230957, "epoch": 0.6853932584269663, "grad_norm": 0.4350711405277252, "kl": 1.158935546875, "learning_rate": 2.0766262403774388e-06, "loss": 0.0464, "reward": 0.29088394716382027, "reward_std": 0.8285562247037888, "rewards/correctness_reward_func": 0.1741071455180645, "rewards/int_reward_func": 0.2321428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11536607332527637, "step": 183 }, { "completion_length": 102.04687881469727, "epoch": 0.6891385767790262, "grad_norm": 0.5000995993614197, "kl": 1.3017578125, "learning_rate": 2.0444111862696313e-06, "loss": 0.0521, "reward": 0.33771875873208046, "reward_std": 0.786424919962883, "rewards/correctness_reward_func": 0.1785714328289032, "rewards/int_reward_func": 0.2645089328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10536161065101624, "step": 184 }, { "completion_length": 84.60714530944824, "epoch": 0.6928838951310862, "grad_norm": 0.4194977283477783, "kl": 1.094482421875, "learning_rate": 2.01227419495968e-06, "loss": 0.0438, "reward": 0.2823236584663391, "reward_std": 0.8382576406002045, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.2343750149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1261584870517254, "step": 185 }, { "completion_length": 101.93527221679688, "epoch": 0.6966292134831461, "grad_norm": 0.5226723551750183, "kl": 1.428466796875, "learning_rate": 1.9802207729556023e-06, "loss": 0.0571, "reward": 0.3646428808569908, "reward_std": 0.7940028458833694, "rewards/correctness_reward_func": 0.20089286752045155, "rewards/int_reward_func": 0.2645089402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1007589353248477, "step": 186 }, { "completion_length": 89.75893211364746, "epoch": 0.700374531835206, "grad_norm": 0.5314286351203918, "kl": 1.265625, "learning_rate": 1.9482564124462478e-06, "loss": 0.0506, "reward": 0.34281474351882935, "reward_std": 0.7245265394449234, "rewards/correctness_reward_func": 0.1875000111758709, "rewards/int_reward_func": 0.2500000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09468527138233185, "step": 187 }, { "completion_length": 89.30357360839844, "epoch": 0.704119850187266, "grad_norm": 0.4597249925136566, "kl": 1.21875, "learning_rate": 1.9163865903602374e-06, "loss": 0.0488, "reward": 0.3371138572692871, "reward_std": 0.7971315979957581, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.2410714328289032, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10038616601377726, "step": 188 }, { "completion_length": 98.16071891784668, "epoch": 0.7078651685393258, "grad_norm": 0.4473523497581482, "kl": 1.35888671875, "learning_rate": 1.8846167674275175e-06, "loss": 0.0544, "reward": 0.2626696489751339, "reward_std": 0.6907573491334915, "rewards/correctness_reward_func": 0.1383928619325161, "rewards/int_reward_func": 0.2555803619325161, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13130357582122087, "step": 189 }, { "completion_length": 102.61830711364746, "epoch": 0.7116104868913857, "grad_norm": 0.4473523497581482, "kl": NaN, "learning_rate": 1.8846167674275175e-06, "loss": 0.054, "reward": 0.3266384117305279, "reward_std": 0.8520393073558807, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.2600446566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12537054996937513, "step": 190 }, { "completion_length": 98.22098731994629, "epoch": 0.7153558052434457, "grad_norm": 0.6242156624794006, "kl": 1.2861328125, "learning_rate": 1.852952387243698e-06, "loss": 0.0514, "reward": 0.2729821652173996, "reward_std": 0.7455658465623856, "rewards/correctness_reward_func": 0.1785714365541935, "rewards/int_reward_func": 0.2343750074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1399642862379551, "step": 191 }, { "completion_length": 95.50446701049805, "epoch": 0.7191011235955056, "grad_norm": 0.5410141348838806, "kl": 1.269287109375, "learning_rate": 1.8213988753373147e-06, "loss": 0.0508, "reward": 0.3193794898688793, "reward_std": 0.8921961933374405, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.2488839402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14825446717441082, "step": 192 }, { "completion_length": 95.50893211364746, "epoch": 0.7228464419475655, "grad_norm": 0.5410141348838806, "kl": NaN, "learning_rate": 1.8213988753373147e-06, "loss": 0.0494, "reward": 0.4274576008319855, "reward_std": 0.8424459546804428, "rewards/correctness_reward_func": 0.2500000074505806, "rewards/int_reward_func": 0.2555803656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07812276761978865, "step": 193 }, { "completion_length": 100.1406307220459, "epoch": 0.7265917602996255, "grad_norm": 0.5630261301994324, "kl": 1.331298828125, "learning_rate": 1.7899616382401935e-06, "loss": 0.0532, "reward": 0.24447321146726608, "reward_std": 0.8442487269639969, "rewards/correctness_reward_func": 0.1517857201397419, "rewards/int_reward_func": 0.2455357275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15284822601825, "step": 194 }, { "completion_length": 98.96429061889648, "epoch": 0.7303370786516854, "grad_norm": 0.5630261301994324, "kl": NaN, "learning_rate": 1.7899616382401935e-06, "loss": 0.0485, "reward": 0.3635468855500221, "reward_std": 0.8013804405927658, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2544642984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07841741375159472, "step": 195 }, { "completion_length": 96.05804252624512, "epoch": 0.7340823970037453, "grad_norm": 0.4605935513973236, "kl": 1.296875, "learning_rate": 1.758646062561073e-06, "loss": 0.0519, "reward": 0.30311162024736404, "reward_std": 0.7245951294898987, "rewards/correctness_reward_func": 0.13839286379516125, "rewards/int_reward_func": 0.2433035783469677, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07858482515439391, "step": 196 }, { "completion_length": 99.58036041259766, "epoch": 0.7378277153558053, "grad_norm": 0.5969142913818359, "kl": 1.305419921875, "learning_rate": 1.7274575140626318e-06, "loss": 0.0522, "reward": 0.26581921428442, "reward_std": 0.8090188354253769, "rewards/correctness_reward_func": 0.15625000931322575, "rewards/int_reward_func": 0.2310267984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12145759037230164, "step": 197 }, { "completion_length": 91.8995590209961, "epoch": 0.7415730337078652, "grad_norm": 0.527574360370636, "kl": 1.245849609375, "learning_rate": 1.6964013367420967e-06, "loss": 0.0498, "reward": 0.38672323897480965, "reward_std": 0.7834379523992538, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2689732238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10546428337693214, "step": 198 }, { "completion_length": 98.9375057220459, "epoch": 0.7453183520599251, "grad_norm": 0.5717042684555054, "kl": 1.277587890625, "learning_rate": 1.665482851915573e-06, "loss": 0.0511, "reward": 0.24759376049041748, "reward_std": 0.6963834911584854, "rewards/correctness_reward_func": 0.1116071492433548, "rewards/int_reward_func": 0.2343750111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09838839736767113, "step": 199 }, { "completion_length": 96.61830711364746, "epoch": 0.7490636704119851, "grad_norm": 0.5991680026054382, "kl": 1.228271484375, "learning_rate": 1.634707357306267e-06, "loss": 0.0491, "reward": 0.3660937622189522, "reward_std": 0.8573340475559235, "rewards/correctness_reward_func": 0.2366071492433548, "rewards/int_reward_func": 0.2488839365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11939733009785414, "step": 200 }, { "completion_length": 106.84152221679688, "epoch": 0.7528089887640449, "grad_norm": 0.5178794860839844, "kl": 1.34326171875, "learning_rate": 1.6040801261367494e-06, "loss": 0.0537, "reward": 0.3160022422671318, "reward_std": 0.8164055794477463, "rewards/correctness_reward_func": 0.196428582072258, "rewards/int_reward_func": 0.2622767984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14270313642919064, "step": 201 }, { "completion_length": 90.1875057220459, "epoch": 0.7565543071161048, "grad_norm": 0.5120040774345398, "kl": 1.171142578125, "learning_rate": 1.5736064062254094e-06, "loss": 0.0468, "reward": 0.2789843790233135, "reward_std": 0.8016841560602188, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.251116082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12838170863687992, "step": 202 }, { "completion_length": 90.62277030944824, "epoch": 0.7602996254681648, "grad_norm": 0.4355852007865906, "kl": 1.2255859375, "learning_rate": 1.5432914190872757e-06, "loss": 0.049, "reward": 0.34145762026309967, "reward_std": 0.745452344417572, "rewards/correctness_reward_func": 0.1696428656578064, "rewards/int_reward_func": 0.2645089402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09269420150667429, "step": 203 }, { "completion_length": 96.64955711364746, "epoch": 0.7640449438202247, "grad_norm": 0.49031803011894226, "kl": 1.14404296875, "learning_rate": 1.5131403590393323e-06, "loss": 0.0458, "reward": 0.3124375157058239, "reward_std": 0.7880858033895493, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2343750074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10943750524893403, "step": 204 }, { "completion_length": 90.63616561889648, "epoch": 0.7677902621722846, "grad_norm": 0.48222535848617554, "kl": 1.188720703125, "learning_rate": 1.4831583923105e-06, "loss": 0.0475, "reward": 0.372580386698246, "reward_std": 0.8582592159509659, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.2522321566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0984017881564796, "step": 205 }, { "completion_length": 99.03348922729492, "epoch": 0.7715355805243446, "grad_norm": 0.4410792589187622, "kl": 1.247314453125, "learning_rate": 1.4533506561564305e-06, "loss": 0.0499, "reward": 0.33057814463973045, "reward_std": 0.7788708359003067, "rewards/correctness_reward_func": 0.2142857238650322, "rewards/int_reward_func": 0.2433035857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1270111622288823, "step": 206 }, { "completion_length": 103.51786041259766, "epoch": 0.7752808988764045, "grad_norm": 0.5613678693771362, "kl": 1.181640625, "learning_rate": 1.4237222579792618e-06, "loss": 0.0473, "reward": 0.26221875846385956, "reward_std": 0.7947122156620026, "rewards/correctness_reward_func": 0.14732143469154835, "rewards/int_reward_func": 0.2421875074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12729018926620483, "step": 207 }, { "completion_length": 92.02232551574707, "epoch": 0.7790262172284644, "grad_norm": 0.6175336241722107, "kl": 1.016357421875, "learning_rate": 1.3942782744524974e-06, "loss": 0.0406, "reward": 0.2860134020447731, "reward_std": 0.7233386188745499, "rewards/correctness_reward_func": 0.1339285783469677, "rewards/int_reward_func": 0.2276785783469677, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.075593750923872, "step": 208 }, { "completion_length": 103.3125057220459, "epoch": 0.7827715355805244, "grad_norm": 0.4679234027862549, "kl": 1.18408203125, "learning_rate": 1.3650237506511333e-06, "loss": 0.0474, "reward": 0.28701116889715195, "reward_std": 0.8643300235271454, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.2578125111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18062277510762215, "step": 209 }, { "completion_length": 102.34598541259766, "epoch": 0.7865168539325843, "grad_norm": 0.5165115594863892, "kl": 1.170654296875, "learning_rate": 1.3359636991872215e-06, "loss": 0.0468, "reward": 0.30597545951604843, "reward_std": 0.6952601373195648, "rewards/correctness_reward_func": 0.1428571492433548, "rewards/int_reward_func": 0.2500000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08688170462846756, "step": 210 }, { "completion_length": 114.28572082519531, "epoch": 0.7902621722846442, "grad_norm": 0.4335125982761383, "kl": 1.242919921875, "learning_rate": 1.307103099350979e-06, "loss": 0.0497, "reward": 0.2512656319886446, "reward_std": 0.8131757378578186, "rewards/correctness_reward_func": 0.1517857201397419, "rewards/int_reward_func": 0.2377232238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1382433008402586, "step": 211 }, { "completion_length": 101.02009391784668, "epoch": 0.7940074906367042, "grad_norm": 0.5146499276161194, "kl": 1.20947265625, "learning_rate": 1.2784468962576136e-06, "loss": 0.0484, "reward": 0.27424776926636696, "reward_std": 0.76705102622509, "rewards/correctness_reward_func": 0.13392858020961285, "rewards/int_reward_func": 0.2377232201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09740401990711689, "step": 212 }, { "completion_length": 108.00000381469727, "epoch": 0.797752808988764, "grad_norm": 0.46153557300567627, "kl": 1.201416015625, "learning_rate": 1.2500000000000007e-06, "loss": 0.048, "reward": 0.2503928691148758, "reward_std": 0.8307089358568192, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2410714365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1781785748898983, "step": 213 }, { "completion_length": 93.59598731994629, "epoch": 0.8014981273408239, "grad_norm": 0.49990415573120117, "kl": 1.1107177734375, "learning_rate": 1.2217672848073702e-06, "loss": 0.0444, "reward": 0.3425290137529373, "reward_std": 0.8552941530942917, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.2645089477300644, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1418459787964821, "step": 214 }, { "completion_length": 100.52009201049805, "epoch": 0.8052434456928839, "grad_norm": 0.46211493015289307, "kl": 1.141845703125, "learning_rate": 1.193753588210128e-06, "loss": 0.0457, "reward": 0.2656339444220066, "reward_std": 0.7493345886468887, "rewards/correctness_reward_func": 0.15625000931322575, "rewards/int_reward_func": 0.2767857275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1674017831683159, "step": 215 }, { "completion_length": 90.06473541259766, "epoch": 0.8089887640449438, "grad_norm": 0.4925229549407959, "kl": 1.171630859375, "learning_rate": 1.1659637102109713e-06, "loss": 0.0469, "reward": 0.31793973594903946, "reward_std": 0.8032208532094955, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.2433035857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09947098419070244, "step": 216 }, { "completion_length": 93.21428871154785, "epoch": 0.8127340823970037, "grad_norm": 0.4107387363910675, "kl": 1.096923828125, "learning_rate": 1.1384024124624324e-06, "loss": 0.0439, "reward": 0.2808660827577114, "reward_std": 0.7595269531011581, "rewards/correctness_reward_func": 0.19642857648432255, "rewards/int_reward_func": 0.243303582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15886607952415943, "step": 217 }, { "completion_length": 107.45982551574707, "epoch": 0.8164794007490637, "grad_norm": 0.4463358521461487, "kl": 1.247802734375, "learning_rate": 1.1110744174509952e-06, "loss": 0.0499, "reward": 0.27611831203103065, "reward_std": 0.8640467375516891, "rewards/correctness_reward_func": 0.2187500074505806, "rewards/int_reward_func": 0.2444196492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18705134466290474, "step": 218 }, { "completion_length": 96.86384582519531, "epoch": 0.8202247191011236, "grad_norm": 0.5301110148429871, "kl": 1.199951171875, "learning_rate": 1.0839844076879186e-06, "loss": 0.048, "reward": 0.31224555149674416, "reward_std": 0.7878952473402023, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.258928582072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15204017609357834, "step": 219 }, { "completion_length": 91.19643211364746, "epoch": 0.8239700374531835, "grad_norm": 0.4537685811519623, "kl": 1.101806640625, "learning_rate": 1.0571370249069163e-06, "loss": 0.0441, "reward": 0.3926495686173439, "reward_std": 0.8612103760242462, "rewards/correctness_reward_func": 0.2008928693830967, "rewards/int_reward_func": 0.2611607313156128, "rewards/soft_format_reward_func": 0.0011160714784637094, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07052009226754308, "step": 220 }, { "completion_length": 90.3125057220459, "epoch": 0.8277153558052435, "grad_norm": 0.4537685811519623, "kl": NaN, "learning_rate": 1.0571370249069163e-06, "loss": 0.0418, "reward": 0.28333261236548424, "reward_std": 0.7591045498847961, "rewards/correctness_reward_func": 0.1562500074505806, "rewards/int_reward_func": 0.2455357275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11845313012599945, "step": 221 }, { "completion_length": 90.84152030944824, "epoch": 0.8314606741573034, "grad_norm": 0.5334520936012268, "kl": 1.140625, "learning_rate": 1.0305368692688175e-06, "loss": 0.0456, "reward": 0.305205374956131, "reward_std": 0.7375971227884293, "rewards/correctness_reward_func": 0.1830357164144516, "rewards/int_reward_func": 0.2310267984867096, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10885715018957853, "step": 222 }, { "completion_length": 93.97098541259766, "epoch": 0.8352059925093633, "grad_norm": 0.4208020269870758, "kl": 1.108154296875, "learning_rate": 1.0041884985733524e-06, "loss": 0.0443, "reward": 0.339029036462307, "reward_std": 0.7930542379617691, "rewards/correctness_reward_func": 0.2053571492433548, "rewards/int_reward_func": 0.2287946529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09512277226895094, "step": 223 }, { "completion_length": 102.49107551574707, "epoch": 0.8389513108614233, "grad_norm": 0.41976016759872437, "kl": 1.1943359375, "learning_rate": 9.780964274781984e-07, "loss": 0.0478, "reward": 0.23499107360839844, "reward_std": 0.7919187396764755, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.219866082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15005357982590795, "step": 224 }, { "completion_length": 88.04464912414551, "epoch": 0.8426966292134831, "grad_norm": 0.41976016759872437, "kl": NaN, "learning_rate": 9.780964274781984e-07, "loss": 0.0464, "reward": 0.20726785995066166, "reward_std": 0.7513840273022652, "rewards/correctness_reward_func": 0.10714286379516125, "rewards/int_reward_func": 0.219866082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11974107846617699, "step": 225 }, { "completion_length": 99.45536422729492, "epoch": 0.846441947565543, "grad_norm": 0.6243520975112915, "kl": 1.137939453125, "learning_rate": 9.522651267254149e-07, "loss": 0.0455, "reward": 0.3012098353356123, "reward_std": 0.7535159438848495, "rewards/correctness_reward_func": 0.1696428656578064, "rewards/int_reward_func": 0.2287946566939354, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09722768981009722, "step": 226 }, { "completion_length": 88.08928871154785, "epoch": 0.850187265917603, "grad_norm": 0.5370275974273682, "kl": 1.02734375, "learning_rate": 9.266990223754069e-07, "loss": 0.0411, "reward": 0.3909241184592247, "reward_std": 0.7717972099781036, "rewards/correctness_reward_func": 0.1964285783469677, "rewards/int_reward_func": 0.2511160857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05662053730338812, "step": 227 }, { "completion_length": 92.60937881469727, "epoch": 0.8539325842696629, "grad_norm": 0.4681946337223053, "kl": 1.1201171875, "learning_rate": 9.014024950485384e-07, "loss": 0.0448, "reward": 0.3636852651834488, "reward_std": 0.8009557723999023, "rewards/correctness_reward_func": 0.2053571566939354, "rewards/int_reward_func": 0.251116082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09278795216232538, "step": 228 }, { "completion_length": 105.85491371154785, "epoch": 0.8576779026217228, "grad_norm": 0.4945945143699646, "kl": 1.174072265625, "learning_rate": 8.763798791745413e-07, "loss": 0.047, "reward": 0.3037031330168247, "reward_std": 0.8670637309551239, "rewards/correctness_reward_func": 0.2008928656578064, "rewards/int_reward_func": 0.2745535857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17174331843852997, "step": 229 }, { "completion_length": 94.59152221679688, "epoch": 0.8614232209737828, "grad_norm": 0.5289459228515625, "kl": 1.0704345703125, "learning_rate": 8.516354622498279e-07, "loss": 0.0428, "reward": 0.3705156408250332, "reward_std": 0.8742925226688385, "rewards/correctness_reward_func": 0.223214291036129, "rewards/int_reward_func": 0.2488839365541935, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10158259607851505, "step": 230 }, { "completion_length": 86.8683090209961, "epoch": 0.8651685393258427, "grad_norm": 0.4612804353237152, "kl": 1.0635986328125, "learning_rate": 8.271734841028553e-07, "loss": 0.0425, "reward": 0.32227009534835815, "reward_std": 0.7606519907712936, "rewards/correctness_reward_func": 0.14285715110599995, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08063170197419822, "step": 231 }, { "completion_length": 109.55357551574707, "epoch": 0.8689138576779026, "grad_norm": 0.5144924521446228, "kl": 1.314208984375, "learning_rate": 8.029981361676456e-07, "loss": 0.0526, "reward": 0.28595758974552155, "reward_std": 0.8525267541408539, "rewards/correctness_reward_func": 0.2098214440047741, "rewards/int_reward_func": 0.2611607238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.185024568811059, "step": 232 }, { "completion_length": 98.18303871154785, "epoch": 0.8726591760299626, "grad_norm": 0.5144924521446228, "kl": NaN, "learning_rate": 8.029981361676456e-07, "loss": 0.0476, "reward": 0.2972254566848278, "reward_std": 0.7380311787128448, "rewards/correctness_reward_func": 0.1473214365541935, "rewards/int_reward_func": 0.2433035857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09339955315226689, "step": 233 }, { "completion_length": 98.16741371154785, "epoch": 0.8764044943820225, "grad_norm": 0.5009276270866394, "kl": 1.244384765625, "learning_rate": 7.791135607656147e-07, "loss": 0.0498, "reward": 0.3269129544496536, "reward_std": 0.7167427837848663, "rewards/correctness_reward_func": 0.16071429662406445, "rewards/int_reward_func": 0.2500000111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08380134031176567, "step": 234 }, { "completion_length": 97.88616371154785, "epoch": 0.8801498127340824, "grad_norm": 0.5268076062202454, "kl": 1.235107421875, "learning_rate": 7.555238503958001e-07, "loss": 0.0494, "reward": 0.3055223375558853, "reward_std": 0.8659389615058899, "rewards/correctness_reward_func": 0.1919642947614193, "rewards/int_reward_func": 0.2500000074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13644197303801775, "step": 235 }, { "completion_length": 88.23661041259766, "epoch": 0.8838951310861424, "grad_norm": 0.5268076062202454, "kl": NaN, "learning_rate": 7.555238503958001e-07, "loss": 0.0497, "reward": 0.3386116325855255, "reward_std": 0.7405965030193329, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2500000186264515, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07656696811318398, "step": 236 }, { "completion_length": 92.50000381469727, "epoch": 0.8876404494382022, "grad_norm": 0.49278637766838074, "kl": 1.129150390625, "learning_rate": 7.322330470336314e-07, "loss": 0.0452, "reward": 0.265178584959358, "reward_std": 0.866941437125206, "rewards/correctness_reward_func": 0.16517857648432255, "rewards/int_reward_func": 0.2477678693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14776786230504513, "step": 237 }, { "completion_length": 95.01339912414551, "epoch": 0.8913857677902621, "grad_norm": 0.46395623683929443, "kl": 1.181640625, "learning_rate": 7.092451414383644e-07, "loss": 0.0473, "reward": 0.2845067009329796, "reward_std": 0.8498467355966568, "rewards/correctness_reward_func": 0.1696428693830967, "rewards/int_reward_func": 0.2555803656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.14183259941637516, "step": 238 }, { "completion_length": 99.3593807220459, "epoch": 0.8951310861423221, "grad_norm": 0.6616207957267761, "kl": 1.17919921875, "learning_rate": 6.865640724692815e-07, "loss": 0.0472, "reward": 0.3268973380327225, "reward_std": 0.8011666536331177, "rewards/correctness_reward_func": 0.2008928656578064, "rewards/int_reward_func": 0.2500000149011612, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12399554438889027, "step": 239 }, { "completion_length": 104.7745590209961, "epoch": 0.898876404494382, "grad_norm": 0.5387265086174011, "kl": 1.183349609375, "learning_rate": 6.641937264107868e-07, "loss": 0.0473, "reward": 0.4128861799836159, "reward_std": 0.850861206650734, "rewards/correctness_reward_func": 0.2232142947614193, "rewards/int_reward_func": 0.2734375074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08376562781631947, "step": 240 }, { "completion_length": 92.34152221679688, "epoch": 0.9026217228464419, "grad_norm": 0.5197759866714478, "kl": 1.0531005859375, "learning_rate": 6.421379363065142e-07, "loss": 0.0421, "reward": 0.35036832839250565, "reward_std": 0.8599109500646591, "rewards/correctness_reward_func": 0.2187500149011612, "rewards/int_reward_func": 0.2645089440047741, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13289063051342964, "step": 241 }, { "completion_length": 111.65402030944824, "epoch": 0.9063670411985019, "grad_norm": 0.46235111355781555, "kl": 1.236572265625, "learning_rate": 6.204004813025569e-07, "loss": 0.0495, "reward": 0.3619754686951637, "reward_std": 0.7729392051696777, "rewards/correctness_reward_func": 0.2187500037252903, "rewards/int_reward_func": 0.2589285857975483, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11570313014090061, "step": 242 }, { "completion_length": 103.99107551574707, "epoch": 0.9101123595505618, "grad_norm": 0.5195502638816833, "kl": 1.19384765625, "learning_rate": 5.989850859999227e-07, "loss": 0.0477, "reward": 0.2594174239784479, "reward_std": 0.7237197905778885, "rewards/correctness_reward_func": 0.1473214328289032, "rewards/int_reward_func": 0.2522321529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.14013616926968098, "step": 243 }, { "completion_length": 93.28571891784668, "epoch": 0.9138576779026217, "grad_norm": 0.4491368234157562, "kl": 1.1376953125, "learning_rate": 5.778954198163514e-07, "loss": 0.0455, "reward": 0.2785803731530905, "reward_std": 0.6952795684337616, "rewards/correctness_reward_func": 0.160714291036129, "rewards/int_reward_func": 0.2533482275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13548214174807072, "step": 244 }, { "completion_length": 95.37500381469727, "epoch": 0.9176029962546817, "grad_norm": 0.4656703770160675, "kl": 1.113037109375, "learning_rate": 5.571350963575728e-07, "loss": 0.0445, "reward": 0.3266986757516861, "reward_std": 0.7986108660697937, "rewards/correctness_reward_func": 0.1830357238650322, "rewards/int_reward_func": 0.2421875111758709, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.09964063111692667, "step": 245 }, { "completion_length": 100.66964912414551, "epoch": 0.9213483146067416, "grad_norm": 0.48842382431030273, "kl": 1.119873046875, "learning_rate": 5.367076727981383e-07, "loss": 0.0448, "reward": 0.21152456477284431, "reward_std": 0.7304975092411041, "rewards/correctness_reward_func": 0.12053571827709675, "rewards/int_reward_func": 0.2064732275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11548437923192978, "step": 246 }, { "completion_length": 95.72991561889648, "epoch": 0.9250936329588015, "grad_norm": 0.48145803809165955, "kl": 1.124755859375, "learning_rate": 5.166166492719124e-07, "loss": 0.045, "reward": 0.2994174249470234, "reward_std": 0.7895640283823013, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2377232275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10348437912762165, "step": 247 }, { "completion_length": 101.8192024230957, "epoch": 0.9288389513108615, "grad_norm": 0.4706651568412781, "kl": 1.165283203125, "learning_rate": 4.968654682723487e-07, "loss": 0.0466, "reward": 0.28485044091939926, "reward_std": 0.9133375287055969, "rewards/correctness_reward_func": 0.2098214365541935, "rewards/int_reward_func": 0.2321428656578064, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15711384266614914, "step": 248 }, { "completion_length": 91.77902030944824, "epoch": 0.9325842696629213, "grad_norm": 0.9746862053871155, "kl": 1.063232421875, "learning_rate": 4.774575140626317e-07, "loss": 0.0425, "reward": 0.3371250182390213, "reward_std": 0.775251716375351, "rewards/correctness_reward_func": 0.1696428656578064, "rewards/int_reward_func": 0.2433035783469677, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07582143507897854, "step": 249 }, { "completion_length": 93.1718807220459, "epoch": 0.9363295880149812, "grad_norm": 0.4796634316444397, "kl": 1.09228515625, "learning_rate": 4.5839611209580277e-07, "loss": 0.0437, "reward": 0.42918528616428375, "reward_std": 0.7719597369432449, "rewards/correctness_reward_func": 0.2008928693830967, "rewards/int_reward_func": 0.2767857201397419, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04849330266006291, "step": 250 }, { "completion_length": 86.16964721679688, "epoch": 0.9400749063670412, "grad_norm": 0.4254949986934662, "kl": 1.08740234375, "learning_rate": 4.396845284449608e-07, "loss": 0.0435, "reward": 0.27672769874334335, "reward_std": 0.7562145739793777, "rewards/correctness_reward_func": 0.1383928619325161, "rewards/int_reward_func": 0.251116082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11278124991804361, "step": 251 }, { "completion_length": 101.90402221679688, "epoch": 0.9438202247191011, "grad_norm": 0.4744266867637634, "kl": 1.144775390625, "learning_rate": 4.2132596924363666e-07, "loss": 0.0458, "reward": 0.2469821460545063, "reward_std": 0.840282753109932, "rewards/correctness_reward_func": 0.1785714365541935, "rewards/int_reward_func": 0.2410714402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17266072891652584, "step": 252 }, { "completion_length": 118.7723274230957, "epoch": 0.947565543071161, "grad_norm": 0.4241223633289337, "kl": 1.21875, "learning_rate": 4.033235801364402e-07, "loss": 0.0488, "reward": 0.15625000838190317, "reward_std": 0.7448219954967499, "rewards/correctness_reward_func": 0.1205357238650322, "rewards/int_reward_func": 0.2209821529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.1863839291036129, "step": 253 }, { "completion_length": 101.55580711364746, "epoch": 0.951310861423221, "grad_norm": 0.5570555925369263, "kl": 1.070068359375, "learning_rate": 3.85680445740067e-07, "loss": 0.0428, "reward": 0.23607589676976204, "reward_std": 0.7750666290521622, "rewards/correctness_reward_func": 0.1116071492433548, "rewards/int_reward_func": 0.2287946529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10432589706033468, "step": 254 }, { "completion_length": 95.45089721679688, "epoch": 0.9550561797752809, "grad_norm": 0.511661946773529, "kl": 1.1142578125, "learning_rate": 3.683995891147696e-07, "loss": 0.0446, "reward": 0.36682143807411194, "reward_std": 0.7813057452440262, "rewards/correctness_reward_func": 0.2053571529686451, "rewards/int_reward_func": 0.2600446529686451, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0985803622752428, "step": 255 }, { "completion_length": 101.62277030944824, "epoch": 0.9588014981273408, "grad_norm": 0.4439486563205719, "kl": 1.134765625, "learning_rate": 3.514839712463683e-07, "loss": 0.0454, "reward": 0.30028797313570976, "reward_std": 0.8320818990468979, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2455357238650322, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11042634584009647, "step": 256 }, { "completion_length": 93.14062881469727, "epoch": 0.9625468164794008, "grad_norm": 0.5691079497337341, "kl": 1.0438232421875, "learning_rate": 3.3493649053890325e-07, "loss": 0.0418, "reward": 0.31014733761548996, "reward_std": 0.7862526774406433, "rewards/correctness_reward_func": 0.1741071529686451, "rewards/int_reward_func": 0.2399553693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10391518194228411, "step": 257 }, { "completion_length": 102.33705711364746, "epoch": 0.9662921348314607, "grad_norm": 0.5333299040794373, "kl": 1.126220703125, "learning_rate": 3.187599823180071e-07, "loss": 0.045, "reward": 0.32864734157919884, "reward_std": 0.7325298935174942, "rewards/correctness_reward_func": 0.1473214328289032, "rewards/int_reward_func": 0.2578125074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0764866080135107, "step": 258 }, { "completion_length": 93.04911231994629, "epoch": 0.9700374531835206, "grad_norm": 0.5367720127105713, "kl": 1.045654296875, "learning_rate": 3.0295721834508686e-07, "loss": 0.0418, "reward": 0.39304019510746, "reward_std": 0.9003488570451736, "rewards/correctness_reward_func": 0.254464291036129, "rewards/int_reward_func": 0.2678571492433548, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12928125727921724, "step": 259 }, { "completion_length": 91.97545051574707, "epoch": 0.9737827715355806, "grad_norm": 0.4951762557029724, "kl": 1.07470703125, "learning_rate": 2.875309063423956e-07, "loss": 0.043, "reward": 0.21667636185884476, "reward_std": 0.8131074160337448, "rewards/correctness_reward_func": 0.1294642947614193, "rewards/int_reward_func": 0.2220982275903225, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.13600223883986473, "step": 260 }, { "completion_length": 92.67634391784668, "epoch": 0.9775280898876404, "grad_norm": 0.5502758622169495, "kl": 1.0794677734375, "learning_rate": 2.7248368952908055e-07, "loss": 0.0432, "reward": 0.32996875420212746, "reward_std": 0.8287549465894699, "rewards/correctness_reward_func": 0.1785714328289032, "rewards/int_reward_func": 0.2723214402794838, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0011160714784637094, "rewards/xmlcount_reward_func": -0.12204018794000149, "step": 261 }, { "completion_length": 100.24777221679688, "epoch": 0.9812734082397003, "grad_norm": 0.5502758622169495, "kl": NaN, "learning_rate": 2.7248368952908055e-07, "loss": 0.0487, "reward": 0.3751874938607216, "reward_std": 0.7956403493881226, "rewards/correctness_reward_func": 0.2276785857975483, "rewards/int_reward_func": 0.266741082072258, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11923214513808489, "step": 262 }, { "completion_length": 94.25000381469727, "epoch": 0.9850187265917603, "grad_norm": 0.5013113617897034, "kl": 1.0750732421875, "learning_rate": 2.5781814616827936e-07, "loss": 0.043, "reward": 0.30204688012599945, "reward_std": 0.8205768465995789, "rewards/correctness_reward_func": 0.1875000074505806, "rewards/int_reward_func": 0.2466517947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13210491463541985, "step": 263 }, { "completion_length": 88.36830711364746, "epoch": 0.9887640449438202, "grad_norm": 0.55992192029953, "kl": 1.045166015625, "learning_rate": 2.43536789125349e-07, "loss": 0.0418, "reward": 0.27569420635700226, "reward_std": 0.8564379215240479, "rewards/correctness_reward_func": 0.1651785783469677, "rewards/int_reward_func": 0.2466517947614193, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13613616861402988, "step": 264 }, { "completion_length": 89.41071701049805, "epoch": 0.9925093632958801, "grad_norm": 0.55992192029953, "kl": NaN, "learning_rate": 2.43536789125349e-07, "loss": 0.0415, "reward": 0.3237812668085098, "reward_std": 0.8230260014533997, "rewards/correctness_reward_func": 0.1785714402794838, "rewards/int_reward_func": 0.2656250074505806, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12041518278419971, "step": 265 }, { "completion_length": 105.87500381469727, "epoch": 0.9962546816479401, "grad_norm": 0.6678707003593445, "kl": 1.279541015625, "learning_rate": 2.2964206543729662e-07, "loss": 0.0512, "reward": 0.34079688787460327, "reward_std": 0.805058628320694, "rewards/correctness_reward_func": 0.196428582072258, "rewards/int_reward_func": 0.2399553693830967, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09558706358075142, "step": 266 }, { "completion_length": 102.1875, "epoch": 1.0, "grad_norm": 0.5808861255645752, "kl": 1.051025390625, "learning_rate": 2.1613635589349756e-07, "loss": 0.0441, "reward": 0.27787497639656067, "reward_std": 0.8605497926473618, "rewards/correctness_reward_func": 0.25, "rewards/int_reward_func": 0.28125, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2533750110305846, "step": 267 }, { "epoch": 1.0, "step": 267, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 1.6097, "train_samples_per_second": 4642.518, "train_steps_per_second": 165.871 } ], "logging_steps": 1, "max_steps": 267, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 54, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }