{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998183799491464, "eval_steps": 500, "global_step": 1835, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 835.8389282226562, "epoch": 0.0005448601525608427, "grad_norm": 0.11362240463495255, "kl": 0.0, "learning_rate": 1.0869565217391305e-07, "loss": 0.0, "reward": 0.08333333643774192, "reward_std": 0.08660254192849, "rewards/accuracy_reward": 0.08333333643774192, "step": 1 }, { "completion_length": 832.3395566587095, "epoch": 0.005448601525608427, "grad_norm": 0.15013852715492249, "kl": 0.000282216955114294, "learning_rate": 1.0869565217391306e-06, "loss": 0.0, "reward": 0.09629630283625037, "reward_std": 0.11440088896563759, "rewards/accuracy_reward": 0.09629630283625037, "step": 10 }, { "completion_length": 836.4605936686198, "epoch": 0.010897203051216855, "grad_norm": 0.1638469398021698, "kl": 0.0003055572509765625, "learning_rate": 2.173913043478261e-06, "loss": 0.0, "reward": 0.10555556149532398, "reward_std": 0.11643230679134528, "rewards/accuracy_reward": 0.10555556149532398, "step": 20 }, { "completion_length": 861.1105936686198, "epoch": 0.01634580457682528, "grad_norm": 0.13830411434173584, "kl": 0.0004775683085123698, "learning_rate": 3.2608695652173914e-06, "loss": 0.0, "reward": 0.10111111688117186, "reward_std": 0.11354555537303289, "rewards/accuracy_reward": 0.10111111688117186, "step": 30 }, { "completion_length": 836.8517130533854, "epoch": 0.02179440610243371, "grad_norm": 0.14476309716701508, "kl": 0.0010023752848307291, "learning_rate": 4.347826086956522e-06, "loss": 0.0, "reward": 0.09555556103587151, "reward_std": 0.12220580745488405, "rewards/accuracy_reward": 0.09555556103587151, "step": 40 }, { "completion_length": 860.2317138671875, "epoch": 0.027243007628042135, "grad_norm": 0.2523486614227295, "kl": 0.00617218017578125, "learning_rate": 5.4347826086956525e-06, "loss": 0.0002, "reward": 0.1100000058611234, "reward_std": 0.13567731541891892, "rewards/accuracy_reward": 0.1100000058611234, "step": 50 }, { "completion_length": 824.0989339192708, "epoch": 0.03269160915365056, "grad_norm": 0.7656663060188293, "kl": 0.2400385538736979, "learning_rate": 6.521739130434783e-06, "loss": 0.0096, "reward": 0.12222222927957774, "reward_std": 0.13086606257905561, "rewards/accuracy_reward": 0.12222222927957774, "step": 60 }, { "completion_length": 804.6511555989583, "epoch": 0.03814021067925899, "grad_norm": 0.1682807207107544, "kl": 2.9306009928385417, "learning_rate": 7.608695652173914e-06, "loss": 0.1172, "reward": 0.12055556221554677, "reward_std": 0.13086606518675883, "rewards/accuracy_reward": 0.12055556221554677, "step": 70 }, { "completion_length": 809.376708984375, "epoch": 0.04358881220486742, "grad_norm": 0.18487082421779633, "kl": 0.050923665364583336, "learning_rate": 8.695652173913044e-06, "loss": 0.002, "reward": 0.11666667331010103, "reward_std": 0.13471506604303915, "rewards/accuracy_reward": 0.11666667331010103, "step": 80 }, { "completion_length": 845.8022705078125, "epoch": 0.04903741373047584, "grad_norm": 0.11119978129863739, "kl": 0.06029459635416667, "learning_rate": 9.782608695652175e-06, "loss": 0.0024, "reward": 0.10888889469206334, "reward_std": 0.13375281381110352, "rewards/accuracy_reward": 0.10888889469206334, "step": 90 }, { "completion_length": 889.8622701009115, "epoch": 0.05448601525608427, "grad_norm": 0.7447437644004822, "kl": 0.060872395833333336, "learning_rate": 1.0869565217391305e-05, "loss": 0.0024, "reward": 0.1194444514811039, "reward_std": 0.14914882288624842, "rewards/accuracy_reward": 0.1194444514811039, "step": 100 }, { "completion_length": 934.887266031901, "epoch": 0.0599346167816927, "grad_norm": 0.13841398060321808, "kl": 0.059228515625, "learning_rate": 1.1956521739130435e-05, "loss": 0.0024, "reward": 0.1444444514811039, "reward_std": 0.15588457534710567, "rewards/accuracy_reward": 0.1444444514811039, "step": 110 }, { "completion_length": 847.1794860839843, "epoch": 0.06538321830730112, "grad_norm": 0.20260043442249298, "kl": 0.058121744791666666, "learning_rate": 1.3043478260869566e-05, "loss": 0.0023, "reward": 0.12444445031384627, "reward_std": 0.1414508189385136, "rewards/accuracy_reward": 0.12444445031384627, "step": 120 }, { "completion_length": 782.5839294433594, "epoch": 0.07083181983290955, "grad_norm": 0.16823574900627136, "kl": 0.049943033854166666, "learning_rate": 1.4130434782608698e-05, "loss": 0.002, "reward": 0.131666674092412, "reward_std": 0.14529982060194016, "rewards/accuracy_reward": 0.131666674092412, "step": 130 }, { "completion_length": 677.8644755045573, "epoch": 0.07628042135851798, "grad_norm": 1.262801170349121, "kl": 0.04938557942708333, "learning_rate": 1.5217391304347828e-05, "loss": 0.002, "reward": 0.13833334061006705, "reward_std": 0.12990381121635436, "rewards/accuracy_reward": 0.13833334061006705, "step": 140 }, { "completion_length": 754.23115234375, "epoch": 0.08172902288412641, "grad_norm": 0.15313167870044708, "kl": 0.06473795572916667, "learning_rate": 1.630434782608696e-05, "loss": 0.0026, "reward": 0.13055556224038203, "reward_std": 0.11931905665745338, "rewards/accuracy_reward": 0.13055556224038203, "step": 150 }, { "completion_length": 941.7539388020833, "epoch": 0.08717762440973484, "grad_norm": 0.17565220594406128, "kl": 0.09111328125, "learning_rate": 1.739130434782609e-05, "loss": 0.0036, "reward": 0.13944445109615725, "reward_std": 0.1462620739514629, "rewards/accuracy_reward": 0.13944445109615725, "step": 160 }, { "completion_length": 877.140049235026, "epoch": 0.09262622593534327, "grad_norm": 2.699877977371216, "kl": 0.09803059895833334, "learning_rate": 1.847826086956522e-05, "loss": 0.0039, "reward": 0.15722223110496997, "reward_std": 0.15396007529149452, "rewards/accuracy_reward": 0.15722223110496997, "step": 170 }, { "completion_length": 789.8333821614583, "epoch": 0.09807482746095168, "grad_norm": 0.14496812224388123, "kl": 0.11103515625, "learning_rate": 1.956521739130435e-05, "loss": 0.0044, "reward": 0.1372222289443016, "reward_std": 0.12797931178162494, "rewards/accuracy_reward": 0.1372222289443016, "step": 180 }, { "completion_length": 901.2533813476563, "epoch": 0.10352342898656011, "grad_norm": 0.11738000065088272, "kl": 0.10592447916666667, "learning_rate": 1.999941343533685e-05, "loss": 0.0042, "reward": 0.1400000070532163, "reward_std": 0.1299038126443823, "rewards/accuracy_reward": 0.1400000070532163, "step": 190 }, { "completion_length": 772.8806030273438, "epoch": 0.10897203051216854, "grad_norm": 0.1379402130842209, "kl": 0.0951416015625, "learning_rate": 1.9995829150385352e-05, "loss": 0.0038, "reward": 0.16722223026057084, "reward_std": 0.1549223259712259, "rewards/accuracy_reward": 0.16722223026057084, "step": 200 }, { "completion_length": 796.1578206380208, "epoch": 0.11442063203777697, "grad_norm": 0.18366481363773346, "kl": 0.10343424479166667, "learning_rate": 1.9988987745928353e-05, "loss": 0.0041, "reward": 0.15388889685273172, "reward_std": 0.15492232621957858, "rewards/accuracy_reward": 0.15388889685273172, "step": 210 }, { "completion_length": 753.2955932617188, "epoch": 0.1198692335633854, "grad_norm": 0.34986189007759094, "kl": 0.12716471354166667, "learning_rate": 1.9978891699032894e-05, "loss": 0.0051, "reward": 0.15000000757475693, "reward_std": 0.1414508193110426, "rewards/accuracy_reward": 0.15000000757475693, "step": 220 }, { "completion_length": 792.331709798177, "epoch": 0.12531783508899383, "grad_norm": 0.18868593871593475, "kl": 0.13155924479166667, "learning_rate": 1.9965544665174273e-05, "loss": 0.0053, "reward": 0.15333334108193716, "reward_std": 0.15011107164124649, "rewards/accuracy_reward": 0.15333334108193716, "step": 230 }, { "completion_length": 705.5455963134766, "epoch": 0.13076643661460224, "grad_norm": 0.15661919116973877, "kl": 0.12745768229166668, "learning_rate": 1.9948951476912513e-05, "loss": 0.0051, "reward": 0.16388889650503793, "reward_std": 0.16165807868043583, "rewards/accuracy_reward": 0.16388889650503793, "step": 240 }, { "completion_length": 822.7744852701823, "epoch": 0.1362150381402107, "grad_norm": 0.13379545509815216, "kl": 0.1236328125, "learning_rate": 1.992911814214264e-05, "loss": 0.0049, "reward": 0.16888889744877816, "reward_std": 0.15492232702672482, "rewards/accuracy_reward": 0.16888889744877816, "step": 250 }, { "completion_length": 847.4389363606771, "epoch": 0.1416636396658191, "grad_norm": 0.2325723022222519, "kl": 0.111767578125, "learning_rate": 1.9906051841919387e-05, "loss": 0.0045, "reward": 0.17277778666466476, "reward_std": 0.13663956591238577, "rewards/accuracy_reward": 0.17277778666466476, "step": 260 }, { "completion_length": 716.1711486816406, "epoch": 0.14711224119142755, "grad_norm": 0.13704079389572144, "kl": 0.10948893229166666, "learning_rate": 1.9879760927857163e-05, "loss": 0.0044, "reward": 0.20000001067916554, "reward_std": 0.16454482972621917, "rewards/accuracy_reward": 0.20000001067916554, "step": 270 }, { "completion_length": 690.4217041015625, "epoch": 0.15256084271703596, "grad_norm": 0.1831006407737732, "kl": 3.4497395833333333, "learning_rate": 1.9850254919106187e-05, "loss": 0.1378, "reward": 0.18500001194576424, "reward_std": 0.1433753212292989, "rewards/accuracy_reward": 0.18500001194576424, "step": 280 }, { "completion_length": 709.8889221191406, "epoch": 0.15800944424264438, "grad_norm": 0.21904754638671875, "kl": 0.13380533854166668, "learning_rate": 1.9817544498905884e-05, "loss": 0.0054, "reward": 0.19722223381201426, "reward_std": 0.1626203325887521, "rewards/accuracy_reward": 0.19722223381201426, "step": 290 }, { "completion_length": 637.940586344401, "epoch": 0.16345804576825282, "grad_norm": 0.17953413724899292, "kl": 0.19720052083333334, "learning_rate": 1.9781641510716804e-05, "loss": 0.0079, "reward": 0.1355555637429158, "reward_std": 0.1645448302850127, "rewards/accuracy_reward": 0.1355555637429158, "step": 300 }, { "completion_length": 750.0444763183593, "epoch": 0.16890664729386123, "grad_norm": 0.15298530459403992, "kl": 0.135498046875, "learning_rate": 1.974255895393247e-05, "loss": 0.0054, "reward": 0.16833334360271693, "reward_std": 0.14722432115425665, "rewards/accuracy_reward": 0.16833334360271693, "step": 310 }, { "completion_length": 778.2539347330729, "epoch": 0.17435524881946968, "grad_norm": 0.17462554574012756, "kl": 0.54716796875, "learning_rate": 1.9700310979172664e-05, "loss": 0.0219, "reward": 0.23444445828596752, "reward_std": 0.1770540880660216, "rewards/accuracy_reward": 0.23444445828596752, "step": 320 }, { "completion_length": 649.7972544352214, "epoch": 0.1798038503450781, "grad_norm": 0.1918572634458542, "kl": 0.2255859375, "learning_rate": 1.9654912883159946e-05, "loss": 0.009, "reward": 0.18888889998197556, "reward_std": 0.1635825766871373, "rewards/accuracy_reward": 0.18888889998197556, "step": 330 }, { "completion_length": 608.611142985026, "epoch": 0.18525245187068654, "grad_norm": 0.17178894579410553, "kl": 0.19505208333333332, "learning_rate": 1.960638110318115e-05, "loss": 0.0078, "reward": 0.18111112142602603, "reward_std": 0.1578090752164523, "rewards/accuracy_reward": 0.18111112142602603, "step": 340 }, { "completion_length": 745.2217081705729, "epoch": 0.19070105339629495, "grad_norm": 0.28473737835884094, "kl": 0.20130208333333333, "learning_rate": 1.9554733211135933e-05, "loss": 0.0081, "reward": 0.1650000085433324, "reward_std": 0.1347150657325983, "rewards/accuracy_reward": 0.1650000085433324, "step": 350 }, { "completion_length": 622.1950337727865, "epoch": 0.19614965492190337, "grad_norm": 0.1968754678964615, "kl": 0.25048828125, "learning_rate": 1.949998790717453e-05, "loss": 0.01, "reward": 0.1911111223200957, "reward_std": 0.15973357918361822, "rewards/accuracy_reward": 0.1911111223200957, "step": 360 }, { "completion_length": 794.3217102050781, "epoch": 0.2015982564475118, "grad_norm": 0.29343000054359436, "kl": 0.23291015625, "learning_rate": 1.9442165012926996e-05, "loss": 0.0093, "reward": 0.19166667585571606, "reward_std": 0.1606958294287324, "rewards/accuracy_reward": 0.19166667585571606, "step": 370 }, { "completion_length": 850.5955993652344, "epoch": 0.20704685797312022, "grad_norm": 1.0496876239776611, "kl": 0.192041015625, "learning_rate": 1.938128546432635e-05, "loss": 0.0077, "reward": 0.182777788490057, "reward_std": 0.16454483040918907, "rewards/accuracy_reward": 0.182777788490057, "step": 380 }, { "completion_length": 759.0372619628906, "epoch": 0.21249545949872867, "grad_norm": 0.1948990523815155, "kl": 0.28180338541666666, "learning_rate": 1.9317371304028375e-05, "loss": 0.0113, "reward": 0.15111111948887507, "reward_std": 0.15877132850388687, "rewards/accuracy_reward": 0.15111111948887507, "step": 390 }, { "completion_length": 608.8128092447917, "epoch": 0.21794406102433708, "grad_norm": 0.20184774696826935, "kl": 0.22998046875, "learning_rate": 1.925044567343055e-05, "loss": 0.0092, "reward": 0.1822222317258517, "reward_std": 0.16358258153001468, "rewards/accuracy_reward": 0.1822222317258517, "step": 400 }, { "completion_length": 566.6578104654948, "epoch": 0.22339266254994553, "grad_norm": 0.1724410057067871, "kl": 0.24383138020833334, "learning_rate": 1.918053280429331e-05, "loss": 0.0098, "reward": 0.22944445833563804, "reward_std": 0.16262032861510914, "rewards/accuracy_reward": 0.22944445833563804, "step": 410 }, { "completion_length": 681.8222615559896, "epoch": 0.22884126407555394, "grad_norm": 0.23551391065120697, "kl": 0.28759765625, "learning_rate": 1.9107658009966425e-05, "loss": 0.0115, "reward": 0.21888890117406845, "reward_std": 0.1693560838699341, "rewards/accuracy_reward": 0.21888890117406845, "step": 420 }, { "completion_length": 784.9755940755208, "epoch": 0.23428986560116236, "grad_norm": 0.13546472787857056, "kl": 0.16551106770833332, "learning_rate": 1.903184767622381e-05, "loss": 0.0066, "reward": 0.1783333433791995, "reward_std": 0.1385640668993195, "rewards/accuracy_reward": 0.1783333433791995, "step": 430 }, { "completion_length": 990.0589497884115, "epoch": 0.2397384671267708, "grad_norm": 0.13672798871994019, "kl": 0.19807942708333334, "learning_rate": 1.895312925170999e-05, "loss": 0.0079, "reward": 0.17055556488533816, "reward_std": 0.14337532110512258, "rewards/accuracy_reward": 0.17055556488533816, "step": 440 }, { "completion_length": 780.6261494954427, "epoch": 0.24518706865237921, "grad_norm": 0.14529773592948914, "kl": 0.25830078125, "learning_rate": 1.8871531238001844e-05, "loss": 0.0103, "reward": 0.18833334452162187, "reward_std": 0.12990381382405758, "rewards/accuracy_reward": 0.18833334452162187, "step": 450 }, { "completion_length": 701.3894856770834, "epoch": 0.25063567017798766, "grad_norm": 0.15654024481773376, "kl": 0.26097005208333335, "learning_rate": 1.878708317928897e-05, "loss": 0.0104, "reward": 0.20277778804302216, "reward_std": 0.14241306868692238, "rewards/accuracy_reward": 0.20277778804302216, "step": 460 }, { "completion_length": 723.2167114257812, "epoch": 0.2560842717035961, "grad_norm": 0.2003524899482727, "kl": 0.21272786458333334, "learning_rate": 1.8699815651676644e-05, "loss": 0.0085, "reward": 0.21444445674618084, "reward_std": 0.1655070828894774, "rewards/accuracy_reward": 0.21444445674618084, "step": 470 }, { "completion_length": 722.7867065429688, "epoch": 0.2615328732292045, "grad_norm": 0.20536291599273682, "kl": 0.32444661458333335, "learning_rate": 1.8609760252115155e-05, "loss": 0.013, "reward": 0.19000001152356466, "reward_std": 0.12894156333059073, "rewards/accuracy_reward": 0.19000001152356466, "step": 480 }, { "completion_length": 634.9461405436198, "epoch": 0.26698147475481293, "grad_norm": 0.23255935311317444, "kl": 0.24235026041666666, "learning_rate": 1.8516949586959466e-05, "loss": 0.0097, "reward": 0.2166666788359483, "reward_std": 0.15011107257256906, "rewards/accuracy_reward": 0.2166666788359483, "step": 490 }, { "completion_length": 667.915586344401, "epoch": 0.2724300762804214, "grad_norm": 0.2208521068096161, "kl": 0.2091796875, "learning_rate": 1.842141726016345e-05, "loss": 0.0084, "reward": 0.23944445500771205, "reward_std": 0.1433753207946817, "rewards/accuracy_reward": 0.23944445500771205, "step": 500 }, { "completion_length": 588.1133616129557, "epoch": 0.27787867780602976, "grad_norm": 0.2693271040916443, "kl": 0.32734375, "learning_rate": 1.8323197861112894e-05, "loss": 0.0131, "reward": 0.20666667719682058, "reward_std": 0.16262032836675644, "rewards/accuracy_reward": 0.20666667719682058, "step": 510 }, { "completion_length": 523.7733591715495, "epoch": 0.2833272793316382, "grad_norm": 0.33560505509376526, "kl": 0.34182942708333336, "learning_rate": 1.8222326952101744e-05, "loss": 0.0137, "reward": 0.22888890157143274, "reward_std": 0.15107332300394774, "rewards/accuracy_reward": 0.22888890157143274, "step": 520 }, { "completion_length": 583.4628112792968, "epoch": 0.28877588085724665, "grad_norm": 0.15240980684757233, "kl": 0.33274739583333335, "learning_rate": 1.811884105545604e-05, "loss": 0.0133, "reward": 0.18444445381561916, "reward_std": 0.1395263155301412, "rewards/accuracy_reward": 0.18444445381561916, "step": 530 }, { "completion_length": 663.6050384521484, "epoch": 0.2942244823828551, "grad_norm": 0.1361735761165619, "kl": 0.20159505208333334, "learning_rate": 1.8012777640310296e-05, "loss": 0.0081, "reward": 0.21000001281499864, "reward_std": 0.15684682689607143, "rewards/accuracy_reward": 0.21000001281499864, "step": 540 }, { "completion_length": 698.3655944824219, "epoch": 0.2996730839084635, "grad_norm": 0.14391390979290009, "kl": 0.20192057291666668, "learning_rate": 1.7904175109041054e-05, "loss": 0.0081, "reward": 0.21666667864968378, "reward_std": 0.15973357955614725, "rewards/accuracy_reward": 0.21666667864968378, "step": 550 }, { "completion_length": 734.030039469401, "epoch": 0.3051216854340719, "grad_norm": 0.2076151818037033, "kl": 0.2923502604166667, "learning_rate": 1.7793072783362518e-05, "loss": 0.0117, "reward": 0.1938888981938362, "reward_std": 0.15299782399088144, "rewards/accuracy_reward": 0.1938888981938362, "step": 560 }, { "completion_length": 753.886153157552, "epoch": 0.31057028695968036, "grad_norm": 0.7906116247177124, "kl": 0.3443359375, "learning_rate": 1.767951089008937e-05, "loss": 0.0138, "reward": 0.21722223460674286, "reward_std": 0.1780163371314605, "rewards/accuracy_reward": 0.21722223460674286, "step": 570 }, { "completion_length": 892.1883809407552, "epoch": 0.31601888848528875, "grad_norm": 0.1703496277332306, "kl": 0.168896484375, "learning_rate": 1.7563530546571822e-05, "loss": 0.0068, "reward": 0.20111112122734387, "reward_std": 0.14337531936665376, "rewards/accuracy_reward": 0.20111112122734387, "step": 580 }, { "completion_length": 986.2056030273437, "epoch": 0.3214674900108972, "grad_norm": 0.11671660840511322, "kl": 0.17861328125, "learning_rate": 1.7445173745808278e-05, "loss": 0.0071, "reward": 0.20777778898676236, "reward_std": 0.14433756973594428, "rewards/accuracy_reward": 0.20777778898676236, "step": 590 }, { "completion_length": 952.4278279622396, "epoch": 0.32691609153650564, "grad_norm": 0.15954096615314484, "kl": 0.23352864583333333, "learning_rate": 1.732448334124091e-05, "loss": 0.0093, "reward": 0.18222223098079363, "reward_std": 0.13375281617045404, "rewards/accuracy_reward": 0.18222223098079363, "step": 600 }, { "completion_length": 763.7900431315104, "epoch": 0.3323646930621141, "grad_norm": 0.19060583412647247, "kl": 0.24853515625, "learning_rate": 1.7201503031239754e-05, "loss": 0.0099, "reward": 0.22222223455707232, "reward_std": 0.15684682416419188, "rewards/accuracy_reward": 0.22222223455707232, "step": 610 }, { "completion_length": 710.0178181966146, "epoch": 0.33781329458772247, "grad_norm": 0.2616126239299774, "kl": 0.38987630208333335, "learning_rate": 1.707627734328078e-05, "loss": 0.0156, "reward": 0.23444445679585138, "reward_std": 0.16935608368366956, "rewards/accuracy_reward": 0.23444445679585138, "step": 620 }, { "completion_length": 695.4672566731771, "epoch": 0.3432618961133309, "grad_norm": 0.11850817501544952, "kl": 0.30657552083333334, "learning_rate": 1.6948851617823903e-05, "loss": 0.0123, "reward": 0.22777779201666515, "reward_std": 0.145299820539852, "rewards/accuracy_reward": 0.22777779201666515, "step": 630 }, { "completion_length": 784.5917073567708, "epoch": 0.34871049763893935, "grad_norm": 0.19705404341220856, "kl": 0.15953776041666667, "learning_rate": 1.68192719918965e-05, "loss": 0.0064, "reward": 0.23388890276352564, "reward_std": 0.16743158052364984, "rewards/accuracy_reward": 0.23388890276352564, "step": 640 }, { "completion_length": 864.4089375813802, "epoch": 0.35415909916454774, "grad_norm": 0.18156002461910248, "kl": 0.194921875, "learning_rate": 1.6687585382388628e-05, "loss": 0.0078, "reward": 0.25666668290893235, "reward_std": 0.1587713286280632, "rewards/accuracy_reward": 0.25666668290893235, "step": 650 }, { "completion_length": 863.7578247070312, "epoch": 0.3596077006901562, "grad_norm": 0.19501180946826935, "kl": 0.3328450520833333, "learning_rate": 1.6553839469065783e-05, "loss": 0.0133, "reward": 0.1950000097354253, "reward_std": 0.1683938330039382, "rewards/accuracy_reward": 0.1950000097354253, "step": 660 }, { "completion_length": 687.0161417643229, "epoch": 0.3650563022157646, "grad_norm": 0.20623794198036194, "kl": 0.23199869791666666, "learning_rate": 1.6418082677305527e-05, "loss": 0.0093, "reward": 0.24166667858759563, "reward_std": 0.15588457559545835, "rewards/accuracy_reward": 0.24166667858759563, "step": 670 }, { "completion_length": 735.1483723958333, "epoch": 0.37050490374137307, "grad_norm": 0.21932479739189148, "kl": 0.27737630208333336, "learning_rate": 1.6280364160564103e-05, "loss": 0.0111, "reward": 0.23277779333293439, "reward_std": 0.1481865718960762, "rewards/accuracy_reward": 0.23277779333293439, "step": 680 }, { "completion_length": 798.261153157552, "epoch": 0.37595350526698146, "grad_norm": 0.17951412498950958, "kl": 0.2635416666666667, "learning_rate": 1.614073378257942e-05, "loss": 0.0105, "reward": 0.20944445555408794, "reward_std": 0.14818657226860524, "rewards/accuracy_reward": 0.20944445555408794, "step": 690 }, { "completion_length": 767.4433715820312, "epoch": 0.3814021067925899, "grad_norm": 0.17271050810813904, "kl": 0.30048828125, "learning_rate": 1.599924209931694e-05, "loss": 0.012, "reward": 0.22222223412245512, "reward_std": 0.1664693317686518, "rewards/accuracy_reward": 0.22222223412245512, "step": 700 }, { "completion_length": 735.5094848632813, "epoch": 0.38685070831819834, "grad_norm": 0.40028366446495056, "kl": 0.36337890625, "learning_rate": 1.585594034066483e-05, "loss": 0.0145, "reward": 0.22722223674257597, "reward_std": 0.16646933387964963, "rewards/accuracy_reward": 0.22722223674257597, "step": 710 }, { "completion_length": 706.8139241536459, "epoch": 0.39229930984380673, "grad_norm": 0.17000922560691833, "kl": 0.37998046875, "learning_rate": 1.5710880391885198e-05, "loss": 0.0152, "reward": 0.24166668156782786, "reward_std": 0.15684682751695314, "rewards/accuracy_reward": 0.24166668156782786, "step": 720 }, { "completion_length": 790.7633728027344, "epoch": 0.3977479113694152, "grad_norm": 0.20689846575260162, "kl": 0.20625, "learning_rate": 1.556411477482796e-05, "loss": 0.0083, "reward": 0.25777779122193656, "reward_std": 0.1722428339223067, "rewards/accuracy_reward": 0.25777779122193656, "step": 730 }, { "completion_length": 908.2522684733073, "epoch": 0.4031965128950236, "grad_norm": 0.21421608328819275, "kl": 0.22008463541666667, "learning_rate": 1.5415696628914304e-05, "loss": 0.0088, "reward": 0.230000010629495, "reward_std": 0.15973357744514943, "rewards/accuracy_reward": 0.230000010629495, "step": 740 }, { "completion_length": 938.4033833821615, "epoch": 0.40864511442063206, "grad_norm": 0.1750338226556778, "kl": 0.19651692708333332, "learning_rate": 1.526567969189646e-05, "loss": 0.0079, "reward": 0.21500001127521198, "reward_std": 0.1433753190562129, "rewards/accuracy_reward": 0.21500001127521198, "step": 750 }, { "completion_length": 833.197265625, "epoch": 0.41409371594624045, "grad_norm": 0.22522857785224915, "kl": 0.3521484375, "learning_rate": 1.51141182804009e-05, "loss": 0.0141, "reward": 0.23222223197420439, "reward_std": 0.17512958478182555, "rewards/accuracy_reward": 0.23222223197420439, "step": 760 }, { "completion_length": 827.607265218099, "epoch": 0.4195423174718489, "grad_norm": 0.3123396933078766, "kl": 0.34469401041666664, "learning_rate": 1.4961067270261896e-05, "loss": 0.0138, "reward": 0.2605555700759093, "reward_std": 0.16454483016083638, "rewards/accuracy_reward": 0.2605555700759093, "step": 770 }, { "completion_length": 800.3728210449219, "epoch": 0.42499091899745733, "grad_norm": 0.18968704342842102, "kl": 0.28020833333333334, "learning_rate": 1.4806582076652654e-05, "loss": 0.0112, "reward": 0.24055556803941727, "reward_std": 0.1606958304842313, "rewards/accuracy_reward": 0.24055556803941727, "step": 780 }, { "completion_length": 672.053369140625, "epoch": 0.4304395205230657, "grad_norm": 0.14060722291469574, "kl": 0.268359375, "learning_rate": 1.4650718634021126e-05, "loss": 0.0107, "reward": 0.24555557022492092, "reward_std": 0.14626207016408443, "rewards/accuracy_reward": 0.24555557022492092, "step": 790 }, { "completion_length": 692.2233723958333, "epoch": 0.43588812204867416, "grad_norm": 0.24256359040737152, "kl": 0.20579427083333332, "learning_rate": 1.449353337583784e-05, "loss": 0.0082, "reward": 0.26833335012197496, "reward_std": 0.15011107188959916, "rewards/accuracy_reward": 0.26833335012197496, "step": 800 }, { "completion_length": 659.2050364176432, "epoch": 0.4413367235742826, "grad_norm": 0.3057202398777008, "kl": 0.2607421875, "learning_rate": 1.4335083214163017e-05, "loss": 0.0104, "reward": 0.2588889040052891, "reward_std": 0.15299782380461693, "rewards/accuracy_reward": 0.2588889040052891, "step": 810 }, { "completion_length": 693.3505920410156, "epoch": 0.44678532509989105, "grad_norm": 0.32940661907196045, "kl": 0.24700520833333334, "learning_rate": 1.4175425519040448e-05, "loss": 0.0099, "reward": 0.24666668102145195, "reward_std": 0.13182831400384507, "rewards/accuracy_reward": 0.24666668102145195, "step": 820 }, { "completion_length": 717.9733723958333, "epoch": 0.45223392662549944, "grad_norm": 0.32940706610679626, "kl": 0.30914713541666666, "learning_rate": 1.40146180977255e-05, "loss": 0.0124, "reward": 0.2550000126163165, "reward_std": 0.1433753201117118, "rewards/accuracy_reward": 0.2550000126163165, "step": 830 }, { "completion_length": 758.3606018066406, "epoch": 0.4576825281511079, "grad_norm": 0.15635548532009125, "kl": 0.2708658854166667, "learning_rate": 1.3852719173754868e-05, "loss": 0.0108, "reward": 0.25444446007410687, "reward_std": 0.14914882220327855, "rewards/accuracy_reward": 0.25444446007410687, "step": 840 }, { "completion_length": 662.8244750976562, "epoch": 0.4631311296767163, "grad_norm": 0.2794467806816101, "kl": 0.34111328125, "learning_rate": 1.3689787365865563e-05, "loss": 0.0136, "reward": 0.23777779067556062, "reward_std": 0.1645448292295138, "rewards/accuracy_reward": 0.23777779067556062, "step": 850 }, { "completion_length": 546.4494689941406, "epoch": 0.4685797312023247, "grad_norm": 0.13812310993671417, "kl": 0.37347005208333334, "learning_rate": 1.352588166677084e-05, "loss": 0.0149, "reward": 0.24888890236616135, "reward_std": 0.16839383120338122, "rewards/accuracy_reward": 0.24888890236616135, "step": 860 }, { "completion_length": 651.5000345865885, "epoch": 0.47402833272793315, "grad_norm": 0.2395883947610855, "kl": 0.364453125, "learning_rate": 1.3361061421800698e-05, "loss": 0.0146, "reward": 0.26833334714174273, "reward_std": 0.1462620700399081, "rewards/accuracy_reward": 0.26833334714174273, "step": 870 }, { "completion_length": 666.6955993652343, "epoch": 0.4794769342535416, "grad_norm": 0.16950438916683197, "kl": 0.26858723958333336, "learning_rate": 1.3195386307414737e-05, "loss": 0.0107, "reward": 0.2777777915199598, "reward_std": 0.15588457497457664, "rewards/accuracy_reward": 0.2777777915199598, "step": 880 }, { "completion_length": 669.873368326823, "epoch": 0.48492553577915004, "grad_norm": 0.2770698666572571, "kl": 0.22216796875, "learning_rate": 1.302891630959508e-05, "loss": 0.0089, "reward": 0.2555555703739325, "reward_std": 0.15299782355626423, "rewards/accuracy_reward": 0.2555555703739325, "step": 890 }, { "completion_length": 779.9567077636718, "epoch": 0.49037413730475843, "grad_norm": 0.2388666868209839, "kl": 0.3577799479166667, "learning_rate": 1.2861711702127265e-05, "loss": 0.0143, "reward": 0.2883333474397659, "reward_std": 0.16839383393526078, "rewards/accuracy_reward": 0.2883333474397659, "step": 900 }, { "completion_length": 774.1683715820312, "epoch": 0.49582273883036687, "grad_norm": 0.1927744299173355, "kl": 0.29733072916666664, "learning_rate": 1.2693833024776929e-05, "loss": 0.0119, "reward": 0.2605555698275566, "reward_std": 0.14914882288624842, "rewards/accuracy_reward": 0.2605555698275566, "step": 910 }, { "completion_length": 787.7378194173177, "epoch": 0.5012713403559753, "grad_norm": 0.20473827421665192, "kl": 0.30960286458333336, "learning_rate": 1.2525341061370148e-05, "loss": 0.0124, "reward": 0.2844444582859675, "reward_std": 0.16069582694520554, "rewards/accuracy_reward": 0.2844444582859675, "step": 920 }, { "completion_length": 733.2739217122396, "epoch": 0.5067199418815838, "grad_norm": 0.35506564378738403, "kl": 0.28668619791666666, "learning_rate": 1.2356296817785467e-05, "loss": 0.0115, "reward": 0.264444458236297, "reward_std": 0.14722432208557923, "rewards/accuracy_reward": 0.264444458236297, "step": 930 }, { "completion_length": 616.1361419677735, "epoch": 0.5121685434071922, "grad_norm": 0.5822309255599976, "kl": 0.44453125, "learning_rate": 1.2186761499865496e-05, "loss": 0.0178, "reward": 0.2672222351034482, "reward_std": 0.14048856620987257, "rewards/accuracy_reward": 0.2672222351034482, "step": 940 }, { "completion_length": 489.3589111328125, "epoch": 0.5176171449328005, "grad_norm": 0.4598633348941803, "kl": 0.2633138020833333, "learning_rate": 1.2016796491256093e-05, "loss": 0.0105, "reward": 0.2905555710196495, "reward_std": 0.1376018171509107, "rewards/accuracy_reward": 0.2905555710196495, "step": 950 }, { "completion_length": 564.8978047688802, "epoch": 0.523065746458409, "grad_norm": 0.19443756341934204, "kl": 0.27320963541666665, "learning_rate": 1.1846463331181225e-05, "loss": 0.0109, "reward": 0.2822222386797269, "reward_std": 0.15492232621957858, "rewards/accuracy_reward": 0.2822222386797269, "step": 960 }, { "completion_length": 856.1006001790364, "epoch": 0.5285143479840174, "grad_norm": 0.18930432200431824, "kl": 0.25869140625, "learning_rate": 1.167582369216144e-05, "loss": 0.0104, "reward": 0.26555557002623875, "reward_std": 0.15011107164124649, "rewards/accuracy_reward": 0.26555557002623875, "step": 970 }, { "completion_length": 860.3039306640625, "epoch": 0.5339629495096259, "grad_norm": 0.36553633213043213, "kl": 0.22236328125, "learning_rate": 1.1504939357684101e-05, "loss": 0.0089, "reward": 0.2933333491285642, "reward_std": 0.13567731585353612, "rewards/accuracy_reward": 0.2933333491285642, "step": 980 }, { "completion_length": 795.6067057291667, "epoch": 0.5394115510352343, "grad_norm": 0.293014258146286, "kl": 0.2884114583333333, "learning_rate": 1.1333872199833446e-05, "loss": 0.0115, "reward": 0.29333335210879646, "reward_std": 0.14241306918362776, "rewards/accuracy_reward": 0.29333335210879646, "step": 990 }, { "completion_length": 759.9989318847656, "epoch": 0.5448601525608427, "grad_norm": 0.20459088683128357, "kl": 0.31344401041666664, "learning_rate": 1.116268415688858e-05, "loss": 0.0125, "reward": 0.27166668126980464, "reward_std": 0.14626206954320273, "rewards/accuracy_reward": 0.27166668126980464, "step": 1000 }, { "completion_length": 756.9611572265625, "epoch": 0.5503087540864512, "grad_norm": 0.23321633040905, "kl": 0.2380859375, "learning_rate": 1.0991437210897447e-05, "loss": 0.0095, "reward": 0.2522222379843394, "reward_std": 0.17128058516730865, "rewards/accuracy_reward": 0.2522222379843394, "step": 1010 }, { "completion_length": 830.0883768717448, "epoch": 0.5557573556120595, "grad_norm": 0.24410440027713776, "kl": 0.26292317708333335, "learning_rate": 1.0820193365235021e-05, "loss": 0.0105, "reward": 0.24277778988083204, "reward_std": 0.15684682658563057, "rewards/accuracy_reward": 0.24277778988083204, "step": 1020 }, { "completion_length": 823.7517110188802, "epoch": 0.561205957137668, "grad_norm": 0.2050526887178421, "kl": 0.3375, "learning_rate": 1.0649014622153752e-05, "loss": 0.0135, "reward": 0.24444445793827374, "reward_std": 0.17705408905943235, "rewards/accuracy_reward": 0.24444445793827374, "step": 1030 }, { "completion_length": 694.7839192708333, "epoch": 0.5666545586632764, "grad_norm": 0.222402885556221, "kl": 0.33203125, "learning_rate": 1.0477962960334393e-05, "loss": 0.0133, "reward": 0.25444445659716924, "reward_std": 0.16262032973269622, "rewards/accuracy_reward": 0.25444445659716924, "step": 1040 }, { "completion_length": 672.986142985026, "epoch": 0.5721031601888849, "grad_norm": 0.35129454731941223, "kl": 0.30276692708333336, "learning_rate": 1.0307100312445382e-05, "loss": 0.0121, "reward": 0.2438889041543007, "reward_std": 0.18667659064133962, "rewards/accuracy_reward": 0.2438889041543007, "step": 1050 }, { "completion_length": 787.0778137207031, "epoch": 0.5775517617144933, "grad_norm": 0.24582481384277344, "kl": 0.2996419270833333, "learning_rate": 1.0136488542718903e-05, "loss": 0.012, "reward": 0.24277779137094815, "reward_std": 0.19245009658237297, "rewards/accuracy_reward": 0.24277779137094815, "step": 1060 }, { "completion_length": 767.2783711751302, "epoch": 0.5830003632401017, "grad_norm": 0.26614880561828613, "kl": 0.37802734375, "learning_rate": 9.966189424551691e-06, "loss": 0.0151, "reward": 0.24500001346071562, "reward_std": 0.15780907503018776, "rewards/accuracy_reward": 0.24500001346071562, "step": 1070 }, { "completion_length": 656.1172627766927, "epoch": 0.5884489647657102, "grad_norm": 0.24548599123954773, "kl": 0.45107421875, "learning_rate": 9.79626461813873e-06, "loss": 0.018, "reward": 0.26277779092391335, "reward_std": 0.18282758990923564, "rewards/accuracy_reward": 0.26277779092391335, "step": 1080 }, { "completion_length": 562.6839223225911, "epoch": 0.5938975662913185, "grad_norm": 0.2709478735923767, "kl": 0.4180989583333333, "learning_rate": 9.626775648147986e-06, "loss": 0.0167, "reward": 0.26055557032426196, "reward_std": 0.15973357893526555, "rewards/accuracy_reward": 0.26055557032426196, "step": 1090 }, { "completion_length": 569.2955790201823, "epoch": 0.599346167816927, "grad_norm": 0.3677089810371399, "kl": 0.2918619791666667, "learning_rate": 9.45778388144413e-06, "loss": 0.0117, "reward": 0.2905555749932925, "reward_std": 0.1626203284288446, "rewards/accuracy_reward": 0.2905555749932925, "step": 1100 }, { "completion_length": 754.6422627766927, "epoch": 0.6047947693425354, "grad_norm": 0.24720929563045502, "kl": 0.3712565104166667, "learning_rate": 9.289350504869456e-06, "loss": 0.0149, "reward": 0.24833334535360335, "reward_std": 0.12990381047129632, "rewards/accuracy_reward": 0.24833334535360335, "step": 1110 }, { "completion_length": 865.498378499349, "epoch": 0.6102433708681438, "grad_norm": 0.2687451243400574, "kl": 0.23932291666666666, "learning_rate": 9.121536503089985e-06, "loss": 0.0096, "reward": 0.2783333480358124, "reward_std": 0.15011107133080562, "rewards/accuracy_reward": 0.2783333480358124, "step": 1120 }, { "completion_length": 755.5978169759114, "epoch": 0.6156919723937523, "grad_norm": 0.503818154335022, "kl": 0.29983723958333336, "learning_rate": 8.954402636514718e-06, "loss": 0.012, "reward": 0.2866666833559672, "reward_std": 0.152997824922204, "rewards/accuracy_reward": 0.2866666833559672, "step": 1130 }, { "completion_length": 676.5350382486979, "epoch": 0.6211405739193607, "grad_norm": 0.285356730222702, "kl": 0.32939453125, "learning_rate": 8.788009419296124e-06, "loss": 0.0132, "reward": 0.28888890544573465, "reward_std": 0.1645448329548041, "rewards/accuracy_reward": 0.28888890544573465, "step": 1140 }, { "completion_length": 671.5789245605469, "epoch": 0.6265891754449692, "grad_norm": 0.30285096168518066, "kl": 0.34625651041666666, "learning_rate": 8.622417097419803e-06, "loss": 0.0139, "reward": 0.2738889041046301, "reward_std": 0.16550708003342152, "rewards/accuracy_reward": 0.2738889041046301, "step": 1150 }, { "completion_length": 649.0000305175781, "epoch": 0.6320377769705775, "grad_norm": 0.24322140216827393, "kl": 0.3473307291666667, "learning_rate": 8.457685626891201e-06, "loss": 0.0139, "reward": 0.29777779430150986, "reward_std": 0.18090308991571266, "rewards/accuracy_reward": 0.29777779430150986, "step": 1160 }, { "completion_length": 648.3328084309895, "epoch": 0.637486378496186, "grad_norm": 0.2571386396884918, "kl": 0.32721354166666666, "learning_rate": 8.293874652027343e-06, "loss": 0.0131, "reward": 0.28777779390414554, "reward_std": 0.15492232665419578, "rewards/accuracy_reward": 0.28777779390414554, "step": 1170 }, { "completion_length": 667.4994791666667, "epoch": 0.6429349800217944, "grad_norm": 0.30658867955207825, "kl": 0.43190104166666665, "learning_rate": 8.131043483861447e-06, "loss": 0.0173, "reward": 0.266111126045386, "reward_std": 0.18282759015758832, "rewards/accuracy_reward": 0.266111126045386, "step": 1180 }, { "completion_length": 670.6622578938802, "epoch": 0.6483835815474028, "grad_norm": 0.18500077724456787, "kl": 0.39547526041666664, "learning_rate": 7.969251078668139e-06, "loss": 0.0158, "reward": 0.28888890544573465, "reward_std": 0.17224283615748087, "rewards/accuracy_reward": 0.28888890544573465, "step": 1190 }, { "completion_length": 653.103369140625, "epoch": 0.6538321830730113, "grad_norm": 0.23634323477745056, "kl": 0.3292317708333333, "learning_rate": 7.808556016617178e-06, "loss": 0.0132, "reward": 0.2494444581369559, "reward_std": 0.15299782305955886, "rewards/accuracy_reward": 0.2494444581369559, "step": 1200 }, { "completion_length": 625.3967000325521, "epoch": 0.6592807845986197, "grad_norm": 0.28967514634132385, "kl": 0.2942708333333333, "learning_rate": 7.649016480563351e-06, "loss": 0.0118, "reward": 0.2700000142057737, "reward_std": 0.1645448298503955, "rewards/accuracy_reward": 0.2700000142057737, "step": 1210 }, { "completion_length": 617.2761474609375, "epoch": 0.6647293861242282, "grad_norm": 0.20342856645584106, "kl": 0.2562174479166667, "learning_rate": 7.490690234980176e-06, "loss": 0.0103, "reward": 0.27333334734042486, "reward_std": 0.1472243204091986, "rewards/accuracy_reward": 0.27333334734042486, "step": 1220 }, { "completion_length": 673.5050374348958, "epoch": 0.6701779876498365, "grad_norm": 0.20131824910640717, "kl": 0.34033203125, "learning_rate": 7.333634605045139e-06, "loss": 0.0136, "reward": 0.30000001688798267, "reward_std": 0.15203557560841244, "rewards/accuracy_reward": 0.30000001688798267, "step": 1230 }, { "completion_length": 687.0967020670573, "epoch": 0.6756265891754449, "grad_norm": 0.42918625473976135, "kl": 0.4288411458333333, "learning_rate": 7.177906455883983e-06, "loss": 0.0171, "reward": 0.2800000141064326, "reward_std": 0.1760918361445268, "rewards/accuracy_reward": 0.2800000141064326, "step": 1240 }, { "completion_length": 682.3494812011719, "epoch": 0.6810751907010534, "grad_norm": 0.46371960639953613, "kl": 0.4420572916666667, "learning_rate": 7.0235621719815215e-06, "loss": 0.0177, "reward": 0.2955555707216263, "reward_std": 0.18378984071314336, "rewards/accuracy_reward": 0.2955555707216263, "step": 1250 }, { "completion_length": 689.0428141276042, "epoch": 0.6865237922266618, "grad_norm": 0.27757933735847473, "kl": 0.9980794270833333, "learning_rate": 6.870657636766538e-06, "loss": 0.04, "reward": 0.25222223773598673, "reward_std": 0.1616580789287885, "rewards/accuracy_reward": 0.25222223773598673, "step": 1260 }, { "completion_length": 600.6483652750651, "epoch": 0.6919723937522703, "grad_norm": 0.26269108057022095, "kl": 0.33671875, "learning_rate": 6.719248212378069e-06, "loss": 0.0135, "reward": 0.28833334843317665, "reward_std": 0.17224283466736476, "rewards/accuracy_reward": 0.28833334843317665, "step": 1270 }, { "completion_length": 591.9605824788412, "epoch": 0.6974209952778787, "grad_norm": 0.2704699635505676, "kl": 0.4122395833333333, "learning_rate": 6.569388719620422e-06, "loss": 0.0165, "reward": 0.28611112932364147, "reward_std": 0.15780907788624365, "rewards/accuracy_reward": 0.28611112932364147, "step": 1280 }, { "completion_length": 652.0617004394531, "epoch": 0.7028695968034872, "grad_norm": 0.32126060128211975, "kl": 0.37216796875, "learning_rate": 6.421133418114227e-06, "loss": 0.0149, "reward": 0.3072222376863162, "reward_std": 0.161658079115053, "rewards/accuracy_reward": 0.3072222376863162, "step": 1290 }, { "completion_length": 678.7733723958333, "epoch": 0.7083181983290955, "grad_norm": 0.46061229705810547, "kl": 0.38616536458333334, "learning_rate": 6.274535986650658e-06, "loss": 0.0154, "reward": 0.2583333467443784, "reward_std": 0.16069582800070445, "rewards/accuracy_reward": 0.2583333467443784, "step": 1300 }, { "completion_length": 666.0994801839192, "epoch": 0.7137667998547039, "grad_norm": 0.4305565357208252, "kl": 0.39619140625, "learning_rate": 6.129649503755929e-06, "loss": 0.0158, "reward": 0.3116666863361994, "reward_std": 0.15107332405944665, "rewards/accuracy_reward": 0.3116666863361994, "step": 1310 }, { "completion_length": 687.0272521972656, "epoch": 0.7192154013803124, "grad_norm": 0.19472011923789978, "kl": 0.30361328125, "learning_rate": 5.9865264284731915e-06, "loss": 0.0121, "reward": 0.3050000168383121, "reward_std": 0.13952631975213686, "rewards/accuracy_reward": 0.3050000168383121, "step": 1320 }, { "completion_length": 653.4939249674479, "epoch": 0.7246640029059208, "grad_norm": 1.0582702159881592, "kl": 0.27610677083333335, "learning_rate": 5.845218581368666e-06, "loss": 0.011, "reward": 0.30611113011837005, "reward_std": 0.14626207165420055, "rewards/accuracy_reward": 0.30611113011837005, "step": 1330 }, { "completion_length": 644.4517049153645, "epoch": 0.7301126044315293, "grad_norm": 0.21517866849899292, "kl": 0.33470052083333335, "learning_rate": 5.705777125768972e-06, "loss": 0.0134, "reward": 0.24833334932724635, "reward_std": 0.14722432146469752, "rewards/accuracy_reward": 0.24833334932724635, "step": 1340 }, { "completion_length": 608.8955820719401, "epoch": 0.7355612059571377, "grad_norm": 0.1876259446144104, "kl": 0.22027994791666666, "learning_rate": 5.568252549236439e-06, "loss": 0.0088, "reward": 0.27611112395922344, "reward_std": 0.15396007585028806, "rewards/accuracy_reward": 0.27611112395922344, "step": 1350 }, { "completion_length": 592.6044738769531, "epoch": 0.7410098074827461, "grad_norm": 0.29081982374191284, "kl": 0.37962239583333335, "learning_rate": 5.432694645289069e-06, "loss": 0.0152, "reward": 0.27111112674077353, "reward_std": 0.15396007498105366, "rewards/accuracy_reward": 0.27111112674077353, "step": 1360 }, { "completion_length": 616.8950327555339, "epoch": 0.7464584090083545, "grad_norm": 0.5649622678756714, "kl": 0.4361653645833333, "learning_rate": 5.299152495371789e-06, "loss": 0.0174, "reward": 0.26722223808368045, "reward_std": 0.15011107449730238, "rewards/accuracy_reward": 0.26722223808368045, "step": 1370 }, { "completion_length": 616.1950297037761, "epoch": 0.7519070105339629, "grad_norm": 0.36272719502449036, "kl": 0.29889322916666666, "learning_rate": 5.167674451085554e-06, "loss": 0.012, "reward": 0.29055557151635486, "reward_std": 0.1491488220791022, "rewards/accuracy_reward": 0.29055557151635486, "step": 1380 }, { "completion_length": 663.9317016601562, "epoch": 0.7573556120595714, "grad_norm": 0.20201607048511505, "kl": 0.24339192708333332, "learning_rate": 5.0383081166806705e-06, "loss": 0.0097, "reward": 0.3166666860381762, "reward_std": 0.1385640662784378, "rewards/accuracy_reward": 0.3166666860381762, "step": 1390 }, { "completion_length": 691.6333679199219, "epoch": 0.7628042135851798, "grad_norm": 0.3258429169654846, "kl": 0.3690104166666667, "learning_rate": 4.911100331820729e-06, "loss": 0.0148, "reward": 0.27166668126980464, "reward_std": 0.14914882518351077, "rewards/accuracy_reward": 0.27166668126980464, "step": 1400 }, { "completion_length": 704.6622599283854, "epoch": 0.7682528151107882, "grad_norm": 0.3449976146221161, "kl": 0.45970052083333335, "learning_rate": 4.786097154623375e-06, "loss": 0.0184, "reward": 0.2383333461980025, "reward_std": 0.14818657202025254, "rewards/accuracy_reward": 0.2383333461980025, "step": 1410 }, { "completion_length": 716.421152750651, "epoch": 0.7737014166363967, "grad_norm": 0.33388417959213257, "kl": 0.34104817708333335, "learning_rate": 4.6633438449840466e-06, "loss": 0.0137, "reward": 0.277777794500192, "reward_std": 0.17609183564782144, "rewards/accuracy_reward": 0.277777794500192, "step": 1420 }, { "completion_length": 735.9500406901042, "epoch": 0.7791500181620051, "grad_norm": 0.2249433994293213, "kl": 0.35625, "learning_rate": 4.542884848188716e-06, "loss": 0.0142, "reward": 0.25166667948166527, "reward_std": 0.13471506567051014, "rewards/accuracy_reward": 0.25166667948166527, "step": 1430 }, { "completion_length": 730.6028177897135, "epoch": 0.7845986196876135, "grad_norm": 0.278963178396225, "kl": 0.35934244791666664, "learning_rate": 4.424763778821603e-06, "loss": 0.0144, "reward": 0.27333334982395174, "reward_std": 0.15203557362159092, "rewards/accuracy_reward": 0.27333334982395174, "step": 1440 }, { "completion_length": 746.3478190104166, "epoch": 0.7900472212132219, "grad_norm": 0.2404111623764038, "kl": 0.33258463541666666, "learning_rate": 4.309023404973634e-06, "loss": 0.0133, "reward": 0.27166668126980464, "reward_std": 0.1703183315694332, "rewards/accuracy_reward": 0.27166668126980464, "step": 1450 }, { "completion_length": 755.9189331054688, "epoch": 0.7954958227388303, "grad_norm": 0.2977342903614044, "kl": 0.33782552083333334, "learning_rate": 4.195705632757396e-06, "loss": 0.0135, "reward": 0.29166668554147085, "reward_std": 0.1385640653471152, "rewards/accuracy_reward": 0.29166668554147085, "step": 1460 }, { "completion_length": 778.9239298502604, "epoch": 0.8009444242644388, "grad_norm": 0.5865334272384644, "kl": 0.39085286458333335, "learning_rate": 4.0848514911342055e-06, "loss": 0.0156, "reward": 0.26500001549720764, "reward_std": 0.15588457323610783, "rewards/accuracy_reward": 0.26500001549720764, "step": 1470 }, { "completion_length": 771.6928141276042, "epoch": 0.8063930257900472, "grad_norm": 0.23458582162857056, "kl": 0.33600260416666666, "learning_rate": 3.97650111705875e-06, "loss": 0.0134, "reward": 0.28222223520278933, "reward_std": 0.15299782579143842, "rewards/accuracy_reward": 0.28222223520278933, "step": 1480 }, { "completion_length": 757.2444864908854, "epoch": 0.8118416273156557, "grad_norm": 0.30406418442726135, "kl": 0.35198567708333334, "learning_rate": 3.8706937409466896e-06, "loss": 0.0141, "reward": 0.27500001738468804, "reward_std": 0.1674315786610047, "rewards/accuracy_reward": 0.27500001738468804, "step": 1490 }, { "completion_length": 714.279482014974, "epoch": 0.8172902288412641, "grad_norm": 0.4123045802116394, "kl": 0.48854166666666665, "learning_rate": 3.7674676724705183e-06, "loss": 0.0195, "reward": 0.2783333505193392, "reward_std": 0.16935608318696418, "rewards/accuracy_reward": 0.2783333505193392, "step": 1500 }, { "completion_length": 663.7144816080729, "epoch": 0.8227388303668725, "grad_norm": 0.36809244751930237, "kl": 0.44947916666666665, "learning_rate": 3.6668602866887815e-06, "loss": 0.018, "reward": 0.2938889066378276, "reward_std": 0.17031833299746116, "rewards/accuracy_reward": 0.2938889066378276, "step": 1510 }, { "completion_length": 648.4411397298177, "epoch": 0.8281874318924809, "grad_norm": 0.24402077496051788, "kl": 0.39036458333333335, "learning_rate": 3.568908010513674e-06, "loss": 0.0156, "reward": 0.27055557270844777, "reward_std": 0.18090308730800947, "rewards/accuracy_reward": 0.27055557270844777, "step": 1520 }, { "completion_length": 610.6050333658854, "epoch": 0.8336360334180893, "grad_norm": 0.28675535321235657, "kl": 0.42470703125, "learning_rate": 3.4736463095219665e-06, "loss": 0.017, "reward": 0.2861111253499985, "reward_std": 0.16743158201376598, "rewards/accuracy_reward": 0.2861111253499985, "step": 1530 }, { "completion_length": 607.9528137207031, "epoch": 0.8390846349436978, "grad_norm": 1.13923978805542, "kl": 0.38356119791666665, "learning_rate": 3.3811096751139803e-06, "loss": 0.0153, "reward": 0.2838889037569364, "reward_std": 0.15299782355626423, "rewards/accuracy_reward": 0.2838889037569364, "step": 1540 }, { "completion_length": 609.931142171224, "epoch": 0.8445332364693062, "grad_norm": 0.3999364674091339, "kl": 0.3766276041666667, "learning_rate": 3.29133161202528e-06, "loss": 0.0151, "reward": 0.27666668271025024, "reward_std": 0.14914882220327855, "rewards/accuracy_reward": 0.27666668271025024, "step": 1550 }, { "completion_length": 627.4083679199218, "epoch": 0.8499818379949147, "grad_norm": 0.24890734255313873, "kl": 0.47555338541666664, "learning_rate": 3.20434462619563e-06, "loss": 0.019, "reward": 0.29388890564441683, "reward_std": 0.17897858731448651, "rewards/accuracy_reward": 0.29388890564441683, "step": 1560 }, { "completion_length": 664.3505879720052, "epoch": 0.8554304395205231, "grad_norm": 0.4761791527271271, "kl": 0.7012369791666667, "learning_rate": 3.120180212999554e-06, "loss": 0.0281, "reward": 0.2994444578886032, "reward_std": 0.18282758841911953, "rewards/accuracy_reward": 0.2994444578886032, "step": 1570 }, { "completion_length": 633.1378051757813, "epoch": 0.8608790410461314, "grad_norm": 0.37956613302230835, "kl": 0.43369140625, "learning_rate": 3.038868845842816e-06, "loss": 0.0173, "reward": 0.3172222410639127, "reward_std": 0.1770540863275528, "rewards/accuracy_reward": 0.3172222410639127, "step": 1580 }, { "completion_length": 641.6078043619792, "epoch": 0.8663276425717399, "grad_norm": 0.3695982098579407, "kl": 0.3548177083333333, "learning_rate": 2.9604399651289172e-06, "loss": 0.0142, "reward": 0.27111112674077353, "reward_std": 0.1491488215823968, "rewards/accuracy_reward": 0.27111112674077353, "step": 1590 }, { "completion_length": 652.6572580973308, "epoch": 0.8717762440973483, "grad_norm": 0.3029833734035492, "kl": 0.45009765625, "learning_rate": 2.8849219675995975e-06, "loss": 0.018, "reward": 0.29888890584309896, "reward_std": 0.14722432121634482, "rewards/accuracy_reward": 0.29888890584309896, "step": 1600 }, { "completion_length": 677.012255859375, "epoch": 0.8772248456229568, "grad_norm": 0.29206570982933044, "kl": 0.53984375, "learning_rate": 2.81234219605325e-06, "loss": 0.0216, "reward": 0.29333334863185884, "reward_std": 0.1597335776935021, "rewards/accuracy_reward": 0.29333334863185884, "step": 1610 }, { "completion_length": 637.7455932617188, "epoch": 0.8826734471485652, "grad_norm": 0.40751364827156067, "kl": 0.4090494791666667, "learning_rate": 2.74272692944489e-06, "loss": 0.0164, "reward": 0.3127777968843778, "reward_std": 0.17224283305307228, "rewards/accuracy_reward": 0.3127777968843778, "step": 1620 }, { "completion_length": 657.5944803873698, "epoch": 0.8881220486741737, "grad_norm": 0.4413086473941803, "kl": 0.4694010416666667, "learning_rate": 2.676101373371348e-06, "loss": 0.0188, "reward": 0.29000001524885494, "reward_std": 0.15203557269026835, "rewards/accuracy_reward": 0.29000001524885494, "step": 1630 }, { "completion_length": 667.4072591145833, "epoch": 0.8935706501997821, "grad_norm": 0.4548060894012451, "kl": 0.44264322916666665, "learning_rate": 2.6124896509450905e-06, "loss": 0.0177, "reward": 0.29500001668930054, "reward_std": 0.17128058609863123, "rewards/accuracy_reward": 0.29500001668930054, "step": 1640 }, { "completion_length": 650.3867004394531, "epoch": 0.8990192517253904, "grad_norm": 0.407611608505249, "kl": 0.49342447916666665, "learning_rate": 2.5519147940599396e-06, "loss": 0.0197, "reward": 0.32111112972100575, "reward_std": 0.15684682484716178, "rewards/accuracy_reward": 0.32111112972100575, "step": 1650 }, { "completion_length": 678.8439229329427, "epoch": 0.9044678532509989, "grad_norm": 0.27222785353660583, "kl": 0.4901041666666667, "learning_rate": 2.4943987350519396e-06, "loss": 0.0196, "reward": 0.28388890127340954, "reward_std": 0.1616580783079068, "rewards/accuracy_reward": 0.28388890127340954, "step": 1660 }, { "completion_length": 677.3072611490885, "epoch": 0.9099164547766073, "grad_norm": 0.4685862362384796, "kl": 0.409375, "learning_rate": 2.4399622987583077e-06, "loss": 0.0164, "reward": 0.32777779499689735, "reward_std": 0.1732050855954488, "rewards/accuracy_reward": 0.32777779499689735, "step": 1670 }, { "completion_length": 660.9905904134115, "epoch": 0.9153650563022158, "grad_norm": 0.3277052938938141, "kl": 0.4407552083333333, "learning_rate": 2.3886251949773824e-06, "loss": 0.0176, "reward": 0.3100000177820524, "reward_std": 0.1587713255236546, "rewards/accuracy_reward": 0.3100000177820524, "step": 1680 }, { "completion_length": 667.7617045084636, "epoch": 0.9208136578278242, "grad_norm": 0.46443241834640503, "kl": 0.43193359375, "learning_rate": 2.340406011332303e-06, "loss": 0.0173, "reward": 0.28388890499869984, "reward_std": 0.15877132589618365, "rewards/accuracy_reward": 0.28388890499869984, "step": 1690 }, { "completion_length": 654.0983683268229, "epoch": 0.9262622593534326, "grad_norm": 0.5611951351165771, "kl": 0.539453125, "learning_rate": 2.2953222065409925e-06, "loss": 0.0216, "reward": 0.3122222418586413, "reward_std": 0.16262033060193062, "rewards/accuracy_reward": 0.3122222418586413, "step": 1700 }, { "completion_length": 666.0250386555989, "epoch": 0.9317108608790411, "grad_norm": 0.6241040229797363, "kl": 0.4469401041666667, "learning_rate": 2.2533901040948702e-06, "loss": 0.0179, "reward": 0.28611112634340924, "reward_std": 0.1655070791641871, "rewards/accuracy_reward": 0.28611112634340924, "step": 1710 }, { "completion_length": 691.7555908203125, "epoch": 0.9371594624046494, "grad_norm": 0.3166793882846832, "kl": 0.41158854166666664, "learning_rate": 2.214624886348614e-06, "loss": 0.0165, "reward": 0.29611112624406816, "reward_std": 0.1539600760365526, "rewards/accuracy_reward": 0.29611112624406816, "step": 1720 }, { "completion_length": 704.2828125, "epoch": 0.9426080639302579, "grad_norm": 0.39701592922210693, "kl": 0.37626953125, "learning_rate": 2.1790405890230854e-06, "loss": 0.0151, "reward": 0.30388890504837035, "reward_std": 0.15396007473270099, "rewards/accuracy_reward": 0.30388890504837035, "step": 1730 }, { "completion_length": 692.5489278157552, "epoch": 0.9480566654558663, "grad_norm": 0.33331432938575745, "kl": 0.41998697916666666, "learning_rate": 2.1466500961234122e-06, "loss": 0.0168, "reward": 0.29388890638947485, "reward_std": 0.151073324183623, "rewards/accuracy_reward": 0.29388890638947485, "step": 1740 }, { "completion_length": 705.2533732096355, "epoch": 0.9535052669814748, "grad_norm": 0.39180633425712585, "kl": 0.47578125, "learning_rate": 2.117465135274078e-06, "loss": 0.019, "reward": 0.3127777944008509, "reward_std": 0.167431582386295, "rewards/accuracy_reward": 0.3127777944008509, "step": 1750 }, { "completion_length": 684.3900370279948, "epoch": 0.9589538685070832, "grad_norm": 1.983847737312317, "kl": 0.4730143229166667, "learning_rate": 2.0914962734727106e-06, "loss": 0.0189, "reward": 0.30888890773057937, "reward_std": 0.16165807681779068, "rewards/accuracy_reward": 0.30888890773057937, "step": 1760 }, { "completion_length": 663.7994812011718, "epoch": 0.9644024700326916, "grad_norm": 1.055616021156311, "kl": 0.42532552083333336, "learning_rate": 2.068752913264074e-06, "loss": 0.017, "reward": 0.3155555707712968, "reward_std": 0.15492232541243237, "rewards/accuracy_reward": 0.3155555707712968, "step": 1770 }, { "completion_length": 668.6583719889323, "epoch": 0.9698510715583001, "grad_norm": 0.4602191746234894, "kl": 0.447265625, "learning_rate": 2.0492432893357008e-06, "loss": 0.0179, "reward": 0.29722223927577335, "reward_std": 0.15973357930779458, "rewards/accuracy_reward": 0.29722223927577335, "step": 1780 }, { "completion_length": 667.3239237467448, "epoch": 0.9752996730839084, "grad_norm": 0.2418515384197235, "kl": 0.45286458333333335, "learning_rate": 2.032974465536361e-06, "loss": 0.0181, "reward": 0.3111111268401146, "reward_std": 0.1520355723798275, "rewards/accuracy_reward": 0.3111111268401146, "step": 1790 }, { "completion_length": 713.8155985514323, "epoch": 0.9807482746095169, "grad_norm": 0.5044481158256531, "kl": 0.3831705729166667, "learning_rate": 2.0199523323184425e-06, "loss": 0.0153, "reward": 0.2966666837533315, "reward_std": 0.15973357719679673, "rewards/accuracy_reward": 0.2966666837533315, "step": 1800 }, { "completion_length": 703.5500345865886, "epoch": 0.9861968761351253, "grad_norm": 0.2937750220298767, "kl": 0.417578125, "learning_rate": 2.010181604605206e-06, "loss": 0.0167, "reward": 0.3172222393254439, "reward_std": 0.16165807812164226, "rewards/accuracy_reward": 0.3172222393254439, "step": 1810 }, { "completion_length": 713.2139221191406, "epoch": 0.9916454776607337, "grad_norm": 0.3408723771572113, "kl": 0.48447265625, "learning_rate": 2.0036658200836492e-06, "loss": 0.0194, "reward": 0.33277779320875805, "reward_std": 0.16550707891583444, "rewards/accuracy_reward": 0.33277779320875805, "step": 1820 }, { "completion_length": 712.1100321451823, "epoch": 0.9970940791863422, "grad_norm": 0.3224766254425049, "kl": 0.5034505208333333, "learning_rate": 2.0004073379236176e-06, "loss": 0.0201, "reward": 0.2988889073332151, "reward_std": 0.17512958732744058, "rewards/accuracy_reward": 0.2988889073332151, "step": 1830 }, { "completion_length": 708.7444864908854, "epoch": 0.9998183799491464, "kl": 0.4979166666666667, "reward": 0.29666668176651, "reward_std": 0.15396007398764291, "rewards/accuracy_reward": 0.29666668176651, "step": 1835, "total_flos": 0.0, "train_loss": 0.013265103175479612, "train_runtime": 96749.3074, "train_samples_per_second": 1.138, "train_steps_per_second": 0.019 } ], "logging_steps": 10, "max_steps": 1835, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 20, "trial_name": null, "trial_params": null }