{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 12059, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8.292561572269674e-05, "grad_norm": 4.255775451660156, "learning_rate": 0.0, "loss": 1.7068, "mean_token_accuracy": 0.6242363229393959, "num_tokens": 32768.0, "step": 1 }, { "epoch": 0.0004146280786134837, "grad_norm": 4.439743995666504, "learning_rate": 3.316749585406302e-07, "loss": 1.6681, "mean_token_accuracy": 0.6322473715990782, "num_tokens": 163840.0, "step": 5 }, { "epoch": 0.0008292561572269675, "grad_norm": 4.379396915435791, "learning_rate": 7.462686567164179e-07, "loss": 1.6459, "mean_token_accuracy": 0.6347201891243458, "num_tokens": 327680.0, "step": 10 }, { "epoch": 0.0012438842358404511, "grad_norm": 3.9361932277679443, "learning_rate": 1.1608623548922056e-06, "loss": 1.6593, "mean_token_accuracy": 0.6337243393063545, "num_tokens": 491520.0, "step": 15 }, { "epoch": 0.001658512314453935, "grad_norm": 3.2223989963531494, "learning_rate": 1.5754560530679936e-06, "loss": 1.6504, "mean_token_accuracy": 0.6342436477541924, "num_tokens": 655360.0, "step": 20 }, { "epoch": 0.0020731403930674187, "grad_norm": 3.0505387783050537, "learning_rate": 1.9900497512437813e-06, "loss": 1.6054, "mean_token_accuracy": 0.642295940220356, "num_tokens": 819200.0, "step": 25 }, { "epoch": 0.0024877684716809022, "grad_norm": 2.7383947372436523, "learning_rate": 2.404643449419569e-06, "loss": 1.5684, "mean_token_accuracy": 0.641458946466446, "num_tokens": 983040.0, "step": 30 }, { "epoch": 0.002902396550294386, "grad_norm": 2.6609301567077637, "learning_rate": 2.819237147595357e-06, "loss": 1.5282, "mean_token_accuracy": 0.6450574293732643, "num_tokens": 1146880.0, "step": 35 }, { "epoch": 0.00331702462890787, "grad_norm": 2.531510829925537, "learning_rate": 3.233830845771144e-06, "loss": 1.5194, "mean_token_accuracy": 0.6455400750041008, "num_tokens": 1310720.0, "step": 40 }, { "epoch": 0.0037316527075213534, "grad_norm": 2.3891067504882812, "learning_rate": 3.6484245439469323e-06, "loss": 1.5524, "mean_token_accuracy": 0.6400598779320716, "num_tokens": 1474560.0, "step": 45 }, { "epoch": 0.004146280786134837, "grad_norm": 2.4584314823150635, "learning_rate": 4.0630182421227205e-06, "loss": 1.5161, "mean_token_accuracy": 0.6443792775273323, "num_tokens": 1638400.0, "step": 50 }, { "epoch": 0.0045609088647483205, "grad_norm": 2.7202205657958984, "learning_rate": 4.477611940298508e-06, "loss": 1.451, "mean_token_accuracy": 0.6507514685392379, "num_tokens": 1802240.0, "step": 55 }, { "epoch": 0.0049755369433618045, "grad_norm": 2.1469480991363525, "learning_rate": 4.892205638474295e-06, "loss": 1.4507, "mean_token_accuracy": 0.6539161786437034, "num_tokens": 1966080.0, "step": 60 }, { "epoch": 0.0053901650219752885, "grad_norm": 2.370760679244995, "learning_rate": 5.306799336650083e-06, "loss": 1.4258, "mean_token_accuracy": 0.6556940346956253, "num_tokens": 2129920.0, "step": 65 }, { "epoch": 0.005804793100588772, "grad_norm": 2.206936836242676, "learning_rate": 5.7213930348258714e-06, "loss": 1.3915, "mean_token_accuracy": 0.665120966732502, "num_tokens": 2293760.0, "step": 70 }, { "epoch": 0.006219421179202256, "grad_norm": 2.31195330619812, "learning_rate": 6.135986733001659e-06, "loss": 1.4497, "mean_token_accuracy": 0.6530730664730072, "num_tokens": 2457600.0, "step": 75 }, { "epoch": 0.00663404925781574, "grad_norm": 2.3220083713531494, "learning_rate": 6.550580431177446e-06, "loss": 1.3777, "mean_token_accuracy": 0.6623583763837815, "num_tokens": 2621056.0, "step": 80 }, { "epoch": 0.007048677336429223, "grad_norm": 2.3705880641937256, "learning_rate": 6.965174129353234e-06, "loss": 1.4263, "mean_token_accuracy": 0.6539833784103394, "num_tokens": 2784896.0, "step": 85 }, { "epoch": 0.007463305415042707, "grad_norm": 2.4472298622131348, "learning_rate": 7.3797678275290215e-06, "loss": 1.4116, "mean_token_accuracy": 0.6581011697649956, "num_tokens": 2948736.0, "step": 90 }, { "epoch": 0.00787793349365619, "grad_norm": 2.323025941848755, "learning_rate": 7.79436152570481e-06, "loss": 1.3931, "mean_token_accuracy": 0.6656957253813743, "num_tokens": 3111601.0, "step": 95 }, { "epoch": 0.008292561572269675, "grad_norm": 2.3700666427612305, "learning_rate": 8.208955223880597e-06, "loss": 1.4302, "mean_token_accuracy": 0.6556268364191056, "num_tokens": 3275441.0, "step": 100 }, { "epoch": 0.008707189650883157, "grad_norm": 2.247762441635132, "learning_rate": 8.623548922056384e-06, "loss": 1.3676, "mean_token_accuracy": 0.6675586551427841, "num_tokens": 3439281.0, "step": 105 }, { "epoch": 0.009121817729496641, "grad_norm": 2.2979612350463867, "learning_rate": 9.038142620232173e-06, "loss": 1.3495, "mean_token_accuracy": 0.6681207224726677, "num_tokens": 3603121.0, "step": 110 }, { "epoch": 0.009536445808110125, "grad_norm": 2.7338132858276367, "learning_rate": 9.45273631840796e-06, "loss": 1.3666, "mean_token_accuracy": 0.6656109973788261, "num_tokens": 3766936.0, "step": 115 }, { "epoch": 0.009951073886723609, "grad_norm": 2.2777035236358643, "learning_rate": 9.867330016583748e-06, "loss": 1.3427, "mean_token_accuracy": 0.6667766377329827, "num_tokens": 3930776.0, "step": 120 }, { "epoch": 0.010365701965337093, "grad_norm": 2.358052968978882, "learning_rate": 1.0281923714759537e-05, "loss": 1.3618, "mean_token_accuracy": 0.6643939435482025, "num_tokens": 4094616.0, "step": 125 }, { "epoch": 0.010780330043950577, "grad_norm": 2.2697947025299072, "learning_rate": 1.0696517412935324e-05, "loss": 1.3022, "mean_token_accuracy": 0.6709677398204803, "num_tokens": 4258456.0, "step": 130 }, { "epoch": 0.01119495812256406, "grad_norm": 2.2408878803253174, "learning_rate": 1.1111111111111112e-05, "loss": 1.3369, "mean_token_accuracy": 0.6672592878341674, "num_tokens": 4422296.0, "step": 135 }, { "epoch": 0.011609586201177543, "grad_norm": 2.094745397567749, "learning_rate": 1.1525704809286899e-05, "loss": 1.3387, "mean_token_accuracy": 0.6627565950155259, "num_tokens": 4586136.0, "step": 140 }, { "epoch": 0.012024214279791027, "grad_norm": 2.171077251434326, "learning_rate": 1.1940298507462686e-05, "loss": 1.3199, "mean_token_accuracy": 0.6675586476922035, "num_tokens": 4749976.0, "step": 145 }, { "epoch": 0.012438842358404511, "grad_norm": 2.0367488861083984, "learning_rate": 1.2354892205638475e-05, "loss": 1.2787, "mean_token_accuracy": 0.6763868525624275, "num_tokens": 4913816.0, "step": 150 }, { "epoch": 0.012853470437017995, "grad_norm": 2.0931756496429443, "learning_rate": 1.2769485903814263e-05, "loss": 1.2735, "mean_token_accuracy": 0.6738025419414043, "num_tokens": 5077656.0, "step": 155 }, { "epoch": 0.01326809851563148, "grad_norm": 2.0900051593780518, "learning_rate": 1.3184079601990052e-05, "loss": 1.3317, "mean_token_accuracy": 0.6669599190354347, "num_tokens": 5241496.0, "step": 160 }, { "epoch": 0.013682726594244961, "grad_norm": 2.108306407928467, "learning_rate": 1.3598673300165837e-05, "loss": 1.4021, "mean_token_accuracy": 0.6558712109923363, "num_tokens": 5405336.0, "step": 165 }, { "epoch": 0.014097354672858445, "grad_norm": 2.012080669403076, "learning_rate": 1.4013266998341626e-05, "loss": 1.2891, "mean_token_accuracy": 0.6758186653256416, "num_tokens": 5569176.0, "step": 170 }, { "epoch": 0.01451198275147193, "grad_norm": 2.0759711265563965, "learning_rate": 1.4427860696517415e-05, "loss": 1.2816, "mean_token_accuracy": 0.6724951103329658, "num_tokens": 5733016.0, "step": 175 }, { "epoch": 0.014926610830085413, "grad_norm": 1.8789936304092407, "learning_rate": 1.4842454394693201e-05, "loss": 1.2954, "mean_token_accuracy": 0.677889783680439, "num_tokens": 5896856.0, "step": 180 }, { "epoch": 0.015341238908698897, "grad_norm": 1.902726650238037, "learning_rate": 1.525704809286899e-05, "loss": 1.3149, "mean_token_accuracy": 0.6745784431695938, "num_tokens": 6060696.0, "step": 185 }, { "epoch": 0.01575586698731238, "grad_norm": 2.1739706993103027, "learning_rate": 1.5671641791044777e-05, "loss": 1.2684, "mean_token_accuracy": 0.6763257533311844, "num_tokens": 6224536.0, "step": 190 }, { "epoch": 0.016170495065925865, "grad_norm": 1.9013311862945557, "learning_rate": 1.6086235489220563e-05, "loss": 1.3092, "mean_token_accuracy": 0.6740285888314247, "num_tokens": 6388376.0, "step": 195 }, { "epoch": 0.01658512314453935, "grad_norm": 1.9905920028686523, "learning_rate": 1.6500829187396352e-05, "loss": 1.2969, "mean_token_accuracy": 0.6714259505271911, "num_tokens": 6552216.0, "step": 200 }, { "epoch": 0.016999751223152833, "grad_norm": 2.1580097675323486, "learning_rate": 1.691542288557214e-05, "loss": 1.2599, "mean_token_accuracy": 0.6779753148555756, "num_tokens": 6716056.0, "step": 205 }, { "epoch": 0.017414379301766314, "grad_norm": 2.1075289249420166, "learning_rate": 1.7330016583747926e-05, "loss": 1.3148, "mean_token_accuracy": 0.6719086065888404, "num_tokens": 6879896.0, "step": 210 }, { "epoch": 0.017829007380379798, "grad_norm": 2.0454044342041016, "learning_rate": 1.7744610281923716e-05, "loss": 1.3219, "mean_token_accuracy": 0.6674425765872002, "num_tokens": 7043736.0, "step": 215 }, { "epoch": 0.018243635458993282, "grad_norm": 1.9608289003372192, "learning_rate": 1.8159203980099505e-05, "loss": 1.2862, "mean_token_accuracy": 0.6722079649567604, "num_tokens": 7207576.0, "step": 220 }, { "epoch": 0.018658263537606766, "grad_norm": 1.9292097091674805, "learning_rate": 1.857379767827529e-05, "loss": 1.3089, "mean_token_accuracy": 0.6710471615195275, "num_tokens": 7371416.0, "step": 225 }, { "epoch": 0.01907289161622025, "grad_norm": 1.9487251043319702, "learning_rate": 1.898839137645108e-05, "loss": 1.268, "mean_token_accuracy": 0.6755315259099006, "num_tokens": 7535256.0, "step": 230 }, { "epoch": 0.019487519694833734, "grad_norm": 1.9679298400878906, "learning_rate": 1.9402985074626868e-05, "loss": 1.2515, "mean_token_accuracy": 0.6827223837375641, "num_tokens": 7699096.0, "step": 235 }, { "epoch": 0.019902147773447218, "grad_norm": 2.0813894271850586, "learning_rate": 1.9817578772802657e-05, "loss": 1.2744, "mean_token_accuracy": 0.6776331901550293, "num_tokens": 7862936.0, "step": 240 }, { "epoch": 0.020316775852060702, "grad_norm": 1.8269994258880615, "learning_rate": 2.0232172470978443e-05, "loss": 1.2968, "mean_token_accuracy": 0.6754276618361473, "num_tokens": 8026776.0, "step": 245 }, { "epoch": 0.020731403930674186, "grad_norm": 1.8057606220245361, "learning_rate": 2.0646766169154232e-05, "loss": 1.329, "mean_token_accuracy": 0.668071848154068, "num_tokens": 8190616.0, "step": 250 }, { "epoch": 0.02114603200928767, "grad_norm": 1.9886025190353394, "learning_rate": 2.1061359867330017e-05, "loss": 1.2449, "mean_token_accuracy": 0.6797592893242836, "num_tokens": 8354456.0, "step": 255 }, { "epoch": 0.021560660087901154, "grad_norm": 1.895953893661499, "learning_rate": 2.1475953565505803e-05, "loss": 1.2596, "mean_token_accuracy": 0.6817754164338112, "num_tokens": 8518296.0, "step": 260 }, { "epoch": 0.021975288166514638, "grad_norm": 1.98335599899292, "learning_rate": 2.1890547263681592e-05, "loss": 1.2828, "mean_token_accuracy": 0.6719269320368767, "num_tokens": 8682136.0, "step": 265 }, { "epoch": 0.02238991624512812, "grad_norm": 2.0442817211151123, "learning_rate": 2.230514096185738e-05, "loss": 1.2699, "mean_token_accuracy": 0.6768572837114334, "num_tokens": 8845976.0, "step": 270 }, { "epoch": 0.022804544323741602, "grad_norm": 1.8405953645706177, "learning_rate": 2.2719734660033167e-05, "loss": 1.2689, "mean_token_accuracy": 0.6766678869724274, "num_tokens": 9009816.0, "step": 275 }, { "epoch": 0.023219172402355086, "grad_norm": 1.8921599388122559, "learning_rate": 2.3134328358208956e-05, "loss": 1.2757, "mean_token_accuracy": 0.6725439906120301, "num_tokens": 9173656.0, "step": 280 }, { "epoch": 0.02363380048096857, "grad_norm": 1.9423072338104248, "learning_rate": 2.3548922056384745e-05, "loss": 1.2914, "mean_token_accuracy": 0.6712365537881851, "num_tokens": 9337496.0, "step": 285 }, { "epoch": 0.024048428559582054, "grad_norm": 1.901416540145874, "learning_rate": 2.396351575456053e-05, "loss": 1.2725, "mean_token_accuracy": 0.6756964817643165, "num_tokens": 9501336.0, "step": 290 }, { "epoch": 0.02446305663819554, "grad_norm": 1.8767505884170532, "learning_rate": 2.437810945273632e-05, "loss": 1.2286, "mean_token_accuracy": 0.6843169555068016, "num_tokens": 9665176.0, "step": 295 }, { "epoch": 0.024877684716809022, "grad_norm": 1.8018405437469482, "learning_rate": 2.479270315091211e-05, "loss": 1.3213, "mean_token_accuracy": 0.6664894863963127, "num_tokens": 9829016.0, "step": 300 }, { "epoch": 0.025292312795422506, "grad_norm": 1.8564913272857666, "learning_rate": 2.5207296849087897e-05, "loss": 1.2648, "mean_token_accuracy": 0.6779142245650291, "num_tokens": 9992856.0, "step": 305 }, { "epoch": 0.02570694087403599, "grad_norm": 1.8388084173202515, "learning_rate": 2.5621890547263683e-05, "loss": 1.2181, "mean_token_accuracy": 0.6881781488656997, "num_tokens": 10156696.0, "step": 310 }, { "epoch": 0.026121568952649474, "grad_norm": 1.929056167602539, "learning_rate": 2.603648424543947e-05, "loss": 1.353, "mean_token_accuracy": 0.6692035496234894, "num_tokens": 10319872.0, "step": 315 }, { "epoch": 0.02653619703126296, "grad_norm": 1.7754048109054565, "learning_rate": 2.645107794361526e-05, "loss": 1.2762, "mean_token_accuracy": 0.6776442378759384, "num_tokens": 10482899.0, "step": 320 }, { "epoch": 0.026950825109876442, "grad_norm": 1.938942313194275, "learning_rate": 2.6865671641791047e-05, "loss": 1.2157, "mean_token_accuracy": 0.6928702339529991, "num_tokens": 10646739.0, "step": 325 }, { "epoch": 0.027365453188489923, "grad_norm": 2.127412796020508, "learning_rate": 2.7280265339966832e-05, "loss": 1.3557, "mean_token_accuracy": 0.6633675456047058, "num_tokens": 10810579.0, "step": 330 }, { "epoch": 0.027780081267103407, "grad_norm": 1.875630497932434, "learning_rate": 2.7694859038142625e-05, "loss": 1.2634, "mean_token_accuracy": 0.6779936462640762, "num_tokens": 10974419.0, "step": 335 }, { "epoch": 0.02819470934571689, "grad_norm": 1.8406985998153687, "learning_rate": 2.810945273631841e-05, "loss": 1.2822, "mean_token_accuracy": 0.6770466819405556, "num_tokens": 11138259.0, "step": 340 }, { "epoch": 0.028609337424330375, "grad_norm": 1.8575828075408936, "learning_rate": 2.8524046434494196e-05, "loss": 1.256, "mean_token_accuracy": 0.6802602678537368, "num_tokens": 11302099.0, "step": 345 }, { "epoch": 0.02902396550294386, "grad_norm": 1.8851755857467651, "learning_rate": 2.8938640132669985e-05, "loss": 1.3098, "mean_token_accuracy": 0.6703384682536125, "num_tokens": 11465939.0, "step": 350 }, { "epoch": 0.029438593581557343, "grad_norm": 1.8496495485305786, "learning_rate": 2.935323383084577e-05, "loss": 1.279, "mean_token_accuracy": 0.6763624161481857, "num_tokens": 11629779.0, "step": 355 }, { "epoch": 0.029853221660170827, "grad_norm": 1.8364508152008057, "learning_rate": 2.976782752902156e-05, "loss": 1.2982, "mean_token_accuracy": 0.6732404701411724, "num_tokens": 11793619.0, "step": 360 }, { "epoch": 0.03026784973878431, "grad_norm": 1.8799529075622559, "learning_rate": 3.018242122719735e-05, "loss": 1.2816, "mean_token_accuracy": 0.6801197469234467, "num_tokens": 11957459.0, "step": 365 }, { "epoch": 0.030682477817397795, "grad_norm": 1.8871667385101318, "learning_rate": 3.059701492537314e-05, "loss": 1.2468, "mean_token_accuracy": 0.6795088008046151, "num_tokens": 12121299.0, "step": 370 }, { "epoch": 0.03109710589601128, "grad_norm": 1.9109357595443726, "learning_rate": 3.101160862354892e-05, "loss": 1.3207, "mean_token_accuracy": 0.6695869967341423, "num_tokens": 12285139.0, "step": 375 }, { "epoch": 0.03151173397462476, "grad_norm": 1.798118233680725, "learning_rate": 3.1426202321724716e-05, "loss": 1.3022, "mean_token_accuracy": 0.6710410594940186, "num_tokens": 12448979.0, "step": 380 }, { "epoch": 0.03192636205323825, "grad_norm": 1.9242483377456665, "learning_rate": 3.18407960199005e-05, "loss": 1.2831, "mean_token_accuracy": 0.674749507009983, "num_tokens": 12612819.0, "step": 385 }, { "epoch": 0.03234099013185173, "grad_norm": 1.8487025499343872, "learning_rate": 3.225538971807629e-05, "loss": 1.2911, "mean_token_accuracy": 0.6759653016924858, "num_tokens": 12776659.0, "step": 390 }, { "epoch": 0.032755618210465215, "grad_norm": 1.8383572101593018, "learning_rate": 3.266998341625207e-05, "loss": 1.283, "mean_token_accuracy": 0.6767656400799751, "num_tokens": 12940499.0, "step": 395 }, { "epoch": 0.0331702462890787, "grad_norm": 1.8775842189788818, "learning_rate": 3.3084577114427865e-05, "loss": 1.2318, "mean_token_accuracy": 0.6872495085000991, "num_tokens": 13104339.0, "step": 400 }, { "epoch": 0.03358487436769218, "grad_norm": 1.8343756198883057, "learning_rate": 3.349917081260365e-05, "loss": 1.2764, "mean_token_accuracy": 0.6762924410402775, "num_tokens": 13267633.0, "step": 405 }, { "epoch": 0.03399950244630567, "grad_norm": 1.866855263710022, "learning_rate": 3.3913764510779436e-05, "loss": 1.2361, "mean_token_accuracy": 0.6795393496751785, "num_tokens": 13431473.0, "step": 410 }, { "epoch": 0.03441413052491915, "grad_norm": 2.01704478263855, "learning_rate": 3.432835820895522e-05, "loss": 1.2635, "mean_token_accuracy": 0.6738086491823196, "num_tokens": 13595313.0, "step": 415 }, { "epoch": 0.03482875860353263, "grad_norm": 2.0374269485473633, "learning_rate": 3.474295190713101e-05, "loss": 1.3077, "mean_token_accuracy": 0.6713587492704391, "num_tokens": 13759153.0, "step": 420 }, { "epoch": 0.03524338668214611, "grad_norm": 1.7698193788528442, "learning_rate": 3.51575456053068e-05, "loss": 1.2407, "mean_token_accuracy": 0.6825207769870758, "num_tokens": 13922993.0, "step": 425 }, { "epoch": 0.035658014760759596, "grad_norm": 1.7802177667617798, "learning_rate": 3.5572139303482585e-05, "loss": 1.2346, "mean_token_accuracy": 0.6825635403394699, "num_tokens": 14086833.0, "step": 430 }, { "epoch": 0.03607264283937308, "grad_norm": 1.8050049543380737, "learning_rate": 3.598673300165838e-05, "loss": 1.2194, "mean_token_accuracy": 0.6875488728284835, "num_tokens": 14250673.0, "step": 435 }, { "epoch": 0.036487270917986564, "grad_norm": 1.7095775604248047, "learning_rate": 3.6401326699834163e-05, "loss": 1.2356, "mean_token_accuracy": 0.6833883225917816, "num_tokens": 14414513.0, "step": 440 }, { "epoch": 0.03690189899660005, "grad_norm": 1.748965859413147, "learning_rate": 3.681592039800995e-05, "loss": 1.2422, "mean_token_accuracy": 0.6845124676823616, "num_tokens": 14578353.0, "step": 445 }, { "epoch": 0.03731652707521353, "grad_norm": 1.8167507648468018, "learning_rate": 3.723051409618574e-05, "loss": 1.3053, "mean_token_accuracy": 0.6701429590582848, "num_tokens": 14742193.0, "step": 450 }, { "epoch": 0.037731155153827016, "grad_norm": 1.798095464706421, "learning_rate": 3.764510779436153e-05, "loss": 1.3032, "mean_token_accuracy": 0.6737719923257828, "num_tokens": 14906033.0, "step": 455 }, { "epoch": 0.0381457832324405, "grad_norm": 1.8302721977233887, "learning_rate": 3.805970149253731e-05, "loss": 1.3093, "mean_token_accuracy": 0.6685470998287201, "num_tokens": 15069125.0, "step": 460 }, { "epoch": 0.038560411311053984, "grad_norm": 1.8639119863510132, "learning_rate": 3.8474295190713105e-05, "loss": 1.3691, "mean_token_accuracy": 0.6577162772417069, "num_tokens": 15232965.0, "step": 465 }, { "epoch": 0.03897503938966747, "grad_norm": 1.7261921167373657, "learning_rate": 3.888888888888889e-05, "loss": 1.2696, "mean_token_accuracy": 0.6784152060747146, "num_tokens": 15396805.0, "step": 470 }, { "epoch": 0.03938966746828095, "grad_norm": 1.8879308700561523, "learning_rate": 3.9303482587064676e-05, "loss": 1.2801, "mean_token_accuracy": 0.6760569319128991, "num_tokens": 15560645.0, "step": 475 }, { "epoch": 0.039804295546894436, "grad_norm": 1.7602691650390625, "learning_rate": 3.971807628524047e-05, "loss": 1.3729, "mean_token_accuracy": 0.6612170025706291, "num_tokens": 15724485.0, "step": 480 }, { "epoch": 0.04021892362550792, "grad_norm": 1.8454163074493408, "learning_rate": 4.0132669983416254e-05, "loss": 1.3278, "mean_token_accuracy": 0.6671309858560562, "num_tokens": 15888325.0, "step": 485 }, { "epoch": 0.040633551704121404, "grad_norm": 1.7032504081726074, "learning_rate": 4.054726368159204e-05, "loss": 1.235, "mean_token_accuracy": 0.6836302936077118, "num_tokens": 16051794.0, "step": 490 }, { "epoch": 0.04104817978273489, "grad_norm": 1.6901429891586304, "learning_rate": 4.096185737976783e-05, "loss": 1.2892, "mean_token_accuracy": 0.673714742064476, "num_tokens": 16215039.0, "step": 495 }, { "epoch": 0.04146280786134837, "grad_norm": 1.7194161415100098, "learning_rate": 4.137645107794362e-05, "loss": 1.3475, "mean_token_accuracy": 0.6696969702839851, "num_tokens": 16378879.0, "step": 500 }, { "epoch": 0.041877435939961856, "grad_norm": 1.787503719329834, "learning_rate": 4.1791044776119404e-05, "loss": 1.2836, "mean_token_accuracy": 0.6777657330036163, "num_tokens": 16541966.0, "step": 505 }, { "epoch": 0.04229206401857534, "grad_norm": 1.6372019052505493, "learning_rate": 4.2205638474295196e-05, "loss": 1.2472, "mean_token_accuracy": 0.6835716024041176, "num_tokens": 16705806.0, "step": 510 }, { "epoch": 0.042706692097188824, "grad_norm": 1.7618612051010132, "learning_rate": 4.262023217247098e-05, "loss": 1.3188, "mean_token_accuracy": 0.6690554767847061, "num_tokens": 16869646.0, "step": 515 }, { "epoch": 0.04312132017580231, "grad_norm": 2.250345468521118, "learning_rate": 4.303482587064677e-05, "loss": 1.38, "mean_token_accuracy": 0.6570992156863212, "num_tokens": 17033486.0, "step": 520 }, { "epoch": 0.04353594825441579, "grad_norm": 1.7955857515335083, "learning_rate": 4.344941956882256e-05, "loss": 1.2819, "mean_token_accuracy": 0.6765945747494697, "num_tokens": 17197326.0, "step": 525 }, { "epoch": 0.043950576333029276, "grad_norm": 1.7054221630096436, "learning_rate": 4.3864013266998345e-05, "loss": 1.2634, "mean_token_accuracy": 0.6766556695103645, "num_tokens": 17361166.0, "step": 530 }, { "epoch": 0.04436520441164276, "grad_norm": 1.677278757095337, "learning_rate": 4.427860696517413e-05, "loss": 1.2543, "mean_token_accuracy": 0.6794110462069511, "num_tokens": 17525006.0, "step": 535 }, { "epoch": 0.04477983249025624, "grad_norm": 1.789715051651001, "learning_rate": 4.469320066334992e-05, "loss": 1.27, "mean_token_accuracy": 0.6728861182928085, "num_tokens": 17688846.0, "step": 540 }, { "epoch": 0.04519446056886972, "grad_norm": 1.7444549798965454, "learning_rate": 4.510779436152571e-05, "loss": 1.2851, "mean_token_accuracy": 0.6725989744067192, "num_tokens": 17852686.0, "step": 545 }, { "epoch": 0.045609088647483205, "grad_norm": 1.7071665525436401, "learning_rate": 4.5522388059701495e-05, "loss": 1.3174, "mean_token_accuracy": 0.6670454546809197, "num_tokens": 18016526.0, "step": 550 }, { "epoch": 0.04602371672609669, "grad_norm": 1.6631416082382202, "learning_rate": 4.593698175787729e-05, "loss": 1.3353, "mean_token_accuracy": 0.6682978942990303, "num_tokens": 18180366.0, "step": 555 }, { "epoch": 0.04643834480471017, "grad_norm": 1.747498869895935, "learning_rate": 4.635157545605307e-05, "loss": 1.2341, "mean_token_accuracy": 0.6825391009449959, "num_tokens": 18344206.0, "step": 560 }, { "epoch": 0.04685297288332366, "grad_norm": 1.735395908355713, "learning_rate": 4.676616915422886e-05, "loss": 1.2797, "mean_token_accuracy": 0.6739538997411728, "num_tokens": 18507084.0, "step": 565 }, { "epoch": 0.04726760096193714, "grad_norm": 1.7586103677749634, "learning_rate": 4.718076285240465e-05, "loss": 1.4349, "mean_token_accuracy": 0.6478922292590141, "num_tokens": 18670924.0, "step": 570 }, { "epoch": 0.047682229040550625, "grad_norm": 1.7093908786773682, "learning_rate": 4.7595356550580436e-05, "loss": 1.3027, "mean_token_accuracy": 0.6697977557778358, "num_tokens": 18834639.0, "step": 575 }, { "epoch": 0.04809685711916411, "grad_norm": 1.7219830751419067, "learning_rate": 4.800995024875622e-05, "loss": 1.3248, "mean_token_accuracy": 0.6675891973078251, "num_tokens": 18998479.0, "step": 580 }, { "epoch": 0.04851148519777759, "grad_norm": 1.7310324907302856, "learning_rate": 4.842454394693201e-05, "loss": 1.2503, "mean_token_accuracy": 0.6795957133173942, "num_tokens": 19162276.0, "step": 585 }, { "epoch": 0.04892611327639108, "grad_norm": 1.7623659372329712, "learning_rate": 4.883913764510779e-05, "loss": 1.2349, "mean_token_accuracy": 0.6812561109662056, "num_tokens": 19326116.0, "step": 590 }, { "epoch": 0.04934074135500456, "grad_norm": 1.6814851760864258, "learning_rate": 4.9253731343283586e-05, "loss": 1.3514, "mean_token_accuracy": 0.6650232136249542, "num_tokens": 19489956.0, "step": 595 }, { "epoch": 0.049755369433618045, "grad_norm": 1.9025373458862305, "learning_rate": 4.966832504145937e-05, "loss": 1.3641, "mean_token_accuracy": 0.6606243863701821, "num_tokens": 19653796.0, "step": 600 }, { "epoch": 0.05016999751223153, "grad_norm": 1.6884503364562988, "learning_rate": 5.0082918739635164e-05, "loss": 1.3043, "mean_token_accuracy": 0.6688844054937363, "num_tokens": 19817636.0, "step": 605 }, { "epoch": 0.05058462559084501, "grad_norm": 1.7554152011871338, "learning_rate": 5.049751243781094e-05, "loss": 1.2706, "mean_token_accuracy": 0.6801991656422615, "num_tokens": 19981476.0, "step": 610 }, { "epoch": 0.0509992536694585, "grad_norm": 1.6515464782714844, "learning_rate": 5.0912106135986735e-05, "loss": 1.3007, "mean_token_accuracy": 0.6692387565970421, "num_tokens": 20145316.0, "step": 615 }, { "epoch": 0.05141388174807198, "grad_norm": 1.6192206144332886, "learning_rate": 5.132669983416253e-05, "loss": 1.2865, "mean_token_accuracy": 0.6751038610935212, "num_tokens": 20309156.0, "step": 620 }, { "epoch": 0.051828509826685465, "grad_norm": 1.6686915159225464, "learning_rate": 5.1741293532338306e-05, "loss": 1.2957, "mean_token_accuracy": 0.6767473071813583, "num_tokens": 20472996.0, "step": 625 }, { "epoch": 0.05224313790529895, "grad_norm": 1.667669653892517, "learning_rate": 5.21558872305141e-05, "loss": 1.2578, "mean_token_accuracy": 0.679050587117672, "num_tokens": 20636836.0, "step": 630 }, { "epoch": 0.05265776598391243, "grad_norm": 1.6532917022705078, "learning_rate": 5.257048092868989e-05, "loss": 1.2807, "mean_token_accuracy": 0.6721774160861969, "num_tokens": 20800676.0, "step": 635 }, { "epoch": 0.05307239406252592, "grad_norm": 1.7098535299301147, "learning_rate": 5.298507462686567e-05, "loss": 1.3167, "mean_token_accuracy": 0.6653653442859649, "num_tokens": 20964516.0, "step": 640 }, { "epoch": 0.0534870221411394, "grad_norm": 1.7239949703216553, "learning_rate": 5.339966832504146e-05, "loss": 1.313, "mean_token_accuracy": 0.671446581184864, "num_tokens": 21127831.0, "step": 645 }, { "epoch": 0.053901650219752885, "grad_norm": 1.6677320003509521, "learning_rate": 5.3814262023217255e-05, "loss": 1.2705, "mean_token_accuracy": 0.6744562536478043, "num_tokens": 21291671.0, "step": 650 }, { "epoch": 0.05431627829836636, "grad_norm": 1.6505001783370972, "learning_rate": 5.422885572139303e-05, "loss": 1.3399, "mean_token_accuracy": 0.6627688139677048, "num_tokens": 21455511.0, "step": 655 }, { "epoch": 0.054730906376979846, "grad_norm": 1.6301279067993164, "learning_rate": 5.4643449419568826e-05, "loss": 1.2612, "mean_token_accuracy": 0.6786168172955513, "num_tokens": 21619351.0, "step": 660 }, { "epoch": 0.05514553445559333, "grad_norm": 1.6679112911224365, "learning_rate": 5.505804311774462e-05, "loss": 1.2431, "mean_token_accuracy": 0.682655180990696, "num_tokens": 21783191.0, "step": 665 }, { "epoch": 0.055560162534206814, "grad_norm": 1.6891405582427979, "learning_rate": 5.54726368159204e-05, "loss": 1.3149, "mean_token_accuracy": 0.6664772793650627, "num_tokens": 21947031.0, "step": 670 }, { "epoch": 0.0559747906128203, "grad_norm": 1.7086944580078125, "learning_rate": 5.588723051409619e-05, "loss": 1.2677, "mean_token_accuracy": 0.6777737095952034, "num_tokens": 22110871.0, "step": 675 }, { "epoch": 0.05638941869143378, "grad_norm": 1.6492773294448853, "learning_rate": 5.630182421227198e-05, "loss": 1.2703, "mean_token_accuracy": 0.6764601692557335, "num_tokens": 22274711.0, "step": 680 }, { "epoch": 0.056804046770047266, "grad_norm": 1.843117356300354, "learning_rate": 5.671641791044776e-05, "loss": 1.2863, "mean_token_accuracy": 0.6737047895789147, "num_tokens": 22438551.0, "step": 685 }, { "epoch": 0.05721867484866075, "grad_norm": 1.6733990907669067, "learning_rate": 5.713101160862355e-05, "loss": 1.1973, "mean_token_accuracy": 0.6911718010902405, "num_tokens": 22602391.0, "step": 690 }, { "epoch": 0.057633302927274234, "grad_norm": 1.5508854389190674, "learning_rate": 5.7545605306799345e-05, "loss": 1.3249, "mean_token_accuracy": 0.6713893011212348, "num_tokens": 22766231.0, "step": 695 }, { "epoch": 0.05804793100588772, "grad_norm": 1.62466299533844, "learning_rate": 5.7960199004975124e-05, "loss": 1.3918, "mean_token_accuracy": 0.6578445717692375, "num_tokens": 22930071.0, "step": 700 }, { "epoch": 0.0584625590845012, "grad_norm": 1.4997344017028809, "learning_rate": 5.837479270315092e-05, "loss": 1.3206, "mean_token_accuracy": 0.6705156370997429, "num_tokens": 23093911.0, "step": 705 }, { "epoch": 0.058877187163114686, "grad_norm": 1.654841423034668, "learning_rate": 5.878938640132671e-05, "loss": 1.2856, "mean_token_accuracy": 0.6701063022017479, "num_tokens": 23257751.0, "step": 710 }, { "epoch": 0.05929181524172817, "grad_norm": 1.640665888786316, "learning_rate": 5.920398009950249e-05, "loss": 1.3436, "mean_token_accuracy": 0.6653836756944657, "num_tokens": 23421591.0, "step": 715 }, { "epoch": 0.059706443320341654, "grad_norm": 1.6757763624191284, "learning_rate": 5.961857379767828e-05, "loss": 1.3558, "mean_token_accuracy": 0.6623839154839516, "num_tokens": 23585431.0, "step": 720 }, { "epoch": 0.06012107139895514, "grad_norm": 1.6322523355484009, "learning_rate": 6.003316749585407e-05, "loss": 1.3635, "mean_token_accuracy": 0.6597018599510193, "num_tokens": 23749271.0, "step": 725 }, { "epoch": 0.06053569947756862, "grad_norm": 1.6678824424743652, "learning_rate": 6.044776119402985e-05, "loss": 1.3659, "mean_token_accuracy": 0.6591336786746979, "num_tokens": 23913111.0, "step": 730 }, { "epoch": 0.060950327556182106, "grad_norm": 1.4809238910675049, "learning_rate": 6.0862354892205644e-05, "loss": 1.2666, "mean_token_accuracy": 0.6795271247625351, "num_tokens": 24076951.0, "step": 735 }, { "epoch": 0.06136495563479559, "grad_norm": 1.533839225769043, "learning_rate": 6.127694859038143e-05, "loss": 1.2866, "mean_token_accuracy": 0.6745417892932892, "num_tokens": 24240791.0, "step": 740 }, { "epoch": 0.061779583713409074, "grad_norm": 1.6793161630630493, "learning_rate": 6.169154228855722e-05, "loss": 1.2913, "mean_token_accuracy": 0.6742424249649048, "num_tokens": 24404631.0, "step": 745 }, { "epoch": 0.06219421179202256, "grad_norm": 2.060741424560547, "learning_rate": 6.2106135986733e-05, "loss": 1.3387, "mean_token_accuracy": 0.6664833813905716, "num_tokens": 24568471.0, "step": 750 }, { "epoch": 0.06260883987063603, "grad_norm": 1.731775164604187, "learning_rate": 6.25207296849088e-05, "loss": 1.2995, "mean_token_accuracy": 0.6714076220989227, "num_tokens": 24732311.0, "step": 755 }, { "epoch": 0.06302346794924953, "grad_norm": 1.576099157333374, "learning_rate": 6.293532338308457e-05, "loss": 1.3214, "mean_token_accuracy": 0.6706133902072906, "num_tokens": 24896151.0, "step": 760 }, { "epoch": 0.063438096027863, "grad_norm": 1.6460028886795044, "learning_rate": 6.334991708126037e-05, "loss": 1.2541, "mean_token_accuracy": 0.6741141244769097, "num_tokens": 25059991.0, "step": 765 }, { "epoch": 0.0638527241064765, "grad_norm": 1.728865146636963, "learning_rate": 6.376451077943616e-05, "loss": 1.3508, "mean_token_accuracy": 0.6589809343218803, "num_tokens": 25223831.0, "step": 770 }, { "epoch": 0.06426735218508997, "grad_norm": 1.5584203004837036, "learning_rate": 6.417910447761194e-05, "loss": 1.2995, "mean_token_accuracy": 0.6665994688868523, "num_tokens": 25387671.0, "step": 775 }, { "epoch": 0.06468198026370346, "grad_norm": 1.5956830978393555, "learning_rate": 6.459369817578773e-05, "loss": 1.2532, "mean_token_accuracy": 0.6797776117920875, "num_tokens": 25551511.0, "step": 780 }, { "epoch": 0.06509660834231694, "grad_norm": 1.5926156044006348, "learning_rate": 6.500829187396353e-05, "loss": 1.3293, "mean_token_accuracy": 0.6667155444622039, "num_tokens": 25715351.0, "step": 785 }, { "epoch": 0.06551123642093043, "grad_norm": 1.5243552923202515, "learning_rate": 6.54228855721393e-05, "loss": 1.3105, "mean_token_accuracy": 0.6727578222751618, "num_tokens": 25879191.0, "step": 790 }, { "epoch": 0.0659258644995439, "grad_norm": 1.619012475013733, "learning_rate": 6.58374792703151e-05, "loss": 1.3561, "mean_token_accuracy": 0.658205033838749, "num_tokens": 26043031.0, "step": 795 }, { "epoch": 0.0663404925781574, "grad_norm": 1.593612790107727, "learning_rate": 6.625207296849088e-05, "loss": 1.2844, "mean_token_accuracy": 0.673313781619072, "num_tokens": 26206871.0, "step": 800 }, { "epoch": 0.06675512065677087, "grad_norm": 1.5031630992889404, "learning_rate": 6.666666666666667e-05, "loss": 1.2926, "mean_token_accuracy": 0.6722629576921463, "num_tokens": 26370711.0, "step": 805 }, { "epoch": 0.06716974873538437, "grad_norm": 1.5605660676956177, "learning_rate": 6.708126036484246e-05, "loss": 1.2939, "mean_token_accuracy": 0.6696425586938858, "num_tokens": 26533925.0, "step": 810 }, { "epoch": 0.06758437681399784, "grad_norm": 1.4447792768478394, "learning_rate": 6.749585406301825e-05, "loss": 1.2417, "mean_token_accuracy": 0.6836693555116653, "num_tokens": 26697765.0, "step": 815 }, { "epoch": 0.06799900489261133, "grad_norm": 1.5722301006317139, "learning_rate": 6.791044776119403e-05, "loss": 1.2685, "mean_token_accuracy": 0.6784824058413506, "num_tokens": 26861605.0, "step": 820 }, { "epoch": 0.06841363297122481, "grad_norm": 1.5380804538726807, "learning_rate": 6.832504145936983e-05, "loss": 1.3755, "mean_token_accuracy": 0.6574107989668846, "num_tokens": 27025445.0, "step": 825 }, { "epoch": 0.0688282610498383, "grad_norm": 1.5342949628829956, "learning_rate": 6.873963515754561e-05, "loss": 1.2748, "mean_token_accuracy": 0.6746456444263458, "num_tokens": 27189285.0, "step": 830 }, { "epoch": 0.06924288912845178, "grad_norm": 1.448546290397644, "learning_rate": 6.91542288557214e-05, "loss": 1.3013, "mean_token_accuracy": 0.6716947659850121, "num_tokens": 27353125.0, "step": 835 }, { "epoch": 0.06965751720706526, "grad_norm": 1.6198995113372803, "learning_rate": 6.956882255389718e-05, "loss": 1.2998, "mean_token_accuracy": 0.6699169114232063, "num_tokens": 27516965.0, "step": 840 }, { "epoch": 0.07007214528567875, "grad_norm": 1.6602299213409424, "learning_rate": 6.998341625207298e-05, "loss": 1.2599, "mean_token_accuracy": 0.6774743407964706, "num_tokens": 27680805.0, "step": 845 }, { "epoch": 0.07048677336429222, "grad_norm": 1.5138729810714722, "learning_rate": 7.039800995024875e-05, "loss": 1.3361, "mean_token_accuracy": 0.6645833343267441, "num_tokens": 27844645.0, "step": 850 }, { "epoch": 0.07090140144290571, "grad_norm": 1.5389212369918823, "learning_rate": 7.081260364842455e-05, "loss": 1.2749, "mean_token_accuracy": 0.6735459432005882, "num_tokens": 28008485.0, "step": 855 }, { "epoch": 0.07131602952151919, "grad_norm": 1.3996397256851196, "learning_rate": 7.122719734660034e-05, "loss": 1.2249, "mean_token_accuracy": 0.682807919383049, "num_tokens": 28172325.0, "step": 860 }, { "epoch": 0.07173065760013268, "grad_norm": 1.5532928705215454, "learning_rate": 7.164179104477612e-05, "loss": 1.3725, "mean_token_accuracy": 0.6573436006903648, "num_tokens": 28336165.0, "step": 865 }, { "epoch": 0.07214528567874616, "grad_norm": 1.6004377603530884, "learning_rate": 7.205638474295191e-05, "loss": 1.4084, "mean_token_accuracy": 0.6534824058413505, "num_tokens": 28500005.0, "step": 870 }, { "epoch": 0.07255991375735965, "grad_norm": 1.4392030239105225, "learning_rate": 7.24709784411277e-05, "loss": 1.3117, "mean_token_accuracy": 0.6682306960225105, "num_tokens": 28663845.0, "step": 875 }, { "epoch": 0.07297454183597313, "grad_norm": 1.471176266670227, "learning_rate": 7.288557213930348e-05, "loss": 1.3104, "mean_token_accuracy": 0.6698924705386162, "num_tokens": 28827685.0, "step": 880 }, { "epoch": 0.07338916991458662, "grad_norm": 1.5112273693084717, "learning_rate": 7.330016583747927e-05, "loss": 1.3454, "mean_token_accuracy": 0.668340665102005, "num_tokens": 28991525.0, "step": 885 }, { "epoch": 0.0738037979932001, "grad_norm": 1.4726061820983887, "learning_rate": 7.371475953565507e-05, "loss": 1.29, "mean_token_accuracy": 0.6737292274832726, "num_tokens": 29155365.0, "step": 890 }, { "epoch": 0.07421842607181359, "grad_norm": 1.5442107915878296, "learning_rate": 7.412935323383084e-05, "loss": 1.2527, "mean_token_accuracy": 0.6790017127990723, "num_tokens": 29319205.0, "step": 895 }, { "epoch": 0.07463305415042706, "grad_norm": 1.7195465564727783, "learning_rate": 7.454394693200664e-05, "loss": 1.3696, "mean_token_accuracy": 0.6660129517316818, "num_tokens": 29483045.0, "step": 900 }, { "epoch": 0.07504768222904055, "grad_norm": 1.5677791833877563, "learning_rate": 7.495854063018242e-05, "loss": 1.3951, "mean_token_accuracy": 0.6553331576287746, "num_tokens": 29645952.0, "step": 905 }, { "epoch": 0.07546231030765403, "grad_norm": 1.5033553838729858, "learning_rate": 7.537313432835821e-05, "loss": 1.3318, "mean_token_accuracy": 0.666281770169735, "num_tokens": 29809792.0, "step": 910 }, { "epoch": 0.07587693838626752, "grad_norm": 1.4631794691085815, "learning_rate": 7.5787728026534e-05, "loss": 1.3955, "mean_token_accuracy": 0.6548264935612679, "num_tokens": 29973632.0, "step": 915 }, { "epoch": 0.076291566464881, "grad_norm": 1.6084941625595093, "learning_rate": 7.62023217247098e-05, "loss": 1.361, "mean_token_accuracy": 0.6567937433719635, "num_tokens": 30137472.0, "step": 920 }, { "epoch": 0.07670619454349449, "grad_norm": 1.395077109336853, "learning_rate": 7.661691542288557e-05, "loss": 1.3181, "mean_token_accuracy": 0.668242909014225, "num_tokens": 30301312.0, "step": 925 }, { "epoch": 0.07712082262210797, "grad_norm": 1.754188895225525, "learning_rate": 7.703150912106136e-05, "loss": 1.3594, "mean_token_accuracy": 0.6625899627804757, "num_tokens": 30465139.0, "step": 930 }, { "epoch": 0.07753545070072146, "grad_norm": 1.4886492490768433, "learning_rate": 7.744610281923715e-05, "loss": 1.3002, "mean_token_accuracy": 0.6706049293279648, "num_tokens": 30628200.0, "step": 935 }, { "epoch": 0.07795007877933494, "grad_norm": 1.5473731756210327, "learning_rate": 7.786069651741294e-05, "loss": 1.33, "mean_token_accuracy": 0.6673020482063293, "num_tokens": 30792040.0, "step": 940 }, { "epoch": 0.07836470685794843, "grad_norm": 1.4740445613861084, "learning_rate": 7.827529021558872e-05, "loss": 1.2766, "mean_token_accuracy": 0.6745784506201744, "num_tokens": 30955880.0, "step": 945 }, { "epoch": 0.0787793349365619, "grad_norm": 1.6856474876403809, "learning_rate": 7.868988391376452e-05, "loss": 1.3747, "mean_token_accuracy": 0.6588622033596039, "num_tokens": 31119579.0, "step": 950 }, { "epoch": 0.07919396301517538, "grad_norm": 2.418442964553833, "learning_rate": 7.910447761194029e-05, "loss": 1.3778, "mean_token_accuracy": 0.6559934005141258, "num_tokens": 31283419.0, "step": 955 }, { "epoch": 0.07960859109378887, "grad_norm": 1.7516529560089111, "learning_rate": 7.951907131011609e-05, "loss": 1.4087, "mean_token_accuracy": 0.6597629532217979, "num_tokens": 31447259.0, "step": 960 }, { "epoch": 0.08002321917240235, "grad_norm": 1.4395791292190552, "learning_rate": 7.993366500829188e-05, "loss": 1.2647, "mean_token_accuracy": 0.6777813985943795, "num_tokens": 31610323.0, "step": 965 }, { "epoch": 0.08043784725101584, "grad_norm": 1.554725170135498, "learning_rate": 8.034825870646766e-05, "loss": 1.3945, "mean_token_accuracy": 0.6536779120564461, "num_tokens": 31774163.0, "step": 970 }, { "epoch": 0.08085247532962932, "grad_norm": 1.4651646614074707, "learning_rate": 8.076285240464345e-05, "loss": 1.3478, "mean_token_accuracy": 0.666422289609909, "num_tokens": 31938003.0, "step": 975 }, { "epoch": 0.08126710340824281, "grad_norm": 1.5143450498580933, "learning_rate": 8.117744610281925e-05, "loss": 1.3087, "mean_token_accuracy": 0.6691593304276466, "num_tokens": 32101843.0, "step": 980 }, { "epoch": 0.08168173148685628, "grad_norm": 1.4389920234680176, "learning_rate": 8.159203980099502e-05, "loss": 1.3589, "mean_token_accuracy": 0.662383921444416, "num_tokens": 32265683.0, "step": 985 }, { "epoch": 0.08209635956546978, "grad_norm": 1.4907101392745972, "learning_rate": 8.200663349917082e-05, "loss": 1.3373, "mean_token_accuracy": 0.6661595821380615, "num_tokens": 32429523.0, "step": 990 }, { "epoch": 0.08251098764408325, "grad_norm": 1.498844861984253, "learning_rate": 8.24212271973466e-05, "loss": 1.4749, "mean_token_accuracy": 0.6424853324890136, "num_tokens": 32593363.0, "step": 995 }, { "epoch": 0.08292561572269674, "grad_norm": 1.4503613710403442, "learning_rate": 8.283582089552239e-05, "loss": 1.3067, "mean_token_accuracy": 0.6691892817616463, "num_tokens": 32756973.0, "step": 1000 }, { "epoch": 0.08334024380131022, "grad_norm": 1.4826053380966187, "learning_rate": 8.325041459369818e-05, "loss": 1.3312, "mean_token_accuracy": 0.6646444290876389, "num_tokens": 32920813.0, "step": 1005 }, { "epoch": 0.08375487187992371, "grad_norm": 1.4513267278671265, "learning_rate": 8.366500829187398e-05, "loss": 1.2827, "mean_token_accuracy": 0.6734543010592461, "num_tokens": 33084653.0, "step": 1010 }, { "epoch": 0.08416949995853719, "grad_norm": 1.3998719453811646, "learning_rate": 8.407960199004975e-05, "loss": 1.3475, "mean_token_accuracy": 0.6636377297341823, "num_tokens": 33247540.0, "step": 1015 }, { "epoch": 0.08458412803715068, "grad_norm": 1.4850621223449707, "learning_rate": 8.449419568822555e-05, "loss": 1.3413, "mean_token_accuracy": 0.6632209196686745, "num_tokens": 33411380.0, "step": 1020 }, { "epoch": 0.08499875611576416, "grad_norm": 1.4333192110061646, "learning_rate": 8.490878938640133e-05, "loss": 1.3847, "mean_token_accuracy": 0.6557184763252735, "num_tokens": 33575220.0, "step": 1025 }, { "epoch": 0.08541338419437765, "grad_norm": 1.4944761991500854, "learning_rate": 8.532338308457712e-05, "loss": 1.3523, "mean_token_accuracy": 0.6646868199110031, "num_tokens": 33738658.0, "step": 1030 }, { "epoch": 0.08582801227299112, "grad_norm": 1.388965129852295, "learning_rate": 8.57379767827529e-05, "loss": 1.3129, "mean_token_accuracy": 0.6730571836233139, "num_tokens": 33902498.0, "step": 1035 }, { "epoch": 0.08624264035160462, "grad_norm": 1.4851020574569702, "learning_rate": 8.61525704809287e-05, "loss": 1.2922, "mean_token_accuracy": 0.6700879752635955, "num_tokens": 34066338.0, "step": 1040 }, { "epoch": 0.08665726843021809, "grad_norm": 1.4334027767181396, "learning_rate": 8.656716417910447e-05, "loss": 1.307, "mean_token_accuracy": 0.673802538216114, "num_tokens": 34230178.0, "step": 1045 }, { "epoch": 0.08707189650883158, "grad_norm": 1.4220467805862427, "learning_rate": 8.698175787728027e-05, "loss": 1.4381, "mean_token_accuracy": 0.6489491671323776, "num_tokens": 34394018.0, "step": 1050 }, { "epoch": 0.08748652458744506, "grad_norm": 1.341352105140686, "learning_rate": 8.739635157545606e-05, "loss": 1.215, "mean_token_accuracy": 0.6873839199542999, "num_tokens": 34557858.0, "step": 1055 }, { "epoch": 0.08790115266605855, "grad_norm": 1.4932255744934082, "learning_rate": 8.781094527363185e-05, "loss": 1.299, "mean_token_accuracy": 0.6708152651786804, "num_tokens": 34720730.0, "step": 1060 }, { "epoch": 0.08831578074467203, "grad_norm": 1.3251454830169678, "learning_rate": 8.822553897180763e-05, "loss": 1.3161, "mean_token_accuracy": 0.6739186197519302, "num_tokens": 34884570.0, "step": 1065 }, { "epoch": 0.08873040882328552, "grad_norm": 1.4369324445724487, "learning_rate": 8.864013266998342e-05, "loss": 1.3105, "mean_token_accuracy": 0.6701320111751556, "num_tokens": 35047445.0, "step": 1070 }, { "epoch": 0.089145036901899, "grad_norm": 1.4724633693695068, "learning_rate": 8.905472636815922e-05, "loss": 1.3482, "mean_token_accuracy": 0.6623900294303894, "num_tokens": 35211285.0, "step": 1075 }, { "epoch": 0.08955966498051247, "grad_norm": 1.3175029754638672, "learning_rate": 8.946932006633499e-05, "loss": 1.3482, "mean_token_accuracy": 0.6598729252815246, "num_tokens": 35375125.0, "step": 1080 }, { "epoch": 0.08997429305912596, "grad_norm": 1.4000898599624634, "learning_rate": 8.988391376451079e-05, "loss": 1.3913, "mean_token_accuracy": 0.6548206344246864, "num_tokens": 35538041.0, "step": 1085 }, { "epoch": 0.09038892113773944, "grad_norm": 1.4432710409164429, "learning_rate": 9.029850746268657e-05, "loss": 1.3746, "mean_token_accuracy": 0.6613330885767936, "num_tokens": 35701881.0, "step": 1090 }, { "epoch": 0.09080354921635293, "grad_norm": 1.5282469987869263, "learning_rate": 9.071310116086236e-05, "loss": 1.4372, "mean_token_accuracy": 0.6472564682364463, "num_tokens": 35864728.0, "step": 1095 }, { "epoch": 0.09121817729496641, "grad_norm": 1.3563272953033447, "learning_rate": 9.112769485903814e-05, "loss": 1.3101, "mean_token_accuracy": 0.6677113883197308, "num_tokens": 36028568.0, "step": 1100 }, { "epoch": 0.0916328053735799, "grad_norm": 1.4359475374221802, "learning_rate": 9.154228855721394e-05, "loss": 1.2544, "mean_token_accuracy": 0.6783113405108452, "num_tokens": 36192408.0, "step": 1105 }, { "epoch": 0.09204743345219338, "grad_norm": 1.4146751165390015, "learning_rate": 9.195688225538971e-05, "loss": 1.4059, "mean_token_accuracy": 0.6540689110755921, "num_tokens": 36356248.0, "step": 1110 }, { "epoch": 0.09246206153080687, "grad_norm": 1.501935362815857, "learning_rate": 9.237147595356551e-05, "loss": 1.3623, "mean_token_accuracy": 0.6604533240199089, "num_tokens": 36520088.0, "step": 1115 }, { "epoch": 0.09287668960942035, "grad_norm": 1.3974504470825195, "learning_rate": 9.27860696517413e-05, "loss": 1.3582, "mean_token_accuracy": 0.6621945217251778, "num_tokens": 36683928.0, "step": 1120 }, { "epoch": 0.09329131768803384, "grad_norm": 1.366337537765503, "learning_rate": 9.320066334991709e-05, "loss": 1.2823, "mean_token_accuracy": 0.6725623145699501, "num_tokens": 36847768.0, "step": 1125 }, { "epoch": 0.09370594576664731, "grad_norm": 1.3947283029556274, "learning_rate": 9.361525704809287e-05, "loss": 1.4181, "mean_token_accuracy": 0.6555474132299424, "num_tokens": 37011608.0, "step": 1130 }, { "epoch": 0.0941205738452608, "grad_norm": 1.4116472005844116, "learning_rate": 9.402985074626867e-05, "loss": 1.465, "mean_token_accuracy": 0.6451433047652244, "num_tokens": 37175231.0, "step": 1135 }, { "epoch": 0.09453520192387428, "grad_norm": 1.4221644401550293, "learning_rate": 9.444444444444444e-05, "loss": 1.3925, "mean_token_accuracy": 0.6539161786437034, "num_tokens": 37339071.0, "step": 1140 }, { "epoch": 0.09494983000248777, "grad_norm": 1.4130382537841797, "learning_rate": 9.485903814262024e-05, "loss": 1.3436, "mean_token_accuracy": 0.6620478987693786, "num_tokens": 37502911.0, "step": 1145 }, { "epoch": 0.09536445808110125, "grad_norm": 1.3947120904922485, "learning_rate": 9.527363184079603e-05, "loss": 1.4183, "mean_token_accuracy": 0.6529630959033966, "num_tokens": 37666751.0, "step": 1150 }, { "epoch": 0.09577908615971474, "grad_norm": 1.4517793655395508, "learning_rate": 9.568822553897181e-05, "loss": 1.4008, "mean_token_accuracy": 0.6511601343750953, "num_tokens": 37830361.0, "step": 1155 }, { "epoch": 0.09619371423832822, "grad_norm": 1.4411286115646362, "learning_rate": 9.61028192371476e-05, "loss": 1.4914, "mean_token_accuracy": 0.6399132460355759, "num_tokens": 37994201.0, "step": 1160 }, { "epoch": 0.09660834231694171, "grad_norm": 1.7347967624664307, "learning_rate": 9.65174129353234e-05, "loss": 1.2925, "mean_token_accuracy": 0.6705400794744492, "num_tokens": 38158041.0, "step": 1165 }, { "epoch": 0.09702297039555519, "grad_norm": 1.3912692070007324, "learning_rate": 9.693200663349917e-05, "loss": 1.3408, "mean_token_accuracy": 0.6626405164599418, "num_tokens": 38321881.0, "step": 1170 }, { "epoch": 0.09743759847416868, "grad_norm": 1.3747230768203735, "learning_rate": 9.734660033167497e-05, "loss": 1.3228, "mean_token_accuracy": 0.6728983402252198, "num_tokens": 38485721.0, "step": 1175 }, { "epoch": 0.09785222655278215, "grad_norm": 1.4716606140136719, "learning_rate": 9.776119402985075e-05, "loss": 1.3058, "mean_token_accuracy": 0.6680290788412094, "num_tokens": 38649561.0, "step": 1180 }, { "epoch": 0.09826685463139564, "grad_norm": 1.3675380945205688, "learning_rate": 9.817578772802654e-05, "loss": 1.2764, "mean_token_accuracy": 0.672256837785244, "num_tokens": 38813401.0, "step": 1185 }, { "epoch": 0.09868148271000912, "grad_norm": 1.4142637252807617, "learning_rate": 9.859038142620233e-05, "loss": 1.401, "mean_token_accuracy": 0.6586057350039483, "num_tokens": 38977105.0, "step": 1190 }, { "epoch": 0.0990961107886226, "grad_norm": 1.3965309858322144, "learning_rate": 9.900497512437812e-05, "loss": 1.358, "mean_token_accuracy": 0.6610459417104722, "num_tokens": 39140945.0, "step": 1195 }, { "epoch": 0.09951073886723609, "grad_norm": 1.3496406078338623, "learning_rate": 9.94195688225539e-05, "loss": 1.3519, "mean_token_accuracy": 0.6601967245340348, "num_tokens": 39304785.0, "step": 1200 }, { "epoch": 0.09992536694584957, "grad_norm": 1.3891043663024902, "learning_rate": 9.98341625207297e-05, "loss": 1.3577, "mean_token_accuracy": 0.6609848454594612, "num_tokens": 39468625.0, "step": 1205 }, { "epoch": 0.10033999502446306, "grad_norm": 1.4117522239685059, "learning_rate": 9.999998114690611e-05, "loss": 1.376, "mean_token_accuracy": 0.659420820325613, "num_tokens": 39632465.0, "step": 1210 }, { "epoch": 0.10075462310307653, "grad_norm": 1.3661329746246338, "learning_rate": 9.999986593360611e-05, "loss": 1.3479, "mean_token_accuracy": 0.6634530752897263, "num_tokens": 39796305.0, "step": 1215 }, { "epoch": 0.10116925118169003, "grad_norm": 1.3807592391967773, "learning_rate": 9.999964598118817e-05, "loss": 1.3104, "mean_token_accuracy": 0.6695197895169258, "num_tokens": 39960145.0, "step": 1220 }, { "epoch": 0.1015838792603035, "grad_norm": 1.3176120519638062, "learning_rate": 9.999932129011307e-05, "loss": 1.3002, "mean_token_accuracy": 0.6680840656161309, "num_tokens": 40123985.0, "step": 1225 }, { "epoch": 0.101998507338917, "grad_norm": 2.028810501098633, "learning_rate": 9.999889186106097e-05, "loss": 1.3252, "mean_token_accuracy": 0.6665505826473236, "num_tokens": 40287825.0, "step": 1230 }, { "epoch": 0.10241313541753047, "grad_norm": 1.3492093086242676, "learning_rate": 9.999835769493143e-05, "loss": 1.4348, "mean_token_accuracy": 0.6501893937587738, "num_tokens": 40451665.0, "step": 1235 }, { "epoch": 0.10282776349614396, "grad_norm": 1.3383077383041382, "learning_rate": 9.999771879284341e-05, "loss": 1.3338, "mean_token_accuracy": 0.6664115741848946, "num_tokens": 40614717.0, "step": 1240 }, { "epoch": 0.10324239157475744, "grad_norm": 1.4968199729919434, "learning_rate": 9.999697515613528e-05, "loss": 1.3937, "mean_token_accuracy": 0.6575207717716693, "num_tokens": 40778557.0, "step": 1245 }, { "epoch": 0.10365701965337093, "grad_norm": 1.3148664236068726, "learning_rate": 9.999612678636478e-05, "loss": 1.27, "mean_token_accuracy": 0.6778225794434547, "num_tokens": 40942397.0, "step": 1250 }, { "epoch": 0.1040716477319844, "grad_norm": 1.4289436340332031, "learning_rate": 9.99951736853091e-05, "loss": 1.3333, "mean_token_accuracy": 0.6689454987645149, "num_tokens": 41106237.0, "step": 1255 }, { "epoch": 0.1044862758105979, "grad_norm": 1.3502683639526367, "learning_rate": 9.999411585496479e-05, "loss": 1.3966, "mean_token_accuracy": 0.6577101618051528, "num_tokens": 41270077.0, "step": 1260 }, { "epoch": 0.10490090388921137, "grad_norm": 1.4543397426605225, "learning_rate": 9.999295329754773e-05, "loss": 1.3918, "mean_token_accuracy": 0.6539956003427505, "num_tokens": 41433917.0, "step": 1265 }, { "epoch": 0.10531553196782487, "grad_norm": 1.4151115417480469, "learning_rate": 9.999168601549327e-05, "loss": 1.3828, "mean_token_accuracy": 0.6566471174359322, "num_tokens": 41597757.0, "step": 1270 }, { "epoch": 0.10573016004643834, "grad_norm": 1.315963864326477, "learning_rate": 9.999031401145609e-05, "loss": 1.3439, "mean_token_accuracy": 0.6630020245909691, "num_tokens": 41760926.0, "step": 1275 }, { "epoch": 0.10614478812505183, "grad_norm": 1.3273677825927734, "learning_rate": 9.998883728831024e-05, "loss": 1.3293, "mean_token_accuracy": 0.6641373381018638, "num_tokens": 41924766.0, "step": 1280 }, { "epoch": 0.10655941620366531, "grad_norm": 1.3122780323028564, "learning_rate": 9.998725584914915e-05, "loss": 1.3364, "mean_token_accuracy": 0.6650659814476967, "num_tokens": 42088606.0, "step": 1285 }, { "epoch": 0.1069740442822788, "grad_norm": 1.3338128328323364, "learning_rate": 9.998556969728559e-05, "loss": 1.3336, "mean_token_accuracy": 0.6651763558387757, "num_tokens": 42251613.0, "step": 1290 }, { "epoch": 0.10738867236089228, "grad_norm": 1.5022046566009521, "learning_rate": 9.99837788362517e-05, "loss": 1.38, "mean_token_accuracy": 0.6555290788412094, "num_tokens": 42415453.0, "step": 1295 }, { "epoch": 0.10780330043950577, "grad_norm": 1.3532664775848389, "learning_rate": 9.998188326979895e-05, "loss": 1.3507, "mean_token_accuracy": 0.6577895864844322, "num_tokens": 42579293.0, "step": 1300 }, { "epoch": 0.10821792851811925, "grad_norm": 1.3476784229278564, "learning_rate": 9.997988300189816e-05, "loss": 1.3537, "mean_token_accuracy": 0.6619501486420631, "num_tokens": 42743133.0, "step": 1305 }, { "epoch": 0.10863255659673272, "grad_norm": 1.2422707080841064, "learning_rate": 9.997777803673944e-05, "loss": 1.3581, "mean_token_accuracy": 0.6607506588101387, "num_tokens": 42905995.0, "step": 1310 }, { "epoch": 0.10904718467534621, "grad_norm": 1.3391062021255493, "learning_rate": 9.997556837873228e-05, "loss": 1.3127, "mean_token_accuracy": 0.668255127966404, "num_tokens": 43069835.0, "step": 1315 }, { "epoch": 0.10946181275395969, "grad_norm": 1.22450852394104, "learning_rate": 9.997325403250541e-05, "loss": 1.3347, "mean_token_accuracy": 0.6667644187808037, "num_tokens": 43233675.0, "step": 1320 }, { "epoch": 0.10987644083257318, "grad_norm": 1.3081496953964233, "learning_rate": 9.997083500290694e-05, "loss": 1.3801, "mean_token_accuracy": 0.6562072336673737, "num_tokens": 43397515.0, "step": 1325 }, { "epoch": 0.11029106891118666, "grad_norm": 1.230015516281128, "learning_rate": 9.99683112950042e-05, "loss": 1.3381, "mean_token_accuracy": 0.6677419349551201, "num_tokens": 43561355.0, "step": 1330 }, { "epoch": 0.11070569698980015, "grad_norm": 1.3189239501953125, "learning_rate": 9.996568291408379e-05, "loss": 1.3946, "mean_token_accuracy": 0.6497922793030739, "num_tokens": 43725195.0, "step": 1335 }, { "epoch": 0.11112032506841363, "grad_norm": 1.2687530517578125, "learning_rate": 9.996294986565166e-05, "loss": 1.3682, "mean_token_accuracy": 0.6570442348718644, "num_tokens": 43889035.0, "step": 1340 }, { "epoch": 0.11153495314702712, "grad_norm": 1.2718192338943481, "learning_rate": 9.996011215543296e-05, "loss": 1.426, "mean_token_accuracy": 0.6486681327223778, "num_tokens": 44052875.0, "step": 1345 }, { "epoch": 0.1119495812256406, "grad_norm": 1.4578088521957397, "learning_rate": 9.995716978937203e-05, "loss": 1.3986, "mean_token_accuracy": 0.6559811875224113, "num_tokens": 44216715.0, "step": 1350 }, { "epoch": 0.11236420930425409, "grad_norm": 1.1890809535980225, "learning_rate": 9.995412277363261e-05, "loss": 1.336, "mean_token_accuracy": 0.6617302060127258, "num_tokens": 44380555.0, "step": 1355 }, { "epoch": 0.11277883738286756, "grad_norm": 1.243876338005066, "learning_rate": 9.995097111459747e-05, "loss": 1.3838, "mean_token_accuracy": 0.6524254620075226, "num_tokens": 44544395.0, "step": 1360 }, { "epoch": 0.11319346546148105, "grad_norm": 1.3206053972244263, "learning_rate": 9.994771481886869e-05, "loss": 1.3321, "mean_token_accuracy": 0.6657380282878875, "num_tokens": 44708235.0, "step": 1365 }, { "epoch": 0.11360809354009453, "grad_norm": 1.2924857139587402, "learning_rate": 9.994435389326753e-05, "loss": 1.3351, "mean_token_accuracy": 0.6672470673918725, "num_tokens": 44872075.0, "step": 1370 }, { "epoch": 0.11402272161870802, "grad_norm": 1.2311909198760986, "learning_rate": 9.99408883448344e-05, "loss": 1.3773, "mean_token_accuracy": 0.6586265876889229, "num_tokens": 45035915.0, "step": 1375 }, { "epoch": 0.1144373496973215, "grad_norm": 1.2170907258987427, "learning_rate": 9.99373181808289e-05, "loss": 1.3426, "mean_token_accuracy": 0.659329180419445, "num_tokens": 45199755.0, "step": 1380 }, { "epoch": 0.11485197777593499, "grad_norm": 1.245579481124878, "learning_rate": 9.993364340872977e-05, "loss": 1.3514, "mean_token_accuracy": 0.6624366760253906, "num_tokens": 45362745.0, "step": 1385 }, { "epoch": 0.11526660585454847, "grad_norm": 1.2931832075119019, "learning_rate": 9.992986403623487e-05, "loss": 1.3698, "mean_token_accuracy": 0.6562740832567215, "num_tokens": 45525978.0, "step": 1390 }, { "epoch": 0.11568123393316196, "grad_norm": 1.2172212600708008, "learning_rate": 9.992598007126117e-05, "loss": 1.3982, "mean_token_accuracy": 0.6571908593177795, "num_tokens": 45689818.0, "step": 1395 }, { "epoch": 0.11609586201177544, "grad_norm": 1.2430282831192017, "learning_rate": 9.99219915219448e-05, "loss": 1.3635, "mean_token_accuracy": 0.6602211631834507, "num_tokens": 45853658.0, "step": 1400 }, { "epoch": 0.11651049009038893, "grad_norm": 1.2654681205749512, "learning_rate": 9.991789839664087e-05, "loss": 1.3851, "mean_token_accuracy": 0.655498529970646, "num_tokens": 46017498.0, "step": 1405 }, { "epoch": 0.1169251181690024, "grad_norm": 1.4644423723220825, "learning_rate": 9.991370070392363e-05, "loss": 1.3919, "mean_token_accuracy": 0.6564149558544159, "num_tokens": 46181338.0, "step": 1410 }, { "epoch": 0.1173397462476159, "grad_norm": 1.237156629562378, "learning_rate": 9.990939845258638e-05, "loss": 1.3377, "mean_token_accuracy": 0.6658663243055344, "num_tokens": 46345178.0, "step": 1415 }, { "epoch": 0.11775437432622937, "grad_norm": 1.3756873607635498, "learning_rate": 9.990499165164139e-05, "loss": 1.3564, "mean_token_accuracy": 0.6600928664207458, "num_tokens": 46509018.0, "step": 1420 }, { "epoch": 0.11816900240484286, "grad_norm": 1.257973551750183, "learning_rate": 9.990048031031999e-05, "loss": 1.3685, "mean_token_accuracy": 0.6592802986502647, "num_tokens": 46672858.0, "step": 1425 }, { "epoch": 0.11858363048345634, "grad_norm": 1.257059097290039, "learning_rate": 9.989586443807248e-05, "loss": 1.307, "mean_token_accuracy": 0.6718108534812928, "num_tokens": 46836698.0, "step": 1430 }, { "epoch": 0.11899825856206982, "grad_norm": 1.324131965637207, "learning_rate": 9.989114404456814e-05, "loss": 1.4071, "mean_token_accuracy": 0.6508858755230904, "num_tokens": 47000538.0, "step": 1435 }, { "epoch": 0.11941288664068331, "grad_norm": 1.2344211339950562, "learning_rate": 9.988631913969519e-05, "loss": 1.334, "mean_token_accuracy": 0.6659274190664292, "num_tokens": 47164378.0, "step": 1440 }, { "epoch": 0.11982751471929678, "grad_norm": 1.2423447370529175, "learning_rate": 9.988138973356079e-05, "loss": 1.3058, "mean_token_accuracy": 0.671522231400013, "num_tokens": 47327560.0, "step": 1445 }, { "epoch": 0.12024214279791028, "grad_norm": 1.4536687135696411, "learning_rate": 9.987635583649097e-05, "loss": 1.4061, "mean_token_accuracy": 0.6506109446287155, "num_tokens": 47491400.0, "step": 1450 }, { "epoch": 0.12065677087652375, "grad_norm": 1.1907180547714233, "learning_rate": 9.987121745903072e-05, "loss": 1.378, "mean_token_accuracy": 0.657317741215229, "num_tokens": 47654463.0, "step": 1455 }, { "epoch": 0.12107139895513724, "grad_norm": 1.2786400318145752, "learning_rate": 9.986597461194382e-05, "loss": 1.3535, "mean_token_accuracy": 0.660117307305336, "num_tokens": 47818303.0, "step": 1460 }, { "epoch": 0.12148602703375072, "grad_norm": 1.2278717756271362, "learning_rate": 9.986062730621294e-05, "loss": 1.3666, "mean_token_accuracy": 0.6596590921282768, "num_tokens": 47982143.0, "step": 1465 }, { "epoch": 0.12190065511236421, "grad_norm": 1.2639421224594116, "learning_rate": 9.985517555303954e-05, "loss": 1.366, "mean_token_accuracy": 0.6585043981671334, "num_tokens": 48145983.0, "step": 1470 }, { "epoch": 0.12231528319097769, "grad_norm": 1.2990895509719849, "learning_rate": 9.984961936384389e-05, "loss": 1.2915, "mean_token_accuracy": 0.6701307401061058, "num_tokens": 48309823.0, "step": 1475 }, { "epoch": 0.12272991126959118, "grad_norm": 1.217031717300415, "learning_rate": 9.984395875026504e-05, "loss": 1.4054, "mean_token_accuracy": 0.6530821815133094, "num_tokens": 48473287.0, "step": 1480 }, { "epoch": 0.12314453934820466, "grad_norm": 1.3979650735855103, "learning_rate": 9.983819372416077e-05, "loss": 1.3625, "mean_token_accuracy": 0.661717988550663, "num_tokens": 48637127.0, "step": 1485 }, { "epoch": 0.12355916742681815, "grad_norm": 1.1983777284622192, "learning_rate": 9.983232429760756e-05, "loss": 1.3352, "mean_token_accuracy": 0.6667155444622039, "num_tokens": 48800967.0, "step": 1490 }, { "epoch": 0.12397379550543162, "grad_norm": 1.2083327770233154, "learning_rate": 9.982635048290065e-05, "loss": 1.3212, "mean_token_accuracy": 0.6723729208111763, "num_tokens": 48964807.0, "step": 1495 }, { "epoch": 0.12438842358404512, "grad_norm": 1.2176861763000488, "learning_rate": 9.98202722925539e-05, "loss": 1.3341, "mean_token_accuracy": 0.6651077300310135, "num_tokens": 49128200.0, "step": 1500 }, { "epoch": 0.12480305166265859, "grad_norm": 1.248131513595581, "learning_rate": 9.981408973929984e-05, "loss": 1.3594, "mean_token_accuracy": 0.6569226816296577, "num_tokens": 49291711.0, "step": 1505 }, { "epoch": 0.12521767974127207, "grad_norm": 1.2244327068328857, "learning_rate": 9.980780283608962e-05, "loss": 1.3562, "mean_token_accuracy": 0.6625122189521789, "num_tokens": 49455551.0, "step": 1510 }, { "epoch": 0.12563230781988557, "grad_norm": 1.1873289346694946, "learning_rate": 9.980141159609292e-05, "loss": 1.318, "mean_token_accuracy": 0.6686216980218888, "num_tokens": 49619391.0, "step": 1515 }, { "epoch": 0.12604693589849905, "grad_norm": 1.2468986511230469, "learning_rate": 9.979491603269807e-05, "loss": 1.3678, "mean_token_accuracy": 0.6572947204113007, "num_tokens": 49783231.0, "step": 1520 }, { "epoch": 0.12646156397711253, "grad_norm": 1.2199991941452026, "learning_rate": 9.97883161595119e-05, "loss": 1.3319, "mean_token_accuracy": 0.661113141477108, "num_tokens": 49947071.0, "step": 1525 }, { "epoch": 0.126876192055726, "grad_norm": 1.2399156093597412, "learning_rate": 9.978161199035973e-05, "loss": 1.3715, "mean_token_accuracy": 0.6594170108437538, "num_tokens": 50110092.0, "step": 1530 }, { "epoch": 0.1272908201343395, "grad_norm": 1.2146192789077759, "learning_rate": 9.977480353928537e-05, "loss": 1.3009, "mean_token_accuracy": 0.6711876869201661, "num_tokens": 50273932.0, "step": 1535 }, { "epoch": 0.127705448212953, "grad_norm": 1.263193964958191, "learning_rate": 9.97678908205511e-05, "loss": 1.4342, "mean_token_accuracy": 0.6495478972792625, "num_tokens": 50437772.0, "step": 1540 }, { "epoch": 0.12812007629156646, "grad_norm": 1.2597016096115112, "learning_rate": 9.97608738486376e-05, "loss": 1.37, "mean_token_accuracy": 0.6623414978384972, "num_tokens": 50601325.0, "step": 1545 }, { "epoch": 0.12853470437017994, "grad_norm": 1.2344940900802612, "learning_rate": 9.975375263824392e-05, "loss": 1.3619, "mean_token_accuracy": 0.6563782960176467, "num_tokens": 50765165.0, "step": 1550 }, { "epoch": 0.12894933244879342, "grad_norm": 1.2537332773208618, "learning_rate": 9.974652720428747e-05, "loss": 1.3234, "mean_token_accuracy": 0.6657013684511185, "num_tokens": 50929005.0, "step": 1555 }, { "epoch": 0.12936396052740692, "grad_norm": 1.1453742980957031, "learning_rate": 9.973919756190407e-05, "loss": 1.3031, "mean_token_accuracy": 0.6742241024971009, "num_tokens": 51092845.0, "step": 1560 }, { "epoch": 0.1297785886060204, "grad_norm": 1.1941858530044556, "learning_rate": 9.973176372644771e-05, "loss": 1.3448, "mean_token_accuracy": 0.6630070850253105, "num_tokens": 51256685.0, "step": 1565 }, { "epoch": 0.13019321668463388, "grad_norm": 1.1861906051635742, "learning_rate": 9.97242257134907e-05, "loss": 1.3187, "mean_token_accuracy": 0.6665872409939766, "num_tokens": 51420525.0, "step": 1570 }, { "epoch": 0.13060784476324735, "grad_norm": 1.1945332288742065, "learning_rate": 9.971658353882359e-05, "loss": 1.3328, "mean_token_accuracy": 0.6672470673918725, "num_tokens": 51584365.0, "step": 1575 }, { "epoch": 0.13102247284186086, "grad_norm": 1.3252277374267578, "learning_rate": 9.970883721845513e-05, "loss": 1.3837, "mean_token_accuracy": 0.6575146600604057, "num_tokens": 51748205.0, "step": 1580 }, { "epoch": 0.13143710092047434, "grad_norm": 1.281628966331482, "learning_rate": 9.97009867686122e-05, "loss": 1.3115, "mean_token_accuracy": 0.6694892480969429, "num_tokens": 51912045.0, "step": 1585 }, { "epoch": 0.1318517289990878, "grad_norm": 1.1837189197540283, "learning_rate": 9.969303220573985e-05, "loss": 1.2931, "mean_token_accuracy": 0.6696419835090637, "num_tokens": 52075885.0, "step": 1590 }, { "epoch": 0.1322663570777013, "grad_norm": 1.196635365486145, "learning_rate": 9.968497354650116e-05, "loss": 1.344, "mean_token_accuracy": 0.6630742907524109, "num_tokens": 52239725.0, "step": 1595 }, { "epoch": 0.1326809851563148, "grad_norm": 1.1934854984283447, "learning_rate": 9.967681080777735e-05, "loss": 1.4029, "mean_token_accuracy": 0.6529447734355927, "num_tokens": 52403565.0, "step": 1600 }, { "epoch": 0.13309561323492827, "grad_norm": 1.2019221782684326, "learning_rate": 9.966854400666762e-05, "loss": 1.4377, "mean_token_accuracy": 0.6488391980528831, "num_tokens": 52567405.0, "step": 1605 }, { "epoch": 0.13351024131354175, "grad_norm": 1.150558352470398, "learning_rate": 9.966017316048917e-05, "loss": 1.3579, "mean_token_accuracy": 0.663917401432991, "num_tokens": 52731245.0, "step": 1610 }, { "epoch": 0.13392486939215523, "grad_norm": 1.1604512929916382, "learning_rate": 9.965169828677711e-05, "loss": 1.316, "mean_token_accuracy": 0.6674120202660561, "num_tokens": 52895085.0, "step": 1615 }, { "epoch": 0.13433949747076873, "grad_norm": 1.2017782926559448, "learning_rate": 9.964311940328456e-05, "loss": 1.2988, "mean_token_accuracy": 0.6694525897502899, "num_tokens": 53058925.0, "step": 1620 }, { "epoch": 0.1347541255493822, "grad_norm": 1.1345432996749878, "learning_rate": 9.963443652798244e-05, "loss": 1.3817, "mean_token_accuracy": 0.6574510850012303, "num_tokens": 53222243.0, "step": 1625 }, { "epoch": 0.13516875362799569, "grad_norm": 1.1749430894851685, "learning_rate": 9.96256496790595e-05, "loss": 1.3316, "mean_token_accuracy": 0.6657746791839599, "num_tokens": 53386083.0, "step": 1630 }, { "epoch": 0.13558338170660916, "grad_norm": 1.196176528930664, "learning_rate": 9.961675887492236e-05, "loss": 1.3431, "mean_token_accuracy": 0.6665762454271317, "num_tokens": 53548930.0, "step": 1635 }, { "epoch": 0.13599800978522267, "grad_norm": 1.1467273235321045, "learning_rate": 9.96077641341954e-05, "loss": 1.3235, "mean_token_accuracy": 0.6667259722948075, "num_tokens": 53711382.0, "step": 1640 }, { "epoch": 0.13641263786383614, "grad_norm": 1.1617478132247925, "learning_rate": 9.959866547572061e-05, "loss": 1.3364, "mean_token_accuracy": 0.6599401280283927, "num_tokens": 53875222.0, "step": 1645 }, { "epoch": 0.13682726594244962, "grad_norm": 1.2925304174423218, "learning_rate": 9.958946291855781e-05, "loss": 1.4382, "mean_token_accuracy": 0.6486009255051612, "num_tokens": 54039062.0, "step": 1650 }, { "epoch": 0.1372418940210631, "grad_norm": 1.1700193881988525, "learning_rate": 9.958015648198441e-05, "loss": 1.2947, "mean_token_accuracy": 0.6765765935182572, "num_tokens": 54202882.0, "step": 1655 }, { "epoch": 0.1376565220996766, "grad_norm": 1.3198378086090088, "learning_rate": 9.95707461854954e-05, "loss": 1.3828, "mean_token_accuracy": 0.6575268775224685, "num_tokens": 54366722.0, "step": 1660 }, { "epoch": 0.13807115017829008, "grad_norm": 1.2401063442230225, "learning_rate": 9.956123204880335e-05, "loss": 1.4184, "mean_token_accuracy": 0.6541998594999313, "num_tokens": 54530471.0, "step": 1665 }, { "epoch": 0.13848577825690356, "grad_norm": 1.2013182640075684, "learning_rate": 9.955161409183838e-05, "loss": 1.426, "mean_token_accuracy": 0.6528225809335708, "num_tokens": 54694311.0, "step": 1670 }, { "epoch": 0.13890040633551703, "grad_norm": 1.1475543975830078, "learning_rate": 9.954189233474807e-05, "loss": 1.3758, "mean_token_accuracy": 0.6577040582895279, "num_tokens": 54858151.0, "step": 1675 }, { "epoch": 0.1393150344141305, "grad_norm": 1.1979186534881592, "learning_rate": 9.953206679789742e-05, "loss": 1.3477, "mean_token_accuracy": 0.6635973781347275, "num_tokens": 55020989.0, "step": 1680 }, { "epoch": 0.13972966249274402, "grad_norm": 1.1845098733901978, "learning_rate": 9.952213750186885e-05, "loss": 1.3626, "mean_token_accuracy": 0.6599279105663299, "num_tokens": 55184829.0, "step": 1685 }, { "epoch": 0.1401442905713575, "grad_norm": 1.218155026435852, "learning_rate": 9.951210446746215e-05, "loss": 1.3181, "mean_token_accuracy": 0.6701835080981254, "num_tokens": 55348280.0, "step": 1690 }, { "epoch": 0.14055891864997097, "grad_norm": 1.219197154045105, "learning_rate": 9.950196771569438e-05, "loss": 1.3093, "mean_token_accuracy": 0.6754842758178711, "num_tokens": 55511537.0, "step": 1695 }, { "epoch": 0.14097354672858445, "grad_norm": 1.195906400680542, "learning_rate": 9.94917272677999e-05, "loss": 1.3788, "mean_token_accuracy": 0.6552663698792458, "num_tokens": 55675377.0, "step": 1700 }, { "epoch": 0.14138817480719795, "grad_norm": 1.2281585931777954, "learning_rate": 9.948138314523026e-05, "loss": 1.3823, "mean_token_accuracy": 0.6584310859441758, "num_tokens": 55839217.0, "step": 1705 }, { "epoch": 0.14180280288581143, "grad_norm": 1.2239000797271729, "learning_rate": 9.947093536965422e-05, "loss": 1.294, "mean_token_accuracy": 0.6741141244769097, "num_tokens": 56003057.0, "step": 1710 }, { "epoch": 0.1422174309644249, "grad_norm": 1.1687766313552856, "learning_rate": 9.946038396295765e-05, "loss": 1.3398, "mean_token_accuracy": 0.6643632367253304, "num_tokens": 56166360.0, "step": 1715 }, { "epoch": 0.14263205904303838, "grad_norm": 1.099410891532898, "learning_rate": 9.94497289472435e-05, "loss": 1.3683, "mean_token_accuracy": 0.6596774220466614, "num_tokens": 56330200.0, "step": 1720 }, { "epoch": 0.1430466871216519, "grad_norm": 1.2177159786224365, "learning_rate": 9.943897034483178e-05, "loss": 1.3449, "mean_token_accuracy": 0.6664161801338195, "num_tokens": 56494040.0, "step": 1725 }, { "epoch": 0.14346131520026537, "grad_norm": 1.15543532371521, "learning_rate": 9.942810817825948e-05, "loss": 1.3912, "mean_token_accuracy": 0.656199187040329, "num_tokens": 56657123.0, "step": 1730 }, { "epoch": 0.14387594327887884, "grad_norm": 1.1122572422027588, "learning_rate": 9.941714247028053e-05, "loss": 1.2874, "mean_token_accuracy": 0.6738514184951783, "num_tokens": 56820963.0, "step": 1735 }, { "epoch": 0.14429057135749232, "grad_norm": 1.126460075378418, "learning_rate": 9.940607324386577e-05, "loss": 1.3429, "mean_token_accuracy": 0.6653286918997765, "num_tokens": 56984803.0, "step": 1740 }, { "epoch": 0.14470519943610582, "grad_norm": 1.163468360900879, "learning_rate": 9.939490052220289e-05, "loss": 1.3763, "mean_token_accuracy": 0.6573924794793129, "num_tokens": 57148643.0, "step": 1745 }, { "epoch": 0.1451198275147193, "grad_norm": 1.1244533061981201, "learning_rate": 9.938362432869635e-05, "loss": 1.2433, "mean_token_accuracy": 0.6807140037417412, "num_tokens": 57311792.0, "step": 1750 }, { "epoch": 0.14553445559333278, "grad_norm": 1.2314460277557373, "learning_rate": 9.93722446869674e-05, "loss": 1.3881, "mean_token_accuracy": 0.657117547094822, "num_tokens": 57475632.0, "step": 1755 }, { "epoch": 0.14594908367194626, "grad_norm": 1.1483550071716309, "learning_rate": 9.936076162085397e-05, "loss": 1.3635, "mean_token_accuracy": 0.6604166641831398, "num_tokens": 57639472.0, "step": 1760 }, { "epoch": 0.14636371175055976, "grad_norm": 1.1495261192321777, "learning_rate": 9.934917515441066e-05, "loss": 1.3488, "mean_token_accuracy": 0.6656769335269928, "num_tokens": 57803312.0, "step": 1765 }, { "epoch": 0.14677833982917324, "grad_norm": 1.1304582357406616, "learning_rate": 9.933748531190865e-05, "loss": 1.4188, "mean_token_accuracy": 0.6531586021184921, "num_tokens": 57967152.0, "step": 1770 }, { "epoch": 0.14719296790778671, "grad_norm": 1.192624807357788, "learning_rate": 9.932569211783567e-05, "loss": 1.3548, "mean_token_accuracy": 0.6607099235057831, "num_tokens": 58130992.0, "step": 1775 }, { "epoch": 0.1476075959864002, "grad_norm": 1.160388469696045, "learning_rate": 9.9313795596896e-05, "loss": 1.3282, "mean_token_accuracy": 0.6654569894075394, "num_tokens": 58294832.0, "step": 1780 }, { "epoch": 0.14802222406501367, "grad_norm": 1.136030912399292, "learning_rate": 9.930179577401029e-05, "loss": 1.3372, "mean_token_accuracy": 0.6637654319405556, "num_tokens": 58458483.0, "step": 1785 }, { "epoch": 0.14843685214362717, "grad_norm": 1.1632835865020752, "learning_rate": 9.928969267431564e-05, "loss": 1.4023, "mean_token_accuracy": 0.6540853947401046, "num_tokens": 58621575.0, "step": 1790 }, { "epoch": 0.14885148022224065, "grad_norm": 1.1856722831726074, "learning_rate": 9.927748632316549e-05, "loss": 1.3775, "mean_token_accuracy": 0.6556512728333473, "num_tokens": 58785415.0, "step": 1795 }, { "epoch": 0.14926610830085413, "grad_norm": 1.1614913940429688, "learning_rate": 9.926517674612952e-05, "loss": 1.2629, "mean_token_accuracy": 0.6784029841423035, "num_tokens": 58949255.0, "step": 1800 }, { "epoch": 0.1496807363794676, "grad_norm": 1.094176173210144, "learning_rate": 9.925276396899372e-05, "loss": 1.3127, "mean_token_accuracy": 0.6692265391349792, "num_tokens": 59113095.0, "step": 1805 }, { "epoch": 0.1500953644580811, "grad_norm": 1.1450884342193604, "learning_rate": 9.924024801776022e-05, "loss": 1.4249, "mean_token_accuracy": 0.6477822616696358, "num_tokens": 59276935.0, "step": 1810 }, { "epoch": 0.1505099925366946, "grad_norm": 1.1223996877670288, "learning_rate": 9.922762891864728e-05, "loss": 1.3828, "mean_token_accuracy": 0.6619623631238938, "num_tokens": 59440775.0, "step": 1815 }, { "epoch": 0.15092462061530806, "grad_norm": 1.2567800283432007, "learning_rate": 9.921490669808924e-05, "loss": 1.3815, "mean_token_accuracy": 0.656121701002121, "num_tokens": 59604615.0, "step": 1820 }, { "epoch": 0.15133924869392154, "grad_norm": 1.17715322971344, "learning_rate": 9.920208138273644e-05, "loss": 1.3506, "mean_token_accuracy": 0.6625794246792793, "num_tokens": 59768455.0, "step": 1825 }, { "epoch": 0.15175387677253505, "grad_norm": 1.162170171737671, "learning_rate": 9.91891529994552e-05, "loss": 1.3107, "mean_token_accuracy": 0.6716727122664452, "num_tokens": 59931303.0, "step": 1830 }, { "epoch": 0.15216850485114852, "grad_norm": 6.644440174102783, "learning_rate": 9.917612157532777e-05, "loss": 1.3636, "mean_token_accuracy": 0.6615713581442833, "num_tokens": 60095143.0, "step": 1835 }, { "epoch": 0.152583132929762, "grad_norm": 1.140966534614563, "learning_rate": 9.916298713765219e-05, "loss": 1.3936, "mean_token_accuracy": 0.6620845571160316, "num_tokens": 60258983.0, "step": 1840 }, { "epoch": 0.15299776100837548, "grad_norm": 1.184509515762329, "learning_rate": 9.914974971394233e-05, "loss": 1.3678, "mean_token_accuracy": 0.6639601692557335, "num_tokens": 60422823.0, "step": 1845 }, { "epoch": 0.15341238908698898, "grad_norm": 1.1792467832565308, "learning_rate": 9.913640933192778e-05, "loss": 1.3296, "mean_token_accuracy": 0.6620417848229408, "num_tokens": 60586663.0, "step": 1850 }, { "epoch": 0.15382701716560246, "grad_norm": 1.1453526020050049, "learning_rate": 9.912296601955384e-05, "loss": 1.3378, "mean_token_accuracy": 0.6599584549665451, "num_tokens": 60750503.0, "step": 1855 }, { "epoch": 0.15424164524421594, "grad_norm": 1.1483008861541748, "learning_rate": 9.910941980498136e-05, "loss": 1.3822, "mean_token_accuracy": 0.6558467775583268, "num_tokens": 60914343.0, "step": 1860 }, { "epoch": 0.1546562733228294, "grad_norm": 1.1345916986465454, "learning_rate": 9.90957707165868e-05, "loss": 1.2497, "mean_token_accuracy": 0.678806209564209, "num_tokens": 61078183.0, "step": 1865 }, { "epoch": 0.15507090140144292, "grad_norm": 1.1539981365203857, "learning_rate": 9.908201878296212e-05, "loss": 1.2889, "mean_token_accuracy": 0.6768389582633972, "num_tokens": 61242023.0, "step": 1870 }, { "epoch": 0.1554855294800564, "grad_norm": 1.1856356859207153, "learning_rate": 9.906816403291471e-05, "loss": 1.4199, "mean_token_accuracy": 0.6503726780414582, "num_tokens": 61405863.0, "step": 1875 }, { "epoch": 0.15590015755866987, "grad_norm": 1.1519221067428589, "learning_rate": 9.905420649546731e-05, "loss": 1.3275, "mean_token_accuracy": 0.6667111247777939, "num_tokens": 61568850.0, "step": 1880 }, { "epoch": 0.15631478563728335, "grad_norm": 1.2015529870986938, "learning_rate": 9.904014619985802e-05, "loss": 1.3551, "mean_token_accuracy": 0.6642350882291794, "num_tokens": 61732690.0, "step": 1885 }, { "epoch": 0.15672941371589685, "grad_norm": 1.0787674188613892, "learning_rate": 9.902598317554018e-05, "loss": 1.3456, "mean_token_accuracy": 0.6618436604738236, "num_tokens": 61895552.0, "step": 1890 }, { "epoch": 0.15714404179451033, "grad_norm": 1.1786774396896362, "learning_rate": 9.901171745218229e-05, "loss": 1.3969, "mean_token_accuracy": 0.6578751221299172, "num_tokens": 62059392.0, "step": 1895 }, { "epoch": 0.1575586698731238, "grad_norm": 1.1005985736846924, "learning_rate": 9.899734905966804e-05, "loss": 1.3296, "mean_token_accuracy": 0.6645833313465118, "num_tokens": 62223232.0, "step": 1900 }, { "epoch": 0.15797329795173728, "grad_norm": 1.1051074266433716, "learning_rate": 9.898287802809619e-05, "loss": 1.4501, "mean_token_accuracy": 0.6514662764966488, "num_tokens": 62387072.0, "step": 1905 }, { "epoch": 0.15838792603035076, "grad_norm": 1.10177481174469, "learning_rate": 9.896830438778043e-05, "loss": 1.3444, "mean_token_accuracy": 0.6648582607507706, "num_tokens": 62550912.0, "step": 1910 }, { "epoch": 0.15880255410896427, "grad_norm": 1.9008870124816895, "learning_rate": 9.895362816924949e-05, "loss": 1.3342, "mean_token_accuracy": 0.6668680176138878, "num_tokens": 62714058.0, "step": 1915 }, { "epoch": 0.15921718218757774, "grad_norm": 1.183345913887024, "learning_rate": 9.893884940324691e-05, "loss": 1.3417, "mean_token_accuracy": 0.6646994158625603, "num_tokens": 62877898.0, "step": 1920 }, { "epoch": 0.15963181026619122, "grad_norm": 1.2330020666122437, "learning_rate": 9.89239681207311e-05, "loss": 1.4101, "mean_token_accuracy": 0.6546187698841095, "num_tokens": 63041738.0, "step": 1925 }, { "epoch": 0.1600464383448047, "grad_norm": 1.7111624479293823, "learning_rate": 9.890898435287517e-05, "loss": 1.3232, "mean_token_accuracy": 0.6709066450595855, "num_tokens": 63205578.0, "step": 1930 }, { "epoch": 0.1604610664234182, "grad_norm": 1.1247528791427612, "learning_rate": 9.889389813106693e-05, "loss": 1.3438, "mean_token_accuracy": 0.6666116818785668, "num_tokens": 63369418.0, "step": 1935 }, { "epoch": 0.16087569450203168, "grad_norm": 1.138061761856079, "learning_rate": 9.887870948690885e-05, "loss": 1.2895, "mean_token_accuracy": 0.674154357612133, "num_tokens": 63532687.0, "step": 1940 }, { "epoch": 0.16129032258064516, "grad_norm": 1.2452406883239746, "learning_rate": 9.886341845221787e-05, "loss": 1.3505, "mean_token_accuracy": 0.6643633976578712, "num_tokens": 63696527.0, "step": 1945 }, { "epoch": 0.16170495065925863, "grad_norm": 1.1234358549118042, "learning_rate": 9.88480250590255e-05, "loss": 1.3152, "mean_token_accuracy": 0.6677847012877465, "num_tokens": 63860367.0, "step": 1950 }, { "epoch": 0.16211957873787214, "grad_norm": 1.212026834487915, "learning_rate": 9.883252933957763e-05, "loss": 1.3354, "mean_token_accuracy": 0.663086511194706, "num_tokens": 64024207.0, "step": 1955 }, { "epoch": 0.16253420681648562, "grad_norm": 1.1730372905731201, "learning_rate": 9.881693132633449e-05, "loss": 1.4111, "mean_token_accuracy": 0.650942749530077, "num_tokens": 64187687.0, "step": 1960 }, { "epoch": 0.1629488348950991, "grad_norm": 1.2520666122436523, "learning_rate": 9.880123105197065e-05, "loss": 1.4476, "mean_token_accuracy": 0.6472551025450229, "num_tokens": 64351075.0, "step": 1965 }, { "epoch": 0.16336346297371257, "grad_norm": 1.1247053146362305, "learning_rate": 9.878542854937482e-05, "loss": 1.3724, "mean_token_accuracy": 0.6632514670491219, "num_tokens": 64514915.0, "step": 1970 }, { "epoch": 0.16377809105232607, "grad_norm": 1.1914619207382202, "learning_rate": 9.876952385164989e-05, "loss": 1.4081, "mean_token_accuracy": 0.6540505856275558, "num_tokens": 64678755.0, "step": 1975 }, { "epoch": 0.16419271913093955, "grad_norm": 1.2277207374572754, "learning_rate": 9.875351699211285e-05, "loss": 1.2936, "mean_token_accuracy": 0.6741715192794799, "num_tokens": 64842396.0, "step": 1980 }, { "epoch": 0.16460734720955303, "grad_norm": 1.0599324703216553, "learning_rate": 9.873740800429467e-05, "loss": 1.2859, "mean_token_accuracy": 0.6765945732593537, "num_tokens": 65006236.0, "step": 1985 }, { "epoch": 0.1650219752881665, "grad_norm": 1.1088241338729858, "learning_rate": 9.872119692194027e-05, "loss": 1.3476, "mean_token_accuracy": 0.6675403237342834, "num_tokens": 65170076.0, "step": 1990 }, { "epoch": 0.16543660336678, "grad_norm": 1.1403690576553345, "learning_rate": 9.87048837790084e-05, "loss": 1.3365, "mean_token_accuracy": 0.6665994673967361, "num_tokens": 65333916.0, "step": 1995 }, { "epoch": 0.1658512314453935, "grad_norm": 1.0955932140350342, "learning_rate": 9.868846860967167e-05, "loss": 1.3646, "mean_token_accuracy": 0.662988756597042, "num_tokens": 65497756.0, "step": 2000 }, { "epoch": 0.16626585952400696, "grad_norm": 1.1397483348846436, "learning_rate": 9.867195144831636e-05, "loss": 1.435, "mean_token_accuracy": 0.6478783056139946, "num_tokens": 65660803.0, "step": 2005 }, { "epoch": 0.16668048760262044, "grad_norm": 1.0877341032028198, "learning_rate": 9.865533232954245e-05, "loss": 1.3153, "mean_token_accuracy": 0.6733687669038773, "num_tokens": 65824643.0, "step": 2010 }, { "epoch": 0.16709511568123395, "grad_norm": 1.2004203796386719, "learning_rate": 9.863861128816344e-05, "loss": 1.3619, "mean_token_accuracy": 0.6670149102807045, "num_tokens": 65988483.0, "step": 2015 }, { "epoch": 0.16750974375984742, "grad_norm": 1.2210606336593628, "learning_rate": 9.862178835920637e-05, "loss": 1.359, "mean_token_accuracy": 0.6611253671348095, "num_tokens": 66152323.0, "step": 2020 }, { "epoch": 0.1679243718384609, "grad_norm": 1.1666066646575928, "learning_rate": 9.860486357791172e-05, "loss": 1.3124, "mean_token_accuracy": 0.6706989288330079, "num_tokens": 66316163.0, "step": 2025 }, { "epoch": 0.16833899991707438, "grad_norm": 1.147369623184204, "learning_rate": 9.85878369797333e-05, "loss": 1.369, "mean_token_accuracy": 0.6631170578300953, "num_tokens": 66480003.0, "step": 2030 }, { "epoch": 0.16875362799568785, "grad_norm": 1.1762323379516602, "learning_rate": 9.857070860033826e-05, "loss": 1.3241, "mean_token_accuracy": 0.6665383711457252, "num_tokens": 66643843.0, "step": 2035 }, { "epoch": 0.16916825607430136, "grad_norm": 1.178361177444458, "learning_rate": 9.855347847560689e-05, "loss": 1.3434, "mean_token_accuracy": 0.6594941362738609, "num_tokens": 66807683.0, "step": 2040 }, { "epoch": 0.16958288415291484, "grad_norm": 1.143797516822815, "learning_rate": 9.853614664163265e-05, "loss": 1.2285, "mean_token_accuracy": 0.6801197454333305, "num_tokens": 66971523.0, "step": 2045 }, { "epoch": 0.1699975122315283, "grad_norm": 1.1586716175079346, "learning_rate": 9.851871313472207e-05, "loss": 1.3523, "mean_token_accuracy": 0.6616935506463051, "num_tokens": 67135363.0, "step": 2050 }, { "epoch": 0.1704121403101418, "grad_norm": 1.1401805877685547, "learning_rate": 9.850117799139464e-05, "loss": 1.3736, "mean_token_accuracy": 0.6606549307703972, "num_tokens": 67299203.0, "step": 2055 }, { "epoch": 0.1708267683887553, "grad_norm": 1.1220128536224365, "learning_rate": 9.84835412483828e-05, "loss": 1.2944, "mean_token_accuracy": 0.675079420208931, "num_tokens": 67463043.0, "step": 2060 }, { "epoch": 0.17124139646736877, "grad_norm": 1.1114490032196045, "learning_rate": 9.846580294263172e-05, "loss": 1.3091, "mean_token_accuracy": 0.6686273604631424, "num_tokens": 67626374.0, "step": 2065 }, { "epoch": 0.17165602454598225, "grad_norm": 1.176193118095398, "learning_rate": 9.844796311129944e-05, "loss": 1.3406, "mean_token_accuracy": 0.6674609020352363, "num_tokens": 67790214.0, "step": 2070 }, { "epoch": 0.17207065262459573, "grad_norm": 1.157579779624939, "learning_rate": 9.843002179175665e-05, "loss": 1.3304, "mean_token_accuracy": 0.6642900764942169, "num_tokens": 67954054.0, "step": 2075 }, { "epoch": 0.17248528070320923, "grad_norm": 1.1543059349060059, "learning_rate": 9.841197902158653e-05, "loss": 1.3111, "mean_token_accuracy": 0.6732191652059555, "num_tokens": 68117791.0, "step": 2080 }, { "epoch": 0.1728999087818227, "grad_norm": 1.2211623191833496, "learning_rate": 9.839383483858492e-05, "loss": 1.3807, "mean_token_accuracy": 0.6542844593524932, "num_tokens": 68280791.0, "step": 2085 }, { "epoch": 0.17331453686043619, "grad_norm": 1.1212056875228882, "learning_rate": 9.837558928076003e-05, "loss": 1.2909, "mean_token_accuracy": 0.6729288831353187, "num_tokens": 68444631.0, "step": 2090 }, { "epoch": 0.17372916493904966, "grad_norm": 1.0925419330596924, "learning_rate": 9.83572423863324e-05, "loss": 1.2163, "mean_token_accuracy": 0.6872311800718307, "num_tokens": 68608471.0, "step": 2095 }, { "epoch": 0.17414379301766317, "grad_norm": 1.1620479822158813, "learning_rate": 9.833879419373493e-05, "loss": 1.2925, "mean_token_accuracy": 0.6768633931875229, "num_tokens": 68772311.0, "step": 2100 }, { "epoch": 0.17455842109627664, "grad_norm": 1.1429002285003662, "learning_rate": 9.832024474161263e-05, "loss": 1.3457, "mean_token_accuracy": 0.6651820585131645, "num_tokens": 68936151.0, "step": 2105 }, { "epoch": 0.17497304917489012, "grad_norm": 1.1326366662979126, "learning_rate": 9.83015940688227e-05, "loss": 1.3674, "mean_token_accuracy": 0.6595857813954353, "num_tokens": 69099991.0, "step": 2110 }, { "epoch": 0.1753876772535036, "grad_norm": 1.0936850309371948, "learning_rate": 9.828284221443433e-05, "loss": 1.3882, "mean_token_accuracy": 0.6589015170931816, "num_tokens": 69263831.0, "step": 2115 }, { "epoch": 0.1758023053321171, "grad_norm": 1.0790454149246216, "learning_rate": 9.826398921772868e-05, "loss": 1.3828, "mean_token_accuracy": 0.6558589950203896, "num_tokens": 69427671.0, "step": 2120 }, { "epoch": 0.17621693341073058, "grad_norm": 1.0382895469665527, "learning_rate": 9.82450351181988e-05, "loss": 1.3515, "mean_token_accuracy": 0.6637157872319221, "num_tokens": 69591511.0, "step": 2125 }, { "epoch": 0.17663156148934406, "grad_norm": 1.116068959236145, "learning_rate": 9.822597995554948e-05, "loss": 1.336, "mean_token_accuracy": 0.6685972645878792, "num_tokens": 69755351.0, "step": 2130 }, { "epoch": 0.17704618956795753, "grad_norm": 1.1191227436065674, "learning_rate": 9.820682376969726e-05, "loss": 1.3248, "mean_token_accuracy": 0.6705806404352188, "num_tokens": 69919049.0, "step": 2135 }, { "epoch": 0.17746081764657104, "grad_norm": 1.1419486999511719, "learning_rate": 9.818756660077029e-05, "loss": 1.5134, "mean_token_accuracy": 0.6402981460094452, "num_tokens": 70082889.0, "step": 2140 }, { "epoch": 0.17787544572518452, "grad_norm": 1.7518163919448853, "learning_rate": 9.816820848910826e-05, "loss": 1.3682, "mean_token_accuracy": 0.6609827965497971, "num_tokens": 70246624.0, "step": 2145 }, { "epoch": 0.178290073803798, "grad_norm": 1.0493342876434326, "learning_rate": 9.81487494752623e-05, "loss": 1.2596, "mean_token_accuracy": 0.678048625588417, "num_tokens": 70410464.0, "step": 2150 }, { "epoch": 0.17870470188241147, "grad_norm": 1.1223058700561523, "learning_rate": 9.81291895999949e-05, "loss": 1.3444, "mean_token_accuracy": 0.6666483402252197, "num_tokens": 70574304.0, "step": 2155 }, { "epoch": 0.17911932996102495, "grad_norm": 1.0618618726730347, "learning_rate": 9.810952890427989e-05, "loss": 1.3212, "mean_token_accuracy": 0.6679924234747887, "num_tokens": 70738144.0, "step": 2160 }, { "epoch": 0.17953395803963845, "grad_norm": 1.1819339990615845, "learning_rate": 9.808976742930224e-05, "loss": 1.429, "mean_token_accuracy": 0.6516617774963379, "num_tokens": 70901984.0, "step": 2165 }, { "epoch": 0.17994858611825193, "grad_norm": 1.232933521270752, "learning_rate": 9.806990521645805e-05, "loss": 1.3836, "mean_token_accuracy": 0.6575513169169426, "num_tokens": 71065824.0, "step": 2170 }, { "epoch": 0.1803632141968654, "grad_norm": 1.0888854265213013, "learning_rate": 9.804994230735444e-05, "loss": 1.3215, "mean_token_accuracy": 0.6692053899168968, "num_tokens": 71229122.0, "step": 2175 }, { "epoch": 0.18077784227547888, "grad_norm": 1.100090503692627, "learning_rate": 9.80298787438095e-05, "loss": 1.3636, "mean_token_accuracy": 0.6626466304063797, "num_tokens": 71392962.0, "step": 2180 }, { "epoch": 0.1811924703540924, "grad_norm": 1.1078429222106934, "learning_rate": 9.800971456785209e-05, "loss": 1.3774, "mean_token_accuracy": 0.6605205282568931, "num_tokens": 71556802.0, "step": 2185 }, { "epoch": 0.18160709843270587, "grad_norm": 1.0692031383514404, "learning_rate": 9.798944982172193e-05, "loss": 1.3401, "mean_token_accuracy": 0.665487539768219, "num_tokens": 71720642.0, "step": 2190 }, { "epoch": 0.18202172651131934, "grad_norm": 1.105131983757019, "learning_rate": 9.796908454786935e-05, "loss": 1.4053, "mean_token_accuracy": 0.6570259012281895, "num_tokens": 71884482.0, "step": 2195 }, { "epoch": 0.18243635458993282, "grad_norm": 1.1255451440811157, "learning_rate": 9.794861878895527e-05, "loss": 1.3658, "mean_token_accuracy": 0.6593963801860809, "num_tokens": 72048322.0, "step": 2200 }, { "epoch": 0.18285098266854632, "grad_norm": 1.0730068683624268, "learning_rate": 9.792805258785114e-05, "loss": 1.4008, "mean_token_accuracy": 0.6534824058413505, "num_tokens": 72212162.0, "step": 2205 }, { "epoch": 0.1832656107471598, "grad_norm": 1.0972033739089966, "learning_rate": 9.790738598763875e-05, "loss": 1.3791, "mean_token_accuracy": 0.6571542024612427, "num_tokens": 72376002.0, "step": 2210 }, { "epoch": 0.18368023882577328, "grad_norm": 1.0816172361373901, "learning_rate": 9.78866190316103e-05, "loss": 1.3325, "mean_token_accuracy": 0.6688905179500579, "num_tokens": 72539842.0, "step": 2215 }, { "epoch": 0.18409486690438676, "grad_norm": 1.131365180015564, "learning_rate": 9.786575176326813e-05, "loss": 1.3574, "mean_token_accuracy": 0.6613880753517151, "num_tokens": 72703682.0, "step": 2220 }, { "epoch": 0.18450949498300026, "grad_norm": 1.1486268043518066, "learning_rate": 9.784478422632473e-05, "loss": 1.377, "mean_token_accuracy": 0.6570772603154182, "num_tokens": 72866724.0, "step": 2225 }, { "epoch": 0.18492412306161374, "grad_norm": 1.094726324081421, "learning_rate": 9.782371646470267e-05, "loss": 1.4077, "mean_token_accuracy": 0.6569770231842995, "num_tokens": 73030564.0, "step": 2230 }, { "epoch": 0.18533875114022721, "grad_norm": 1.1273894309997559, "learning_rate": 9.780254852253444e-05, "loss": 1.3788, "mean_token_accuracy": 0.6608993165194988, "num_tokens": 73194404.0, "step": 2235 }, { "epoch": 0.1857533792188407, "grad_norm": 1.1158084869384766, "learning_rate": 9.778128044416236e-05, "loss": 1.3406, "mean_token_accuracy": 0.6637341201305389, "num_tokens": 73358244.0, "step": 2240 }, { "epoch": 0.1861680072974542, "grad_norm": 1.0741921663284302, "learning_rate": 9.77599122741386e-05, "loss": 1.2999, "mean_token_accuracy": 0.6719269290566444, "num_tokens": 73522084.0, "step": 2245 }, { "epoch": 0.18658263537606767, "grad_norm": 1.1117743253707886, "learning_rate": 9.773844405722487e-05, "loss": 1.3018, "mean_token_accuracy": 0.6700600832700729, "num_tokens": 73685472.0, "step": 2250 }, { "epoch": 0.18699726345468115, "grad_norm": 1.135411024093628, "learning_rate": 9.771687583839261e-05, "loss": 1.285, "mean_token_accuracy": 0.6762952029705047, "num_tokens": 73849312.0, "step": 2255 }, { "epoch": 0.18741189153329463, "grad_norm": 1.0903639793395996, "learning_rate": 9.769520766282263e-05, "loss": 1.3271, "mean_token_accuracy": 0.6650048926472664, "num_tokens": 74013152.0, "step": 2260 }, { "epoch": 0.1878265196119081, "grad_norm": 1.1023638248443604, "learning_rate": 9.767343957590516e-05, "loss": 1.2548, "mean_token_accuracy": 0.6818181812763214, "num_tokens": 74176992.0, "step": 2265 }, { "epoch": 0.1882411476905216, "grad_norm": 1.0778052806854248, "learning_rate": 9.765157162323973e-05, "loss": 1.3384, "mean_token_accuracy": 0.663483626395464, "num_tokens": 74340832.0, "step": 2270 }, { "epoch": 0.1886557757691351, "grad_norm": 1.110249400138855, "learning_rate": 9.762960385063506e-05, "loss": 1.3886, "mean_token_accuracy": 0.6580768913030625, "num_tokens": 74504100.0, "step": 2275 }, { "epoch": 0.18907040384774856, "grad_norm": 1.146621823310852, "learning_rate": 9.7607536304109e-05, "loss": 1.3387, "mean_token_accuracy": 0.6679115161299706, "num_tokens": 74667795.0, "step": 2280 }, { "epoch": 0.18948503192636204, "grad_norm": 1.2143663167953491, "learning_rate": 9.758536902988835e-05, "loss": 1.3847, "mean_token_accuracy": 0.658492174744606, "num_tokens": 74831635.0, "step": 2285 }, { "epoch": 0.18989966000497555, "grad_norm": 1.1127694845199585, "learning_rate": 9.756310207440886e-05, "loss": 1.3139, "mean_token_accuracy": 0.6696419835090637, "num_tokens": 74995475.0, "step": 2290 }, { "epoch": 0.19031428808358902, "grad_norm": 1.1862319707870483, "learning_rate": 9.75407354843151e-05, "loss": 1.2876, "mean_token_accuracy": 0.6754197210073472, "num_tokens": 75158878.0, "step": 2295 }, { "epoch": 0.1907289161622025, "grad_norm": 1.1235371828079224, "learning_rate": 9.751826930646031e-05, "loss": 1.3592, "mean_token_accuracy": 0.6648093804717063, "num_tokens": 75322718.0, "step": 2300 }, { "epoch": 0.19114354424081598, "grad_norm": 1.1142855882644653, "learning_rate": 9.749570358790638e-05, "loss": 1.2707, "mean_token_accuracy": 0.6775232195854187, "num_tokens": 75486558.0, "step": 2305 }, { "epoch": 0.19155817231942948, "grad_norm": 1.0357214212417603, "learning_rate": 9.74730383759237e-05, "loss": 1.2893, "mean_token_accuracy": 0.6758553296327591, "num_tokens": 75650398.0, "step": 2310 }, { "epoch": 0.19197280039804296, "grad_norm": 1.2712405920028687, "learning_rate": 9.745027371799107e-05, "loss": 1.3268, "mean_token_accuracy": 0.6664589449763298, "num_tokens": 75814238.0, "step": 2315 }, { "epoch": 0.19238742847665644, "grad_norm": 1.0738105773925781, "learning_rate": 9.742740966179567e-05, "loss": 1.3649, "mean_token_accuracy": 0.6583088934421539, "num_tokens": 75978078.0, "step": 2320 }, { "epoch": 0.1928020565552699, "grad_norm": 1.1622228622436523, "learning_rate": 9.740444625523279e-05, "loss": 1.3111, "mean_token_accuracy": 0.6688497617840767, "num_tokens": 76141604.0, "step": 2325 }, { "epoch": 0.19321668463388342, "grad_norm": 1.0559049844741821, "learning_rate": 9.738138354640593e-05, "loss": 1.2614, "mean_token_accuracy": 0.6786290287971497, "num_tokens": 76305444.0, "step": 2330 }, { "epoch": 0.1936313127124969, "grad_norm": 1.1456258296966553, "learning_rate": 9.735822158362657e-05, "loss": 1.3067, "mean_token_accuracy": 0.6677236080169677, "num_tokens": 76469284.0, "step": 2335 }, { "epoch": 0.19404594079111037, "grad_norm": 1.1015242338180542, "learning_rate": 9.733496041541414e-05, "loss": 1.4018, "mean_token_accuracy": 0.6553274631500244, "num_tokens": 76633124.0, "step": 2340 }, { "epoch": 0.19446056886972385, "grad_norm": 1.0512968301773071, "learning_rate": 9.73116000904958e-05, "loss": 1.2869, "mean_token_accuracy": 0.6766739979386329, "num_tokens": 76796964.0, "step": 2345 }, { "epoch": 0.19487519694833735, "grad_norm": 1.1637961864471436, "learning_rate": 9.728814065780651e-05, "loss": 1.3192, "mean_token_accuracy": 0.6687282115221024, "num_tokens": 76959932.0, "step": 2350 }, { "epoch": 0.19528982502695083, "grad_norm": 1.1040971279144287, "learning_rate": 9.72645821664888e-05, "loss": 1.378, "mean_token_accuracy": 0.6587915405631065, "num_tokens": 77123772.0, "step": 2355 }, { "epoch": 0.1957044531055643, "grad_norm": 1.0787874460220337, "learning_rate": 9.724092466589273e-05, "loss": 1.3312, "mean_token_accuracy": 0.6668265044689179, "num_tokens": 77287521.0, "step": 2360 }, { "epoch": 0.19611908118417778, "grad_norm": 1.105224609375, "learning_rate": 9.721716820557573e-05, "loss": 1.2537, "mean_token_accuracy": 0.6787023469805717, "num_tokens": 77451361.0, "step": 2365 }, { "epoch": 0.1965337092627913, "grad_norm": 1.1023021936416626, "learning_rate": 9.719331283530255e-05, "loss": 1.2594, "mean_token_accuracy": 0.6774978116154671, "num_tokens": 77615046.0, "step": 2370 }, { "epoch": 0.19694833734140477, "grad_norm": 1.169885516166687, "learning_rate": 9.716935860504512e-05, "loss": 1.3832, "mean_token_accuracy": 0.6575268790125847, "num_tokens": 77778886.0, "step": 2375 }, { "epoch": 0.19736296542001824, "grad_norm": 1.3298619985580444, "learning_rate": 9.714530556498252e-05, "loss": 1.3489, "mean_token_accuracy": 0.661094817519188, "num_tokens": 77942726.0, "step": 2380 }, { "epoch": 0.19777759349863172, "grad_norm": 1.0819296836853027, "learning_rate": 9.712115376550072e-05, "loss": 1.3445, "mean_token_accuracy": 0.6615713611245155, "num_tokens": 78106566.0, "step": 2385 }, { "epoch": 0.1981922215772452, "grad_norm": 1.1341317892074585, "learning_rate": 9.709690325719263e-05, "loss": 1.3962, "mean_token_accuracy": 0.6577895864844322, "num_tokens": 78270406.0, "step": 2390 }, { "epoch": 0.1986068496558587, "grad_norm": 1.5062912702560425, "learning_rate": 9.707255409085793e-05, "loss": 1.3149, "mean_token_accuracy": 0.6675708696246148, "num_tokens": 78434246.0, "step": 2395 }, { "epoch": 0.19902147773447218, "grad_norm": 1.6830103397369385, "learning_rate": 9.704810631750299e-05, "loss": 1.1792, "mean_token_accuracy": 0.6939577236771584, "num_tokens": 78598086.0, "step": 2400 }, { "epoch": 0.19943610581308566, "grad_norm": 1.3983255624771118, "learning_rate": 9.702355998834065e-05, "loss": 1.3945, "mean_token_accuracy": 0.6583511427044868, "num_tokens": 78761846.0, "step": 2405 }, { "epoch": 0.19985073389169913, "grad_norm": 1.0804715156555176, "learning_rate": 9.699891515479031e-05, "loss": 1.3258, "mean_token_accuracy": 0.6670332357287407, "num_tokens": 78925686.0, "step": 2410 }, { "epoch": 0.20026536197031264, "grad_norm": 1.0900949239730835, "learning_rate": 9.697417186847766e-05, "loss": 1.3864, "mean_token_accuracy": 0.6587426692247391, "num_tokens": 79089526.0, "step": 2415 }, { "epoch": 0.20067999004892612, "grad_norm": 1.089453101158142, "learning_rate": 9.694933018123464e-05, "loss": 1.396, "mean_token_accuracy": 0.6545210152864456, "num_tokens": 79253366.0, "step": 2420 }, { "epoch": 0.2010946181275396, "grad_norm": 1.11849045753479, "learning_rate": 9.692439014509931e-05, "loss": 1.3114, "mean_token_accuracy": 0.6667460948228836, "num_tokens": 79417206.0, "step": 2425 }, { "epoch": 0.20150924620615307, "grad_norm": 1.2045278549194336, "learning_rate": 9.689935181231575e-05, "loss": 1.3497, "mean_token_accuracy": 0.6638746336102486, "num_tokens": 79581046.0, "step": 2430 }, { "epoch": 0.20192387428476657, "grad_norm": 1.1044609546661377, "learning_rate": 9.6874215235334e-05, "loss": 1.3491, "mean_token_accuracy": 0.6666239008307457, "num_tokens": 79744886.0, "step": 2435 }, { "epoch": 0.20233850236338005, "grad_norm": 1.1200894117355347, "learning_rate": 9.684898046680981e-05, "loss": 1.3401, "mean_token_accuracy": 0.6662483721971512, "num_tokens": 79908147.0, "step": 2440 }, { "epoch": 0.20275313044199353, "grad_norm": 1.1056984663009644, "learning_rate": 9.682364755960468e-05, "loss": 1.2718, "mean_token_accuracy": 0.6802785888314247, "num_tokens": 80071987.0, "step": 2445 }, { "epoch": 0.203167758520607, "grad_norm": 1.1164171695709229, "learning_rate": 9.679821656678572e-05, "loss": 1.2684, "mean_token_accuracy": 0.6782441362738609, "num_tokens": 80235827.0, "step": 2450 }, { "epoch": 0.2035823865992205, "grad_norm": 1.1353195905685425, "learning_rate": 9.677268754162541e-05, "loss": 1.3475, "mean_token_accuracy": 0.6620967760682106, "num_tokens": 80399667.0, "step": 2455 }, { "epoch": 0.203997014677834, "grad_norm": 1.1183967590332031, "learning_rate": 9.674706053760169e-05, "loss": 1.3117, "mean_token_accuracy": 0.6705278620123863, "num_tokens": 80563507.0, "step": 2460 }, { "epoch": 0.20441164275644746, "grad_norm": 1.030859112739563, "learning_rate": 9.672133560839768e-05, "loss": 1.3011, "mean_token_accuracy": 0.6704667627811431, "num_tokens": 80727347.0, "step": 2465 }, { "epoch": 0.20482627083506094, "grad_norm": 1.0588932037353516, "learning_rate": 9.669551280790166e-05, "loss": 1.3074, "mean_token_accuracy": 0.6711326971650123, "num_tokens": 80891187.0, "step": 2470 }, { "epoch": 0.20524089891367445, "grad_norm": 1.0952879190444946, "learning_rate": 9.66695921902069e-05, "loss": 1.3323, "mean_token_accuracy": 0.6656402736902237, "num_tokens": 81055027.0, "step": 2475 }, { "epoch": 0.20565552699228792, "grad_norm": 1.1158757209777832, "learning_rate": 9.664357380961162e-05, "loss": 1.3251, "mean_token_accuracy": 0.6664972737431526, "num_tokens": 81217910.0, "step": 2480 }, { "epoch": 0.2060701550709014, "grad_norm": 0.9855961203575134, "learning_rate": 9.661745772061881e-05, "loss": 1.2601, "mean_token_accuracy": 0.68031525015831, "num_tokens": 81381750.0, "step": 2485 }, { "epoch": 0.20648478314951488, "grad_norm": 1.1344683170318604, "learning_rate": 9.659124397793613e-05, "loss": 1.4021, "mean_token_accuracy": 0.6581928178668022, "num_tokens": 81545590.0, "step": 2490 }, { "epoch": 0.20689941122812838, "grad_norm": 1.0772168636322021, "learning_rate": 9.656493263647581e-05, "loss": 1.3634, "mean_token_accuracy": 0.6626588478684425, "num_tokens": 81709430.0, "step": 2495 }, { "epoch": 0.20731403930674186, "grad_norm": 1.090943455696106, "learning_rate": 9.653852375135456e-05, "loss": 1.3196, "mean_token_accuracy": 0.6680290848016739, "num_tokens": 81873270.0, "step": 2500 }, { "epoch": 0.20772866738535534, "grad_norm": 1.1841354370117188, "learning_rate": 9.651201737789335e-05, "loss": 1.4031, "mean_token_accuracy": 0.6540872484445572, "num_tokens": 82037110.0, "step": 2505 }, { "epoch": 0.2081432954639688, "grad_norm": 1.040313959121704, "learning_rate": 9.648541357161747e-05, "loss": 1.2831, "mean_token_accuracy": 0.6743523925542831, "num_tokens": 82200950.0, "step": 2510 }, { "epoch": 0.2085579235425823, "grad_norm": 1.0221846103668213, "learning_rate": 9.645871238825619e-05, "loss": 1.3091, "mean_token_accuracy": 0.6719657763838768, "num_tokens": 82364428.0, "step": 2515 }, { "epoch": 0.2089725516211958, "grad_norm": 1.0882335901260376, "learning_rate": 9.643191388374288e-05, "loss": 1.3205, "mean_token_accuracy": 0.6690249308943749, "num_tokens": 82528268.0, "step": 2520 }, { "epoch": 0.20938717969980927, "grad_norm": 1.0770103931427002, "learning_rate": 9.640501811421469e-05, "loss": 1.3315, "mean_token_accuracy": 0.6688233181834221, "num_tokens": 82692108.0, "step": 2525 }, { "epoch": 0.20980180777842275, "grad_norm": 1.097158670425415, "learning_rate": 9.637802513601258e-05, "loss": 1.3196, "mean_token_accuracy": 0.6666116788983345, "num_tokens": 82855948.0, "step": 2530 }, { "epoch": 0.21021643585703623, "grad_norm": 1.102793574333191, "learning_rate": 9.635093500568109e-05, "loss": 1.3891, "mean_token_accuracy": 0.6579056680202484, "num_tokens": 83019788.0, "step": 2535 }, { "epoch": 0.21063106393564973, "grad_norm": 1.058576226234436, "learning_rate": 9.632374777996831e-05, "loss": 1.3382, "mean_token_accuracy": 0.6638813644647599, "num_tokens": 83183076.0, "step": 2540 }, { "epoch": 0.2110456920142632, "grad_norm": 1.150504231452942, "learning_rate": 9.629646351582573e-05, "loss": 1.3318, "mean_token_accuracy": 0.6660740464925766, "num_tokens": 83346916.0, "step": 2545 }, { "epoch": 0.21146032009287669, "grad_norm": 1.1130492687225342, "learning_rate": 9.626908227040808e-05, "loss": 1.3191, "mean_token_accuracy": 0.66956866979599, "num_tokens": 83510756.0, "step": 2550 }, { "epoch": 0.21187494817149016, "grad_norm": 1.1053194999694824, "learning_rate": 9.624160410107326e-05, "loss": 1.3424, "mean_token_accuracy": 0.6644733622670174, "num_tokens": 83674596.0, "step": 2555 }, { "epoch": 0.21228957625010367, "grad_norm": 1.0200395584106445, "learning_rate": 9.621402906538222e-05, "loss": 1.2404, "mean_token_accuracy": 0.6816626936197281, "num_tokens": 83838094.0, "step": 2560 }, { "epoch": 0.21270420432871714, "grad_norm": 1.1047829389572144, "learning_rate": 9.618635722109881e-05, "loss": 1.4284, "mean_token_accuracy": 0.6508206129074097, "num_tokens": 84001749.0, "step": 2565 }, { "epoch": 0.21311883240733062, "grad_norm": 0.9829691648483276, "learning_rate": 9.61585886261897e-05, "loss": 1.3187, "mean_token_accuracy": 0.6698619246482849, "num_tokens": 84165589.0, "step": 2570 }, { "epoch": 0.2135334604859441, "grad_norm": 1.0752907991409302, "learning_rate": 9.613072333882416e-05, "loss": 1.2852, "mean_token_accuracy": 0.6733260005712509, "num_tokens": 84329429.0, "step": 2575 }, { "epoch": 0.2139480885645576, "grad_norm": 1.1305135488510132, "learning_rate": 9.610276141737409e-05, "loss": 1.3866, "mean_token_accuracy": 0.6594574764370918, "num_tokens": 84493269.0, "step": 2580 }, { "epoch": 0.21436271664317108, "grad_norm": 1.0816925764083862, "learning_rate": 9.607470292041379e-05, "loss": 1.3104, "mean_token_accuracy": 0.6695931047201157, "num_tokens": 84657109.0, "step": 2585 }, { "epoch": 0.21477734472178456, "grad_norm": 1.08124840259552, "learning_rate": 9.604654790671985e-05, "loss": 1.2459, "mean_token_accuracy": 0.6807917907834053, "num_tokens": 84820949.0, "step": 2590 }, { "epoch": 0.21519197280039803, "grad_norm": 1.0594547986984253, "learning_rate": 9.601829643527105e-05, "loss": 1.286, "mean_token_accuracy": 0.673258799314499, "num_tokens": 84984789.0, "step": 2595 }, { "epoch": 0.21560660087901154, "grad_norm": 0.9991068243980408, "learning_rate": 9.598994856524826e-05, "loss": 1.3501, "mean_token_accuracy": 0.664106796681881, "num_tokens": 85148629.0, "step": 2600 }, { "epoch": 0.21602122895762502, "grad_norm": 1.0182642936706543, "learning_rate": 9.596150435603422e-05, "loss": 1.305, "mean_token_accuracy": 0.6696542009711266, "num_tokens": 85312469.0, "step": 2605 }, { "epoch": 0.2164358570362385, "grad_norm": 1.0205860137939453, "learning_rate": 9.593296386721353e-05, "loss": 1.3007, "mean_token_accuracy": 0.6687988772988319, "num_tokens": 85476309.0, "step": 2610 }, { "epoch": 0.21685048511485197, "grad_norm": 1.0811766386032104, "learning_rate": 9.59043271585725e-05, "loss": 1.339, "mean_token_accuracy": 0.6658479943871498, "num_tokens": 85640149.0, "step": 2615 }, { "epoch": 0.21726511319346545, "grad_norm": 1.0677396059036255, "learning_rate": 9.587559429009889e-05, "loss": 1.331, "mean_token_accuracy": 0.6626160770654679, "num_tokens": 85803989.0, "step": 2620 }, { "epoch": 0.21767974127207895, "grad_norm": 1.0135524272918701, "learning_rate": 9.584676532198202e-05, "loss": 1.3114, "mean_token_accuracy": 0.6692815214395523, "num_tokens": 85967829.0, "step": 2625 }, { "epoch": 0.21809436935069243, "grad_norm": 1.0285613536834717, "learning_rate": 9.581784031461247e-05, "loss": 1.2702, "mean_token_accuracy": 0.6740224823355675, "num_tokens": 86131669.0, "step": 2630 }, { "epoch": 0.2185089974293059, "grad_norm": 1.0852622985839844, "learning_rate": 9.578881932858198e-05, "loss": 1.3504, "mean_token_accuracy": 0.6628107890486717, "num_tokens": 86294505.0, "step": 2635 }, { "epoch": 0.21892362550791938, "grad_norm": 1.1283034086227417, "learning_rate": 9.575970242468335e-05, "loss": 1.3912, "mean_token_accuracy": 0.6552175015211106, "num_tokens": 86458345.0, "step": 2640 }, { "epoch": 0.2193382535865329, "grad_norm": 1.1609395742416382, "learning_rate": 9.573048966391034e-05, "loss": 1.362, "mean_token_accuracy": 0.6610581636428833, "num_tokens": 86622185.0, "step": 2645 }, { "epoch": 0.21975288166514637, "grad_norm": 1.007416009902954, "learning_rate": 9.570118110745749e-05, "loss": 1.2972, "mean_token_accuracy": 0.6743523970246315, "num_tokens": 86786025.0, "step": 2650 }, { "epoch": 0.22016750974375984, "grad_norm": 1.0748475790023804, "learning_rate": 9.567177681672e-05, "loss": 1.3899, "mean_token_accuracy": 0.6591829985380173, "num_tokens": 86948955.0, "step": 2655 }, { "epoch": 0.22058213782237332, "grad_norm": 1.1083078384399414, "learning_rate": 9.564227685329363e-05, "loss": 1.3111, "mean_token_accuracy": 0.6674165606498719, "num_tokens": 87112703.0, "step": 2660 }, { "epoch": 0.22099676590098682, "grad_norm": 1.0616549253463745, "learning_rate": 9.561268127897457e-05, "loss": 1.343, "mean_token_accuracy": 0.6640853062272072, "num_tokens": 87275623.0, "step": 2665 }, { "epoch": 0.2214113939796003, "grad_norm": 1.0404539108276367, "learning_rate": 9.558299015575922e-05, "loss": 1.3441, "mean_token_accuracy": 0.6618218451738358, "num_tokens": 87439463.0, "step": 2670 }, { "epoch": 0.22182602205821378, "grad_norm": 1.0714560747146606, "learning_rate": 9.555320354584423e-05, "loss": 1.2658, "mean_token_accuracy": 0.6778225809335708, "num_tokens": 87603303.0, "step": 2675 }, { "epoch": 0.22224065013682726, "grad_norm": 1.0824095010757446, "learning_rate": 9.552332151162623e-05, "loss": 1.2986, "mean_token_accuracy": 0.671291546523571, "num_tokens": 87767143.0, "step": 2680 }, { "epoch": 0.22265527821544076, "grad_norm": 1.081459403038025, "learning_rate": 9.549334411570174e-05, "loss": 1.4601, "mean_token_accuracy": 0.6446869522333145, "num_tokens": 87930536.0, "step": 2685 }, { "epoch": 0.22306990629405424, "grad_norm": 1.0536558628082275, "learning_rate": 9.546327142086704e-05, "loss": 1.3329, "mean_token_accuracy": 0.6650904163718223, "num_tokens": 88094376.0, "step": 2690 }, { "epoch": 0.22348453437266771, "grad_norm": 1.0550700426101685, "learning_rate": 9.543310349011805e-05, "loss": 1.3128, "mean_token_accuracy": 0.6701490715146065, "num_tokens": 88258216.0, "step": 2695 }, { "epoch": 0.2238991624512812, "grad_norm": 1.101529598236084, "learning_rate": 9.540284038665022e-05, "loss": 1.2555, "mean_token_accuracy": 0.6795943312346935, "num_tokens": 88422056.0, "step": 2700 }, { "epoch": 0.2243137905298947, "grad_norm": 1.0878149271011353, "learning_rate": 9.537248217385828e-05, "loss": 1.3549, "mean_token_accuracy": 0.668823316693306, "num_tokens": 88585896.0, "step": 2705 }, { "epoch": 0.22472841860850817, "grad_norm": 1.0655194520950317, "learning_rate": 9.53420289153363e-05, "loss": 1.343, "mean_token_accuracy": 0.6660740479826928, "num_tokens": 88749736.0, "step": 2710 }, { "epoch": 0.22514304668712165, "grad_norm": 1.0821983814239502, "learning_rate": 9.531148067487738e-05, "loss": 1.3206, "mean_token_accuracy": 0.6682368010282517, "num_tokens": 88913576.0, "step": 2715 }, { "epoch": 0.22555767476573513, "grad_norm": 1.0168461799621582, "learning_rate": 9.528083751647358e-05, "loss": 1.2642, "mean_token_accuracy": 0.6809506341814995, "num_tokens": 89077416.0, "step": 2720 }, { "epoch": 0.22597230284434863, "grad_norm": 1.0486526489257812, "learning_rate": 9.525009950431588e-05, "loss": 1.2823, "mean_token_accuracy": 0.6794293776154519, "num_tokens": 89241256.0, "step": 2725 }, { "epoch": 0.2263869309229621, "grad_norm": 1.0182121992111206, "learning_rate": 9.521926670279384e-05, "loss": 1.3659, "mean_token_accuracy": 0.6572886124253273, "num_tokens": 89405096.0, "step": 2730 }, { "epoch": 0.2268015590015756, "grad_norm": 1.030746579170227, "learning_rate": 9.518833917649568e-05, "loss": 1.3167, "mean_token_accuracy": 0.6725348204374313, "num_tokens": 89568712.0, "step": 2735 }, { "epoch": 0.22721618708018906, "grad_norm": 1.0402779579162598, "learning_rate": 9.5157316990208e-05, "loss": 1.3271, "mean_token_accuracy": 0.6652859270572662, "num_tokens": 89732552.0, "step": 2740 }, { "epoch": 0.22763081515880254, "grad_norm": 1.0385247468948364, "learning_rate": 9.512620020891569e-05, "loss": 1.3584, "mean_token_accuracy": 0.6640579134225846, "num_tokens": 89896392.0, "step": 2745 }, { "epoch": 0.22804544323741605, "grad_norm": 1.0232688188552856, "learning_rate": 9.509498889780182e-05, "loss": 1.3352, "mean_token_accuracy": 0.6671554207801819, "num_tokens": 90060232.0, "step": 2750 }, { "epoch": 0.22846007131602952, "grad_norm": 1.022635817527771, "learning_rate": 9.506368312224746e-05, "loss": 1.3824, "mean_token_accuracy": 0.6553824588656425, "num_tokens": 90224072.0, "step": 2755 }, { "epoch": 0.228874699394643, "grad_norm": 1.0279450416564941, "learning_rate": 9.503228294783158e-05, "loss": 1.2934, "mean_token_accuracy": 0.6734054297208786, "num_tokens": 90387912.0, "step": 2760 }, { "epoch": 0.22928932747325648, "grad_norm": 0.9623901844024658, "learning_rate": 9.500078844033089e-05, "loss": 1.2324, "mean_token_accuracy": 0.6841581121087075, "num_tokens": 90551752.0, "step": 2765 }, { "epoch": 0.22970395555186998, "grad_norm": 1.0440038442611694, "learning_rate": 9.496919966571971e-05, "loss": 1.2384, "mean_token_accuracy": 0.680700147151947, "num_tokens": 90715592.0, "step": 2770 }, { "epoch": 0.23011858363048346, "grad_norm": 1.0675630569458008, "learning_rate": 9.493751669016982e-05, "loss": 1.3354, "mean_token_accuracy": 0.6638172417879105, "num_tokens": 90879155.0, "step": 2775 }, { "epoch": 0.23053321170909694, "grad_norm": 1.002764344215393, "learning_rate": 9.490573958005032e-05, "loss": 1.2405, "mean_token_accuracy": 0.6804007798433304, "num_tokens": 91042995.0, "step": 2780 }, { "epoch": 0.2309478397877104, "grad_norm": 1.21285080909729, "learning_rate": 9.487386840192754e-05, "loss": 1.312, "mean_token_accuracy": 0.6705889537930488, "num_tokens": 91206835.0, "step": 2785 }, { "epoch": 0.23136246786632392, "grad_norm": 1.0286260843276978, "learning_rate": 9.484190322256484e-05, "loss": 1.3475, "mean_token_accuracy": 0.6628112107515335, "num_tokens": 91369694.0, "step": 2790 }, { "epoch": 0.2317770959449374, "grad_norm": 1.0780329704284668, "learning_rate": 9.480984410892247e-05, "loss": 1.2979, "mean_token_accuracy": 0.6744745880365371, "num_tokens": 91533534.0, "step": 2795 }, { "epoch": 0.23219172402355087, "grad_norm": 1.0212982892990112, "learning_rate": 9.47776911281575e-05, "loss": 1.3197, "mean_token_accuracy": 0.672837245464325, "num_tokens": 91697374.0, "step": 2800 }, { "epoch": 0.23260635210216435, "grad_norm": 1.0043659210205078, "learning_rate": 9.47454443476236e-05, "loss": 1.4051, "mean_token_accuracy": 0.657325267791748, "num_tokens": 91861214.0, "step": 2805 }, { "epoch": 0.23302098018077785, "grad_norm": 1.0502917766571045, "learning_rate": 9.471310383487096e-05, "loss": 1.3021, "mean_token_accuracy": 0.668786658346653, "num_tokens": 92025054.0, "step": 2810 }, { "epoch": 0.23343560825939133, "grad_norm": 1.4406880140304565, "learning_rate": 9.468066965764603e-05, "loss": 1.2822, "mean_token_accuracy": 0.6733932077884675, "num_tokens": 92188894.0, "step": 2815 }, { "epoch": 0.2338502363380048, "grad_norm": 1.0190492868423462, "learning_rate": 9.464814188389162e-05, "loss": 1.3089, "mean_token_accuracy": 0.6734237536787987, "num_tokens": 92352734.0, "step": 2820 }, { "epoch": 0.23426486441661828, "grad_norm": 0.9841130971908569, "learning_rate": 9.461552058174647e-05, "loss": 1.3578, "mean_token_accuracy": 0.6651637375354766, "num_tokens": 92516574.0, "step": 2825 }, { "epoch": 0.2346794924952318, "grad_norm": 1.0100873708724976, "learning_rate": 9.458280581954528e-05, "loss": 1.2901, "mean_token_accuracy": 0.6766862124204636, "num_tokens": 92680414.0, "step": 2830 }, { "epoch": 0.23509412057384527, "grad_norm": 1.1190541982650757, "learning_rate": 9.454999766581858e-05, "loss": 1.3889, "mean_token_accuracy": 0.6534491300582885, "num_tokens": 92843564.0, "step": 2835 }, { "epoch": 0.23550874865245874, "grad_norm": 1.0348269939422607, "learning_rate": 9.451709618929247e-05, "loss": 1.288, "mean_token_accuracy": 0.6731304943561554, "num_tokens": 93007404.0, "step": 2840 }, { "epoch": 0.23592337673107222, "grad_norm": 1.0778911113739014, "learning_rate": 9.448410145888857e-05, "loss": 1.3457, "mean_token_accuracy": 0.6646410763263703, "num_tokens": 93171029.0, "step": 2845 }, { "epoch": 0.23633800480968573, "grad_norm": 1.0126421451568604, "learning_rate": 9.445101354372385e-05, "loss": 1.3107, "mean_token_accuracy": 0.6731671556830406, "num_tokens": 93334869.0, "step": 2850 }, { "epoch": 0.2367526328882992, "grad_norm": 1.005475640296936, "learning_rate": 9.441783251311049e-05, "loss": 1.3616, "mean_token_accuracy": 0.6659518577158451, "num_tokens": 93498709.0, "step": 2855 }, { "epoch": 0.23716726096691268, "grad_norm": 0.9984427690505981, "learning_rate": 9.438455843655569e-05, "loss": 1.3184, "mean_token_accuracy": 0.6687435671687126, "num_tokens": 93662185.0, "step": 2860 }, { "epoch": 0.23758188904552616, "grad_norm": 1.0320781469345093, "learning_rate": 9.435119138376159e-05, "loss": 1.2562, "mean_token_accuracy": 0.6770711153745651, "num_tokens": 93826025.0, "step": 2865 }, { "epoch": 0.23799651712413963, "grad_norm": 1.0141587257385254, "learning_rate": 9.43177314246251e-05, "loss": 1.2928, "mean_token_accuracy": 0.6795943334698678, "num_tokens": 93989865.0, "step": 2870 }, { "epoch": 0.23841114520275314, "grad_norm": 1.0589091777801514, "learning_rate": 9.428417862923772e-05, "loss": 1.3665, "mean_token_accuracy": 0.6603066936135292, "num_tokens": 94153705.0, "step": 2875 }, { "epoch": 0.23882577328136662, "grad_norm": 1.1072170734405518, "learning_rate": 9.425053306788549e-05, "loss": 1.2944, "mean_token_accuracy": 0.6692937418818474, "num_tokens": 94317545.0, "step": 2880 }, { "epoch": 0.2392404013599801, "grad_norm": 1.028734803199768, "learning_rate": 9.421679481104868e-05, "loss": 1.3394, "mean_token_accuracy": 0.6677543595433235, "num_tokens": 94480139.0, "step": 2885 }, { "epoch": 0.23965502943859357, "grad_norm": 1.0725563764572144, "learning_rate": 9.41829639294018e-05, "loss": 1.3677, "mean_token_accuracy": 0.6636730194091797, "num_tokens": 94643979.0, "step": 2890 }, { "epoch": 0.24006965751720707, "grad_norm": 1.009470820426941, "learning_rate": 9.414904049381336e-05, "loss": 1.2996, "mean_token_accuracy": 0.6729166626930236, "num_tokens": 94807819.0, "step": 2895 }, { "epoch": 0.24048428559582055, "grad_norm": 1.0591769218444824, "learning_rate": 9.41150245753458e-05, "loss": 1.3459, "mean_token_accuracy": 0.6637157902121544, "num_tokens": 94971659.0, "step": 2900 }, { "epoch": 0.24089891367443403, "grad_norm": 1.0332934856414795, "learning_rate": 9.408091624525522e-05, "loss": 1.2765, "mean_token_accuracy": 0.677138315141201, "num_tokens": 95135499.0, "step": 2905 }, { "epoch": 0.2413135417530475, "grad_norm": 1.0035442113876343, "learning_rate": 9.404671557499137e-05, "loss": 1.2972, "mean_token_accuracy": 0.6727082923054695, "num_tokens": 95299132.0, "step": 2910 }, { "epoch": 0.241728169831661, "grad_norm": 1.0331671237945557, "learning_rate": 9.401242263619738e-05, "loss": 1.4168, "mean_token_accuracy": 0.6534823991358281, "num_tokens": 95462972.0, "step": 2915 }, { "epoch": 0.2421427979102745, "grad_norm": 1.0751533508300781, "learning_rate": 9.39780375007097e-05, "loss": 1.3258, "mean_token_accuracy": 0.6660618290305138, "num_tokens": 95626812.0, "step": 2920 }, { "epoch": 0.24255742598888796, "grad_norm": 1.0071254968643188, "learning_rate": 9.394356024055788e-05, "loss": 1.3109, "mean_token_accuracy": 0.6700085535645485, "num_tokens": 95790652.0, "step": 2925 }, { "epoch": 0.24297205406750144, "grad_norm": 1.0864005088806152, "learning_rate": 9.39089909279645e-05, "loss": 1.2887, "mean_token_accuracy": 0.6743523955345154, "num_tokens": 95954492.0, "step": 2930 }, { "epoch": 0.24338668214611495, "grad_norm": 1.0048301219940186, "learning_rate": 9.387432963534492e-05, "loss": 1.2817, "mean_token_accuracy": 0.6728433534502983, "num_tokens": 96118332.0, "step": 2935 }, { "epoch": 0.24380131022472842, "grad_norm": 1.0312083959579468, "learning_rate": 9.383957643530718e-05, "loss": 1.3182, "mean_token_accuracy": 0.6667404159903526, "num_tokens": 96282117.0, "step": 2940 }, { "epoch": 0.2442159383033419, "grad_norm": 1.0427964925765991, "learning_rate": 9.380473140065191e-05, "loss": 1.3905, "mean_token_accuracy": 0.6564271792769432, "num_tokens": 96445957.0, "step": 2945 }, { "epoch": 0.24463056638195538, "grad_norm": 1.0681729316711426, "learning_rate": 9.376979460437205e-05, "loss": 1.3662, "mean_token_accuracy": 0.6643389567732811, "num_tokens": 96609797.0, "step": 2950 }, { "epoch": 0.24504519446056888, "grad_norm": 1.115302324295044, "learning_rate": 9.373476611965278e-05, "loss": 1.3389, "mean_token_accuracy": 0.6667399793863297, "num_tokens": 96773637.0, "step": 2955 }, { "epoch": 0.24545982253918236, "grad_norm": 1.0743083953857422, "learning_rate": 9.369964601987132e-05, "loss": 1.2964, "mean_token_accuracy": 0.6747861742973328, "num_tokens": 96937477.0, "step": 2960 }, { "epoch": 0.24587445061779584, "grad_norm": 1.0740010738372803, "learning_rate": 9.366443437859688e-05, "loss": 1.3728, "mean_token_accuracy": 0.660416665673256, "num_tokens": 97101317.0, "step": 2965 }, { "epoch": 0.2462890786964093, "grad_norm": 1.0379053354263306, "learning_rate": 9.362913126959037e-05, "loss": 1.3167, "mean_token_accuracy": 0.6735581636428833, "num_tokens": 97265157.0, "step": 2970 }, { "epoch": 0.2467037067750228, "grad_norm": 0.9872788190841675, "learning_rate": 9.359373676680429e-05, "loss": 1.2447, "mean_token_accuracy": 0.683156156539917, "num_tokens": 97428997.0, "step": 2975 }, { "epoch": 0.2471183348536363, "grad_norm": 0.994804859161377, "learning_rate": 9.355825094438264e-05, "loss": 1.3235, "mean_token_accuracy": 0.6704239964485168, "num_tokens": 97592837.0, "step": 2980 }, { "epoch": 0.24753296293224977, "grad_norm": 0.9986344575881958, "learning_rate": 9.352267387666071e-05, "loss": 1.3173, "mean_token_accuracy": 0.6646505400538445, "num_tokens": 97756677.0, "step": 2985 }, { "epoch": 0.24794759101086325, "grad_norm": 1.0404119491577148, "learning_rate": 9.348700563816488e-05, "loss": 1.3425, "mean_token_accuracy": 0.6644794717431068, "num_tokens": 97920517.0, "step": 2990 }, { "epoch": 0.24836221908947673, "grad_norm": 1.0144007205963135, "learning_rate": 9.345124630361257e-05, "loss": 1.3814, "mean_token_accuracy": 0.6638990670442582, "num_tokens": 98084357.0, "step": 2995 }, { "epoch": 0.24877684716809023, "grad_norm": 1.1392951011657715, "learning_rate": 9.3415395947912e-05, "loss": 1.3271, "mean_token_accuracy": 0.6704850926995277, "num_tokens": 98248197.0, "step": 3000 }, { "epoch": 0.2491914752467037, "grad_norm": 1.0668189525604248, "learning_rate": 9.337945464616207e-05, "loss": 1.3355, "mean_token_accuracy": 0.6689821600914001, "num_tokens": 98412037.0, "step": 3005 }, { "epoch": 0.24960610332531719, "grad_norm": 1.0075867176055908, "learning_rate": 9.334342247365216e-05, "loss": 1.3259, "mean_token_accuracy": 0.6669354841113091, "num_tokens": 98575877.0, "step": 3010 }, { "epoch": 0.25002073140393066, "grad_norm": 0.9960988759994507, "learning_rate": 9.330729950586207e-05, "loss": 1.3354, "mean_token_accuracy": 0.6663807734847069, "num_tokens": 98739529.0, "step": 3015 }, { "epoch": 0.25043535948254414, "grad_norm": 1.0258535146713257, "learning_rate": 9.327108581846172e-05, "loss": 1.2835, "mean_token_accuracy": 0.6787328958511353, "num_tokens": 98903369.0, "step": 3020 }, { "epoch": 0.2508499875611576, "grad_norm": 1.0559964179992676, "learning_rate": 9.323478148731112e-05, "loss": 1.3956, "mean_token_accuracy": 0.6548631489276886, "num_tokens": 99067209.0, "step": 3025 }, { "epoch": 0.25126461563977115, "grad_norm": 1.0208572149276733, "learning_rate": 9.319838658846019e-05, "loss": 1.3445, "mean_token_accuracy": 0.6640823528170585, "num_tokens": 99231049.0, "step": 3030 }, { "epoch": 0.2516792437183846, "grad_norm": 1.0227625370025635, "learning_rate": 9.316190119814847e-05, "loss": 1.2566, "mean_token_accuracy": 0.6830950632691384, "num_tokens": 99394889.0, "step": 3035 }, { "epoch": 0.2520938717969981, "grad_norm": 0.9901267290115356, "learning_rate": 9.312532539280512e-05, "loss": 1.3832, "mean_token_accuracy": 0.656995353102684, "num_tokens": 99558729.0, "step": 3040 }, { "epoch": 0.2525084998756116, "grad_norm": 1.0293430089950562, "learning_rate": 9.308865924904873e-05, "loss": 1.3067, "mean_token_accuracy": 0.6714381769299507, "num_tokens": 99722569.0, "step": 3045 }, { "epoch": 0.25292312795422506, "grad_norm": 1.020733118057251, "learning_rate": 9.305190284368706e-05, "loss": 1.3645, "mean_token_accuracy": 0.66188904941082, "num_tokens": 99886409.0, "step": 3050 }, { "epoch": 0.25333775603283853, "grad_norm": 1.0120285749435425, "learning_rate": 9.301505625371702e-05, "loss": 1.3091, "mean_token_accuracy": 0.6673692539334297, "num_tokens": 100050249.0, "step": 3055 }, { "epoch": 0.253752384111452, "grad_norm": 1.0258290767669678, "learning_rate": 9.29781195563244e-05, "loss": 1.3079, "mean_token_accuracy": 0.6742057658731937, "num_tokens": 100214089.0, "step": 3060 }, { "epoch": 0.2541670121900655, "grad_norm": 0.9953669905662537, "learning_rate": 9.294109282888373e-05, "loss": 1.2476, "mean_token_accuracy": 0.6814332813024521, "num_tokens": 100377929.0, "step": 3065 }, { "epoch": 0.254581640268679, "grad_norm": 1.0049127340316772, "learning_rate": 9.290397614895815e-05, "loss": 1.3179, "mean_token_accuracy": 0.6699474558234215, "num_tokens": 100541769.0, "step": 3070 }, { "epoch": 0.2549962683472925, "grad_norm": 1.0290277004241943, "learning_rate": 9.286676959429926e-05, "loss": 1.275, "mean_token_accuracy": 0.6747800588607789, "num_tokens": 100705609.0, "step": 3075 }, { "epoch": 0.255410896425906, "grad_norm": 0.9995718002319336, "learning_rate": 9.282947324284689e-05, "loss": 1.2793, "mean_token_accuracy": 0.6762157842516899, "num_tokens": 100869449.0, "step": 3080 }, { "epoch": 0.25582552450451945, "grad_norm": 0.9663996696472168, "learning_rate": 9.279208717272898e-05, "loss": 1.2812, "mean_token_accuracy": 0.6771077737212181, "num_tokens": 101033289.0, "step": 3085 }, { "epoch": 0.25624015258313293, "grad_norm": 1.0330816507339478, "learning_rate": 9.275461146226143e-05, "loss": 1.3458, "mean_token_accuracy": 0.6675525397062302, "num_tokens": 101197129.0, "step": 3090 }, { "epoch": 0.2566547806617464, "grad_norm": 0.9948139786720276, "learning_rate": 9.271704618994792e-05, "loss": 1.2211, "mean_token_accuracy": 0.6863330885767936, "num_tokens": 101360969.0, "step": 3095 }, { "epoch": 0.2570694087403599, "grad_norm": 1.0910817384719849, "learning_rate": 9.26793914344797e-05, "loss": 1.347, "mean_token_accuracy": 0.665854100883007, "num_tokens": 101524809.0, "step": 3100 }, { "epoch": 0.25748403681897336, "grad_norm": 1.057950735092163, "learning_rate": 9.264164727473553e-05, "loss": 1.2894, "mean_token_accuracy": 0.67501832395792, "num_tokens": 101688649.0, "step": 3105 }, { "epoch": 0.25789866489758684, "grad_norm": 0.997083306312561, "learning_rate": 9.26038137897814e-05, "loss": 1.3129, "mean_token_accuracy": 0.6704789832234382, "num_tokens": 101852489.0, "step": 3110 }, { "epoch": 0.25831329297620037, "grad_norm": 1.046116590499878, "learning_rate": 9.256589105887045e-05, "loss": 1.3677, "mean_token_accuracy": 0.659909576177597, "num_tokens": 102016329.0, "step": 3115 }, { "epoch": 0.25872792105481385, "grad_norm": 1.0335099697113037, "learning_rate": 9.252787916144276e-05, "loss": 1.3387, "mean_token_accuracy": 0.670869991183281, "num_tokens": 102180169.0, "step": 3120 }, { "epoch": 0.2591425491334273, "grad_norm": 1.0001356601715088, "learning_rate": 9.248977817712521e-05, "loss": 1.2951, "mean_token_accuracy": 0.6754032284021377, "num_tokens": 102344009.0, "step": 3125 }, { "epoch": 0.2595571772120408, "grad_norm": 1.0196658372879028, "learning_rate": 9.245158818573124e-05, "loss": 1.3248, "mean_token_accuracy": 0.6689027354121209, "num_tokens": 102507849.0, "step": 3130 }, { "epoch": 0.2599718052906543, "grad_norm": 1.027083158493042, "learning_rate": 9.241330926726082e-05, "loss": 1.3356, "mean_token_accuracy": 0.6643267348408699, "num_tokens": 102671689.0, "step": 3135 }, { "epoch": 0.26038643336926776, "grad_norm": 1.0170602798461914, "learning_rate": 9.237494150190017e-05, "loss": 1.2662, "mean_token_accuracy": 0.6814943760633468, "num_tokens": 102835529.0, "step": 3140 }, { "epoch": 0.26080106144788123, "grad_norm": 0.9635075330734253, "learning_rate": 9.233648497002161e-05, "loss": 1.2548, "mean_token_accuracy": 0.68298509567976, "num_tokens": 102999369.0, "step": 3145 }, { "epoch": 0.2612156895264947, "grad_norm": 1.0100306272506714, "learning_rate": 9.229793975218342e-05, "loss": 1.4041, "mean_token_accuracy": 0.6569037184119224, "num_tokens": 103163209.0, "step": 3150 }, { "epoch": 0.26163031760510824, "grad_norm": 1.0459941625595093, "learning_rate": 9.225930592912966e-05, "loss": 1.3257, "mean_token_accuracy": 0.6676625102758408, "num_tokens": 103327049.0, "step": 3155 }, { "epoch": 0.2620449456837217, "grad_norm": 1.121048927307129, "learning_rate": 9.222058358179002e-05, "loss": 1.3567, "mean_token_accuracy": 0.6637218981981278, "num_tokens": 103490889.0, "step": 3160 }, { "epoch": 0.2624595737623352, "grad_norm": 1.0036065578460693, "learning_rate": 9.218177279127958e-05, "loss": 1.2773, "mean_token_accuracy": 0.674205768108368, "num_tokens": 103654729.0, "step": 3165 }, { "epoch": 0.2628742018409487, "grad_norm": 1.020117163658142, "learning_rate": 9.214287363889872e-05, "loss": 1.3656, "mean_token_accuracy": 0.6625549823045731, "num_tokens": 103818569.0, "step": 3170 }, { "epoch": 0.26328882991956215, "grad_norm": 0.9855335354804993, "learning_rate": 9.210388620613293e-05, "loss": 1.2552, "mean_token_accuracy": 0.6789345040917396, "num_tokens": 103982409.0, "step": 3175 }, { "epoch": 0.2637034579981756, "grad_norm": 0.9548041224479675, "learning_rate": 9.20648105746526e-05, "loss": 1.328, "mean_token_accuracy": 0.6681791037321091, "num_tokens": 104145721.0, "step": 3180 }, { "epoch": 0.2641180860767891, "grad_norm": 1.0116145610809326, "learning_rate": 9.202564682631289e-05, "loss": 1.2607, "mean_token_accuracy": 0.6777675986289978, "num_tokens": 104309561.0, "step": 3185 }, { "epoch": 0.2645327141554026, "grad_norm": 1.0734248161315918, "learning_rate": 9.198639504315358e-05, "loss": 1.2948, "mean_token_accuracy": 0.6718597277998924, "num_tokens": 104473401.0, "step": 3190 }, { "epoch": 0.2649473422340161, "grad_norm": 1.166406512260437, "learning_rate": 9.194705530739882e-05, "loss": 1.3162, "mean_token_accuracy": 0.6695503443479538, "num_tokens": 104637241.0, "step": 3195 }, { "epoch": 0.2653619703126296, "grad_norm": 1.0191946029663086, "learning_rate": 9.1907627701457e-05, "loss": 1.3405, "mean_token_accuracy": 0.6675891995429992, "num_tokens": 104801081.0, "step": 3200 }, { "epoch": 0.26577659839124307, "grad_norm": 1.0574842691421509, "learning_rate": 9.186811230792061e-05, "loss": 1.3374, "mean_token_accuracy": 0.6671737521886826, "num_tokens": 104964921.0, "step": 3205 }, { "epoch": 0.26619122646985655, "grad_norm": 1.0240511894226074, "learning_rate": 9.182850920956601e-05, "loss": 1.2516, "mean_token_accuracy": 0.6806186750531197, "num_tokens": 105128442.0, "step": 3210 }, { "epoch": 0.26660585454847, "grad_norm": 0.9983618855476379, "learning_rate": 9.178881848935329e-05, "loss": 1.2835, "mean_token_accuracy": 0.6728005856275558, "num_tokens": 105292282.0, "step": 3215 }, { "epoch": 0.2670204826270835, "grad_norm": 1.0017285346984863, "learning_rate": 9.17490402304261e-05, "loss": 1.28, "mean_token_accuracy": 0.6750406816601753, "num_tokens": 105455380.0, "step": 3220 }, { "epoch": 0.267435110705697, "grad_norm": 1.013919711112976, "learning_rate": 9.170917451611147e-05, "loss": 1.3516, "mean_token_accuracy": 0.6645039066672325, "num_tokens": 105619220.0, "step": 3225 }, { "epoch": 0.26784973878431045, "grad_norm": 0.9826188087463379, "learning_rate": 9.166922142991963e-05, "loss": 1.3478, "mean_token_accuracy": 0.6666116788983345, "num_tokens": 105783060.0, "step": 3230 }, { "epoch": 0.26826436686292393, "grad_norm": 1.062046766281128, "learning_rate": 9.162918105554378e-05, "loss": 1.3156, "mean_token_accuracy": 0.6701210156083107, "num_tokens": 105945803.0, "step": 3235 }, { "epoch": 0.26867899494153746, "grad_norm": 1.0021882057189941, "learning_rate": 9.158905347686005e-05, "loss": 1.363, "mean_token_accuracy": 0.6650781989097595, "num_tokens": 106109643.0, "step": 3240 }, { "epoch": 0.26909362302015094, "grad_norm": 0.9552791714668274, "learning_rate": 9.15488387779272e-05, "loss": 1.3426, "mean_token_accuracy": 0.66844452470541, "num_tokens": 106273483.0, "step": 3245 }, { "epoch": 0.2695082510987644, "grad_norm": 1.0006051063537598, "learning_rate": 9.150853704298648e-05, "loss": 1.3249, "mean_token_accuracy": 0.6659518554806709, "num_tokens": 106437323.0, "step": 3250 }, { "epoch": 0.2699228791773779, "grad_norm": 1.0177334547042847, "learning_rate": 9.146814835646151e-05, "loss": 1.4038, "mean_token_accuracy": 0.6590929798781872, "num_tokens": 106600363.0, "step": 3255 }, { "epoch": 0.27033750725599137, "grad_norm": 0.9710380434989929, "learning_rate": 9.1427672802958e-05, "loss": 1.2932, "mean_token_accuracy": 0.6759469673037529, "num_tokens": 106764203.0, "step": 3260 }, { "epoch": 0.27075213533460485, "grad_norm": 1.0061321258544922, "learning_rate": 9.138711046726367e-05, "loss": 1.2612, "mean_token_accuracy": 0.6762341171503067, "num_tokens": 106928043.0, "step": 3265 }, { "epoch": 0.2711667634132183, "grad_norm": 1.4211668968200684, "learning_rate": 9.134646143434802e-05, "loss": 1.2949, "mean_token_accuracy": 0.6718108490109443, "num_tokens": 107091883.0, "step": 3270 }, { "epoch": 0.2715813914918318, "grad_norm": 0.9751541614532471, "learning_rate": 9.130572578936213e-05, "loss": 1.2502, "mean_token_accuracy": 0.6819098234176636, "num_tokens": 107255723.0, "step": 3275 }, { "epoch": 0.27199601957044534, "grad_norm": 0.992137610912323, "learning_rate": 9.126490361763856e-05, "loss": 1.3316, "mean_token_accuracy": 0.6688416451215744, "num_tokens": 107419563.0, "step": 3280 }, { "epoch": 0.2724106476490588, "grad_norm": 0.9852685928344727, "learning_rate": 9.122399500469107e-05, "loss": 1.2833, "mean_token_accuracy": 0.6725562021136284, "num_tokens": 107583403.0, "step": 3285 }, { "epoch": 0.2728252757276723, "grad_norm": 0.9987186789512634, "learning_rate": 9.118300003621459e-05, "loss": 1.338, "mean_token_accuracy": 0.6650476589798927, "num_tokens": 107747243.0, "step": 3290 }, { "epoch": 0.27323990380628577, "grad_norm": 1.0254756212234497, "learning_rate": 9.114191879808484e-05, "loss": 1.2657, "mean_token_accuracy": 0.6778714567422867, "num_tokens": 107911083.0, "step": 3295 }, { "epoch": 0.27365453188489924, "grad_norm": 0.9749782085418701, "learning_rate": 9.110075137635831e-05, "loss": 1.3152, "mean_token_accuracy": 0.6695625618100166, "num_tokens": 108074923.0, "step": 3300 }, { "epoch": 0.2740691599635127, "grad_norm": 0.9420017600059509, "learning_rate": 9.105949785727203e-05, "loss": 1.2318, "mean_token_accuracy": 0.6857771277427673, "num_tokens": 108238763.0, "step": 3305 }, { "epoch": 0.2744837880421262, "grad_norm": 2.3964684009552, "learning_rate": 9.101815832724338e-05, "loss": 1.366, "mean_token_accuracy": 0.6613514140248299, "num_tokens": 108402603.0, "step": 3310 }, { "epoch": 0.2748984161207397, "grad_norm": 0.9832311272621155, "learning_rate": 9.097673287286991e-05, "loss": 1.2669, "mean_token_accuracy": 0.6781463831663131, "num_tokens": 108566443.0, "step": 3315 }, { "epoch": 0.2753130441993532, "grad_norm": 1.0415925979614258, "learning_rate": 9.093522158092914e-05, "loss": 1.3111, "mean_token_accuracy": 0.669195581972599, "num_tokens": 108729715.0, "step": 3320 }, { "epoch": 0.2757276722779667, "grad_norm": 1.0237131118774414, "learning_rate": 9.089362453837845e-05, "loss": 1.3479, "mean_token_accuracy": 0.6657319158315659, "num_tokens": 108893555.0, "step": 3325 }, { "epoch": 0.27614230035658016, "grad_norm": 1.0040159225463867, "learning_rate": 9.085194183235481e-05, "loss": 1.2577, "mean_token_accuracy": 0.6773888096213341, "num_tokens": 109057395.0, "step": 3330 }, { "epoch": 0.27655692843519364, "grad_norm": 1.0257548093795776, "learning_rate": 9.081017355017467e-05, "loss": 1.3377, "mean_token_accuracy": 0.6641862168908119, "num_tokens": 109221235.0, "step": 3335 }, { "epoch": 0.2769715565138071, "grad_norm": 1.0096317529678345, "learning_rate": 9.07683197793337e-05, "loss": 1.3428, "mean_token_accuracy": 0.6685923337936401, "num_tokens": 109384292.0, "step": 3340 }, { "epoch": 0.2773861845924206, "grad_norm": 0.9559937715530396, "learning_rate": 9.07263806075067e-05, "loss": 1.3623, "mean_token_accuracy": 0.665658600628376, "num_tokens": 109548132.0, "step": 3345 }, { "epoch": 0.27780081267103407, "grad_norm": 0.9451937079429626, "learning_rate": 9.068435612254733e-05, "loss": 1.3313, "mean_token_accuracy": 0.6634958505630493, "num_tokens": 109711972.0, "step": 3350 }, { "epoch": 0.27821544074964755, "grad_norm": 0.9886478781700134, "learning_rate": 9.064224641248798e-05, "loss": 1.2324, "mean_token_accuracy": 0.6831625834107399, "num_tokens": 109875744.0, "step": 3355 }, { "epoch": 0.278630068828261, "grad_norm": 1.0173379182815552, "learning_rate": 9.060005156553955e-05, "loss": 1.3514, "mean_token_accuracy": 0.6684750705957413, "num_tokens": 110039584.0, "step": 3360 }, { "epoch": 0.27904469690687456, "grad_norm": 0.9815647602081299, "learning_rate": 9.055777167009133e-05, "loss": 1.3492, "mean_token_accuracy": 0.6633675500750542, "num_tokens": 110203424.0, "step": 3365 }, { "epoch": 0.27945932498548803, "grad_norm": 1.0273895263671875, "learning_rate": 9.051540681471071e-05, "loss": 1.2606, "mean_token_accuracy": 0.6797031410038471, "num_tokens": 110366989.0, "step": 3370 }, { "epoch": 0.2798739530641015, "grad_norm": 0.9994346499443054, "learning_rate": 9.047295708814307e-05, "loss": 1.2776, "mean_token_accuracy": 0.6767400532960892, "num_tokens": 110529452.0, "step": 3375 }, { "epoch": 0.280288581142715, "grad_norm": 0.9704295992851257, "learning_rate": 9.043042257931163e-05, "loss": 1.2921, "mean_token_accuracy": 0.6753299072384834, "num_tokens": 110693292.0, "step": 3380 }, { "epoch": 0.28070320922132846, "grad_norm": 1.0179647207260132, "learning_rate": 9.038780337731712e-05, "loss": 1.3267, "mean_token_accuracy": 0.6659090921282769, "num_tokens": 110857132.0, "step": 3385 }, { "epoch": 0.28111783729994194, "grad_norm": 1.0432519912719727, "learning_rate": 9.034509957143775e-05, "loss": 1.3504, "mean_token_accuracy": 0.6619195982813835, "num_tokens": 111020972.0, "step": 3390 }, { "epoch": 0.2815324653785554, "grad_norm": 1.0100421905517578, "learning_rate": 9.030231125112896e-05, "loss": 1.2802, "mean_token_accuracy": 0.6753299161791801, "num_tokens": 111184812.0, "step": 3395 }, { "epoch": 0.2819470934571689, "grad_norm": 0.9555953741073608, "learning_rate": 9.025943850602316e-05, "loss": 1.3427, "mean_token_accuracy": 0.6632758989930153, "num_tokens": 111347809.0, "step": 3400 }, { "epoch": 0.28236172153578243, "grad_norm": 1.0390369892120361, "learning_rate": 9.021648142592971e-05, "loss": 1.3525, "mean_token_accuracy": 0.6661779060959816, "num_tokens": 111511649.0, "step": 3405 }, { "epoch": 0.2827763496143959, "grad_norm": 1.0647310018539429, "learning_rate": 9.017344010083457e-05, "loss": 1.2919, "mean_token_accuracy": 0.6720491155982018, "num_tokens": 111675489.0, "step": 3410 }, { "epoch": 0.2831909776930094, "grad_norm": 1.0379363298416138, "learning_rate": 9.01303146209002e-05, "loss": 1.3333, "mean_token_accuracy": 0.6658174470067024, "num_tokens": 111839329.0, "step": 3415 }, { "epoch": 0.28360560577162286, "grad_norm": 1.0758144855499268, "learning_rate": 9.008710507646529e-05, "loss": 1.2527, "mean_token_accuracy": 0.6785129502415657, "num_tokens": 112003169.0, "step": 3420 }, { "epoch": 0.28402023385023634, "grad_norm": 0.9726275205612183, "learning_rate": 9.004381155804473e-05, "loss": 1.3586, "mean_token_accuracy": 0.6619318142533303, "num_tokens": 112167009.0, "step": 3425 }, { "epoch": 0.2844348619288498, "grad_norm": 1.0184444189071655, "learning_rate": 9.000043415632923e-05, "loss": 1.3267, "mean_token_accuracy": 0.6635813802480698, "num_tokens": 112330849.0, "step": 3430 }, { "epoch": 0.2848494900074633, "grad_norm": 1.0357671976089478, "learning_rate": 8.995697296218526e-05, "loss": 1.3327, "mean_token_accuracy": 0.6685667164623738, "num_tokens": 112494689.0, "step": 3435 }, { "epoch": 0.28526411808607677, "grad_norm": 0.9924004077911377, "learning_rate": 8.991342806665481e-05, "loss": 1.2824, "mean_token_accuracy": 0.6740469232201576, "num_tokens": 112658529.0, "step": 3440 }, { "epoch": 0.2856787461646903, "grad_norm": 0.9633674621582031, "learning_rate": 8.98697995609552e-05, "loss": 1.3339, "mean_token_accuracy": 0.6692316338419915, "num_tokens": 112821517.0, "step": 3445 }, { "epoch": 0.2860933742433038, "grad_norm": 0.9266213774681091, "learning_rate": 8.982608753647888e-05, "loss": 1.2695, "mean_token_accuracy": 0.6778958901762963, "num_tokens": 112985357.0, "step": 3450 }, { "epoch": 0.28650800232191725, "grad_norm": 0.9768882393836975, "learning_rate": 8.978229208479331e-05, "loss": 1.2095, "mean_token_accuracy": 0.6911997899413109, "num_tokens": 113148805.0, "step": 3455 }, { "epoch": 0.28692263040053073, "grad_norm": 1.002746820449829, "learning_rate": 8.973841329764066e-05, "loss": 1.2813, "mean_token_accuracy": 0.6760325044393539, "num_tokens": 113312645.0, "step": 3460 }, { "epoch": 0.2873372584791442, "grad_norm": 0.9470506310462952, "learning_rate": 8.969445126693768e-05, "loss": 1.1976, "mean_token_accuracy": 0.6911351397633553, "num_tokens": 113476485.0, "step": 3465 }, { "epoch": 0.2877518865577577, "grad_norm": 0.9840176105499268, "learning_rate": 8.965040608477549e-05, "loss": 1.3362, "mean_token_accuracy": 0.6656280577182769, "num_tokens": 113640325.0, "step": 3470 }, { "epoch": 0.28816651463637116, "grad_norm": 1.018842101097107, "learning_rate": 8.960627784341944e-05, "loss": 1.3188, "mean_token_accuracy": 0.6690188199281693, "num_tokens": 113804165.0, "step": 3475 }, { "epoch": 0.28858114271498464, "grad_norm": 1.047323226928711, "learning_rate": 8.956206663530881e-05, "loss": 1.2952, "mean_token_accuracy": 0.6749450147151947, "num_tokens": 113968005.0, "step": 3480 }, { "epoch": 0.2889957707935981, "grad_norm": 1.0208772420883179, "learning_rate": 8.951777255305673e-05, "loss": 1.3619, "mean_token_accuracy": 0.6644733607769012, "num_tokens": 114131845.0, "step": 3485 }, { "epoch": 0.28941039887221165, "grad_norm": 0.9302547574043274, "learning_rate": 8.94733956894499e-05, "loss": 1.2659, "mean_token_accuracy": 0.6765823513269424, "num_tokens": 114295685.0, "step": 3490 }, { "epoch": 0.2898250269508251, "grad_norm": 0.9998297691345215, "learning_rate": 8.942893613744843e-05, "loss": 1.3208, "mean_token_accuracy": 0.6723768964409829, "num_tokens": 114458469.0, "step": 3495 }, { "epoch": 0.2902396550294386, "grad_norm": 0.9959949851036072, "learning_rate": 8.938439399018567e-05, "loss": 1.3665, "mean_token_accuracy": 0.6610520526766777, "num_tokens": 114622309.0, "step": 3500 }, { "epoch": 0.2906542831080521, "grad_norm": 1.010382890701294, "learning_rate": 8.9339769340968e-05, "loss": 1.2943, "mean_token_accuracy": 0.6740163698792457, "num_tokens": 114786149.0, "step": 3505 }, { "epoch": 0.29106891118666556, "grad_norm": 0.9420673251152039, "learning_rate": 8.929506228327453e-05, "loss": 1.2948, "mean_token_accuracy": 0.6721407622098923, "num_tokens": 114949989.0, "step": 3510 }, { "epoch": 0.29148353926527903, "grad_norm": 0.9875288605690002, "learning_rate": 8.925027291075713e-05, "loss": 1.2495, "mean_token_accuracy": 0.6839748278260231, "num_tokens": 115113829.0, "step": 3515 }, { "epoch": 0.2918981673438925, "grad_norm": 1.045227289199829, "learning_rate": 8.920540131724e-05, "loss": 1.3585, "mean_token_accuracy": 0.6651698425412178, "num_tokens": 115277669.0, "step": 3520 }, { "epoch": 0.292312795422506, "grad_norm": 0.992067277431488, "learning_rate": 8.916044759671964e-05, "loss": 1.2662, "mean_token_accuracy": 0.6729960918426514, "num_tokens": 115441509.0, "step": 3525 }, { "epoch": 0.2927274235011195, "grad_norm": 0.9711788892745972, "learning_rate": 8.911541184336455e-05, "loss": 1.2847, "mean_token_accuracy": 0.674321848154068, "num_tokens": 115605349.0, "step": 3530 }, { "epoch": 0.293142051579733, "grad_norm": 0.9849469661712646, "learning_rate": 8.907029415151509e-05, "loss": 1.3259, "mean_token_accuracy": 0.6654081135988236, "num_tokens": 115769189.0, "step": 3535 }, { "epoch": 0.2935566796583465, "grad_norm": 0.9864558577537537, "learning_rate": 8.902509461568324e-05, "loss": 1.3397, "mean_token_accuracy": 0.6647116288542747, "num_tokens": 115933029.0, "step": 3540 }, { "epoch": 0.29397130773695995, "grad_norm": 1.0209910869598389, "learning_rate": 8.897981333055249e-05, "loss": 1.3385, "mean_token_accuracy": 0.6657764717936516, "num_tokens": 116096847.0, "step": 3545 }, { "epoch": 0.29438593581557343, "grad_norm": 0.9152947068214417, "learning_rate": 8.893445039097747e-05, "loss": 1.2298, "mean_token_accuracy": 0.6811148285865783, "num_tokens": 116260162.0, "step": 3550 }, { "epoch": 0.2948005638941869, "grad_norm": 1.0247241258621216, "learning_rate": 8.888900589198397e-05, "loss": 1.3976, "mean_token_accuracy": 0.6553091421723366, "num_tokens": 116424002.0, "step": 3555 }, { "epoch": 0.2952151919728004, "grad_norm": 1.0255168676376343, "learning_rate": 8.884347992876856e-05, "loss": 1.2533, "mean_token_accuracy": 0.6799609005451203, "num_tokens": 116587842.0, "step": 3560 }, { "epoch": 0.29562982005141386, "grad_norm": 1.0117552280426025, "learning_rate": 8.879787259669848e-05, "loss": 1.326, "mean_token_accuracy": 0.6681744247674942, "num_tokens": 116751057.0, "step": 3565 }, { "epoch": 0.29604444813002734, "grad_norm": 0.9783384203910828, "learning_rate": 8.875218399131142e-05, "loss": 1.2761, "mean_token_accuracy": 0.6763746306300163, "num_tokens": 116914897.0, "step": 3570 }, { "epoch": 0.29645907620864087, "grad_norm": 1.0150396823883057, "learning_rate": 8.870641420831534e-05, "loss": 1.224, "mean_token_accuracy": 0.6861681327223778, "num_tokens": 117078737.0, "step": 3575 }, { "epoch": 0.29687370428725435, "grad_norm": 0.9749643802642822, "learning_rate": 8.86605633435882e-05, "loss": 1.2896, "mean_token_accuracy": 0.6753237992525101, "num_tokens": 117242577.0, "step": 3580 }, { "epoch": 0.2972883323658678, "grad_norm": 1.0319440364837646, "learning_rate": 8.861463149317786e-05, "loss": 1.3142, "mean_token_accuracy": 0.6703567981719971, "num_tokens": 117406417.0, "step": 3585 }, { "epoch": 0.2977029604444813, "grad_norm": 1.0056684017181396, "learning_rate": 8.85686187533018e-05, "loss": 1.275, "mean_token_accuracy": 0.677285224199295, "num_tokens": 117568404.0, "step": 3590 }, { "epoch": 0.2981175885230948, "grad_norm": 0.9867774844169617, "learning_rate": 8.852252522034697e-05, "loss": 1.3728, "mean_token_accuracy": 0.6640212625265122, "num_tokens": 117732244.0, "step": 3595 }, { "epoch": 0.29853221660170826, "grad_norm": 0.9824956059455872, "learning_rate": 8.847635099086953e-05, "loss": 1.294, "mean_token_accuracy": 0.6750061109662056, "num_tokens": 117896084.0, "step": 3600 }, { "epoch": 0.29894684468032173, "grad_norm": 1.0192714929580688, "learning_rate": 8.84300961615947e-05, "loss": 1.2946, "mean_token_accuracy": 0.6741141274571418, "num_tokens": 118059924.0, "step": 3605 }, { "epoch": 0.2993614727589352, "grad_norm": 0.999767541885376, "learning_rate": 8.838376082941654e-05, "loss": 1.2483, "mean_token_accuracy": 0.6844934061169624, "num_tokens": 118223311.0, "step": 3610 }, { "epoch": 0.29977610083754874, "grad_norm": 0.9591719508171082, "learning_rate": 8.833734509139778e-05, "loss": 1.2544, "mean_token_accuracy": 0.6827162742614746, "num_tokens": 118387151.0, "step": 3615 }, { "epoch": 0.3001907289161622, "grad_norm": 0.9481289386749268, "learning_rate": 8.829084904476949e-05, "loss": 1.2819, "mean_token_accuracy": 0.6749938905239106, "num_tokens": 118550991.0, "step": 3620 }, { "epoch": 0.3006053569947757, "grad_norm": 0.9553553462028503, "learning_rate": 8.824427278693108e-05, "loss": 1.2809, "mean_token_accuracy": 0.6812683321535588, "num_tokens": 118714831.0, "step": 3625 }, { "epoch": 0.3010199850733892, "grad_norm": 1.0011934041976929, "learning_rate": 8.819761641544992e-05, "loss": 1.3812, "mean_token_accuracy": 0.659054248034954, "num_tokens": 118878671.0, "step": 3630 }, { "epoch": 0.30143461315200265, "grad_norm": 1.0294073820114136, "learning_rate": 8.81508800280612e-05, "loss": 1.353, "mean_token_accuracy": 0.6636974602937699, "num_tokens": 119042511.0, "step": 3635 }, { "epoch": 0.3018492412306161, "grad_norm": 1.0390793085098267, "learning_rate": 8.810406372266778e-05, "loss": 1.284, "mean_token_accuracy": 0.6743035152554512, "num_tokens": 119206351.0, "step": 3640 }, { "epoch": 0.3022638693092296, "grad_norm": 0.9407466650009155, "learning_rate": 8.805716759733984e-05, "loss": 1.2414, "mean_token_accuracy": 0.679227763414383, "num_tokens": 119370191.0, "step": 3645 }, { "epoch": 0.3026784973878431, "grad_norm": 0.9619264006614685, "learning_rate": 8.801019175031486e-05, "loss": 1.308, "mean_token_accuracy": 0.6739919319748878, "num_tokens": 119534031.0, "step": 3650 }, { "epoch": 0.3030931254664566, "grad_norm": 1.022631049156189, "learning_rate": 8.796313627999728e-05, "loss": 1.2781, "mean_token_accuracy": 0.6755681842565536, "num_tokens": 119697871.0, "step": 3655 }, { "epoch": 0.3035077535450701, "grad_norm": 1.0383825302124023, "learning_rate": 8.791600128495832e-05, "loss": 1.3631, "mean_token_accuracy": 0.6621700882911682, "num_tokens": 119861711.0, "step": 3660 }, { "epoch": 0.30392238162368357, "grad_norm": 0.9666171669960022, "learning_rate": 8.786878686393579e-05, "loss": 1.2866, "mean_token_accuracy": 0.6758003398776055, "num_tokens": 120025551.0, "step": 3665 }, { "epoch": 0.30433700970229705, "grad_norm": 0.9437312483787537, "learning_rate": 8.78214931158339e-05, "loss": 1.2863, "mean_token_accuracy": 0.6766190111637116, "num_tokens": 120189391.0, "step": 3670 }, { "epoch": 0.3047516377809105, "grad_norm": 1.012891173362732, "learning_rate": 8.777412013972304e-05, "loss": 1.3006, "mean_token_accuracy": 0.6769367054104805, "num_tokens": 120353231.0, "step": 3675 }, { "epoch": 0.305166265859524, "grad_norm": 1.033991813659668, "learning_rate": 8.772666803483956e-05, "loss": 1.2896, "mean_token_accuracy": 0.6741019040346146, "num_tokens": 120517071.0, "step": 3680 }, { "epoch": 0.3055808939381375, "grad_norm": 0.9737828373908997, "learning_rate": 8.767913690058551e-05, "loss": 1.2837, "mean_token_accuracy": 0.675384895503521, "num_tokens": 120680911.0, "step": 3685 }, { "epoch": 0.30599552201675095, "grad_norm": 0.9874985218048096, "learning_rate": 8.763152683652857e-05, "loss": 1.3349, "mean_token_accuracy": 0.6681194260716439, "num_tokens": 120844087.0, "step": 3690 }, { "epoch": 0.30641015009536443, "grad_norm": 1.0114009380340576, "learning_rate": 8.758383794240172e-05, "loss": 1.3402, "mean_token_accuracy": 0.6662512198090553, "num_tokens": 121007927.0, "step": 3695 }, { "epoch": 0.30682477817397796, "grad_norm": 0.9285961389541626, "learning_rate": 8.753607031810312e-05, "loss": 1.2309, "mean_token_accuracy": 0.6873839244246482, "num_tokens": 121171767.0, "step": 3700 }, { "epoch": 0.30723940625259144, "grad_norm": 0.9762885570526123, "learning_rate": 8.748822406369574e-05, "loss": 1.2427, "mean_token_accuracy": 0.682667401432991, "num_tokens": 121335607.0, "step": 3705 }, { "epoch": 0.3076540343312049, "grad_norm": 1.032778024673462, "learning_rate": 8.74402992794074e-05, "loss": 1.2541, "mean_token_accuracy": 0.6826001942157746, "num_tokens": 121499447.0, "step": 3710 }, { "epoch": 0.3080686624098184, "grad_norm": 1.0014163255691528, "learning_rate": 8.739229606563035e-05, "loss": 1.2626, "mean_token_accuracy": 0.6812499985098839, "num_tokens": 121663287.0, "step": 3715 }, { "epoch": 0.30848329048843187, "grad_norm": 0.9528563022613525, "learning_rate": 8.734421452292114e-05, "loss": 1.2527, "mean_token_accuracy": 0.6809811815619469, "num_tokens": 121827127.0, "step": 3720 }, { "epoch": 0.30889791856704535, "grad_norm": 1.0677566528320312, "learning_rate": 8.72960547520004e-05, "loss": 1.2492, "mean_token_accuracy": 0.6800281047821045, "num_tokens": 121990967.0, "step": 3725 }, { "epoch": 0.3093125466456588, "grad_norm": 0.9664039611816406, "learning_rate": 8.724781685375265e-05, "loss": 1.3207, "mean_token_accuracy": 0.6690554708242417, "num_tokens": 122154807.0, "step": 3730 }, { "epoch": 0.3097271747242723, "grad_norm": 1.023189902305603, "learning_rate": 8.719950092922604e-05, "loss": 1.2411, "mean_token_accuracy": 0.6794843584299087, "num_tokens": 122318647.0, "step": 3735 }, { "epoch": 0.31014180280288584, "grad_norm": 0.9762243032455444, "learning_rate": 8.715110707963221e-05, "loss": 1.3371, "mean_token_accuracy": 0.6647421836853027, "num_tokens": 122482487.0, "step": 3740 }, { "epoch": 0.3105564308814993, "grad_norm": 0.9874435663223267, "learning_rate": 8.710263540634602e-05, "loss": 1.2827, "mean_token_accuracy": 0.6774367719888688, "num_tokens": 122646034.0, "step": 3745 }, { "epoch": 0.3109710589601128, "grad_norm": 0.9017988443374634, "learning_rate": 8.705408601090532e-05, "loss": 1.1893, "mean_token_accuracy": 0.6915566936135292, "num_tokens": 122809874.0, "step": 3750 }, { "epoch": 0.31138568703872627, "grad_norm": 0.9966082572937012, "learning_rate": 8.70054589950108e-05, "loss": 1.3368, "mean_token_accuracy": 0.6670882239937782, "num_tokens": 122973714.0, "step": 3755 }, { "epoch": 0.31180031511733974, "grad_norm": 1.0282634496688843, "learning_rate": 8.695675446052579e-05, "loss": 1.3208, "mean_token_accuracy": 0.6717497557401657, "num_tokens": 123137554.0, "step": 3760 }, { "epoch": 0.3122149431959532, "grad_norm": 0.9596050381660461, "learning_rate": 8.690797250947593e-05, "loss": 1.3403, "mean_token_accuracy": 0.6624877825379372, "num_tokens": 123301394.0, "step": 3765 }, { "epoch": 0.3126295712745667, "grad_norm": 1.0352023839950562, "learning_rate": 8.685911324404906e-05, "loss": 1.3191, "mean_token_accuracy": 0.6656373083591461, "num_tokens": 123464324.0, "step": 3770 }, { "epoch": 0.3130441993531802, "grad_norm": 0.9956424832344055, "learning_rate": 8.681017676659499e-05, "loss": 1.3084, "mean_token_accuracy": 0.671352642774582, "num_tokens": 123628164.0, "step": 3775 }, { "epoch": 0.3134588274317937, "grad_norm": 0.9463441967964172, "learning_rate": 8.676116317962528e-05, "loss": 1.3019, "mean_token_accuracy": 0.6744090288877487, "num_tokens": 123791551.0, "step": 3780 }, { "epoch": 0.3138734555104072, "grad_norm": 0.963124692440033, "learning_rate": 8.671207258581298e-05, "loss": 1.2478, "mean_token_accuracy": 0.6797104150056839, "num_tokens": 123955391.0, "step": 3785 }, { "epoch": 0.31428808358902066, "grad_norm": 0.9857081174850464, "learning_rate": 8.666290508799249e-05, "loss": 1.3029, "mean_token_accuracy": 0.6718658357858658, "num_tokens": 124119231.0, "step": 3790 }, { "epoch": 0.31470271166763414, "grad_norm": 0.9956610202789307, "learning_rate": 8.661366078915926e-05, "loss": 1.3673, "mean_token_accuracy": 0.6612109035253525, "num_tokens": 124283071.0, "step": 3795 }, { "epoch": 0.3151173397462476, "grad_norm": 0.9534837603569031, "learning_rate": 8.656433979246972e-05, "loss": 1.2769, "mean_token_accuracy": 0.6763257592916488, "num_tokens": 124446911.0, "step": 3800 }, { "epoch": 0.3155319678248611, "grad_norm": 0.9826126098632812, "learning_rate": 8.651494220124086e-05, "loss": 1.297, "mean_token_accuracy": 0.6729411080479621, "num_tokens": 124610751.0, "step": 3805 }, { "epoch": 0.31594659590347457, "grad_norm": 0.9866495132446289, "learning_rate": 8.646546811895014e-05, "loss": 1.2753, "mean_token_accuracy": 0.675342133641243, "num_tokens": 124774591.0, "step": 3810 }, { "epoch": 0.31636122398208805, "grad_norm": 0.9472578763961792, "learning_rate": 8.641591764923532e-05, "loss": 1.2422, "mean_token_accuracy": 0.6868707269430161, "num_tokens": 124938431.0, "step": 3815 }, { "epoch": 0.3167758520607015, "grad_norm": 0.9497211575508118, "learning_rate": 8.636629089589409e-05, "loss": 1.2842, "mean_token_accuracy": 0.6793316245079041, "num_tokens": 125102271.0, "step": 3820 }, { "epoch": 0.31719048013931506, "grad_norm": 1.0438381433486938, "learning_rate": 8.631658796288399e-05, "loss": 1.3518, "mean_token_accuracy": 0.6632453590631485, "num_tokens": 125266111.0, "step": 3825 }, { "epoch": 0.31760510821792853, "grad_norm": 1.0412575006484985, "learning_rate": 8.626680895432213e-05, "loss": 1.3068, "mean_token_accuracy": 0.6738636314868927, "num_tokens": 125429951.0, "step": 3830 }, { "epoch": 0.318019736296542, "grad_norm": 1.0045636892318726, "learning_rate": 8.621695397448497e-05, "loss": 1.2654, "mean_token_accuracy": 0.6792705297470093, "num_tokens": 125593791.0, "step": 3835 }, { "epoch": 0.3184343643751555, "grad_norm": 0.9786563515663147, "learning_rate": 8.616702312780813e-05, "loss": 1.3044, "mean_token_accuracy": 0.6722140803933143, "num_tokens": 125757631.0, "step": 3840 }, { "epoch": 0.31884899245376896, "grad_norm": 1.0160499811172485, "learning_rate": 8.611701651888616e-05, "loss": 1.2634, "mean_token_accuracy": 0.6827040523290634, "num_tokens": 125921471.0, "step": 3845 }, { "epoch": 0.31926362053238244, "grad_norm": 0.9721868634223938, "learning_rate": 8.606693425247227e-05, "loss": 1.2613, "mean_token_accuracy": 0.6813662230968476, "num_tokens": 126085196.0, "step": 3850 }, { "epoch": 0.3196782486109959, "grad_norm": 0.9894058108329773, "learning_rate": 8.60167764334782e-05, "loss": 1.3418, "mean_token_accuracy": 0.6640518069267273, "num_tokens": 126249036.0, "step": 3855 }, { "epoch": 0.3200928766896094, "grad_norm": 1.0008037090301514, "learning_rate": 8.596654316697397e-05, "loss": 1.3918, "mean_token_accuracy": 0.6580717816948891, "num_tokens": 126410606.0, "step": 3860 }, { "epoch": 0.32050750476822293, "grad_norm": 0.9972857236862183, "learning_rate": 8.591623455818762e-05, "loss": 1.336, "mean_token_accuracy": 0.6668804988265038, "num_tokens": 126574446.0, "step": 3865 }, { "epoch": 0.3209221328468364, "grad_norm": 0.9058495759963989, "learning_rate": 8.586585071250498e-05, "loss": 1.351, "mean_token_accuracy": 0.6672898322343827, "num_tokens": 126738286.0, "step": 3870 }, { "epoch": 0.3213367609254499, "grad_norm": 0.9955798387527466, "learning_rate": 8.581539173546955e-05, "loss": 1.2712, "mean_token_accuracy": 0.6773826986551285, "num_tokens": 126902126.0, "step": 3875 }, { "epoch": 0.32175138900406336, "grad_norm": 1.066311001777649, "learning_rate": 8.57648577327822e-05, "loss": 1.2406, "mean_token_accuracy": 0.6785888627171517, "num_tokens": 127065196.0, "step": 3880 }, { "epoch": 0.32216601708267684, "grad_norm": 0.9382798671722412, "learning_rate": 8.571424881030093e-05, "loss": 1.2819, "mean_token_accuracy": 0.6794049352407455, "num_tokens": 127229036.0, "step": 3885 }, { "epoch": 0.3225806451612903, "grad_norm": 1.005324125289917, "learning_rate": 8.566356507404072e-05, "loss": 1.3124, "mean_token_accuracy": 0.6711082607507706, "num_tokens": 127392876.0, "step": 3890 }, { "epoch": 0.3229952732399038, "grad_norm": 0.9854289293289185, "learning_rate": 8.561280663017324e-05, "loss": 1.2908, "mean_token_accuracy": 0.66966642588377, "num_tokens": 127556716.0, "step": 3895 }, { "epoch": 0.32340990131851727, "grad_norm": 1.044110655784607, "learning_rate": 8.556197358502666e-05, "loss": 1.2587, "mean_token_accuracy": 0.6809384137392044, "num_tokens": 127720556.0, "step": 3900 }, { "epoch": 0.3238245293971308, "grad_norm": 0.9688938856124878, "learning_rate": 8.551106604508545e-05, "loss": 1.2507, "mean_token_accuracy": 0.6818731635808944, "num_tokens": 127884396.0, "step": 3905 }, { "epoch": 0.3242391574757443, "grad_norm": 0.9877737760543823, "learning_rate": 8.546008411699009e-05, "loss": 1.2911, "mean_token_accuracy": 0.6763318687677383, "num_tokens": 128048236.0, "step": 3910 }, { "epoch": 0.32465378555435775, "grad_norm": 0.9584382176399231, "learning_rate": 8.540902790753693e-05, "loss": 1.329, "mean_token_accuracy": 0.6675403237342834, "num_tokens": 128212076.0, "step": 3915 }, { "epoch": 0.32506841363297123, "grad_norm": 0.9776164889335632, "learning_rate": 8.535789752367791e-05, "loss": 1.3346, "mean_token_accuracy": 0.6679924249649047, "num_tokens": 128375916.0, "step": 3920 }, { "epoch": 0.3254830417115847, "grad_norm": 1.0723118782043457, "learning_rate": 8.530669307252033e-05, "loss": 1.266, "mean_token_accuracy": 0.6777309402823448, "num_tokens": 128539756.0, "step": 3925 }, { "epoch": 0.3258976697901982, "grad_norm": 0.9671254754066467, "learning_rate": 8.525541466132665e-05, "loss": 1.2936, "mean_token_accuracy": 0.6725837871432304, "num_tokens": 128703039.0, "step": 3930 }, { "epoch": 0.32631229786881166, "grad_norm": 0.9290532469749451, "learning_rate": 8.520406239751429e-05, "loss": 1.2614, "mean_token_accuracy": 0.68012585490942, "num_tokens": 128866879.0, "step": 3935 }, { "epoch": 0.32672692594742514, "grad_norm": 0.9890875220298767, "learning_rate": 8.515263638865533e-05, "loss": 1.2861, "mean_token_accuracy": 0.6782450526952744, "num_tokens": 129030618.0, "step": 3940 }, { "epoch": 0.3271415540260386, "grad_norm": 0.9538615942001343, "learning_rate": 8.510113674247636e-05, "loss": 1.3331, "mean_token_accuracy": 0.6691410079598427, "num_tokens": 129194458.0, "step": 3945 }, { "epoch": 0.32755618210465215, "grad_norm": 0.995694637298584, "learning_rate": 8.504956356685825e-05, "loss": 1.3451, "mean_token_accuracy": 0.6659946203231811, "num_tokens": 129358298.0, "step": 3950 }, { "epoch": 0.3279708101832656, "grad_norm": 0.9291507005691528, "learning_rate": 8.499791696983584e-05, "loss": 1.2219, "mean_token_accuracy": 0.68574658036232, "num_tokens": 129522138.0, "step": 3955 }, { "epoch": 0.3283854382618791, "grad_norm": 0.9434182047843933, "learning_rate": 8.494619705959779e-05, "loss": 1.2882, "mean_token_accuracy": 0.6761119276285171, "num_tokens": 129685978.0, "step": 3960 }, { "epoch": 0.3288000663404926, "grad_norm": 0.941871702671051, "learning_rate": 8.489440394448638e-05, "loss": 1.3127, "mean_token_accuracy": 0.6678580120205879, "num_tokens": 129849818.0, "step": 3965 }, { "epoch": 0.32921469441910606, "grad_norm": 0.9243893027305603, "learning_rate": 8.484253773299718e-05, "loss": 1.2996, "mean_token_accuracy": 0.6784090906381607, "num_tokens": 130013658.0, "step": 3970 }, { "epoch": 0.32962932249771953, "grad_norm": 0.9161133766174316, "learning_rate": 8.479059853377892e-05, "loss": 1.277, "mean_token_accuracy": 0.6793010771274567, "num_tokens": 130177498.0, "step": 3975 }, { "epoch": 0.330043950576333, "grad_norm": 0.9451107978820801, "learning_rate": 8.47385864556332e-05, "loss": 1.3267, "mean_token_accuracy": 0.671181571483612, "num_tokens": 130341338.0, "step": 3980 }, { "epoch": 0.3304585786549465, "grad_norm": 0.9498744606971741, "learning_rate": 8.468650160751428e-05, "loss": 1.3259, "mean_token_accuracy": 0.6695992231369019, "num_tokens": 130505178.0, "step": 3985 }, { "epoch": 0.33087320673356, "grad_norm": 0.9431107044219971, "learning_rate": 8.463434409852892e-05, "loss": 1.3873, "mean_token_accuracy": 0.6567815229296684, "num_tokens": 130669018.0, "step": 3990 }, { "epoch": 0.3312878348121735, "grad_norm": 0.9482833743095398, "learning_rate": 8.458211403793599e-05, "loss": 1.2723, "mean_token_accuracy": 0.6752749234437943, "num_tokens": 130832858.0, "step": 3995 }, { "epoch": 0.331702462890787, "grad_norm": 0.9537649154663086, "learning_rate": 8.452981153514643e-05, "loss": 1.2432, "mean_token_accuracy": 0.6842253148555756, "num_tokens": 130996698.0, "step": 4000 }, { "epoch": 0.33211709096940045, "grad_norm": 0.9574406743049622, "learning_rate": 8.44774366997229e-05, "loss": 1.2732, "mean_token_accuracy": 0.6766739994287491, "num_tokens": 131160538.0, "step": 4005 }, { "epoch": 0.33253171904801393, "grad_norm": 0.9847679734230042, "learning_rate": 8.442498964137952e-05, "loss": 1.3391, "mean_token_accuracy": 0.6670026913285255, "num_tokens": 131324378.0, "step": 4010 }, { "epoch": 0.3329463471266274, "grad_norm": 0.962680995464325, "learning_rate": 8.437247046998183e-05, "loss": 1.2861, "mean_token_accuracy": 0.6737597808241844, "num_tokens": 131488218.0, "step": 4015 }, { "epoch": 0.3333609752052409, "grad_norm": 1.0363256931304932, "learning_rate": 8.431987929554632e-05, "loss": 1.3114, "mean_token_accuracy": 0.6710471659898758, "num_tokens": 131652058.0, "step": 4020 }, { "epoch": 0.33377560328385436, "grad_norm": 1.0140748023986816, "learning_rate": 8.426721622824035e-05, "loss": 1.2832, "mean_token_accuracy": 0.6753176897764206, "num_tokens": 131815898.0, "step": 4025 }, { "epoch": 0.3341902313624679, "grad_norm": 0.9625388979911804, "learning_rate": 8.421448137838186e-05, "loss": 1.204, "mean_token_accuracy": 0.6887280084192753, "num_tokens": 131979738.0, "step": 4030 }, { "epoch": 0.33460485944108137, "grad_norm": 0.9752932190895081, "learning_rate": 8.416167485643923e-05, "loss": 1.3647, "mean_token_accuracy": 0.6627566017210483, "num_tokens": 132143578.0, "step": 4035 }, { "epoch": 0.33501948751969485, "grad_norm": 0.9496031999588013, "learning_rate": 8.410879677303087e-05, "loss": 1.3112, "mean_token_accuracy": 0.6705584079027176, "num_tokens": 132307418.0, "step": 4040 }, { "epoch": 0.3354341155983083, "grad_norm": 0.9461378455162048, "learning_rate": 8.405584723892521e-05, "loss": 1.1767, "mean_token_accuracy": 0.6935592070221901, "num_tokens": 132471091.0, "step": 4045 }, { "epoch": 0.3358487436769218, "grad_norm": 0.9161537885665894, "learning_rate": 8.400282636504027e-05, "loss": 1.3451, "mean_token_accuracy": 0.667497555911541, "num_tokens": 132634931.0, "step": 4050 }, { "epoch": 0.3362633717555353, "grad_norm": 0.9966378808021545, "learning_rate": 8.394973426244352e-05, "loss": 1.3231, "mean_token_accuracy": 0.6694892451167107, "num_tokens": 132798771.0, "step": 4055 }, { "epoch": 0.33667799983414876, "grad_norm": 0.9339237809181213, "learning_rate": 8.38965710423517e-05, "loss": 1.2202, "mean_token_accuracy": 0.6860826000571251, "num_tokens": 132962611.0, "step": 4060 }, { "epoch": 0.33709262791276223, "grad_norm": 0.9363735318183899, "learning_rate": 8.384333681613044e-05, "loss": 1.2093, "mean_token_accuracy": 0.68759775608778, "num_tokens": 133126451.0, "step": 4065 }, { "epoch": 0.3375072559913757, "grad_norm": 0.9587835073471069, "learning_rate": 8.37900316952942e-05, "loss": 1.254, "mean_token_accuracy": 0.6832692757248878, "num_tokens": 133290113.0, "step": 4070 }, { "epoch": 0.33792188406998924, "grad_norm": 0.9150619506835938, "learning_rate": 8.373665579150587e-05, "loss": 1.3307, "mean_token_accuracy": 0.6668804973363877, "num_tokens": 133453953.0, "step": 4075 }, { "epoch": 0.3383365121486027, "grad_norm": 0.99213707447052, "learning_rate": 8.368320921657666e-05, "loss": 1.2631, "mean_token_accuracy": 0.6791481450200081, "num_tokens": 133617318.0, "step": 4080 }, { "epoch": 0.3387511402272162, "grad_norm": 0.9785225987434387, "learning_rate": 8.362969208246582e-05, "loss": 1.271, "mean_token_accuracy": 0.67991201877594, "num_tokens": 133781158.0, "step": 4085 }, { "epoch": 0.3391657683058297, "grad_norm": 0.8981971740722656, "learning_rate": 8.357610450128042e-05, "loss": 1.3013, "mean_token_accuracy": 0.673093843460083, "num_tokens": 133944998.0, "step": 4090 }, { "epoch": 0.33958039638444315, "grad_norm": 0.930395245552063, "learning_rate": 8.352244658527504e-05, "loss": 1.2701, "mean_token_accuracy": 0.6777633026242256, "num_tokens": 134108425.0, "step": 4095 }, { "epoch": 0.3399950244630566, "grad_norm": 0.9164111018180847, "learning_rate": 8.346871844685167e-05, "loss": 1.226, "mean_token_accuracy": 0.6855266347527504, "num_tokens": 134272265.0, "step": 4100 }, { "epoch": 0.3404096525416701, "grad_norm": 0.9662408828735352, "learning_rate": 8.341492019855934e-05, "loss": 1.3584, "mean_token_accuracy": 0.6646721988916398, "num_tokens": 134436081.0, "step": 4105 }, { "epoch": 0.3408242806202836, "grad_norm": 0.9960159063339233, "learning_rate": 8.3361051953094e-05, "loss": 1.3134, "mean_token_accuracy": 0.6722201898694038, "num_tokens": 134599921.0, "step": 4110 }, { "epoch": 0.3412389086988971, "grad_norm": 0.8849697113037109, "learning_rate": 8.330711382329817e-05, "loss": 1.2717, "mean_token_accuracy": 0.6770100191235542, "num_tokens": 134763761.0, "step": 4115 }, { "epoch": 0.3416535367775106, "grad_norm": 0.9529863595962524, "learning_rate": 8.325310592216082e-05, "loss": 1.2397, "mean_token_accuracy": 0.6822275161743164, "num_tokens": 134927601.0, "step": 4120 }, { "epoch": 0.34206816485612407, "grad_norm": 0.9730038642883301, "learning_rate": 8.319902836281706e-05, "loss": 1.346, "mean_token_accuracy": 0.6663184270262719, "num_tokens": 135091441.0, "step": 4125 }, { "epoch": 0.34248279293473755, "grad_norm": 0.9488906860351562, "learning_rate": 8.31448812585479e-05, "loss": 1.289, "mean_token_accuracy": 0.6779081113636494, "num_tokens": 135255281.0, "step": 4130 }, { "epoch": 0.342897421013351, "grad_norm": 1.0032751560211182, "learning_rate": 8.309066472278004e-05, "loss": 1.3417, "mean_token_accuracy": 0.6674425736069679, "num_tokens": 135419121.0, "step": 4135 }, { "epoch": 0.3433120490919645, "grad_norm": 0.99687659740448, "learning_rate": 8.303637886908562e-05, "loss": 1.2785, "mean_token_accuracy": 0.6735092848539352, "num_tokens": 135582961.0, "step": 4140 }, { "epoch": 0.343726677170578, "grad_norm": 0.9653187990188599, "learning_rate": 8.2982023811182e-05, "loss": 1.3828, "mean_token_accuracy": 0.6618096277117729, "num_tokens": 135746801.0, "step": 4145 }, { "epoch": 0.34414130524919145, "grad_norm": 0.9359713196754456, "learning_rate": 8.292759966293152e-05, "loss": 1.2622, "mean_token_accuracy": 0.6757636874914169, "num_tokens": 135910641.0, "step": 4150 }, { "epoch": 0.344555933327805, "grad_norm": 0.9051123857498169, "learning_rate": 8.287310653834121e-05, "loss": 1.2661, "mean_token_accuracy": 0.6835622668266297, "num_tokens": 136073698.0, "step": 4155 }, { "epoch": 0.34497056140641846, "grad_norm": 0.9394945502281189, "learning_rate": 8.281854455156262e-05, "loss": 1.3008, "mean_token_accuracy": 0.672403471171856, "num_tokens": 136237538.0, "step": 4160 }, { "epoch": 0.34538518948503194, "grad_norm": 0.9870340824127197, "learning_rate": 8.276391381689152e-05, "loss": 1.2776, "mean_token_accuracy": 0.6784396395087242, "num_tokens": 136401378.0, "step": 4165 }, { "epoch": 0.3457998175636454, "grad_norm": 0.9649732112884521, "learning_rate": 8.270921444876775e-05, "loss": 1.323, "mean_token_accuracy": 0.6687744408845901, "num_tokens": 136565218.0, "step": 4170 }, { "epoch": 0.3462144456422589, "grad_norm": 0.9824701547622681, "learning_rate": 8.26544465617749e-05, "loss": 1.3662, "mean_token_accuracy": 0.6621753796935081, "num_tokens": 136728291.0, "step": 4175 }, { "epoch": 0.34662907372087237, "grad_norm": 0.9233903288841248, "learning_rate": 8.259961027064003e-05, "loss": 1.2652, "mean_token_accuracy": 0.6777920290827751, "num_tokens": 136892131.0, "step": 4180 }, { "epoch": 0.34704370179948585, "grad_norm": 0.8987513780593872, "learning_rate": 8.254470569023359e-05, "loss": 1.3039, "mean_token_accuracy": 0.6722690597176552, "num_tokens": 137055971.0, "step": 4185 }, { "epoch": 0.3474583298780993, "grad_norm": 1.0128217935562134, "learning_rate": 8.2489732935569e-05, "loss": 1.3247, "mean_token_accuracy": 0.6686278089880944, "num_tokens": 137219811.0, "step": 4190 }, { "epoch": 0.3478729579567128, "grad_norm": 0.9634391069412231, "learning_rate": 8.243469212180254e-05, "loss": 1.2436, "mean_token_accuracy": 0.6825207725167275, "num_tokens": 137383651.0, "step": 4195 }, { "epoch": 0.34828758603532634, "grad_norm": 0.9329254031181335, "learning_rate": 8.237958336423305e-05, "loss": 1.2822, "mean_token_accuracy": 0.6733443275094032, "num_tokens": 137547491.0, "step": 4200 }, { "epoch": 0.3487022141139398, "grad_norm": 0.9092962741851807, "learning_rate": 8.232440677830168e-05, "loss": 1.3034, "mean_token_accuracy": 0.6714442819356918, "num_tokens": 137711331.0, "step": 4205 }, { "epoch": 0.3491168421925533, "grad_norm": 1.0242863893508911, "learning_rate": 8.22691624795917e-05, "loss": 1.3614, "mean_token_accuracy": 0.663538607954979, "num_tokens": 137875171.0, "step": 4210 }, { "epoch": 0.34953147027116677, "grad_norm": 2.263258934020996, "learning_rate": 8.221385058382818e-05, "loss": 1.3176, "mean_token_accuracy": 0.6717314258217811, "num_tokens": 138039011.0, "step": 4215 }, { "epoch": 0.34994609834978024, "grad_norm": 0.9587223529815674, "learning_rate": 8.215847120687783e-05, "loss": 1.3566, "mean_token_accuracy": 0.6651515141129494, "num_tokens": 138202851.0, "step": 4220 }, { "epoch": 0.3503607264283937, "grad_norm": 0.9732730388641357, "learning_rate": 8.210302446474869e-05, "loss": 1.2507, "mean_token_accuracy": 0.6835585042834282, "num_tokens": 138366064.0, "step": 4225 }, { "epoch": 0.3507753545070072, "grad_norm": 0.9751377701759338, "learning_rate": 8.204751047358993e-05, "loss": 1.3179, "mean_token_accuracy": 0.6698497042059899, "num_tokens": 138529904.0, "step": 4230 }, { "epoch": 0.3511899825856207, "grad_norm": 0.9684930443763733, "learning_rate": 8.199192934969163e-05, "loss": 1.2834, "mean_token_accuracy": 0.6742214113473892, "num_tokens": 138693733.0, "step": 4235 }, { "epoch": 0.3516046106642342, "grad_norm": 0.9790882468223572, "learning_rate": 8.19362812094844e-05, "loss": 1.3213, "mean_token_accuracy": 0.6715420380234718, "num_tokens": 138857573.0, "step": 4240 }, { "epoch": 0.3520192387428477, "grad_norm": 0.9955540299415588, "learning_rate": 8.188056616953932e-05, "loss": 1.3012, "mean_token_accuracy": 0.6734115317463875, "num_tokens": 139021413.0, "step": 4245 }, { "epoch": 0.35243386682146116, "grad_norm": 0.9717499613761902, "learning_rate": 8.18247843465676e-05, "loss": 1.2503, "mean_token_accuracy": 0.6813660755753517, "num_tokens": 139185253.0, "step": 4250 }, { "epoch": 0.35284849490007464, "grad_norm": 0.985129177570343, "learning_rate": 8.176893585742031e-05, "loss": 1.2865, "mean_token_accuracy": 0.6772849515080452, "num_tokens": 139349093.0, "step": 4255 }, { "epoch": 0.3532631229786881, "grad_norm": 0.9623016715049744, "learning_rate": 8.171302081908819e-05, "loss": 1.2734, "mean_token_accuracy": 0.6783895179629326, "num_tokens": 139512598.0, "step": 4260 }, { "epoch": 0.3536777510573016, "grad_norm": 0.9025546312332153, "learning_rate": 8.165703934870142e-05, "loss": 1.2964, "mean_token_accuracy": 0.674847262352705, "num_tokens": 139676438.0, "step": 4265 }, { "epoch": 0.35409237913591507, "grad_norm": 0.9645045399665833, "learning_rate": 8.160099156352929e-05, "loss": 1.2387, "mean_token_accuracy": 0.6831665381789207, "num_tokens": 139839810.0, "step": 4270 }, { "epoch": 0.35450700721452855, "grad_norm": 0.9419810175895691, "learning_rate": 8.154487758098003e-05, "loss": 1.2537, "mean_token_accuracy": 0.6783122837543487, "num_tokens": 140003379.0, "step": 4275 }, { "epoch": 0.3549216352931421, "grad_norm": 0.9562869668006897, "learning_rate": 8.148869751860053e-05, "loss": 1.227, "mean_token_accuracy": 0.6850745335221291, "num_tokens": 140167219.0, "step": 4280 }, { "epoch": 0.35533626337175556, "grad_norm": 1.0040013790130615, "learning_rate": 8.143245149407612e-05, "loss": 1.2568, "mean_token_accuracy": 0.682539102435112, "num_tokens": 140331059.0, "step": 4285 }, { "epoch": 0.35575089145036903, "grad_norm": 0.9768736958503723, "learning_rate": 8.13761396252303e-05, "loss": 1.3256, "mean_token_accuracy": 0.6704789817333221, "num_tokens": 140494899.0, "step": 4290 }, { "epoch": 0.3561655195289825, "grad_norm": 0.9282314777374268, "learning_rate": 8.131976203002447e-05, "loss": 1.1742, "mean_token_accuracy": 0.6952040582895279, "num_tokens": 140658739.0, "step": 4295 }, { "epoch": 0.356580147607596, "grad_norm": 1.0825862884521484, "learning_rate": 8.126331882655775e-05, "loss": 1.3217, "mean_token_accuracy": 0.6707844614982605, "num_tokens": 140822579.0, "step": 4300 }, { "epoch": 0.35699477568620946, "grad_norm": 0.962060809135437, "learning_rate": 8.12068101330667e-05, "loss": 1.2716, "mean_token_accuracy": 0.6770894408226014, "num_tokens": 140986419.0, "step": 4305 }, { "epoch": 0.35740940376482294, "grad_norm": 0.98125821352005, "learning_rate": 8.115023606792505e-05, "loss": 1.222, "mean_token_accuracy": 0.6857465758919716, "num_tokens": 141150259.0, "step": 4310 }, { "epoch": 0.3578240318434364, "grad_norm": 0.9417420029640198, "learning_rate": 8.109359674964345e-05, "loss": 1.2183, "mean_token_accuracy": 0.6839931562542916, "num_tokens": 141314099.0, "step": 4315 }, { "epoch": 0.3582386599220499, "grad_norm": 0.9744819402694702, "learning_rate": 8.103689229686929e-05, "loss": 1.2387, "mean_token_accuracy": 0.6847996070981026, "num_tokens": 141477939.0, "step": 4320 }, { "epoch": 0.35865328800066343, "grad_norm": 0.9578313231468201, "learning_rate": 8.098012282838634e-05, "loss": 1.3259, "mean_token_accuracy": 0.6701979517936707, "num_tokens": 141641779.0, "step": 4325 }, { "epoch": 0.3590679160792769, "grad_norm": 0.9743626713752747, "learning_rate": 8.092328846311464e-05, "loss": 1.2282, "mean_token_accuracy": 0.6806573778390884, "num_tokens": 141805619.0, "step": 4330 }, { "epoch": 0.3594825441578904, "grad_norm": 0.9296370148658752, "learning_rate": 8.08663893201101e-05, "loss": 1.2349, "mean_token_accuracy": 0.6843108460307121, "num_tokens": 141969459.0, "step": 4335 }, { "epoch": 0.35989717223650386, "grad_norm": 0.92148357629776, "learning_rate": 8.080942551856436e-05, "loss": 1.2489, "mean_token_accuracy": 0.685392227768898, "num_tokens": 142133299.0, "step": 4340 }, { "epoch": 0.36031180031511734, "grad_norm": 0.9848425388336182, "learning_rate": 8.075239717780455e-05, "loss": 1.2599, "mean_token_accuracy": 0.6786168172955513, "num_tokens": 142297139.0, "step": 4345 }, { "epoch": 0.3607264283937308, "grad_norm": 1.043373465538025, "learning_rate": 8.069530441729291e-05, "loss": 1.3162, "mean_token_accuracy": 0.6734550461173058, "num_tokens": 142460884.0, "step": 4350 }, { "epoch": 0.3611410564723443, "grad_norm": 0.927104651927948, "learning_rate": 8.06381473566267e-05, "loss": 1.2539, "mean_token_accuracy": 0.681940370798111, "num_tokens": 142624724.0, "step": 4355 }, { "epoch": 0.36155568455095777, "grad_norm": 0.9074141383171082, "learning_rate": 8.058092611553782e-05, "loss": 1.2578, "mean_token_accuracy": 0.6808372125029564, "num_tokens": 142788207.0, "step": 4360 }, { "epoch": 0.3619703126295713, "grad_norm": 0.979074239730835, "learning_rate": 8.052364081389263e-05, "loss": 1.3162, "mean_token_accuracy": 0.6724095776677131, "num_tokens": 142952047.0, "step": 4365 }, { "epoch": 0.3623849407081848, "grad_norm": 0.9424482583999634, "learning_rate": 8.046629157169172e-05, "loss": 1.2837, "mean_token_accuracy": 0.6769992977380752, "num_tokens": 143115013.0, "step": 4370 }, { "epoch": 0.36279956878679825, "grad_norm": 0.969394862651825, "learning_rate": 8.040887850906957e-05, "loss": 1.2637, "mean_token_accuracy": 0.6814088448882103, "num_tokens": 143278853.0, "step": 4375 }, { "epoch": 0.36321419686541173, "grad_norm": 0.9611478447914124, "learning_rate": 8.035140174629438e-05, "loss": 1.2938, "mean_token_accuracy": 0.67542155534029, "num_tokens": 143442693.0, "step": 4380 }, { "epoch": 0.3636288249440252, "grad_norm": 0.9548108577728271, "learning_rate": 8.02938614037678e-05, "loss": 1.2743, "mean_token_accuracy": 0.6769916892051697, "num_tokens": 143606533.0, "step": 4385 }, { "epoch": 0.3640434530226387, "grad_norm": 0.9090794920921326, "learning_rate": 8.023625760202463e-05, "loss": 1.2429, "mean_token_accuracy": 0.6857918426394463, "num_tokens": 143770118.0, "step": 4390 }, { "epoch": 0.36445808110125216, "grad_norm": 0.881459653377533, "learning_rate": 8.01785904617326e-05, "loss": 1.2454, "mean_token_accuracy": 0.6801380708813667, "num_tokens": 143933958.0, "step": 4395 }, { "epoch": 0.36487270917986564, "grad_norm": 0.9274210929870605, "learning_rate": 8.012086010369218e-05, "loss": 1.2423, "mean_token_accuracy": 0.6870169594883919, "num_tokens": 144096876.0, "step": 4400 }, { "epoch": 0.3652873372584791, "grad_norm": 0.9701513648033142, "learning_rate": 8.00630666488362e-05, "loss": 1.1798, "mean_token_accuracy": 0.6948985800147056, "num_tokens": 144260716.0, "step": 4405 }, { "epoch": 0.36570196533709265, "grad_norm": 0.9466547966003418, "learning_rate": 8.000521021822972e-05, "loss": 1.2971, "mean_token_accuracy": 0.6743035256862641, "num_tokens": 144424556.0, "step": 4410 }, { "epoch": 0.3661165934157061, "grad_norm": 0.9436784982681274, "learning_rate": 7.994729093306968e-05, "loss": 1.2801, "mean_token_accuracy": 0.6743096321821213, "num_tokens": 144588396.0, "step": 4415 }, { "epoch": 0.3665312214943196, "grad_norm": 0.9617013931274414, "learning_rate": 7.98893089146847e-05, "loss": 1.2311, "mean_token_accuracy": 0.6872229784727096, "num_tokens": 144751878.0, "step": 4420 }, { "epoch": 0.3669458495729331, "grad_norm": 1.0159647464752197, "learning_rate": 7.983126428453482e-05, "loss": 1.2889, "mean_token_accuracy": 0.6737414434552192, "num_tokens": 144915718.0, "step": 4425 }, { "epoch": 0.36736047765154656, "grad_norm": 0.9690437912940979, "learning_rate": 7.977315716421125e-05, "loss": 1.271, "mean_token_accuracy": 0.6776820629835129, "num_tokens": 145079558.0, "step": 4430 }, { "epoch": 0.36777510573016003, "grad_norm": 0.9304080009460449, "learning_rate": 7.971498767543604e-05, "loss": 1.2298, "mean_token_accuracy": 0.684420819580555, "num_tokens": 145243398.0, "step": 4435 }, { "epoch": 0.3681897338087735, "grad_norm": 0.8567568063735962, "learning_rate": 7.965675594006198e-05, "loss": 1.2391, "mean_token_accuracy": 0.6845796659588814, "num_tokens": 145407238.0, "step": 4440 }, { "epoch": 0.368604361887387, "grad_norm": 1.0175611972808838, "learning_rate": 7.959846208007221e-05, "loss": 1.2621, "mean_token_accuracy": 0.6806634843349457, "num_tokens": 145571078.0, "step": 4445 }, { "epoch": 0.3690189899660005, "grad_norm": 1.036781668663025, "learning_rate": 7.954010621758e-05, "loss": 1.2196, "mean_token_accuracy": 0.687646621465683, "num_tokens": 145734918.0, "step": 4450 }, { "epoch": 0.369433618044614, "grad_norm": 0.9784215688705444, "learning_rate": 7.948168847482846e-05, "loss": 1.2364, "mean_token_accuracy": 0.6837732195854187, "num_tokens": 145898758.0, "step": 4455 }, { "epoch": 0.3698482461232275, "grad_norm": 0.9979178309440613, "learning_rate": 7.942320897419044e-05, "loss": 1.2061, "mean_token_accuracy": 0.6896566465497017, "num_tokens": 146062598.0, "step": 4460 }, { "epoch": 0.37026287420184095, "grad_norm": 0.9144283533096313, "learning_rate": 7.936466783816808e-05, "loss": 1.2689, "mean_token_accuracy": 0.6753910094499588, "num_tokens": 146226438.0, "step": 4465 }, { "epoch": 0.37067750228045443, "grad_norm": 0.8742868304252625, "learning_rate": 7.930606518939261e-05, "loss": 1.2428, "mean_token_accuracy": 0.6869623616337777, "num_tokens": 146390278.0, "step": 4470 }, { "epoch": 0.3710921303590679, "grad_norm": 0.9299134612083435, "learning_rate": 7.924740115062419e-05, "loss": 1.2078, "mean_token_accuracy": 0.6866507828235626, "num_tokens": 146554118.0, "step": 4475 }, { "epoch": 0.3715067584376814, "grad_norm": 0.9693060517311096, "learning_rate": 7.918867584475154e-05, "loss": 1.2774, "mean_token_accuracy": 0.6766065344214439, "num_tokens": 146716945.0, "step": 4480 }, { "epoch": 0.37192138651629486, "grad_norm": 0.9707568287849426, "learning_rate": 7.912988939479174e-05, "loss": 1.2349, "mean_token_accuracy": 0.6863880708813668, "num_tokens": 146880785.0, "step": 4485 }, { "epoch": 0.3723360145949084, "grad_norm": 0.9991249442100525, "learning_rate": 7.90710419238899e-05, "loss": 1.3343, "mean_token_accuracy": 0.6660679325461387, "num_tokens": 147044625.0, "step": 4490 }, { "epoch": 0.37275064267352187, "grad_norm": 0.9615370631217957, "learning_rate": 7.901213355531901e-05, "loss": 1.3219, "mean_token_accuracy": 0.6711326971650123, "num_tokens": 147208465.0, "step": 4495 }, { "epoch": 0.37316527075213535, "grad_norm": 0.9403915405273438, "learning_rate": 7.895316441247962e-05, "loss": 1.2407, "mean_token_accuracy": 0.6798142716288567, "num_tokens": 147372305.0, "step": 4500 }, { "epoch": 0.3735798988307488, "grad_norm": 1.0085158348083496, "learning_rate": 7.889413461889957e-05, "loss": 1.2442, "mean_token_accuracy": 0.6817213639616966, "num_tokens": 147535888.0, "step": 4505 }, { "epoch": 0.3739945269093623, "grad_norm": 0.9304342865943909, "learning_rate": 7.883504429823377e-05, "loss": 1.2535, "mean_token_accuracy": 0.6781647145748139, "num_tokens": 147699728.0, "step": 4510 }, { "epoch": 0.3744091549879758, "grad_norm": 0.9755577445030212, "learning_rate": 7.877589357426392e-05, "loss": 1.3527, "mean_token_accuracy": 0.666141252219677, "num_tokens": 147863568.0, "step": 4515 }, { "epoch": 0.37482378306658926, "grad_norm": 0.888272762298584, "learning_rate": 7.871668257089822e-05, "loss": 1.2179, "mean_token_accuracy": 0.6887952148914337, "num_tokens": 148027408.0, "step": 4520 }, { "epoch": 0.37523841114520273, "grad_norm": 0.8954936861991882, "learning_rate": 7.86574114121712e-05, "loss": 1.2342, "mean_token_accuracy": 0.6860153958201408, "num_tokens": 148191248.0, "step": 4525 }, { "epoch": 0.3756530392238162, "grad_norm": 0.9565839767456055, "learning_rate": 7.859808022224335e-05, "loss": 1.2451, "mean_token_accuracy": 0.6833516657352448, "num_tokens": 148355088.0, "step": 4530 }, { "epoch": 0.37606766730242974, "grad_norm": 0.9371515512466431, "learning_rate": 7.853868912540095e-05, "loss": 1.2363, "mean_token_accuracy": 0.684695751965046, "num_tokens": 148518928.0, "step": 4535 }, { "epoch": 0.3764822953810432, "grad_norm": 0.9867259860038757, "learning_rate": 7.847923824605572e-05, "loss": 1.3794, "mean_token_accuracy": 0.6623289346694946, "num_tokens": 148682768.0, "step": 4540 }, { "epoch": 0.3768969234596567, "grad_norm": 0.9817521572113037, "learning_rate": 7.841972770874469e-05, "loss": 1.2832, "mean_token_accuracy": 0.6744990229606629, "num_tokens": 148846608.0, "step": 4545 }, { "epoch": 0.3773115515382702, "grad_norm": 0.9609804153442383, "learning_rate": 7.836015763812978e-05, "loss": 1.3478, "mean_token_accuracy": 0.6655303075909614, "num_tokens": 149010448.0, "step": 4550 }, { "epoch": 0.37772617961688365, "grad_norm": 0.9168370962142944, "learning_rate": 7.830052815899769e-05, "loss": 1.4083, "mean_token_accuracy": 0.6589259445667267, "num_tokens": 149174288.0, "step": 4555 }, { "epoch": 0.3781408076954971, "grad_norm": 0.9401752352714539, "learning_rate": 7.824083939625953e-05, "loss": 1.3108, "mean_token_accuracy": 0.6691959947347641, "num_tokens": 149338128.0, "step": 4560 }, { "epoch": 0.3785554357741106, "grad_norm": 0.9858892560005188, "learning_rate": 7.818109147495057e-05, "loss": 1.2495, "mean_token_accuracy": 0.6821542009711266, "num_tokens": 149501968.0, "step": 4565 }, { "epoch": 0.3789700638527241, "grad_norm": 0.933495283126831, "learning_rate": 7.812128452023008e-05, "loss": 1.2661, "mean_token_accuracy": 0.6834799602627755, "num_tokens": 149665808.0, "step": 4570 }, { "epoch": 0.3793846919313376, "grad_norm": 1.671836495399475, "learning_rate": 7.806141865738092e-05, "loss": 1.2599, "mean_token_accuracy": 0.6817564889788628, "num_tokens": 149828698.0, "step": 4575 }, { "epoch": 0.3797993200099511, "grad_norm": 0.8905841708183289, "learning_rate": 7.80014940118094e-05, "loss": 1.2838, "mean_token_accuracy": 0.6760935962200165, "num_tokens": 149992538.0, "step": 4580 }, { "epoch": 0.38021394808856457, "grad_norm": 0.8741480112075806, "learning_rate": 7.794151070904492e-05, "loss": 1.2684, "mean_token_accuracy": 0.6797715067863465, "num_tokens": 150156378.0, "step": 4585 }, { "epoch": 0.38062857616717805, "grad_norm": 0.9679931998252869, "learning_rate": 7.788146887473984e-05, "loss": 1.2268, "mean_token_accuracy": 0.6868401765823364, "num_tokens": 150320218.0, "step": 4590 }, { "epoch": 0.3810432042457915, "grad_norm": 1.0136057138442993, "learning_rate": 7.7821368634669e-05, "loss": 1.264, "mean_token_accuracy": 0.6758919849991798, "num_tokens": 150484058.0, "step": 4595 }, { "epoch": 0.381457832324405, "grad_norm": 0.8925061821937561, "learning_rate": 7.77612101147297e-05, "loss": 1.2511, "mean_token_accuracy": 0.6838954016566277, "num_tokens": 150647898.0, "step": 4600 }, { "epoch": 0.3818724604030185, "grad_norm": 0.9632482528686523, "learning_rate": 7.770099344094126e-05, "loss": 1.2628, "mean_token_accuracy": 0.67837243527174, "num_tokens": 150811738.0, "step": 4605 }, { "epoch": 0.38228708848163195, "grad_norm": 0.9291356205940247, "learning_rate": 7.764071873944488e-05, "loss": 1.2562, "mean_token_accuracy": 0.6821419820189476, "num_tokens": 150975578.0, "step": 4610 }, { "epoch": 0.3827017165602455, "grad_norm": 0.9034490585327148, "learning_rate": 7.758038613650325e-05, "loss": 1.2158, "mean_token_accuracy": 0.6865102618932724, "num_tokens": 151139418.0, "step": 4615 }, { "epoch": 0.38311634463885896, "grad_norm": 0.9467730522155762, "learning_rate": 7.75199957585004e-05, "loss": 1.3079, "mean_token_accuracy": 0.6715264797210694, "num_tokens": 151302594.0, "step": 4620 }, { "epoch": 0.38353097271747244, "grad_norm": 0.9477065205574036, "learning_rate": 7.745954773194135e-05, "loss": 1.2109, "mean_token_accuracy": 0.6921913221478462, "num_tokens": 151465897.0, "step": 4625 }, { "epoch": 0.3839456007960859, "grad_norm": 0.9852918982505798, "learning_rate": 7.739904218345192e-05, "loss": 1.2157, "mean_token_accuracy": 0.6889140188694001, "num_tokens": 151629259.0, "step": 4630 }, { "epoch": 0.3843602288746994, "grad_norm": 0.9387479424476624, "learning_rate": 7.733847923977839e-05, "loss": 1.3291, "mean_token_accuracy": 0.6675158843398095, "num_tokens": 151793099.0, "step": 4635 }, { "epoch": 0.38477485695331287, "grad_norm": 0.9405195713043213, "learning_rate": 7.727785902778728e-05, "loss": 1.2637, "mean_token_accuracy": 0.6774315729737281, "num_tokens": 151956939.0, "step": 4640 }, { "epoch": 0.38518948503192635, "grad_norm": 0.9244207143783569, "learning_rate": 7.72171816744651e-05, "loss": 1.2571, "mean_token_accuracy": 0.6825879812240601, "num_tokens": 152120779.0, "step": 4645 }, { "epoch": 0.3856041131105398, "grad_norm": 0.9538646936416626, "learning_rate": 7.715644730691802e-05, "loss": 1.2506, "mean_token_accuracy": 0.6821515664458275, "num_tokens": 152284101.0, "step": 4650 }, { "epoch": 0.3860187411891533, "grad_norm": 0.9201942086219788, "learning_rate": 7.709565605237168e-05, "loss": 1.2071, "mean_token_accuracy": 0.6904081106185913, "num_tokens": 152447941.0, "step": 4655 }, { "epoch": 0.38643336926776684, "grad_norm": 1.0343220233917236, "learning_rate": 7.703480803817087e-05, "loss": 1.2443, "mean_token_accuracy": 0.6835288360714913, "num_tokens": 152611781.0, "step": 4660 }, { "epoch": 0.3868479973463803, "grad_norm": 0.9714875221252441, "learning_rate": 7.697390339177925e-05, "loss": 1.2861, "mean_token_accuracy": 0.6763013213872909, "num_tokens": 152775621.0, "step": 4665 }, { "epoch": 0.3872626254249938, "grad_norm": 0.9571571350097656, "learning_rate": 7.691294224077919e-05, "loss": 1.2557, "mean_token_accuracy": 0.6802358254790306, "num_tokens": 152939461.0, "step": 4670 }, { "epoch": 0.38767725350360727, "grad_norm": 0.9107460379600525, "learning_rate": 7.685192471287134e-05, "loss": 1.2313, "mean_token_accuracy": 0.6853433534502983, "num_tokens": 153103301.0, "step": 4675 }, { "epoch": 0.38809188158222074, "grad_norm": 0.8951613306999207, "learning_rate": 7.679085093587449e-05, "loss": 1.3226, "mean_token_accuracy": 0.6727455973625183, "num_tokens": 153267141.0, "step": 4680 }, { "epoch": 0.3885065096608342, "grad_norm": 1.0090018510818481, "learning_rate": 7.672972103772524e-05, "loss": 1.2378, "mean_token_accuracy": 0.689778833091259, "num_tokens": 153430981.0, "step": 4685 }, { "epoch": 0.3889211377394477, "grad_norm": 0.94710773229599, "learning_rate": 7.666853514647781e-05, "loss": 1.2251, "mean_token_accuracy": 0.6845307916402816, "num_tokens": 153594821.0, "step": 4690 }, { "epoch": 0.3893357658180612, "grad_norm": 0.9767709970474243, "learning_rate": 7.660729339030361e-05, "loss": 1.234, "mean_token_accuracy": 0.6856488302350044, "num_tokens": 153758661.0, "step": 4695 }, { "epoch": 0.3897503938966747, "grad_norm": 1.0423895120620728, "learning_rate": 7.654599589749119e-05, "loss": 1.2757, "mean_token_accuracy": 0.6767481967806817, "num_tokens": 153921750.0, "step": 4700 }, { "epoch": 0.3901650219752882, "grad_norm": 0.9277583956718445, "learning_rate": 7.648464279644575e-05, "loss": 1.2619, "mean_token_accuracy": 0.6783602133393287, "num_tokens": 154085590.0, "step": 4705 }, { "epoch": 0.39057965005390166, "grad_norm": 0.8938004374504089, "learning_rate": 7.642323421568906e-05, "loss": 1.1811, "mean_token_accuracy": 0.6952956974506378, "num_tokens": 154249430.0, "step": 4710 }, { "epoch": 0.39099427813251514, "grad_norm": 0.9139187335968018, "learning_rate": 7.636177028385909e-05, "loss": 1.2321, "mean_token_accuracy": 0.6850405231118202, "num_tokens": 154412883.0, "step": 4715 }, { "epoch": 0.3914089062111286, "grad_norm": 0.9672526121139526, "learning_rate": 7.63002511297097e-05, "loss": 1.2406, "mean_token_accuracy": 0.6823204308748245, "num_tokens": 154575713.0, "step": 4720 }, { "epoch": 0.3918235342897421, "grad_norm": 0.9284092783927917, "learning_rate": 7.623867688211053e-05, "loss": 1.1583, "mean_token_accuracy": 0.6993890523910522, "num_tokens": 154739553.0, "step": 4725 }, { "epoch": 0.39223816236835557, "grad_norm": 0.920409083366394, "learning_rate": 7.617704767004653e-05, "loss": 1.2206, "mean_token_accuracy": 0.6864186227321625, "num_tokens": 154903393.0, "step": 4730 }, { "epoch": 0.39265279044696905, "grad_norm": 0.9640065431594849, "learning_rate": 7.611536362261783e-05, "loss": 1.242, "mean_token_accuracy": 0.6832172498106956, "num_tokens": 155067233.0, "step": 4735 }, { "epoch": 0.3930674185255826, "grad_norm": 0.9314320683479309, "learning_rate": 7.605362486903946e-05, "loss": 1.2077, "mean_token_accuracy": 0.6876893937587738, "num_tokens": 155231073.0, "step": 4740 }, { "epoch": 0.39348204660419606, "grad_norm": 0.9220605492591858, "learning_rate": 7.5991831538641e-05, "loss": 1.2445, "mean_token_accuracy": 0.6840542554855347, "num_tokens": 155394913.0, "step": 4745 }, { "epoch": 0.39389667468280953, "grad_norm": 0.909056544303894, "learning_rate": 7.59299837608664e-05, "loss": 1.2791, "mean_token_accuracy": 0.6786107078194619, "num_tokens": 155558753.0, "step": 4750 }, { "epoch": 0.394311302761423, "grad_norm": 0.9505742192268372, "learning_rate": 7.586808166527361e-05, "loss": 1.241, "mean_token_accuracy": 0.6831500500440597, "num_tokens": 155722593.0, "step": 4755 }, { "epoch": 0.3947259308400365, "grad_norm": 0.8974282145500183, "learning_rate": 7.58061253815344e-05, "loss": 1.2235, "mean_token_accuracy": 0.6804130047559738, "num_tokens": 155886433.0, "step": 4760 }, { "epoch": 0.39514055891864996, "grad_norm": 0.9585201740264893, "learning_rate": 7.574411503943406e-05, "loss": 1.3285, "mean_token_accuracy": 0.666739983856678, "num_tokens": 156050273.0, "step": 4765 }, { "epoch": 0.39555518699726344, "grad_norm": 0.9355363845825195, "learning_rate": 7.568205076887109e-05, "loss": 1.2458, "mean_token_accuracy": 0.6835899338126182, "num_tokens": 156214113.0, "step": 4770 }, { "epoch": 0.3959698150758769, "grad_norm": 0.9360225200653076, "learning_rate": 7.561993269985703e-05, "loss": 1.3031, "mean_token_accuracy": 0.6800586521625519, "num_tokens": 156377953.0, "step": 4775 }, { "epoch": 0.3963844431544904, "grad_norm": 2.4573659896850586, "learning_rate": 7.555776096251599e-05, "loss": 1.3057, "mean_token_accuracy": 0.6769516438245773, "num_tokens": 156540933.0, "step": 4780 }, { "epoch": 0.39679907123310393, "grad_norm": 0.8595539331436157, "learning_rate": 7.549553568708462e-05, "loss": 1.2688, "mean_token_accuracy": 0.6785801604390145, "num_tokens": 156704773.0, "step": 4785 }, { "epoch": 0.3972136993117174, "grad_norm": 0.9471558928489685, "learning_rate": 7.543325700391169e-05, "loss": 1.2423, "mean_token_accuracy": 0.6851234093308449, "num_tokens": 156868613.0, "step": 4790 }, { "epoch": 0.3976283273903309, "grad_norm": 0.8847547769546509, "learning_rate": 7.537092504345781e-05, "loss": 1.1858, "mean_token_accuracy": 0.6924975574016571, "num_tokens": 157032453.0, "step": 4795 }, { "epoch": 0.39804295546894436, "grad_norm": 0.9858616590499878, "learning_rate": 7.530853993629524e-05, "loss": 1.2537, "mean_token_accuracy": 0.6855498313903808, "num_tokens": 157195089.0, "step": 4800 }, { "epoch": 0.39845758354755784, "grad_norm": 0.9740419387817383, "learning_rate": 7.524610181310752e-05, "loss": 1.2528, "mean_token_accuracy": 0.6816410079598427, "num_tokens": 157358929.0, "step": 4805 }, { "epoch": 0.3988722116261713, "grad_norm": 0.8679446578025818, "learning_rate": 7.518361080468931e-05, "loss": 1.2289, "mean_token_accuracy": 0.6904386594891548, "num_tokens": 157522769.0, "step": 4810 }, { "epoch": 0.3992868397047848, "grad_norm": 0.9794730544090271, "learning_rate": 7.512106704194602e-05, "loss": 1.3254, "mean_token_accuracy": 0.6709757059812546, "num_tokens": 157685877.0, "step": 4815 }, { "epoch": 0.39970146778339827, "grad_norm": 0.9422624707221985, "learning_rate": 7.505847065589357e-05, "loss": 1.3384, "mean_token_accuracy": 0.6665444731712341, "num_tokens": 157849717.0, "step": 4820 }, { "epoch": 0.4001160958620118, "grad_norm": 0.9462253451347351, "learning_rate": 7.499582177765811e-05, "loss": 1.2414, "mean_token_accuracy": 0.6833699867129326, "num_tokens": 158013557.0, "step": 4825 }, { "epoch": 0.4005307239406253, "grad_norm": 0.9390513300895691, "learning_rate": 7.493312053847578e-05, "loss": 1.2295, "mean_token_accuracy": 0.685948196053505, "num_tokens": 158177397.0, "step": 4830 }, { "epoch": 0.40094535201923875, "grad_norm": 0.9818820953369141, "learning_rate": 7.487036706969234e-05, "loss": 1.2479, "mean_token_accuracy": 0.6862170085310936, "num_tokens": 158341237.0, "step": 4835 }, { "epoch": 0.40135998009785223, "grad_norm": 0.9143067002296448, "learning_rate": 7.480756150276303e-05, "loss": 1.3002, "mean_token_accuracy": 0.6738724946975708, "num_tokens": 158504344.0, "step": 4840 }, { "epoch": 0.4017746081764657, "grad_norm": 0.9471181035041809, "learning_rate": 7.47447039692522e-05, "loss": 1.3073, "mean_token_accuracy": 0.6732954531908035, "num_tokens": 158668184.0, "step": 4845 }, { "epoch": 0.4021892362550792, "grad_norm": 0.9081429243087769, "learning_rate": 7.468179460083302e-05, "loss": 1.2684, "mean_token_accuracy": 0.6803519010543824, "num_tokens": 158832024.0, "step": 4850 }, { "epoch": 0.40260386433369266, "grad_norm": 0.93732088804245, "learning_rate": 7.461883352928734e-05, "loss": 1.2306, "mean_token_accuracy": 0.6859359741210938, "num_tokens": 158995864.0, "step": 4855 }, { "epoch": 0.40301849241230614, "grad_norm": 0.9196427464485168, "learning_rate": 7.455582088650521e-05, "loss": 1.2741, "mean_token_accuracy": 0.6800830900669098, "num_tokens": 159159704.0, "step": 4860 }, { "epoch": 0.40343312049091967, "grad_norm": 0.9257554411888123, "learning_rate": 7.449275680448475e-05, "loss": 1.2941, "mean_token_accuracy": 0.6768084064126014, "num_tokens": 159323544.0, "step": 4865 }, { "epoch": 0.40384774856953315, "grad_norm": 0.9372634291648865, "learning_rate": 7.442964141533187e-05, "loss": 1.2781, "mean_token_accuracy": 0.6789235323667526, "num_tokens": 159487002.0, "step": 4870 }, { "epoch": 0.4042623766481466, "grad_norm": 0.920061469078064, "learning_rate": 7.436647485125993e-05, "loss": 1.3087, "mean_token_accuracy": 0.6718169584870338, "num_tokens": 159650842.0, "step": 4875 }, { "epoch": 0.4046770047267601, "grad_norm": 0.9787533283233643, "learning_rate": 7.430325724458945e-05, "loss": 1.2569, "mean_token_accuracy": 0.6805657401680947, "num_tokens": 159814682.0, "step": 4880 }, { "epoch": 0.4050916328053736, "grad_norm": 0.9405059814453125, "learning_rate": 7.423998872774795e-05, "loss": 1.2982, "mean_token_accuracy": 0.6744195967912674, "num_tokens": 159978522.0, "step": 4885 }, { "epoch": 0.40550626088398706, "grad_norm": 0.9164022207260132, "learning_rate": 7.417666943326954e-05, "loss": 1.2315, "mean_token_accuracy": 0.6812695115804672, "num_tokens": 160142292.0, "step": 4890 }, { "epoch": 0.40592088896260053, "grad_norm": 0.896062433719635, "learning_rate": 7.411329949379473e-05, "loss": 1.2897, "mean_token_accuracy": 0.6766129016876221, "num_tokens": 160306132.0, "step": 4895 }, { "epoch": 0.406335517041214, "grad_norm": 0.9773444533348083, "learning_rate": 7.40498790420701e-05, "loss": 1.2571, "mean_token_accuracy": 0.6823374912142753, "num_tokens": 160469972.0, "step": 4900 }, { "epoch": 0.4067501451198275, "grad_norm": 0.9660437107086182, "learning_rate": 7.398640821094803e-05, "loss": 1.3766, "mean_token_accuracy": 0.6599706739187241, "num_tokens": 160633812.0, "step": 4905 }, { "epoch": 0.407164773198441, "grad_norm": 0.9986489415168762, "learning_rate": 7.39228871333865e-05, "loss": 1.2598, "mean_token_accuracy": 0.6785618305206299, "num_tokens": 160797652.0, "step": 4910 }, { "epoch": 0.4075794012770545, "grad_norm": 0.9626627564430237, "learning_rate": 7.385931594244865e-05, "loss": 1.2133, "mean_token_accuracy": 0.6895649999380111, "num_tokens": 160961492.0, "step": 4915 }, { "epoch": 0.407994029355668, "grad_norm": 0.9181817770004272, "learning_rate": 7.379569477130269e-05, "loss": 1.179, "mean_token_accuracy": 0.6955156370997428, "num_tokens": 161125332.0, "step": 4920 }, { "epoch": 0.40840865743428145, "grad_norm": 0.9096157550811768, "learning_rate": 7.373202375322144e-05, "loss": 1.2787, "mean_token_accuracy": 0.6760080680251122, "num_tokens": 161289172.0, "step": 4925 }, { "epoch": 0.40882328551289493, "grad_norm": 0.9276206493377686, "learning_rate": 7.36683030215822e-05, "loss": 1.2459, "mean_token_accuracy": 0.6854533195495606, "num_tokens": 161453012.0, "step": 4930 }, { "epoch": 0.4092379135915084, "grad_norm": 0.9503964185714722, "learning_rate": 7.360453270986642e-05, "loss": 1.2499, "mean_token_accuracy": 0.681518816947937, "num_tokens": 161616852.0, "step": 4935 }, { "epoch": 0.4096525416701219, "grad_norm": 0.943750262260437, "learning_rate": 7.354071295165936e-05, "loss": 1.2608, "mean_token_accuracy": 0.681390517950058, "num_tokens": 161780692.0, "step": 4940 }, { "epoch": 0.41006716974873536, "grad_norm": 0.9272693991661072, "learning_rate": 7.347684388064987e-05, "loss": 1.3221, "mean_token_accuracy": 0.6698435947299004, "num_tokens": 161944532.0, "step": 4945 }, { "epoch": 0.4104817978273489, "grad_norm": 0.8784942626953125, "learning_rate": 7.341292563063014e-05, "loss": 1.2676, "mean_token_accuracy": 0.6795271277427674, "num_tokens": 162108372.0, "step": 4950 }, { "epoch": 0.41089642590596237, "grad_norm": 0.922111451625824, "learning_rate": 7.334895833549533e-05, "loss": 1.1891, "mean_token_accuracy": 0.6913672983646393, "num_tokens": 162272212.0, "step": 4955 }, { "epoch": 0.41131105398457585, "grad_norm": 0.9174176454544067, "learning_rate": 7.328494212924335e-05, "loss": 1.2044, "mean_token_accuracy": 0.6894794717431069, "num_tokens": 162436052.0, "step": 4960 }, { "epoch": 0.4117256820631893, "grad_norm": 0.925289511680603, "learning_rate": 7.322087714597461e-05, "loss": 1.2223, "mean_token_accuracy": 0.6847018539905548, "num_tokens": 162599892.0, "step": 4965 }, { "epoch": 0.4121403101418028, "grad_norm": 0.9320161938667297, "learning_rate": 7.315676351989164e-05, "loss": 1.2919, "mean_token_accuracy": 0.6725500985980034, "num_tokens": 162763732.0, "step": 4970 }, { "epoch": 0.4125549382204163, "grad_norm": 0.918063759803772, "learning_rate": 7.309260138529892e-05, "loss": 1.2638, "mean_token_accuracy": 0.6777309387922287, "num_tokens": 162927572.0, "step": 4975 }, { "epoch": 0.41296956629902976, "grad_norm": 0.9324549436569214, "learning_rate": 7.302839087660251e-05, "loss": 1.2293, "mean_token_accuracy": 0.6880620747804642, "num_tokens": 163091412.0, "step": 4980 }, { "epoch": 0.41338419437764323, "grad_norm": 0.9459344148635864, "learning_rate": 7.296413212830979e-05, "loss": 1.296, "mean_token_accuracy": 0.6782135888934135, "num_tokens": 163255252.0, "step": 4985 }, { "epoch": 0.41379882245625677, "grad_norm": 0.9537709355354309, "learning_rate": 7.289982527502923e-05, "loss": 1.2983, "mean_token_accuracy": 0.6713343113660812, "num_tokens": 163419092.0, "step": 4990 }, { "epoch": 0.41421345053487024, "grad_norm": 0.9475631713867188, "learning_rate": 7.283547045147005e-05, "loss": 1.2518, "mean_token_accuracy": 0.6799449473619461, "num_tokens": 163582720.0, "step": 4995 }, { "epoch": 0.4146280786134837, "grad_norm": 0.9904707074165344, "learning_rate": 7.277106779244196e-05, "loss": 1.3101, "mean_token_accuracy": 0.6728284910321236, "num_tokens": 163746262.0, "step": 5000 }, { "epoch": 0.4150427066920972, "grad_norm": 0.931486964225769, "learning_rate": 7.270661743285489e-05, "loss": 1.3136, "mean_token_accuracy": 0.6726686611771584, "num_tokens": 163909642.0, "step": 5005 }, { "epoch": 0.4154573347707107, "grad_norm": 0.9016469717025757, "learning_rate": 7.264211950771865e-05, "loss": 1.2161, "mean_token_accuracy": 0.6878421351313591, "num_tokens": 164073482.0, "step": 5010 }, { "epoch": 0.41587196284932415, "grad_norm": 0.9519628882408142, "learning_rate": 7.257757415214275e-05, "loss": 1.2858, "mean_token_accuracy": 0.6751099690794945, "num_tokens": 164237322.0, "step": 5015 }, { "epoch": 0.4162865909279376, "grad_norm": 0.9091925621032715, "learning_rate": 7.251298150133598e-05, "loss": 1.248, "mean_token_accuracy": 0.6883980944752693, "num_tokens": 164401162.0, "step": 5020 }, { "epoch": 0.4167012190065511, "grad_norm": 0.90833979845047, "learning_rate": 7.24483416906063e-05, "loss": 1.2673, "mean_token_accuracy": 0.6829545453190804, "num_tokens": 164565002.0, "step": 5025 }, { "epoch": 0.4171158470851646, "grad_norm": 0.8844804763793945, "learning_rate": 7.238365485536038e-05, "loss": 1.161, "mean_token_accuracy": 0.6969330415129662, "num_tokens": 164728842.0, "step": 5030 }, { "epoch": 0.4175304751637781, "grad_norm": 0.9600077867507935, "learning_rate": 7.231892113110342e-05, "loss": 1.2595, "mean_token_accuracy": 0.6765945747494697, "num_tokens": 164892682.0, "step": 5035 }, { "epoch": 0.4179451032423916, "grad_norm": 0.8916535973548889, "learning_rate": 7.225414065343886e-05, "loss": 1.2524, "mean_token_accuracy": 0.6826246321201325, "num_tokens": 165056522.0, "step": 5040 }, { "epoch": 0.41835973132100507, "grad_norm": 0.9368508458137512, "learning_rate": 7.218931355806808e-05, "loss": 1.2071, "mean_token_accuracy": 0.686858506500721, "num_tokens": 165220362.0, "step": 5045 }, { "epoch": 0.41877435939961855, "grad_norm": 0.9774593710899353, "learning_rate": 7.212443998079006e-05, "loss": 1.3668, "mean_token_accuracy": 0.6624511271715164, "num_tokens": 165384202.0, "step": 5050 }, { "epoch": 0.419188987478232, "grad_norm": 0.9264464974403381, "learning_rate": 7.205952005750121e-05, "loss": 1.2484, "mean_token_accuracy": 0.678195258975029, "num_tokens": 165548042.0, "step": 5055 }, { "epoch": 0.4196036155568455, "grad_norm": 0.8728965520858765, "learning_rate": 7.199455392419502e-05, "loss": 1.2964, "mean_token_accuracy": 0.676967254281044, "num_tokens": 165711882.0, "step": 5060 }, { "epoch": 0.420018243635459, "grad_norm": 0.9150432348251343, "learning_rate": 7.192954171696173e-05, "loss": 1.2071, "mean_token_accuracy": 0.6928763419389725, "num_tokens": 165875722.0, "step": 5065 }, { "epoch": 0.42043287171407245, "grad_norm": 0.9414107203483582, "learning_rate": 7.186448357198819e-05, "loss": 1.2934, "mean_token_accuracy": 0.6748655915260315, "num_tokens": 166039562.0, "step": 5070 }, { "epoch": 0.420847499792686, "grad_norm": 0.9863566756248474, "learning_rate": 7.179937962555734e-05, "loss": 1.2448, "mean_token_accuracy": 0.6804740965366364, "num_tokens": 166203402.0, "step": 5075 }, { "epoch": 0.42126212787129946, "grad_norm": 0.9362086057662964, "learning_rate": 7.173423001404821e-05, "loss": 1.1731, "mean_token_accuracy": 0.6968719467520714, "num_tokens": 166367242.0, "step": 5080 }, { "epoch": 0.42167675594991294, "grad_norm": 0.9288221597671509, "learning_rate": 7.166903487393539e-05, "loss": 1.305, "mean_token_accuracy": 0.6745540015399456, "num_tokens": 166531082.0, "step": 5085 }, { "epoch": 0.4220913840285264, "grad_norm": 0.9393354654312134, "learning_rate": 7.160379434178888e-05, "loss": 1.282, "mean_token_accuracy": 0.6779623195528984, "num_tokens": 166693932.0, "step": 5090 }, { "epoch": 0.4225060121071399, "grad_norm": 0.8841371536254883, "learning_rate": 7.153850855427376e-05, "loss": 1.2141, "mean_token_accuracy": 0.6882575780153275, "num_tokens": 166857772.0, "step": 5095 }, { "epoch": 0.42292064018575337, "grad_norm": 0.9394116997718811, "learning_rate": 7.147317764814992e-05, "loss": 1.2814, "mean_token_accuracy": 0.6750122174620629, "num_tokens": 167021612.0, "step": 5100 }, { "epoch": 0.42333526826436685, "grad_norm": 0.8560900092124939, "learning_rate": 7.140780176027177e-05, "loss": 1.1895, "mean_token_accuracy": 0.695643937587738, "num_tokens": 167185452.0, "step": 5105 }, { "epoch": 0.4237498963429803, "grad_norm": 0.9591403603553772, "learning_rate": 7.13423810275879e-05, "loss": 1.2349, "mean_token_accuracy": 0.6861192584037781, "num_tokens": 167349292.0, "step": 5110 }, { "epoch": 0.4241645244215938, "grad_norm": 0.9461228251457214, "learning_rate": 7.127691558714091e-05, "loss": 1.3244, "mean_token_accuracy": 0.6665647983551025, "num_tokens": 167512669.0, "step": 5115 }, { "epoch": 0.42457915250020734, "grad_norm": 0.946561872959137, "learning_rate": 7.121140557606699e-05, "loss": 1.2473, "mean_token_accuracy": 0.6869655027985573, "num_tokens": 167675955.0, "step": 5120 }, { "epoch": 0.4249937805788208, "grad_norm": 0.9129251837730408, "learning_rate": 7.114585113159571e-05, "loss": 1.2625, "mean_token_accuracy": 0.6778080701828003, "num_tokens": 167839038.0, "step": 5125 }, { "epoch": 0.4254084086574343, "grad_norm": 0.9162349700927734, "learning_rate": 7.108025239104978e-05, "loss": 1.276, "mean_token_accuracy": 0.676509042084217, "num_tokens": 168002878.0, "step": 5130 }, { "epoch": 0.42582303673604777, "grad_norm": 0.9409959316253662, "learning_rate": 7.101460949184464e-05, "loss": 1.2804, "mean_token_accuracy": 0.6783541053533554, "num_tokens": 168166718.0, "step": 5135 }, { "epoch": 0.42623766481466124, "grad_norm": 0.9891248941421509, "learning_rate": 7.094892257148821e-05, "loss": 1.2819, "mean_token_accuracy": 0.6758308865129947, "num_tokens": 168330558.0, "step": 5140 }, { "epoch": 0.4266522928932747, "grad_norm": 0.899552583694458, "learning_rate": 7.088319176758069e-05, "loss": 1.1743, "mean_token_accuracy": 0.6964381694793701, "num_tokens": 168494398.0, "step": 5145 }, { "epoch": 0.4270669209718882, "grad_norm": 0.8751155138015747, "learning_rate": 7.081741721781418e-05, "loss": 1.2244, "mean_token_accuracy": 0.6883736550807953, "num_tokens": 168658238.0, "step": 5150 }, { "epoch": 0.4274815490505017, "grad_norm": 0.9431947469711304, "learning_rate": 7.07515990599724e-05, "loss": 1.2772, "mean_token_accuracy": 0.6795393422245979, "num_tokens": 168822078.0, "step": 5155 }, { "epoch": 0.4278961771291152, "grad_norm": 0.9290785789489746, "learning_rate": 7.068573743193047e-05, "loss": 1.2116, "mean_token_accuracy": 0.6866263419389724, "num_tokens": 168985918.0, "step": 5160 }, { "epoch": 0.4283108052077287, "grad_norm": 0.9355618953704834, "learning_rate": 7.061983247165447e-05, "loss": 1.2549, "mean_token_accuracy": 0.6818164244294167, "num_tokens": 169149728.0, "step": 5165 }, { "epoch": 0.42872543328634216, "grad_norm": 0.9035906195640564, "learning_rate": 7.055388431720139e-05, "loss": 1.2103, "mean_token_accuracy": 0.6902064979076385, "num_tokens": 169313568.0, "step": 5170 }, { "epoch": 0.42914006136495564, "grad_norm": 0.9600555300712585, "learning_rate": 7.048789310671859e-05, "loss": 1.281, "mean_token_accuracy": 0.6786229223012924, "num_tokens": 169477408.0, "step": 5175 }, { "epoch": 0.4295546894435691, "grad_norm": 0.9158090353012085, "learning_rate": 7.042185897844367e-05, "loss": 1.2422, "mean_token_accuracy": 0.6817509770393372, "num_tokens": 169641248.0, "step": 5180 }, { "epoch": 0.4299693175221826, "grad_norm": 0.856984555721283, "learning_rate": 7.035578207070412e-05, "loss": 1.2134, "mean_token_accuracy": 0.69148338586092, "num_tokens": 169805088.0, "step": 5185 }, { "epoch": 0.43038394560079607, "grad_norm": 0.9986769556999207, "learning_rate": 7.028966252191709e-05, "loss": 1.3264, "mean_token_accuracy": 0.6711939051747322, "num_tokens": 169968657.0, "step": 5190 }, { "epoch": 0.43079857367940955, "grad_norm": 0.9320401549339294, "learning_rate": 7.022350047058897e-05, "loss": 1.2592, "mean_token_accuracy": 0.6817143216729165, "num_tokens": 170132497.0, "step": 5195 }, { "epoch": 0.4312132017580231, "grad_norm": 0.8438582420349121, "learning_rate": 7.015729605531526e-05, "loss": 1.1353, "mean_token_accuracy": 0.7033174499869347, "num_tokens": 170296337.0, "step": 5200 }, { "epoch": 0.43162782983663656, "grad_norm": 0.9292356371879578, "learning_rate": 7.009104941478015e-05, "loss": 1.2829, "mean_token_accuracy": 0.6764112934470177, "num_tokens": 170460177.0, "step": 5205 }, { "epoch": 0.43204245791525003, "grad_norm": 0.9409948587417603, "learning_rate": 7.002476068775633e-05, "loss": 1.2898, "mean_token_accuracy": 0.6755437485873699, "num_tokens": 170624017.0, "step": 5210 }, { "epoch": 0.4324570859938635, "grad_norm": 0.9407399892807007, "learning_rate": 6.995843001310463e-05, "loss": 1.2212, "mean_token_accuracy": 0.6895344540476799, "num_tokens": 170787857.0, "step": 5215 }, { "epoch": 0.432871714072477, "grad_norm": 0.9206990599632263, "learning_rate": 6.98920575297737e-05, "loss": 1.3201, "mean_token_accuracy": 0.6725439876317978, "num_tokens": 170951697.0, "step": 5220 }, { "epoch": 0.43328634215109046, "grad_norm": 0.9319395422935486, "learning_rate": 6.982564337679986e-05, "loss": 1.2724, "mean_token_accuracy": 0.6791238963603974, "num_tokens": 171115537.0, "step": 5225 }, { "epoch": 0.43370097022970394, "grad_norm": 0.9000293016433716, "learning_rate": 6.975918769330669e-05, "loss": 1.1855, "mean_token_accuracy": 0.6975562095642089, "num_tokens": 171279377.0, "step": 5230 }, { "epoch": 0.4341155983083174, "grad_norm": 0.9152920842170715, "learning_rate": 6.969269061850474e-05, "loss": 1.2542, "mean_token_accuracy": 0.6791566848754883, "num_tokens": 171443130.0, "step": 5235 }, { "epoch": 0.4345302263869309, "grad_norm": 0.906944990158081, "learning_rate": 6.962615229169129e-05, "loss": 1.2295, "mean_token_accuracy": 0.6885508254170418, "num_tokens": 171606970.0, "step": 5240 }, { "epoch": 0.43494485446554443, "grad_norm": 0.9342389702796936, "learning_rate": 6.955957285225001e-05, "loss": 1.3235, "mean_token_accuracy": 0.670918869972229, "num_tokens": 171770810.0, "step": 5245 }, { "epoch": 0.4353594825441579, "grad_norm": 0.9459275007247925, "learning_rate": 6.949295243965073e-05, "loss": 1.2582, "mean_token_accuracy": 0.6794415950775147, "num_tokens": 171934650.0, "step": 5250 }, { "epoch": 0.4357741106227714, "grad_norm": 0.9350090026855469, "learning_rate": 6.942629119344907e-05, "loss": 1.1836, "mean_token_accuracy": 0.6913062110543251, "num_tokens": 172098490.0, "step": 5255 }, { "epoch": 0.43618873870138486, "grad_norm": 0.9977842569351196, "learning_rate": 6.935958925328622e-05, "loss": 1.279, "mean_token_accuracy": 0.6781463846564293, "num_tokens": 172262330.0, "step": 5260 }, { "epoch": 0.43660336677999834, "grad_norm": 0.9499651789665222, "learning_rate": 6.929284675888859e-05, "loss": 1.2166, "mean_token_accuracy": 0.6865652456879616, "num_tokens": 172426170.0, "step": 5265 }, { "epoch": 0.4370179948586118, "grad_norm": 0.9373361468315125, "learning_rate": 6.922606385006757e-05, "loss": 1.2732, "mean_token_accuracy": 0.6814699441194534, "num_tokens": 172590010.0, "step": 5270 }, { "epoch": 0.4374326229372253, "grad_norm": 0.9018722176551819, "learning_rate": 6.91592406667192e-05, "loss": 1.2672, "mean_token_accuracy": 0.6811365976929664, "num_tokens": 172753579.0, "step": 5275 }, { "epoch": 0.43784725101583877, "grad_norm": 0.894157350063324, "learning_rate": 6.909237734882384e-05, "loss": 1.2316, "mean_token_accuracy": 0.684879033267498, "num_tokens": 172917419.0, "step": 5280 }, { "epoch": 0.4382618790944523, "grad_norm": 0.9963103532791138, "learning_rate": 6.902547403644601e-05, "loss": 1.3413, "mean_token_accuracy": 0.6668132960796356, "num_tokens": 173081259.0, "step": 5285 }, { "epoch": 0.4386765071730658, "grad_norm": 0.9677658677101135, "learning_rate": 6.895853086973395e-05, "loss": 1.3384, "mean_token_accuracy": 0.6679078742861748, "num_tokens": 173245094.0, "step": 5290 }, { "epoch": 0.43909113525167925, "grad_norm": 0.8745771646499634, "learning_rate": 6.88915479889194e-05, "loss": 1.1847, "mean_token_accuracy": 0.691142700612545, "num_tokens": 173407960.0, "step": 5295 }, { "epoch": 0.43950576333029273, "grad_norm": 0.9091145992279053, "learning_rate": 6.882452553431728e-05, "loss": 1.2406, "mean_token_accuracy": 0.6878787890076637, "num_tokens": 173571800.0, "step": 5300 }, { "epoch": 0.4399203914089062, "grad_norm": 0.9397043585777283, "learning_rate": 6.875746364632544e-05, "loss": 1.2601, "mean_token_accuracy": 0.6831928133964539, "num_tokens": 173735640.0, "step": 5305 }, { "epoch": 0.4403350194875197, "grad_norm": 0.8953580260276794, "learning_rate": 6.86903624654243e-05, "loss": 1.2124, "mean_token_accuracy": 0.6883370012044907, "num_tokens": 173899480.0, "step": 5310 }, { "epoch": 0.44074964756613316, "grad_norm": 0.9146726727485657, "learning_rate": 6.862322213217661e-05, "loss": 1.2162, "mean_token_accuracy": 0.6873961389064789, "num_tokens": 174063320.0, "step": 5315 }, { "epoch": 0.44116427564474664, "grad_norm": 0.9711309671401978, "learning_rate": 6.855604278722716e-05, "loss": 1.2564, "mean_token_accuracy": 0.682221406698227, "num_tokens": 174227160.0, "step": 5320 }, { "epoch": 0.44157890372336017, "grad_norm": 0.9068252444267273, "learning_rate": 6.84888245713024e-05, "loss": 1.231, "mean_token_accuracy": 0.68525170981884, "num_tokens": 174391000.0, "step": 5325 }, { "epoch": 0.44199353180197365, "grad_norm": 0.9210191369056702, "learning_rate": 6.842156762521026e-05, "loss": 1.258, "mean_token_accuracy": 0.6829973116517067, "num_tokens": 174554840.0, "step": 5330 }, { "epoch": 0.4424081598805871, "grad_norm": 0.9596512913703918, "learning_rate": 6.835427208983977e-05, "loss": 1.2529, "mean_token_accuracy": 0.6870601147413253, "num_tokens": 174718680.0, "step": 5335 }, { "epoch": 0.4428227879592006, "grad_norm": 0.9190239310264587, "learning_rate": 6.828693810616083e-05, "loss": 1.2416, "mean_token_accuracy": 0.6829301044344902, "num_tokens": 174882520.0, "step": 5340 }, { "epoch": 0.4432374160378141, "grad_norm": 0.935261070728302, "learning_rate": 6.821956581522382e-05, "loss": 1.2233, "mean_token_accuracy": 0.6880627810955048, "num_tokens": 175045555.0, "step": 5345 }, { "epoch": 0.44365204411642756, "grad_norm": 0.9144822955131531, "learning_rate": 6.815215535815944e-05, "loss": 1.2118, "mean_token_accuracy": 0.6848851442337036, "num_tokens": 175209395.0, "step": 5350 }, { "epoch": 0.44406667219504103, "grad_norm": 0.9519127607345581, "learning_rate": 6.80847068761783e-05, "loss": 1.2238, "mean_token_accuracy": 0.6834555193781853, "num_tokens": 175373235.0, "step": 5355 }, { "epoch": 0.4444813002736545, "grad_norm": 0.9514979124069214, "learning_rate": 6.801722051057064e-05, "loss": 1.163, "mean_token_accuracy": 0.6971590876579284, "num_tokens": 175537075.0, "step": 5360 }, { "epoch": 0.444895928352268, "grad_norm": 0.9447032809257507, "learning_rate": 6.794969640270611e-05, "loss": 1.2814, "mean_token_accuracy": 0.6770955502986908, "num_tokens": 175700915.0, "step": 5365 }, { "epoch": 0.4453105564308815, "grad_norm": 0.9713386297225952, "learning_rate": 6.788213469403342e-05, "loss": 1.2513, "mean_token_accuracy": 0.6837243407964706, "num_tokens": 175864755.0, "step": 5370 }, { "epoch": 0.445725184509495, "grad_norm": 0.9300346970558167, "learning_rate": 6.781453552608e-05, "loss": 1.1998, "mean_token_accuracy": 0.6904753193259239, "num_tokens": 176028595.0, "step": 5375 }, { "epoch": 0.4461398125881085, "grad_norm": 0.8931049108505249, "learning_rate": 6.774689904045176e-05, "loss": 1.2284, "mean_token_accuracy": 0.6878054708242416, "num_tokens": 176192435.0, "step": 5380 }, { "epoch": 0.44655444066672195, "grad_norm": 0.9139026999473572, "learning_rate": 6.767922537883283e-05, "loss": 1.1994, "mean_token_accuracy": 0.6937866598367691, "num_tokens": 176356275.0, "step": 5385 }, { "epoch": 0.44696906874533543, "grad_norm": 0.8814029097557068, "learning_rate": 6.761151468298514e-05, "loss": 1.2196, "mean_token_accuracy": 0.6869806960225106, "num_tokens": 176520115.0, "step": 5390 }, { "epoch": 0.4473836968239489, "grad_norm": 0.8821919560432434, "learning_rate": 6.75437670947483e-05, "loss": 1.2221, "mean_token_accuracy": 0.6869073823094368, "num_tokens": 176683955.0, "step": 5395 }, { "epoch": 0.4477983249025624, "grad_norm": 0.9081880450248718, "learning_rate": 6.74759827560391e-05, "loss": 1.2343, "mean_token_accuracy": 0.6858565524220467, "num_tokens": 176847795.0, "step": 5400 }, { "epoch": 0.44821295298117586, "grad_norm": 0.9185976982116699, "learning_rate": 6.740816180885135e-05, "loss": 1.2193, "mean_token_accuracy": 0.68883186429739, "num_tokens": 177011635.0, "step": 5405 }, { "epoch": 0.4486275810597894, "grad_norm": 0.9622422456741333, "learning_rate": 6.73403043952556e-05, "loss": 1.218, "mean_token_accuracy": 0.689094577729702, "num_tokens": 177175475.0, "step": 5410 }, { "epoch": 0.44904220913840287, "grad_norm": 0.8946992754936218, "learning_rate": 6.72724106573987e-05, "loss": 1.305, "mean_token_accuracy": 0.6759164288640023, "num_tokens": 177339315.0, "step": 5415 }, { "epoch": 0.44945683721701635, "grad_norm": 0.8865119218826294, "learning_rate": 6.720448073750367e-05, "loss": 1.1819, "mean_token_accuracy": 0.6931940361857414, "num_tokens": 177503155.0, "step": 5420 }, { "epoch": 0.4498714652956298, "grad_norm": 0.9959166049957275, "learning_rate": 6.713651477786926e-05, "loss": 1.1997, "mean_token_accuracy": 0.6922165229916573, "num_tokens": 177666995.0, "step": 5425 }, { "epoch": 0.4502860933742433, "grad_norm": 0.9162701964378357, "learning_rate": 6.706851292086975e-05, "loss": 1.2072, "mean_token_accuracy": 0.6892473086714744, "num_tokens": 177830835.0, "step": 5430 }, { "epoch": 0.4507007214528568, "grad_norm": 0.9049626588821411, "learning_rate": 6.700047530895463e-05, "loss": 1.1919, "mean_token_accuracy": 0.6913673028349876, "num_tokens": 177994675.0, "step": 5435 }, { "epoch": 0.45111534953147026, "grad_norm": 0.9806783199310303, "learning_rate": 6.693240208464827e-05, "loss": 1.3308, "mean_token_accuracy": 0.6697458475828171, "num_tokens": 178158515.0, "step": 5440 }, { "epoch": 0.45152997761008373, "grad_norm": 0.9126311540603638, "learning_rate": 6.686429339054961e-05, "loss": 1.2807, "mean_token_accuracy": 0.6787044301629066, "num_tokens": 178322221.0, "step": 5445 }, { "epoch": 0.45194460568869727, "grad_norm": 0.9570632576942444, "learning_rate": 6.679614936933196e-05, "loss": 1.2483, "mean_token_accuracy": 0.6813285246491432, "num_tokens": 178485110.0, "step": 5450 }, { "epoch": 0.45235923376731074, "grad_norm": 0.9193328022956848, "learning_rate": 6.67279701637426e-05, "loss": 1.3085, "mean_token_accuracy": 0.671994136273861, "num_tokens": 178648950.0, "step": 5455 }, { "epoch": 0.4527738618459242, "grad_norm": 0.9053947925567627, "learning_rate": 6.665975591660247e-05, "loss": 1.2362, "mean_token_accuracy": 0.6855449616909027, "num_tokens": 178812790.0, "step": 5460 }, { "epoch": 0.4531884899245377, "grad_norm": 0.9453768730163574, "learning_rate": 6.659150677080598e-05, "loss": 1.24, "mean_token_accuracy": 0.6845857784152031, "num_tokens": 178976630.0, "step": 5465 }, { "epoch": 0.4536031180031512, "grad_norm": 0.9386938810348511, "learning_rate": 6.652322286932061e-05, "loss": 1.1507, "mean_token_accuracy": 0.6952223852276802, "num_tokens": 179140470.0, "step": 5470 }, { "epoch": 0.45401774608176465, "grad_norm": 0.9033142328262329, "learning_rate": 6.645490435518668e-05, "loss": 1.2315, "mean_token_accuracy": 0.685832105576992, "num_tokens": 179304310.0, "step": 5475 }, { "epoch": 0.4544323741603781, "grad_norm": 0.8971104621887207, "learning_rate": 6.638655137151695e-05, "loss": 1.1798, "mean_token_accuracy": 0.6961326941847801, "num_tokens": 179468150.0, "step": 5480 }, { "epoch": 0.4548470022389916, "grad_norm": 0.8761679530143738, "learning_rate": 6.631816406149648e-05, "loss": 1.2451, "mean_token_accuracy": 0.6850073292851449, "num_tokens": 179631990.0, "step": 5485 }, { "epoch": 0.4552616303176051, "grad_norm": 0.9151933193206787, "learning_rate": 6.624974256838215e-05, "loss": 1.2634, "mean_token_accuracy": 0.6781891547143459, "num_tokens": 179795830.0, "step": 5490 }, { "epoch": 0.4556762583962186, "grad_norm": 0.8931605815887451, "learning_rate": 6.618128703550246e-05, "loss": 1.1539, "mean_token_accuracy": 0.6978433474898338, "num_tokens": 179959670.0, "step": 5495 }, { "epoch": 0.4560908864748321, "grad_norm": 0.8990060091018677, "learning_rate": 6.61127976062573e-05, "loss": 1.2154, "mean_token_accuracy": 0.6862170100212097, "num_tokens": 180123510.0, "step": 5500 }, { "epoch": 0.45650551455344557, "grad_norm": 0.8971739411354065, "learning_rate": 6.604427442411746e-05, "loss": 1.2498, "mean_token_accuracy": 0.6818304002285004, "num_tokens": 180287350.0, "step": 5505 }, { "epoch": 0.45692014263205905, "grad_norm": 0.9555619359016418, "learning_rate": 6.597571763262449e-05, "loss": 1.2663, "mean_token_accuracy": 0.6820014685392379, "num_tokens": 180451190.0, "step": 5510 }, { "epoch": 0.4573347707106725, "grad_norm": 0.9414533972740173, "learning_rate": 6.590712737539031e-05, "loss": 1.2378, "mean_token_accuracy": 0.6874816685914993, "num_tokens": 180615030.0, "step": 5515 }, { "epoch": 0.457749398789286, "grad_norm": 0.9812831282615662, "learning_rate": 6.5838503796097e-05, "loss": 1.19, "mean_token_accuracy": 0.6898047953844071, "num_tokens": 180778332.0, "step": 5520 }, { "epoch": 0.4581640268678995, "grad_norm": 0.9169635772705078, "learning_rate": 6.576984703849639e-05, "loss": 1.2571, "mean_token_accuracy": 0.6822153061628342, "num_tokens": 180942172.0, "step": 5525 }, { "epoch": 0.45857865494651295, "grad_norm": 0.9392279982566833, "learning_rate": 6.570115724640984e-05, "loss": 1.229, "mean_token_accuracy": 0.6863025456666947, "num_tokens": 181106012.0, "step": 5530 }, { "epoch": 0.4589932830251265, "grad_norm": 0.9157460927963257, "learning_rate": 6.563243456372788e-05, "loss": 1.2381, "mean_token_accuracy": 0.6850562125444413, "num_tokens": 181269852.0, "step": 5535 }, { "epoch": 0.45940791110373996, "grad_norm": 0.9607328772544861, "learning_rate": 6.556367913441e-05, "loss": 1.2847, "mean_token_accuracy": 0.6747739523649215, "num_tokens": 181433692.0, "step": 5540 }, { "epoch": 0.45982253918235344, "grad_norm": 0.8823652863502502, "learning_rate": 6.54948911024842e-05, "loss": 1.2308, "mean_token_accuracy": 0.6844797477126121, "num_tokens": 181596791.0, "step": 5545 }, { "epoch": 0.4602371672609669, "grad_norm": 0.8790220022201538, "learning_rate": 6.542607061204683e-05, "loss": 1.2398, "mean_token_accuracy": 0.6900232210755348, "num_tokens": 181760631.0, "step": 5550 }, { "epoch": 0.4606517953395804, "grad_norm": 0.9489864110946655, "learning_rate": 6.535721780726228e-05, "loss": 1.2872, "mean_token_accuracy": 0.6787390008568763, "num_tokens": 181924471.0, "step": 5555 }, { "epoch": 0.46106642341819387, "grad_norm": 0.9242657423019409, "learning_rate": 6.528833283236249e-05, "loss": 1.2546, "mean_token_accuracy": 0.6824230194091797, "num_tokens": 182088311.0, "step": 5560 }, { "epoch": 0.46148105149680735, "grad_norm": 1.01797616481781, "learning_rate": 6.521941583164695e-05, "loss": 1.3627, "mean_token_accuracy": 0.6646261021494866, "num_tokens": 182252151.0, "step": 5565 }, { "epoch": 0.4618956795754208, "grad_norm": 0.989361584186554, "learning_rate": 6.515046694948213e-05, "loss": 1.2577, "mean_token_accuracy": 0.6792399823665619, "num_tokens": 182415991.0, "step": 5570 }, { "epoch": 0.46231030765403436, "grad_norm": 0.9516648650169373, "learning_rate": 6.508148633030132e-05, "loss": 1.2551, "mean_token_accuracy": 0.6816052556037903, "num_tokens": 182579752.0, "step": 5575 }, { "epoch": 0.46272493573264784, "grad_norm": 0.9034631252288818, "learning_rate": 6.501247411860429e-05, "loss": 1.1766, "mean_token_accuracy": 0.6908907622098923, "num_tokens": 182743592.0, "step": 5580 }, { "epoch": 0.4631395638112613, "grad_norm": 0.9333795309066772, "learning_rate": 6.494343045895702e-05, "loss": 1.2323, "mean_token_accuracy": 0.6843536138534546, "num_tokens": 182907432.0, "step": 5585 }, { "epoch": 0.4635541918898748, "grad_norm": 0.9616758227348328, "learning_rate": 6.487435549599132e-05, "loss": 1.2525, "mean_token_accuracy": 0.683321113884449, "num_tokens": 183071272.0, "step": 5590 }, { "epoch": 0.46396881996848827, "grad_norm": 0.9114391207695007, "learning_rate": 6.480524937440456e-05, "loss": 1.2472, "mean_token_accuracy": 0.6821236565709115, "num_tokens": 183235112.0, "step": 5595 }, { "epoch": 0.46438344804710174, "grad_norm": 0.9227198958396912, "learning_rate": 6.473611223895947e-05, "loss": 1.249, "mean_token_accuracy": 0.68401148468256, "num_tokens": 183398952.0, "step": 5600 }, { "epoch": 0.4647980761257152, "grad_norm": 0.9138762950897217, "learning_rate": 6.466694423448365e-05, "loss": 1.2531, "mean_token_accuracy": 0.6837206363677979, "num_tokens": 183562480.0, "step": 5605 }, { "epoch": 0.4652127042043287, "grad_norm": 0.8999396562576294, "learning_rate": 6.459774550586942e-05, "loss": 1.2405, "mean_token_accuracy": 0.6830950632691384, "num_tokens": 183726320.0, "step": 5610 }, { "epoch": 0.4656273322829422, "grad_norm": 0.9314269423484802, "learning_rate": 6.452851619807342e-05, "loss": 1.2525, "mean_token_accuracy": 0.6828629061579704, "num_tokens": 183890160.0, "step": 5615 }, { "epoch": 0.4660419603615557, "grad_norm": 0.9825212955474854, "learning_rate": 6.445925645611641e-05, "loss": 1.2568, "mean_token_accuracy": 0.6807856827974319, "num_tokens": 184054000.0, "step": 5620 }, { "epoch": 0.4664565884401692, "grad_norm": 0.9416497945785522, "learning_rate": 6.438996642508283e-05, "loss": 1.2582, "mean_token_accuracy": 0.683064517378807, "num_tokens": 184217840.0, "step": 5625 }, { "epoch": 0.46687121651878266, "grad_norm": 0.9385108947753906, "learning_rate": 6.432064625012064e-05, "loss": 1.2555, "mean_token_accuracy": 0.6771627545356751, "num_tokens": 184381680.0, "step": 5630 }, { "epoch": 0.46728584459739614, "grad_norm": 0.9490394592285156, "learning_rate": 6.425129607644089e-05, "loss": 1.2297, "mean_token_accuracy": 0.684237539768219, "num_tokens": 184545520.0, "step": 5635 }, { "epoch": 0.4677004726760096, "grad_norm": 0.9528844952583313, "learning_rate": 6.418191604931748e-05, "loss": 1.1835, "mean_token_accuracy": 0.6946603089571, "num_tokens": 184709360.0, "step": 5640 }, { "epoch": 0.4681151007546231, "grad_norm": 0.8833242654800415, "learning_rate": 6.411250631408687e-05, "loss": 1.2603, "mean_token_accuracy": 0.6811895027756691, "num_tokens": 184872732.0, "step": 5645 }, { "epoch": 0.46852972883323657, "grad_norm": 0.9160747528076172, "learning_rate": 6.404306701614773e-05, "loss": 1.2691, "mean_token_accuracy": 0.6816104598343372, "num_tokens": 185036572.0, "step": 5650 }, { "epoch": 0.46894435691185005, "grad_norm": 0.9168289303779602, "learning_rate": 6.397359830096067e-05, "loss": 1.2455, "mean_token_accuracy": 0.6794538110494613, "num_tokens": 185200412.0, "step": 5655 }, { "epoch": 0.4693589849904636, "grad_norm": 0.9233760833740234, "learning_rate": 6.390410031404792e-05, "loss": 1.2416, "mean_token_accuracy": 0.6828624308109283, "num_tokens": 185363403.0, "step": 5660 }, { "epoch": 0.46977361306907706, "grad_norm": 0.9137739539146423, "learning_rate": 6.383457320099303e-05, "loss": 1.1292, "mean_token_accuracy": 0.7065860241651535, "num_tokens": 185527243.0, "step": 5665 }, { "epoch": 0.47018824114769053, "grad_norm": 0.9377095699310303, "learning_rate": 6.376501710744056e-05, "loss": 1.1629, "mean_token_accuracy": 0.6961693525314331, "num_tokens": 185691083.0, "step": 5670 }, { "epoch": 0.470602869226304, "grad_norm": 0.8585503101348877, "learning_rate": 6.369543217909577e-05, "loss": 1.2209, "mean_token_accuracy": 0.6888868555426597, "num_tokens": 185854923.0, "step": 5675 }, { "epoch": 0.4710174973049175, "grad_norm": 0.9265078902244568, "learning_rate": 6.362581856172433e-05, "loss": 1.2307, "mean_token_accuracy": 0.6839687183499337, "num_tokens": 186018763.0, "step": 5680 }, { "epoch": 0.47143212538353096, "grad_norm": 0.8388371467590332, "learning_rate": 6.355617640115203e-05, "loss": 1.2153, "mean_token_accuracy": 0.6901798859238625, "num_tokens": 186182351.0, "step": 5685 }, { "epoch": 0.47184675346214444, "grad_norm": 0.898435115814209, "learning_rate": 6.348650584326439e-05, "loss": 1.2653, "mean_token_accuracy": 0.6818548381328583, "num_tokens": 186346191.0, "step": 5690 }, { "epoch": 0.4722613815407579, "grad_norm": 0.9321533441543579, "learning_rate": 6.341680703400651e-05, "loss": 1.2397, "mean_token_accuracy": 0.6852517113089561, "num_tokens": 186510031.0, "step": 5695 }, { "epoch": 0.47267600961937145, "grad_norm": 0.9423805475234985, "learning_rate": 6.334708011938258e-05, "loss": 1.2926, "mean_token_accuracy": 0.6789161786437035, "num_tokens": 186673871.0, "step": 5700 }, { "epoch": 0.47309063769798493, "grad_norm": 0.9340402483940125, "learning_rate": 6.327732524545571e-05, "loss": 1.2594, "mean_token_accuracy": 0.6856793776154518, "num_tokens": 186837711.0, "step": 5705 }, { "epoch": 0.4735052657765984, "grad_norm": 0.9308452010154724, "learning_rate": 6.320754255834758e-05, "loss": 1.2601, "mean_token_accuracy": 0.6822519540786743, "num_tokens": 187001551.0, "step": 5710 }, { "epoch": 0.4739198938552119, "grad_norm": 0.8753827810287476, "learning_rate": 6.313773220423812e-05, "loss": 1.1713, "mean_token_accuracy": 0.69445870667696, "num_tokens": 187165391.0, "step": 5715 }, { "epoch": 0.47433452193382536, "grad_norm": 0.9730857610702515, "learning_rate": 6.30678943293652e-05, "loss": 1.3436, "mean_token_accuracy": 0.6642106577754021, "num_tokens": 187329231.0, "step": 5720 }, { "epoch": 0.47474915001243884, "grad_norm": 0.9346562623977661, "learning_rate": 6.29980290800244e-05, "loss": 1.2378, "mean_token_accuracy": 0.6841153427958488, "num_tokens": 187493071.0, "step": 5725 }, { "epoch": 0.4751637780910523, "grad_norm": 0.9891049861907959, "learning_rate": 6.292813660256856e-05, "loss": 1.3049, "mean_token_accuracy": 0.6780832409858704, "num_tokens": 187656534.0, "step": 5730 }, { "epoch": 0.4755784061696658, "grad_norm": 0.8796224594116211, "learning_rate": 6.285821704340765e-05, "loss": 1.2258, "mean_token_accuracy": 0.6818120688199997, "num_tokens": 187820374.0, "step": 5735 }, { "epoch": 0.47599303424827927, "grad_norm": 0.9198670387268066, "learning_rate": 6.278827054900828e-05, "loss": 1.2519, "mean_token_accuracy": 0.6800464317202568, "num_tokens": 187984214.0, "step": 5740 }, { "epoch": 0.4764076623268928, "grad_norm": 0.9337548613548279, "learning_rate": 6.271829726589355e-05, "loss": 1.2797, "mean_token_accuracy": 0.6787939861416816, "num_tokens": 188148054.0, "step": 5745 }, { "epoch": 0.4768222904055063, "grad_norm": 0.9277735948562622, "learning_rate": 6.264829734064264e-05, "loss": 1.338, "mean_token_accuracy": 0.668963835388422, "num_tokens": 188311894.0, "step": 5750 }, { "epoch": 0.47723691848411975, "grad_norm": 0.9833148121833801, "learning_rate": 6.257827091989055e-05, "loss": 1.2735, "mean_token_accuracy": 0.6778641879558563, "num_tokens": 188474887.0, "step": 5755 }, { "epoch": 0.47765154656273323, "grad_norm": 0.9519672989845276, "learning_rate": 6.250821815032779e-05, "loss": 1.2091, "mean_token_accuracy": 0.6879032239317894, "num_tokens": 188638727.0, "step": 5760 }, { "epoch": 0.4780661746413467, "grad_norm": 0.9036309719085693, "learning_rate": 6.243813917870005e-05, "loss": 1.2139, "mean_token_accuracy": 0.6871397942304611, "num_tokens": 188801674.0, "step": 5765 }, { "epoch": 0.4784808027199602, "grad_norm": 0.9152082204818726, "learning_rate": 6.236803415180792e-05, "loss": 1.2659, "mean_token_accuracy": 0.6844013914465904, "num_tokens": 188964908.0, "step": 5770 }, { "epoch": 0.47889543079857366, "grad_norm": 0.9431639313697815, "learning_rate": 6.229790321650661e-05, "loss": 1.1818, "mean_token_accuracy": 0.6987719893455505, "num_tokens": 189128748.0, "step": 5775 }, { "epoch": 0.47931005887718714, "grad_norm": 0.9656767845153809, "learning_rate": 6.22277465197055e-05, "loss": 1.303, "mean_token_accuracy": 0.6747922793030738, "num_tokens": 189292588.0, "step": 5780 }, { "epoch": 0.47972468695580067, "grad_norm": 0.8947068452835083, "learning_rate": 6.215756420836801e-05, "loss": 1.2374, "mean_token_accuracy": 0.6805779531598091, "num_tokens": 189456428.0, "step": 5785 }, { "epoch": 0.48013931503441415, "grad_norm": 0.9972400665283203, "learning_rate": 6.208735642951121e-05, "loss": 1.281, "mean_token_accuracy": 0.6826551795005799, "num_tokens": 189620268.0, "step": 5790 }, { "epoch": 0.4805539431130276, "grad_norm": 0.994186520576477, "learning_rate": 6.20171233302055e-05, "loss": 1.2425, "mean_token_accuracy": 0.6841886594891549, "num_tokens": 189784108.0, "step": 5795 }, { "epoch": 0.4809685711916411, "grad_norm": 0.9338303208351135, "learning_rate": 6.194686505757437e-05, "loss": 1.165, "mean_token_accuracy": 0.6987719908356667, "num_tokens": 189947948.0, "step": 5800 }, { "epoch": 0.4813831992702546, "grad_norm": 0.8793720006942749, "learning_rate": 6.187658175879397e-05, "loss": 1.185, "mean_token_accuracy": 0.6946175426244736, "num_tokens": 190111788.0, "step": 5805 }, { "epoch": 0.48179782734886806, "grad_norm": 0.9103440046310425, "learning_rate": 6.18062735810929e-05, "loss": 1.1409, "mean_token_accuracy": 0.7049303486943245, "num_tokens": 190275628.0, "step": 5810 }, { "epoch": 0.48221245542748153, "grad_norm": 0.9415132403373718, "learning_rate": 6.173594067175192e-05, "loss": 1.1824, "mean_token_accuracy": 0.696218229830265, "num_tokens": 190439468.0, "step": 5815 }, { "epoch": 0.482627083506095, "grad_norm": 0.9324992895126343, "learning_rate": 6.166558317810353e-05, "loss": 1.2044, "mean_token_accuracy": 0.689943790435791, "num_tokens": 190603308.0, "step": 5820 }, { "epoch": 0.48304171158470854, "grad_norm": 0.9188627004623413, "learning_rate": 6.159520124753179e-05, "loss": 1.2093, "mean_token_accuracy": 0.6883675456047058, "num_tokens": 190767148.0, "step": 5825 }, { "epoch": 0.483456339663322, "grad_norm": 0.873873233795166, "learning_rate": 6.152479502747189e-05, "loss": 1.2675, "mean_token_accuracy": 0.6782244190573692, "num_tokens": 190930407.0, "step": 5830 }, { "epoch": 0.4838709677419355, "grad_norm": 0.9298220276832581, "learning_rate": 6.145436466540995e-05, "loss": 1.2797, "mean_token_accuracy": 0.6794084221124649, "num_tokens": 191093428.0, "step": 5835 }, { "epoch": 0.484285595820549, "grad_norm": 0.8735694289207458, "learning_rate": 6.138391030888267e-05, "loss": 1.2099, "mean_token_accuracy": 0.6898154929280281, "num_tokens": 191257268.0, "step": 5840 }, { "epoch": 0.48470022389916245, "grad_norm": 0.9457949995994568, "learning_rate": 6.131343210547694e-05, "loss": 1.1883, "mean_token_accuracy": 0.6914406135678292, "num_tokens": 191421108.0, "step": 5845 }, { "epoch": 0.48511485197777593, "grad_norm": 0.909713864326477, "learning_rate": 6.124293020282969e-05, "loss": 1.1569, "mean_token_accuracy": 0.698619256913662, "num_tokens": 191584948.0, "step": 5850 }, { "epoch": 0.4855294800563894, "grad_norm": 0.9126371145248413, "learning_rate": 6.117240474862743e-05, "loss": 1.2108, "mean_token_accuracy": 0.6895588994026184, "num_tokens": 191748788.0, "step": 5855 }, { "epoch": 0.4859441081350029, "grad_norm": 0.8835738301277161, "learning_rate": 6.110185589060608e-05, "loss": 1.3032, "mean_token_accuracy": 0.6763624176383018, "num_tokens": 191912628.0, "step": 5860 }, { "epoch": 0.48635873621361636, "grad_norm": 0.9455219507217407, "learning_rate": 6.1031283776550475e-05, "loss": 1.1685, "mean_token_accuracy": 0.6940554723143577, "num_tokens": 192076468.0, "step": 5865 }, { "epoch": 0.4867733642922299, "grad_norm": 0.9001264572143555, "learning_rate": 6.096068855429429e-05, "loss": 1.1785, "mean_token_accuracy": 0.6976967230439186, "num_tokens": 192240308.0, "step": 5870 }, { "epoch": 0.48718799237084337, "grad_norm": 0.9531750082969666, "learning_rate": 6.08900703717195e-05, "loss": 1.1979, "mean_token_accuracy": 0.6936278119683266, "num_tokens": 192404148.0, "step": 5875 }, { "epoch": 0.48760262044945685, "grad_norm": 0.9018110036849976, "learning_rate": 6.081942937675625e-05, "loss": 1.1978, "mean_token_accuracy": 0.6919415920972825, "num_tokens": 192567988.0, "step": 5880 }, { "epoch": 0.4880172485280703, "grad_norm": 0.9620280861854553, "learning_rate": 6.074876571738246e-05, "loss": 1.2369, "mean_token_accuracy": 0.6848770201206207, "num_tokens": 192731608.0, "step": 5885 }, { "epoch": 0.4884318766066838, "grad_norm": 0.931185781955719, "learning_rate": 6.0678079541623475e-05, "loss": 1.1997, "mean_token_accuracy": 0.6906952559947968, "num_tokens": 192895448.0, "step": 5890 }, { "epoch": 0.4888465046852973, "grad_norm": 0.93744295835495, "learning_rate": 6.060737099755189e-05, "loss": 1.2619, "mean_token_accuracy": 0.6782441407442092, "num_tokens": 193059288.0, "step": 5895 }, { "epoch": 0.48926113276391076, "grad_norm": 0.8988864421844482, "learning_rate": 6.053664023328708e-05, "loss": 1.239, "mean_token_accuracy": 0.6839809373021126, "num_tokens": 193223128.0, "step": 5900 }, { "epoch": 0.48967576084252423, "grad_norm": 0.9602072834968567, "learning_rate": 6.046588739699502e-05, "loss": 1.3042, "mean_token_accuracy": 0.6769978016614914, "num_tokens": 193386968.0, "step": 5905 }, { "epoch": 0.49009038892113777, "grad_norm": 0.9107131361961365, "learning_rate": 6.039511263688789e-05, "loss": 1.1768, "mean_token_accuracy": 0.6962706178426743, "num_tokens": 193550218.0, "step": 5910 }, { "epoch": 0.49050501699975124, "grad_norm": 0.945172905921936, "learning_rate": 6.0324316101223796e-05, "loss": 1.2521, "mean_token_accuracy": 0.6807856783270836, "num_tokens": 193714058.0, "step": 5915 }, { "epoch": 0.4909196450783647, "grad_norm": 0.8420608639717102, "learning_rate": 6.0253497938306494e-05, "loss": 1.1644, "mean_token_accuracy": 0.700329914689064, "num_tokens": 193877898.0, "step": 5920 }, { "epoch": 0.4913342731569782, "grad_norm": 0.8994563221931458, "learning_rate": 6.0182658296485005e-05, "loss": 1.2116, "mean_token_accuracy": 0.691538368165493, "num_tokens": 194041738.0, "step": 5925 }, { "epoch": 0.4917489012355917, "grad_norm": 0.9086576700210571, "learning_rate": 6.011179732415335e-05, "loss": 1.1416, "mean_token_accuracy": 0.6986620262265205, "num_tokens": 194205578.0, "step": 5930 }, { "epoch": 0.49216352931420515, "grad_norm": 0.8597748875617981, "learning_rate": 6.0040915169750265e-05, "loss": 1.2576, "mean_token_accuracy": 0.6861559227108955, "num_tokens": 194369418.0, "step": 5935 }, { "epoch": 0.4925781573928186, "grad_norm": 0.9271917343139648, "learning_rate": 5.997001198175882e-05, "loss": 1.2655, "mean_token_accuracy": 0.6808162242174148, "num_tokens": 194533258.0, "step": 5940 }, { "epoch": 0.4929927854714321, "grad_norm": 0.9166371822357178, "learning_rate": 5.989908790870616e-05, "loss": 1.3046, "mean_token_accuracy": 0.6792445093393326, "num_tokens": 194696193.0, "step": 5945 }, { "epoch": 0.4934074135500456, "grad_norm": 0.9166553616523743, "learning_rate": 5.9828143099163206e-05, "loss": 1.27, "mean_token_accuracy": 0.6806940406560897, "num_tokens": 194860033.0, "step": 5950 }, { "epoch": 0.4938220416286591, "grad_norm": 0.924065351486206, "learning_rate": 5.975717770174424e-05, "loss": 1.2329, "mean_token_accuracy": 0.6834431111812591, "num_tokens": 195023520.0, "step": 5955 }, { "epoch": 0.4942366697072726, "grad_norm": 0.907636284828186, "learning_rate": 5.968619186510678e-05, "loss": 1.2479, "mean_token_accuracy": 0.681549359858036, "num_tokens": 195187360.0, "step": 5960 }, { "epoch": 0.49465129778588607, "grad_norm": 0.9144726991653442, "learning_rate": 5.961518573795105e-05, "loss": 1.257, "mean_token_accuracy": 0.6842314228415489, "num_tokens": 195351200.0, "step": 5965 }, { "epoch": 0.49506592586449955, "grad_norm": 0.9033091068267822, "learning_rate": 5.9544159469019855e-05, "loss": 1.2346, "mean_token_accuracy": 0.6842741966247559, "num_tokens": 195515040.0, "step": 5970 }, { "epoch": 0.495480553943113, "grad_norm": 0.8993896842002869, "learning_rate": 5.9473113207098194e-05, "loss": 1.2514, "mean_token_accuracy": 0.6854166716337204, "num_tokens": 195678880.0, "step": 5975 }, { "epoch": 0.4958951820217265, "grad_norm": 0.8590402603149414, "learning_rate": 5.940204710101288e-05, "loss": 1.3325, "mean_token_accuracy": 0.6693731665611267, "num_tokens": 195842720.0, "step": 5980 }, { "epoch": 0.49630981010034, "grad_norm": 0.8978489637374878, "learning_rate": 5.933096129963238e-05, "loss": 1.1963, "mean_token_accuracy": 0.6939699441194535, "num_tokens": 196006560.0, "step": 5985 }, { "epoch": 0.49672443817895345, "grad_norm": 0.9233525991439819, "learning_rate": 5.925985595186634e-05, "loss": 1.1436, "mean_token_accuracy": 0.7024804502725601, "num_tokens": 196170400.0, "step": 5990 }, { "epoch": 0.497139066257567, "grad_norm": 0.9250538349151611, "learning_rate": 5.9188731206665396e-05, "loss": 1.2149, "mean_token_accuracy": 0.6864369481801986, "num_tokens": 196334240.0, "step": 5995 }, { "epoch": 0.49755369433618046, "grad_norm": 0.8921429514884949, "learning_rate": 5.911758721302082e-05, "loss": 1.1808, "mean_token_accuracy": 0.697421795129776, "num_tokens": 196498080.0, "step": 6000 }, { "epoch": 0.49796832241479394, "grad_norm": 0.9132588505744934, "learning_rate": 5.904642411996418e-05, "loss": 1.24, "mean_token_accuracy": 0.6872372940182686, "num_tokens": 196661920.0, "step": 6005 }, { "epoch": 0.4983829504934074, "grad_norm": 0.906562864780426, "learning_rate": 5.897524207656708e-05, "loss": 1.2232, "mean_token_accuracy": 0.6900476559996604, "num_tokens": 196825760.0, "step": 6010 }, { "epoch": 0.4987975785720209, "grad_norm": 0.9356030821800232, "learning_rate": 5.890404123194081e-05, "loss": 1.1835, "mean_token_accuracy": 0.691752202808857, "num_tokens": 196989600.0, "step": 6015 }, { "epoch": 0.49921220665063437, "grad_norm": 0.9574745893478394, "learning_rate": 5.883282173523603e-05, "loss": 1.3666, "mean_token_accuracy": 0.6597079634666443, "num_tokens": 197153440.0, "step": 6020 }, { "epoch": 0.49962683472924785, "grad_norm": 0.9683645963668823, "learning_rate": 5.876158373564249e-05, "loss": 1.2422, "mean_token_accuracy": 0.6861742407083511, "num_tokens": 197317280.0, "step": 6025 }, { "epoch": 0.5000414628078613, "grad_norm": 0.9355904459953308, "learning_rate": 5.869032738238871e-05, "loss": 1.2374, "mean_token_accuracy": 0.6827791914343834, "num_tokens": 197480273.0, "step": 6030 }, { "epoch": 0.5004560908864748, "grad_norm": 0.9154330492019653, "learning_rate": 5.861905282474161e-05, "loss": 1.274, "mean_token_accuracy": 0.6810056194663048, "num_tokens": 197644113.0, "step": 6035 }, { "epoch": 0.5008707189650883, "grad_norm": 0.9223089814186096, "learning_rate": 5.854776021200632e-05, "loss": 1.2341, "mean_token_accuracy": 0.6852827444672585, "num_tokens": 197807814.0, "step": 6040 }, { "epoch": 0.5012853470437018, "grad_norm": 0.8918419480323792, "learning_rate": 5.847644969352569e-05, "loss": 1.2313, "mean_token_accuracy": 0.6858565464615822, "num_tokens": 197971654.0, "step": 6045 }, { "epoch": 0.5016999751223152, "grad_norm": 0.9654561281204224, "learning_rate": 5.840512141868021e-05, "loss": 1.243, "mean_token_accuracy": 0.6822886109352112, "num_tokens": 198135494.0, "step": 6050 }, { "epoch": 0.5021146032009288, "grad_norm": 0.8958101868629456, "learning_rate": 5.833377553688743e-05, "loss": 1.251, "mean_token_accuracy": 0.6814332827925682, "num_tokens": 198299334.0, "step": 6055 }, { "epoch": 0.5025292312795423, "grad_norm": 0.9199835062026978, "learning_rate": 5.8262412197601856e-05, "loss": 1.2257, "mean_token_accuracy": 0.6889723837375641, "num_tokens": 198463174.0, "step": 6060 }, { "epoch": 0.5029438593581558, "grad_norm": 0.931423544883728, "learning_rate": 5.819103155031459e-05, "loss": 1.2309, "mean_token_accuracy": 0.6848666816949844, "num_tokens": 198626162.0, "step": 6065 }, { "epoch": 0.5033584874367693, "grad_norm": 0.9099568724632263, "learning_rate": 5.811963374455291e-05, "loss": 1.2458, "mean_token_accuracy": 0.6816876038908959, "num_tokens": 198789346.0, "step": 6070 }, { "epoch": 0.5037731155153827, "grad_norm": 0.8975064754486084, "learning_rate": 5.80482189298801e-05, "loss": 1.1754, "mean_token_accuracy": 0.6951735079288482, "num_tokens": 198953186.0, "step": 6075 }, { "epoch": 0.5041877435939962, "grad_norm": 0.906932532787323, "learning_rate": 5.797678725589507e-05, "loss": 1.2362, "mean_token_accuracy": 0.6864308372139931, "num_tokens": 199117026.0, "step": 6080 }, { "epoch": 0.5046023716726097, "grad_norm": 0.932145893573761, "learning_rate": 5.790533887223201e-05, "loss": 1.2296, "mean_token_accuracy": 0.6862781062722206, "num_tokens": 199280866.0, "step": 6085 }, { "epoch": 0.5050169997512232, "grad_norm": 0.8927560448646545, "learning_rate": 5.7833873928560134e-05, "loss": 1.1698, "mean_token_accuracy": 0.6978782877326012, "num_tokens": 199444061.0, "step": 6090 }, { "epoch": 0.5054316278298366, "grad_norm": 0.8670108318328857, "learning_rate": 5.7762392574583356e-05, "loss": 1.1605, "mean_token_accuracy": 0.6997556135058403, "num_tokens": 199607901.0, "step": 6095 }, { "epoch": 0.5058462559084501, "grad_norm": 0.8638862371444702, "learning_rate": 5.7690894960039953e-05, "loss": 1.2513, "mean_token_accuracy": 0.6832722336053848, "num_tokens": 199771741.0, "step": 6100 }, { "epoch": 0.5062608839870636, "grad_norm": 0.8774763345718384, "learning_rate": 5.761938123470227e-05, "loss": 1.177, "mean_token_accuracy": 0.6960777178406715, "num_tokens": 199935581.0, "step": 6105 }, { "epoch": 0.5066755120656771, "grad_norm": 0.9620174169540405, "learning_rate": 5.7547851548376405e-05, "loss": 1.2632, "mean_token_accuracy": 0.6802113875746727, "num_tokens": 200099421.0, "step": 6110 }, { "epoch": 0.5070901401442905, "grad_norm": 0.9467723965644836, "learning_rate": 5.7476306050901876e-05, "loss": 1.1143, "mean_token_accuracy": 0.708003418147564, "num_tokens": 200263261.0, "step": 6115 }, { "epoch": 0.507504768222904, "grad_norm": 0.8758918642997742, "learning_rate": 5.740474489215133e-05, "loss": 1.1709, "mean_token_accuracy": 0.6971163272857666, "num_tokens": 200427101.0, "step": 6120 }, { "epoch": 0.5079193963015175, "grad_norm": 0.9225305914878845, "learning_rate": 5.733316822203022e-05, "loss": 1.2875, "mean_token_accuracy": 0.6764418363571167, "num_tokens": 200590941.0, "step": 6125 }, { "epoch": 0.508334024380131, "grad_norm": 0.9075090289115906, "learning_rate": 5.7261576190476515e-05, "loss": 1.1457, "mean_token_accuracy": 0.7037573307752609, "num_tokens": 200754781.0, "step": 6130 }, { "epoch": 0.5087486524587445, "grad_norm": 0.8592643141746521, "learning_rate": 5.7189968947460316e-05, "loss": 1.1999, "mean_token_accuracy": 0.6876955017447471, "num_tokens": 200918621.0, "step": 6135 }, { "epoch": 0.509163280537358, "grad_norm": 0.9491396546363831, "learning_rate": 5.711834664298362e-05, "loss": 1.2697, "mean_token_accuracy": 0.6799853354692459, "num_tokens": 201082461.0, "step": 6140 }, { "epoch": 0.5095779086159715, "grad_norm": 0.9179980158805847, "learning_rate": 5.704670942707997e-05, "loss": 1.1817, "mean_token_accuracy": 0.696865837275982, "num_tokens": 201246301.0, "step": 6145 }, { "epoch": 0.509992536694585, "grad_norm": 0.8991859555244446, "learning_rate": 5.697505744981415e-05, "loss": 1.2769, "mean_token_accuracy": 0.6775598764419556, "num_tokens": 201410141.0, "step": 6150 }, { "epoch": 0.5104071647731985, "grad_norm": 0.8738629817962646, "learning_rate": 5.690339086128187e-05, "loss": 1.2654, "mean_token_accuracy": 0.683230721950531, "num_tokens": 201573685.0, "step": 6155 }, { "epoch": 0.510821792851812, "grad_norm": 0.8947060108184814, "learning_rate": 5.683170981160941e-05, "loss": 1.2279, "mean_token_accuracy": 0.688263687491417, "num_tokens": 201737525.0, "step": 6160 }, { "epoch": 0.5112364209304254, "grad_norm": 0.8794062733650208, "learning_rate": 5.6760014450953406e-05, "loss": 1.2145, "mean_token_accuracy": 0.6862536638975143, "num_tokens": 201901365.0, "step": 6165 }, { "epoch": 0.5116510490090389, "grad_norm": 0.8858832716941833, "learning_rate": 5.66883049295004e-05, "loss": 1.1711, "mean_token_accuracy": 0.6965915784239769, "num_tokens": 202065168.0, "step": 6170 }, { "epoch": 0.5120656770876524, "grad_norm": 0.8891280889511108, "learning_rate": 5.6616581397466664e-05, "loss": 1.3551, "mean_token_accuracy": 0.6664528340101242, "num_tokens": 202229008.0, "step": 6175 }, { "epoch": 0.5124803051662659, "grad_norm": 0.8534021973609924, "learning_rate": 5.654484400509778e-05, "loss": 1.1872, "mean_token_accuracy": 0.6963587433099747, "num_tokens": 202392848.0, "step": 6180 }, { "epoch": 0.5128949332448793, "grad_norm": 0.9156395792961121, "learning_rate": 5.6473092902668366e-05, "loss": 1.259, "mean_token_accuracy": 0.6817326471209526, "num_tokens": 202556688.0, "step": 6185 }, { "epoch": 0.5133095613234928, "grad_norm": 0.9103695154190063, "learning_rate": 5.640132824048179e-05, "loss": 1.2786, "mean_token_accuracy": 0.681138950586319, "num_tokens": 202719652.0, "step": 6190 }, { "epoch": 0.5137241894021063, "grad_norm": 0.9157353639602661, "learning_rate": 5.632955016886978e-05, "loss": 1.2264, "mean_token_accuracy": 0.6895894408226013, "num_tokens": 202883492.0, "step": 6195 }, { "epoch": 0.5141388174807198, "grad_norm": 0.9072819948196411, "learning_rate": 5.62577588381922e-05, "loss": 1.2243, "mean_token_accuracy": 0.6888135358691215, "num_tokens": 203047332.0, "step": 6200 }, { "epoch": 0.5145534455593332, "grad_norm": 0.9278843402862549, "learning_rate": 5.618595439883664e-05, "loss": 1.2849, "mean_token_accuracy": 0.6793351963162422, "num_tokens": 203210230.0, "step": 6205 }, { "epoch": 0.5149680736379467, "grad_norm": 0.8833913207054138, "learning_rate": 5.61141370012182e-05, "loss": 1.1722, "mean_token_accuracy": 0.6963343143463134, "num_tokens": 203374070.0, "step": 6210 }, { "epoch": 0.5153827017165602, "grad_norm": 0.9043241739273071, "learning_rate": 5.6042306795779085e-05, "loss": 1.2276, "mean_token_accuracy": 0.6884836286306382, "num_tokens": 203537910.0, "step": 6215 }, { "epoch": 0.5157973297951737, "grad_norm": 0.9065195322036743, "learning_rate": 5.597046393298836e-05, "loss": 1.2534, "mean_token_accuracy": 0.6853250235319137, "num_tokens": 203701750.0, "step": 6220 }, { "epoch": 0.5162119578737873, "grad_norm": 0.8881427049636841, "learning_rate": 5.589860856334158e-05, "loss": 1.204, "mean_token_accuracy": 0.6931936025619507, "num_tokens": 203865453.0, "step": 6225 }, { "epoch": 0.5166265859524007, "grad_norm": 0.9282435774803162, "learning_rate": 5.582674083736049e-05, "loss": 1.2401, "mean_token_accuracy": 0.6892106533050537, "num_tokens": 204029293.0, "step": 6230 }, { "epoch": 0.5170412140310142, "grad_norm": 0.953029990196228, "learning_rate": 5.57548609055928e-05, "loss": 1.3481, "mean_token_accuracy": 0.6690982446074486, "num_tokens": 204193133.0, "step": 6235 }, { "epoch": 0.5174558421096277, "grad_norm": 0.884660542011261, "learning_rate": 5.568296891861166e-05, "loss": 1.1057, "mean_token_accuracy": 0.7079239949584007, "num_tokens": 204356973.0, "step": 6240 }, { "epoch": 0.5178704701882412, "grad_norm": 0.8669636845588684, "learning_rate": 5.561106502701557e-05, "loss": 1.1728, "mean_token_accuracy": 0.6963220924139023, "num_tokens": 204520813.0, "step": 6245 }, { "epoch": 0.5182850982668546, "grad_norm": 0.9606974720954895, "learning_rate": 5.5539149381427934e-05, "loss": 1.2909, "mean_token_accuracy": 0.6754643246531487, "num_tokens": 204684653.0, "step": 6250 }, { "epoch": 0.5186997263454681, "grad_norm": 0.9595761895179749, "learning_rate": 5.546722213249678e-05, "loss": 1.2513, "mean_token_accuracy": 0.6835227265954018, "num_tokens": 204848493.0, "step": 6255 }, { "epoch": 0.5191143544240816, "grad_norm": 0.8764197826385498, "learning_rate": 5.539528343089445e-05, "loss": 1.2467, "mean_token_accuracy": 0.6843291744589806, "num_tokens": 205012333.0, "step": 6260 }, { "epoch": 0.5195289825026951, "grad_norm": 0.9235277771949768, "learning_rate": 5.5323333427317256e-05, "loss": 1.2007, "mean_token_accuracy": 0.6919721379876137, "num_tokens": 205176173.0, "step": 6265 }, { "epoch": 0.5199436105813086, "grad_norm": 0.9578131437301636, "learning_rate": 5.525137227248522e-05, "loss": 1.2815, "mean_token_accuracy": 0.6770711153745651, "num_tokens": 205340013.0, "step": 6270 }, { "epoch": 0.520358238659922, "grad_norm": 0.9407724738121033, "learning_rate": 5.51794001171417e-05, "loss": 1.2213, "mean_token_accuracy": 0.6865285962820054, "num_tokens": 205503853.0, "step": 6275 }, { "epoch": 0.5207728667385355, "grad_norm": 0.934880256652832, "learning_rate": 5.5107417112053094e-05, "loss": 1.1946, "mean_token_accuracy": 0.6924914509057999, "num_tokens": 205667693.0, "step": 6280 }, { "epoch": 0.521187494817149, "grad_norm": 1.0713592767715454, "learning_rate": 5.503542340800852e-05, "loss": 1.1717, "mean_token_accuracy": 0.6967314258217812, "num_tokens": 205831533.0, "step": 6285 }, { "epoch": 0.5216021228957625, "grad_norm": 0.9055458903312683, "learning_rate": 5.496341915581957e-05, "loss": 1.1642, "mean_token_accuracy": 0.699040810763836, "num_tokens": 205995373.0, "step": 6290 }, { "epoch": 0.5220167509743759, "grad_norm": 0.8946324586868286, "learning_rate": 5.4891404506319825e-05, "loss": 1.2236, "mean_token_accuracy": 0.6841092362999917, "num_tokens": 206159213.0, "step": 6295 }, { "epoch": 0.5224313790529894, "grad_norm": 0.9214036464691162, "learning_rate": 5.481937961036476e-05, "loss": 1.293, "mean_token_accuracy": 0.6736925691366196, "num_tokens": 206323053.0, "step": 6300 }, { "epoch": 0.5228460071316029, "grad_norm": 0.9672065377235413, "learning_rate": 5.474734461883124e-05, "loss": 1.2162, "mean_token_accuracy": 0.6890579178929329, "num_tokens": 206486893.0, "step": 6305 }, { "epoch": 0.5232606352102165, "grad_norm": 0.9097797870635986, "learning_rate": 5.4675299682617285e-05, "loss": 1.2221, "mean_token_accuracy": 0.6881767675280571, "num_tokens": 206650573.0, "step": 6310 }, { "epoch": 0.52367526328883, "grad_norm": 0.8895254731178284, "learning_rate": 5.460324495264179e-05, "loss": 1.0871, "mean_token_accuracy": 0.7105632960796356, "num_tokens": 206814413.0, "step": 6315 }, { "epoch": 0.5240898913674434, "grad_norm": 0.8365790843963623, "learning_rate": 5.453118057984411e-05, "loss": 1.2284, "mean_token_accuracy": 0.6855632916092873, "num_tokens": 206978253.0, "step": 6320 }, { "epoch": 0.5245045194460569, "grad_norm": 0.8553270697593689, "learning_rate": 5.445910671518384e-05, "loss": 1.1792, "mean_token_accuracy": 0.6980083122849464, "num_tokens": 207142093.0, "step": 6325 }, { "epoch": 0.5249191475246704, "grad_norm": 0.9196605086326599, "learning_rate": 5.438702350964044e-05, "loss": 1.3084, "mean_token_accuracy": 0.676484601944685, "num_tokens": 207305933.0, "step": 6330 }, { "epoch": 0.5253337756032839, "grad_norm": 0.9052113890647888, "learning_rate": 5.4314931114212956e-05, "loss": 1.2114, "mean_token_accuracy": 0.691580730676651, "num_tokens": 207468907.0, "step": 6335 }, { "epoch": 0.5257484036818973, "grad_norm": 0.9065616130828857, "learning_rate": 5.424282967991965e-05, "loss": 1.1778, "mean_token_accuracy": 0.6990835785865783, "num_tokens": 207632747.0, "step": 6340 }, { "epoch": 0.5261630317605108, "grad_norm": 13.21120548248291, "learning_rate": 5.4170719357797774e-05, "loss": 1.2799, "mean_token_accuracy": 0.6758736535906792, "num_tokens": 207796587.0, "step": 6345 }, { "epoch": 0.5265776598391243, "grad_norm": 0.8727403283119202, "learning_rate": 5.4098600298903105e-05, "loss": 1.2515, "mean_token_accuracy": 0.6862014904618263, "num_tokens": 207960159.0, "step": 6350 }, { "epoch": 0.5269922879177378, "grad_norm": 0.9306917190551758, "learning_rate": 5.402647265430982e-05, "loss": 1.2238, "mean_token_accuracy": 0.6863636314868927, "num_tokens": 208123999.0, "step": 6355 }, { "epoch": 0.5274069159963513, "grad_norm": 0.8838227987289429, "learning_rate": 5.3954336575110066e-05, "loss": 1.224, "mean_token_accuracy": 0.6893084019422531, "num_tokens": 208287839.0, "step": 6360 }, { "epoch": 0.5278215440749647, "grad_norm": 0.904277503490448, "learning_rate": 5.388219221241357e-05, "loss": 1.1809, "mean_token_accuracy": 0.69630376547575, "num_tokens": 208451679.0, "step": 6365 }, { "epoch": 0.5282361721535782, "grad_norm": 0.9271522760391235, "learning_rate": 5.3810039717347536e-05, "loss": 1.2546, "mean_token_accuracy": 0.6841703325510025, "num_tokens": 208615519.0, "step": 6370 }, { "epoch": 0.5286508002321917, "grad_norm": 0.9033722281455994, "learning_rate": 5.37378792410561e-05, "loss": 1.1427, "mean_token_accuracy": 0.6994868069887161, "num_tokens": 208779359.0, "step": 6375 }, { "epoch": 0.5290654283108052, "grad_norm": 0.8846749067306519, "learning_rate": 5.3665710934700184e-05, "loss": 1.229, "mean_token_accuracy": 0.689711631834507, "num_tokens": 208943199.0, "step": 6380 }, { "epoch": 0.5294800563894186, "grad_norm": 0.8874713778495789, "learning_rate": 5.3593534949457094e-05, "loss": 1.2066, "mean_token_accuracy": 0.692864128947258, "num_tokens": 209107039.0, "step": 6385 }, { "epoch": 0.5298946844680322, "grad_norm": 0.8738123774528503, "learning_rate": 5.352135143652018e-05, "loss": 1.197, "mean_token_accuracy": 0.6921893879771233, "num_tokens": 209270820.0, "step": 6390 }, { "epoch": 0.5303093125466457, "grad_norm": 0.9200013875961304, "learning_rate": 5.344916054709863e-05, "loss": 1.1546, "mean_token_accuracy": 0.697586752474308, "num_tokens": 209434660.0, "step": 6395 }, { "epoch": 0.5307239406252592, "grad_norm": 0.9030777812004089, "learning_rate": 5.3376962432417045e-05, "loss": 1.2219, "mean_token_accuracy": 0.6877749294042588, "num_tokens": 209598500.0, "step": 6400 }, { "epoch": 0.5311385687038727, "grad_norm": 0.9108365178108215, "learning_rate": 5.3304757243715164e-05, "loss": 1.2661, "mean_token_accuracy": 0.6783941894769668, "num_tokens": 209762253.0, "step": 6405 }, { "epoch": 0.5315531967824861, "grad_norm": 0.9172254800796509, "learning_rate": 5.3232545132247544e-05, "loss": 1.1724, "mean_token_accuracy": 0.7031769335269928, "num_tokens": 209926093.0, "step": 6410 }, { "epoch": 0.5319678248610996, "grad_norm": 0.9194810390472412, "learning_rate": 5.316032624928326e-05, "loss": 1.2083, "mean_token_accuracy": 0.6897408664226532, "num_tokens": 210089227.0, "step": 6415 }, { "epoch": 0.5323824529397131, "grad_norm": 0.8845224380493164, "learning_rate": 5.308810074610554e-05, "loss": 1.2051, "mean_token_accuracy": 0.6930779546499253, "num_tokens": 210253067.0, "step": 6420 }, { "epoch": 0.5327970810183266, "grad_norm": 0.8977319598197937, "learning_rate": 5.3015868774011525e-05, "loss": 1.19, "mean_token_accuracy": 0.6922536373138428, "num_tokens": 210416397.0, "step": 6425 }, { "epoch": 0.53321170909694, "grad_norm": 0.9427075982093811, "learning_rate": 5.2943630484311844e-05, "loss": 1.2702, "mean_token_accuracy": 0.6780119732022285, "num_tokens": 210580237.0, "step": 6430 }, { "epoch": 0.5336263371755535, "grad_norm": 0.9184347987174988, "learning_rate": 5.287138602833045e-05, "loss": 1.2362, "mean_token_accuracy": 0.6841031283140182, "num_tokens": 210744077.0, "step": 6435 }, { "epoch": 0.534040965254167, "grad_norm": 0.9414148330688477, "learning_rate": 5.279913555740411e-05, "loss": 1.1999, "mean_token_accuracy": 0.6908468291163444, "num_tokens": 210907643.0, "step": 6440 }, { "epoch": 0.5344555933327805, "grad_norm": 0.9243748188018799, "learning_rate": 5.272687922288227e-05, "loss": 1.1242, "mean_token_accuracy": 0.7059323117136955, "num_tokens": 211071483.0, "step": 6445 }, { "epoch": 0.534870221411394, "grad_norm": 0.8637745976448059, "learning_rate": 5.265461717612663e-05, "loss": 1.1653, "mean_token_accuracy": 0.6991996586322784, "num_tokens": 211235323.0, "step": 6450 }, { "epoch": 0.5352848494900074, "grad_norm": 0.8968706727027893, "learning_rate": 5.2582349568510835e-05, "loss": 1.2218, "mean_token_accuracy": 0.6856182813644409, "num_tokens": 211399163.0, "step": 6455 }, { "epoch": 0.5356994775686209, "grad_norm": 0.9248412847518921, "learning_rate": 5.251007655142024e-05, "loss": 1.2183, "mean_token_accuracy": 0.6859115362167358, "num_tokens": 211563003.0, "step": 6460 }, { "epoch": 0.5361141056472344, "grad_norm": 0.8828514814376831, "learning_rate": 5.243779827625146e-05, "loss": 1.1561, "mean_token_accuracy": 0.6985092923045159, "num_tokens": 211726843.0, "step": 6465 }, { "epoch": 0.5365287337258479, "grad_norm": 0.8828794360160828, "learning_rate": 5.236551489441216e-05, "loss": 1.1948, "mean_token_accuracy": 0.6899408712983132, "num_tokens": 211889870.0, "step": 6470 }, { "epoch": 0.5369433618044614, "grad_norm": 0.9275861978530884, "learning_rate": 5.229322655732071e-05, "loss": 1.1307, "mean_token_accuracy": 0.7023704811930657, "num_tokens": 212053710.0, "step": 6475 }, { "epoch": 0.5373579898830749, "grad_norm": 0.913375198841095, "learning_rate": 5.222093341640584e-05, "loss": 1.1373, "mean_token_accuracy": 0.7017656400799751, "num_tokens": 212217550.0, "step": 6480 }, { "epoch": 0.5377726179616884, "grad_norm": 0.8686186671257019, "learning_rate": 5.214863562310634e-05, "loss": 1.2431, "mean_token_accuracy": 0.6821007788181305, "num_tokens": 212381009.0, "step": 6485 }, { "epoch": 0.5381872460403019, "grad_norm": 0.9184940457344055, "learning_rate": 5.20763333288708e-05, "loss": 1.228, "mean_token_accuracy": 0.6859359756112099, "num_tokens": 212544849.0, "step": 6490 }, { "epoch": 0.5386018741189154, "grad_norm": 0.8911036849021912, "learning_rate": 5.200402668515716e-05, "loss": 1.1528, "mean_token_accuracy": 0.699456250667572, "num_tokens": 212708689.0, "step": 6495 }, { "epoch": 0.5390165021975288, "grad_norm": 0.885284960269928, "learning_rate": 5.1931715843432506e-05, "loss": 1.2053, "mean_token_accuracy": 0.6943487286567688, "num_tokens": 212872529.0, "step": 6500 }, { "epoch": 0.5394311302761423, "grad_norm": 0.9448803663253784, "learning_rate": 5.185940095517274e-05, "loss": 1.2753, "mean_token_accuracy": 0.6810500115156174, "num_tokens": 213035919.0, "step": 6505 }, { "epoch": 0.5398457583547558, "grad_norm": 0.8612961173057556, "learning_rate": 5.178708217186222e-05, "loss": 1.2534, "mean_token_accuracy": 0.683718228340149, "num_tokens": 213199759.0, "step": 6510 }, { "epoch": 0.5402603864333693, "grad_norm": 0.8703817129135132, "learning_rate": 5.171475964499346e-05, "loss": 1.1027, "mean_token_accuracy": 0.7086999043822289, "num_tokens": 213362820.0, "step": 6515 }, { "epoch": 0.5406750145119827, "grad_norm": 0.900538444519043, "learning_rate": 5.164243352606679e-05, "loss": 1.1833, "mean_token_accuracy": 0.6918194040656089, "num_tokens": 213526660.0, "step": 6520 }, { "epoch": 0.5410896425905962, "grad_norm": 0.8728988766670227, "learning_rate": 5.157010396659014e-05, "loss": 1.1, "mean_token_accuracy": 0.7105594664812088, "num_tokens": 213690000.0, "step": 6525 }, { "epoch": 0.5415042706692097, "grad_norm": 0.9673345685005188, "learning_rate": 5.149777111807859e-05, "loss": 1.2185, "mean_token_accuracy": 0.6913367554545402, "num_tokens": 213853840.0, "step": 6530 }, { "epoch": 0.5419188987478232, "grad_norm": 0.9376686811447144, "learning_rate": 5.142543513205409e-05, "loss": 1.3058, "mean_token_accuracy": 0.6771444231271744, "num_tokens": 214017680.0, "step": 6535 }, { "epoch": 0.5423335268264367, "grad_norm": 0.8954134583473206, "learning_rate": 5.135309616004523e-05, "loss": 1.1879, "mean_token_accuracy": 0.6946840047836303, "num_tokens": 214181489.0, "step": 6540 }, { "epoch": 0.5427481549050501, "grad_norm": 0.9150399565696716, "learning_rate": 5.128075435358679e-05, "loss": 1.2449, "mean_token_accuracy": 0.6834860697388649, "num_tokens": 214345329.0, "step": 6545 }, { "epoch": 0.5431627829836636, "grad_norm": 0.9100620150566101, "learning_rate": 5.120840986421955e-05, "loss": 1.1598, "mean_token_accuracy": 0.6999737590551376, "num_tokens": 214509101.0, "step": 6550 }, { "epoch": 0.5435774110622771, "grad_norm": 0.8855962157249451, "learning_rate": 5.113606284348984e-05, "loss": 1.1788, "mean_token_accuracy": 0.6972385168075561, "num_tokens": 214672941.0, "step": 6555 }, { "epoch": 0.5439920391408907, "grad_norm": 0.9107732772827148, "learning_rate": 5.106371344294936e-05, "loss": 1.2798, "mean_token_accuracy": 0.6762696966528893, "num_tokens": 214835862.0, "step": 6560 }, { "epoch": 0.5444066672195041, "grad_norm": 0.8324126601219177, "learning_rate": 5.099136181415475e-05, "loss": 1.1893, "mean_token_accuracy": 0.6961143642663956, "num_tokens": 214999702.0, "step": 6565 }, { "epoch": 0.5448212952981176, "grad_norm": 0.8903244137763977, "learning_rate": 5.091900810866732e-05, "loss": 1.2255, "mean_token_accuracy": 0.6866263464093209, "num_tokens": 215163542.0, "step": 6570 }, { "epoch": 0.5452359233767311, "grad_norm": 1.8215476274490356, "learning_rate": 5.084665247805276e-05, "loss": 1.2491, "mean_token_accuracy": 0.6820075765252114, "num_tokens": 215327382.0, "step": 6575 }, { "epoch": 0.5456505514553446, "grad_norm": 0.8984115123748779, "learning_rate": 5.0774295073880774e-05, "loss": 1.2231, "mean_token_accuracy": 0.6867302045226097, "num_tokens": 215491222.0, "step": 6580 }, { "epoch": 0.5460651795339581, "grad_norm": 0.9719600677490234, "learning_rate": 5.070193604772477e-05, "loss": 1.238, "mean_token_accuracy": 0.686223118007183, "num_tokens": 215655062.0, "step": 6585 }, { "epoch": 0.5464798076125715, "grad_norm": 0.9477178454399109, "learning_rate": 5.062957555116159e-05, "loss": 1.2902, "mean_token_accuracy": 0.6786129310727119, "num_tokens": 215818714.0, "step": 6590 }, { "epoch": 0.546894435691185, "grad_norm": 0.8436419367790222, "learning_rate": 5.055721373577111e-05, "loss": 1.2024, "mean_token_accuracy": 0.6898107171058655, "num_tokens": 215982484.0, "step": 6595 }, { "epoch": 0.5473090637697985, "grad_norm": 0.8872175216674805, "learning_rate": 5.048485075313598e-05, "loss": 1.2846, "mean_token_accuracy": 0.6801319614052772, "num_tokens": 216146324.0, "step": 6600 }, { "epoch": 0.547723691848412, "grad_norm": 0.9372970461845398, "learning_rate": 5.0412486754841347e-05, "loss": 1.193, "mean_token_accuracy": 0.6931696027517319, "num_tokens": 216310164.0, "step": 6605 }, { "epoch": 0.5481383199270254, "grad_norm": 0.9212945103645325, "learning_rate": 5.03401218924744e-05, "loss": 1.1958, "mean_token_accuracy": 0.6920821130275726, "num_tokens": 216474004.0, "step": 6610 }, { "epoch": 0.5485529480056389, "grad_norm": 0.8940766453742981, "learning_rate": 5.0267756317624216e-05, "loss": 1.1878, "mean_token_accuracy": 0.6963892981410027, "num_tokens": 216637844.0, "step": 6615 }, { "epoch": 0.5489675760842524, "grad_norm": 0.8963186144828796, "learning_rate": 5.019539018188132e-05, "loss": 1.16, "mean_token_accuracy": 0.6982710182666778, "num_tokens": 216801684.0, "step": 6620 }, { "epoch": 0.5493822041628659, "grad_norm": 0.8630821108818054, "learning_rate": 5.0123023636837395e-05, "loss": 1.2503, "mean_token_accuracy": 0.6825696498155593, "num_tokens": 216965524.0, "step": 6625 }, { "epoch": 0.5497968322414793, "grad_norm": 0.9982067942619324, "learning_rate": 5.005065683408508e-05, "loss": 1.2151, "mean_token_accuracy": 0.6896871969103813, "num_tokens": 217129364.0, "step": 6630 }, { "epoch": 0.5502114603200928, "grad_norm": 0.8645791411399841, "learning_rate": 4.99782899252174e-05, "loss": 1.1423, "mean_token_accuracy": 0.7014846056699753, "num_tokens": 217293204.0, "step": 6635 }, { "epoch": 0.5506260883987064, "grad_norm": 0.9290412068367004, "learning_rate": 4.9905923061827736e-05, "loss": 1.2004, "mean_token_accuracy": 0.6902295619249343, "num_tokens": 217456117.0, "step": 6640 }, { "epoch": 0.5510407164773199, "grad_norm": 0.8934392929077148, "learning_rate": 4.98335563955093e-05, "loss": 1.1539, "mean_token_accuracy": 0.6991263449192047, "num_tokens": 217619957.0, "step": 6645 }, { "epoch": 0.5514553445559334, "grad_norm": 0.8861909508705139, "learning_rate": 4.976119007785494e-05, "loss": 1.2497, "mean_token_accuracy": 0.6817570835351944, "num_tokens": 217783797.0, "step": 6650 }, { "epoch": 0.5518699726345468, "grad_norm": 0.8809149861335754, "learning_rate": 4.9688824260456726e-05, "loss": 1.1592, "mean_token_accuracy": 0.7015334829688072, "num_tokens": 217947637.0, "step": 6655 }, { "epoch": 0.5522846007131603, "grad_norm": 0.940298318862915, "learning_rate": 4.9616459094905715e-05, "loss": 1.2212, "mean_token_accuracy": 0.6881417751312255, "num_tokens": 218111131.0, "step": 6660 }, { "epoch": 0.5526992287917738, "grad_norm": 0.8721581101417542, "learning_rate": 4.954409473279158e-05, "loss": 1.2098, "mean_token_accuracy": 0.6854899793863296, "num_tokens": 218274971.0, "step": 6665 }, { "epoch": 0.5531138568703873, "grad_norm": 0.9390237927436829, "learning_rate": 4.947173132570231e-05, "loss": 1.2649, "mean_token_accuracy": 0.681109482049942, "num_tokens": 218438811.0, "step": 6670 }, { "epoch": 0.5535284849490008, "grad_norm": 0.9084700345993042, "learning_rate": 4.9399369025223905e-05, "loss": 1.2492, "mean_token_accuracy": 0.6816715553402901, "num_tokens": 218602651.0, "step": 6675 }, { "epoch": 0.5539431130276142, "grad_norm": 0.9213350415229797, "learning_rate": 4.932700798294006e-05, "loss": 1.234, "mean_token_accuracy": 0.683847026526928, "num_tokens": 218766420.0, "step": 6680 }, { "epoch": 0.5543577411062277, "grad_norm": 0.9205579161643982, "learning_rate": 4.9254648350431787e-05, "loss": 1.2252, "mean_token_accuracy": 0.686002540588379, "num_tokens": 218929489.0, "step": 6685 }, { "epoch": 0.5547723691848412, "grad_norm": 0.8921565413475037, "learning_rate": 4.9182290279277175e-05, "loss": 1.1792, "mean_token_accuracy": 0.694709187746048, "num_tokens": 219093329.0, "step": 6690 }, { "epoch": 0.5551869972634547, "grad_norm": 0.9167972207069397, "learning_rate": 4.9109933921051076e-05, "loss": 1.1969, "mean_token_accuracy": 0.6931573793292045, "num_tokens": 219257169.0, "step": 6695 }, { "epoch": 0.5556016253420681, "grad_norm": 0.914140522480011, "learning_rate": 4.903757942732469e-05, "loss": 1.1886, "mean_token_accuracy": 0.6917277619242668, "num_tokens": 219421009.0, "step": 6700 }, { "epoch": 0.5560162534206816, "grad_norm": 0.8975273966789246, "learning_rate": 4.896522694966533e-05, "loss": 1.1615, "mean_token_accuracy": 0.69662756472826, "num_tokens": 219584849.0, "step": 6705 }, { "epoch": 0.5564308814992951, "grad_norm": 0.8833522796630859, "learning_rate": 4.8892876639636126e-05, "loss": 1.218, "mean_token_accuracy": 0.6935483857989311, "num_tokens": 219748689.0, "step": 6710 }, { "epoch": 0.5568455095779086, "grad_norm": 0.9619110226631165, "learning_rate": 4.8820528648795634e-05, "loss": 1.2085, "mean_token_accuracy": 0.6911351323127747, "num_tokens": 219912529.0, "step": 6715 }, { "epoch": 0.557260137656522, "grad_norm": 0.8885596990585327, "learning_rate": 4.874818312869753e-05, "loss": 1.2336, "mean_token_accuracy": 0.6844024941325187, "num_tokens": 220076369.0, "step": 6720 }, { "epoch": 0.5576747657351356, "grad_norm": 0.9228293895721436, "learning_rate": 4.8675840230890355e-05, "loss": 1.2688, "mean_token_accuracy": 0.6799242347478867, "num_tokens": 220240209.0, "step": 6725 }, { "epoch": 0.5580893938137491, "grad_norm": 0.975376546382904, "learning_rate": 4.860350010691716e-05, "loss": 1.2384, "mean_token_accuracy": 0.6906524926424027, "num_tokens": 220404049.0, "step": 6730 }, { "epoch": 0.5585040218923626, "grad_norm": 0.9415110349655151, "learning_rate": 4.8531162908315134e-05, "loss": 1.2271, "mean_token_accuracy": 0.685777124762535, "num_tokens": 220567889.0, "step": 6735 }, { "epoch": 0.5589186499709761, "grad_norm": 0.9494465589523315, "learning_rate": 4.845882878661538e-05, "loss": 1.2185, "mean_token_accuracy": 0.6860642716288566, "num_tokens": 220731729.0, "step": 6740 }, { "epoch": 0.5593332780495895, "grad_norm": 0.9446563124656677, "learning_rate": 4.838649789334257e-05, "loss": 1.2448, "mean_token_accuracy": 0.6815738022327423, "num_tokens": 220895569.0, "step": 6745 }, { "epoch": 0.559747906128203, "grad_norm": 0.8738687634468079, "learning_rate": 4.8314170380014546e-05, "loss": 1.2853, "mean_token_accuracy": 0.6820564493536949, "num_tokens": 221059409.0, "step": 6750 }, { "epoch": 0.5601625342068165, "grad_norm": 0.9208701848983765, "learning_rate": 4.824184639814215e-05, "loss": 1.1718, "mean_token_accuracy": 0.6970601424574852, "num_tokens": 221222577.0, "step": 6755 }, { "epoch": 0.56057716228543, "grad_norm": 0.8384931683540344, "learning_rate": 4.816952609922879e-05, "loss": 1.228, "mean_token_accuracy": 0.6885202810168266, "num_tokens": 221386417.0, "step": 6760 }, { "epoch": 0.5609917903640435, "grad_norm": 0.8435602188110352, "learning_rate": 4.809720963477013e-05, "loss": 1.1915, "mean_token_accuracy": 0.6923203811049461, "num_tokens": 221550257.0, "step": 6765 }, { "epoch": 0.5614064184426569, "grad_norm": 0.9238464832305908, "learning_rate": 4.802489715625385e-05, "loss": 1.1586, "mean_token_accuracy": 0.6958699896931648, "num_tokens": 221714097.0, "step": 6770 }, { "epoch": 0.5618210465212704, "grad_norm": 0.9219141602516174, "learning_rate": 4.795258881515925e-05, "loss": 1.2861, "mean_token_accuracy": 0.6787390008568763, "num_tokens": 221877937.0, "step": 6775 }, { "epoch": 0.5622356745998839, "grad_norm": 0.9290852546691895, "learning_rate": 4.7880284762957e-05, "loss": 1.1626, "mean_token_accuracy": 0.6970918908715248, "num_tokens": 222041777.0, "step": 6780 }, { "epoch": 0.5626503026784974, "grad_norm": 0.9057936668395996, "learning_rate": 4.7807985151108726e-05, "loss": 1.2176, "mean_token_accuracy": 0.689833813905716, "num_tokens": 222205617.0, "step": 6785 }, { "epoch": 0.5630649307571108, "grad_norm": 0.9059953093528748, "learning_rate": 4.773569013106679e-05, "loss": 1.1597, "mean_token_accuracy": 0.6991935476660729, "num_tokens": 222369457.0, "step": 6790 }, { "epoch": 0.5634795588357243, "grad_norm": 0.8961343765258789, "learning_rate": 4.766339985427396e-05, "loss": 1.2886, "mean_token_accuracy": 0.67652125954628, "num_tokens": 222533297.0, "step": 6795 }, { "epoch": 0.5638941869143378, "grad_norm": 0.9040511250495911, "learning_rate": 4.759111447216301e-05, "loss": 1.2778, "mean_token_accuracy": 0.679087245464325, "num_tokens": 222697137.0, "step": 6800 }, { "epoch": 0.5643088149929513, "grad_norm": 0.9621016979217529, "learning_rate": 4.7518834136156477e-05, "loss": 1.1837, "mean_token_accuracy": 0.6943792745471, "num_tokens": 222860977.0, "step": 6805 }, { "epoch": 0.5647234430715649, "grad_norm": 1.0572618246078491, "learning_rate": 4.74465589976664e-05, "loss": 1.1642, "mean_token_accuracy": 0.6983809888362884, "num_tokens": 223024817.0, "step": 6810 }, { "epoch": 0.5651380711501783, "grad_norm": 0.9216713309288025, "learning_rate": 4.73742892080938e-05, "loss": 1.1888, "mean_token_accuracy": 0.6927174970507621, "num_tokens": 223188657.0, "step": 6815 }, { "epoch": 0.5655526992287918, "grad_norm": 0.919407069683075, "learning_rate": 4.73020249188286e-05, "loss": 1.1847, "mean_token_accuracy": 0.6968169614672661, "num_tokens": 223352497.0, "step": 6820 }, { "epoch": 0.5659673273074053, "grad_norm": 0.924695611000061, "learning_rate": 4.7229766281249165e-05, "loss": 1.1871, "mean_token_accuracy": 0.6970416769385338, "num_tokens": 223515783.0, "step": 6825 }, { "epoch": 0.5663819553860188, "grad_norm": 0.9437134265899658, "learning_rate": 4.7157513446722e-05, "loss": 1.1951, "mean_token_accuracy": 0.6948680371046067, "num_tokens": 223679623.0, "step": 6830 }, { "epoch": 0.5667965834646322, "grad_norm": 0.8854874968528748, "learning_rate": 4.708526656660148e-05, "loss": 1.2441, "mean_token_accuracy": 0.6849584549665451, "num_tokens": 223843463.0, "step": 6835 }, { "epoch": 0.5672112115432457, "grad_norm": 0.918292760848999, "learning_rate": 4.701302579222952e-05, "loss": 1.173, "mean_token_accuracy": 0.6970736175775528, "num_tokens": 224006590.0, "step": 6840 }, { "epoch": 0.5676258396218592, "grad_norm": 0.9339110851287842, "learning_rate": 4.6940791274935224e-05, "loss": 1.2188, "mean_token_accuracy": 0.6935510709881783, "num_tokens": 224169455.0, "step": 6845 }, { "epoch": 0.5680404677004727, "grad_norm": 0.9341229796409607, "learning_rate": 4.686856316603456e-05, "loss": 1.1925, "mean_token_accuracy": 0.6918010711669922, "num_tokens": 224333295.0, "step": 6850 }, { "epoch": 0.5684550957790862, "grad_norm": 0.9337224364280701, "learning_rate": 4.679634161683012e-05, "loss": 1.1728, "mean_token_accuracy": 0.6961937919259071, "num_tokens": 224497135.0, "step": 6855 }, { "epoch": 0.5688697238576996, "grad_norm": 0.9041075706481934, "learning_rate": 4.672412677861076e-05, "loss": 1.1957, "mean_token_accuracy": 0.6941210865974426, "num_tokens": 224660221.0, "step": 6860 }, { "epoch": 0.5692843519363131, "grad_norm": 0.9238066673278809, "learning_rate": 4.6651918802651215e-05, "loss": 1.1911, "mean_token_accuracy": 0.6899743393063545, "num_tokens": 224824061.0, "step": 6865 }, { "epoch": 0.5696989800149266, "grad_norm": 0.956802487373352, "learning_rate": 4.657971784021189e-05, "loss": 1.1953, "mean_token_accuracy": 0.6936889082193375, "num_tokens": 224987901.0, "step": 6870 }, { "epoch": 0.5701136080935401, "grad_norm": 0.905755877494812, "learning_rate": 4.650752404253853e-05, "loss": 1.2357, "mean_token_accuracy": 0.6879765421152115, "num_tokens": 225151741.0, "step": 6875 }, { "epoch": 0.5705282361721535, "grad_norm": 0.8741195797920227, "learning_rate": 4.6435337560861796e-05, "loss": 1.1974, "mean_token_accuracy": 0.6937866553664207, "num_tokens": 225315581.0, "step": 6880 }, { "epoch": 0.570942864250767, "grad_norm": 0.911460280418396, "learning_rate": 4.636315854639707e-05, "loss": 1.1911, "mean_token_accuracy": 0.6958699867129325, "num_tokens": 225479421.0, "step": 6885 }, { "epoch": 0.5713574923293806, "grad_norm": 0.9902095794677734, "learning_rate": 4.629098715034411e-05, "loss": 1.2458, "mean_token_accuracy": 0.6900232106447219, "num_tokens": 225643261.0, "step": 6890 }, { "epoch": 0.5717721204079941, "grad_norm": 0.9339204430580139, "learning_rate": 4.621882352388665e-05, "loss": 1.2106, "mean_token_accuracy": 0.6915994614362717, "num_tokens": 225807101.0, "step": 6895 }, { "epoch": 0.5721867484866076, "grad_norm": 0.8947141170501709, "learning_rate": 4.61466678181922e-05, "loss": 1.2181, "mean_token_accuracy": 0.6909457445144653, "num_tokens": 225970941.0, "step": 6900 }, { "epoch": 0.572601376565221, "grad_norm": 0.881833553314209, "learning_rate": 4.6074520184411685e-05, "loss": 1.2302, "mean_token_accuracy": 0.6894855827093125, "num_tokens": 226134781.0, "step": 6905 }, { "epoch": 0.5730160046438345, "grad_norm": 0.9418825507164001, "learning_rate": 4.6002380773679064e-05, "loss": 1.1899, "mean_token_accuracy": 0.6962304413318634, "num_tokens": 226298621.0, "step": 6910 }, { "epoch": 0.573430632722448, "grad_norm": 0.9378498196601868, "learning_rate": 4.5930249737111134e-05, "loss": 1.2173, "mean_token_accuracy": 0.6883369967341423, "num_tokens": 226462461.0, "step": 6915 }, { "epoch": 0.5738452608010615, "grad_norm": 0.9858556985855103, "learning_rate": 4.5858127225807126e-05, "loss": 1.2337, "mean_token_accuracy": 0.6850562021136284, "num_tokens": 226626301.0, "step": 6920 }, { "epoch": 0.5742598888796749, "grad_norm": 0.9240570068359375, "learning_rate": 4.5786013390848406e-05, "loss": 1.2609, "mean_token_accuracy": 0.6839687198400497, "num_tokens": 226790141.0, "step": 6925 }, { "epoch": 0.5746745169582884, "grad_norm": 0.8796793222427368, "learning_rate": 4.5713908383298134e-05, "loss": 1.291, "mean_token_accuracy": 0.6791238993406296, "num_tokens": 226953981.0, "step": 6930 }, { "epoch": 0.5750891450369019, "grad_norm": 0.878657341003418, "learning_rate": 4.564181235420106e-05, "loss": 1.1323, "mean_token_accuracy": 0.7025233536958695, "num_tokens": 227116801.0, "step": 6935 }, { "epoch": 0.5755037731155154, "grad_norm": 0.8942219018936157, "learning_rate": 4.556972545458307e-05, "loss": 1.1995, "mean_token_accuracy": 0.6889784947037697, "num_tokens": 227280641.0, "step": 6940 }, { "epoch": 0.5759184011941288, "grad_norm": 0.9079373478889465, "learning_rate": 4.549764783545091e-05, "loss": 1.1846, "mean_token_accuracy": 0.6932490170001984, "num_tokens": 227444481.0, "step": 6945 }, { "epoch": 0.5763330292727423, "grad_norm": 0.958938717842102, "learning_rate": 4.5425579647791916e-05, "loss": 1.2102, "mean_token_accuracy": 0.6875183284282684, "num_tokens": 227608321.0, "step": 6950 }, { "epoch": 0.5767476573513558, "grad_norm": 0.9544787406921387, "learning_rate": 4.535352104257369e-05, "loss": 1.2189, "mean_token_accuracy": 0.6896444261074066, "num_tokens": 227772161.0, "step": 6955 }, { "epoch": 0.5771622854299693, "grad_norm": 0.8173982501029968, "learning_rate": 4.52814721707437e-05, "loss": 1.1961, "mean_token_accuracy": 0.6936155915260315, "num_tokens": 227936001.0, "step": 6960 }, { "epoch": 0.5775769135085828, "grad_norm": 0.9662452340126038, "learning_rate": 4.520943318322907e-05, "loss": 1.2135, "mean_token_accuracy": 0.6873044952750206, "num_tokens": 228099841.0, "step": 6965 }, { "epoch": 0.5779915415871962, "grad_norm": 0.9559817910194397, "learning_rate": 4.5137404230936205e-05, "loss": 1.237, "mean_token_accuracy": 0.6828873425722122, "num_tokens": 228263681.0, "step": 6970 }, { "epoch": 0.5784061696658098, "grad_norm": 0.8829718828201294, "learning_rate": 4.506538546475047e-05, "loss": 1.151, "mean_token_accuracy": 0.6998533725738525, "num_tokens": 228427521.0, "step": 6975 }, { "epoch": 0.5788207977444233, "grad_norm": 0.9070400595664978, "learning_rate": 4.499337703553593e-05, "loss": 1.2105, "mean_token_accuracy": 0.6938962921500206, "num_tokens": 228590809.0, "step": 6980 }, { "epoch": 0.5792354258230368, "grad_norm": 0.8562132716178894, "learning_rate": 4.492137909413497e-05, "loss": 1.1862, "mean_token_accuracy": 0.693982158601284, "num_tokens": 228754649.0, "step": 6985 }, { "epoch": 0.5796500539016503, "grad_norm": 0.8953412175178528, "learning_rate": 4.484939179136804e-05, "loss": 1.1653, "mean_token_accuracy": 0.7018756113946438, "num_tokens": 228918489.0, "step": 6990 }, { "epoch": 0.5800646819802637, "grad_norm": 0.9159473776817322, "learning_rate": 4.477741527803322e-05, "loss": 1.2287, "mean_token_accuracy": 0.6866508081555367, "num_tokens": 229082268.0, "step": 6995 }, { "epoch": 0.5804793100588772, "grad_norm": 0.9010199308395386, "learning_rate": 4.4705449704906085e-05, "loss": 1.2679, "mean_token_accuracy": 0.6817631974816323, "num_tokens": 229246108.0, "step": 7000 }, { "epoch": 0.5808939381374907, "grad_norm": 0.8651865720748901, "learning_rate": 4.463349522273925e-05, "loss": 1.1858, "mean_token_accuracy": 0.6942204251885414, "num_tokens": 229409948.0, "step": 7005 }, { "epoch": 0.5813085662161042, "grad_norm": 0.9483944177627563, "learning_rate": 4.456155198226207e-05, "loss": 1.205, "mean_token_accuracy": 0.6894000515341758, "num_tokens": 229573788.0, "step": 7010 }, { "epoch": 0.5817231942947176, "grad_norm": 0.9426966905593872, "learning_rate": 4.4489620134180424e-05, "loss": 1.3329, "mean_token_accuracy": 0.668371208012104, "num_tokens": 229737628.0, "step": 7015 }, { "epoch": 0.5821378223733311, "grad_norm": 0.9136776924133301, "learning_rate": 4.441769982917626e-05, "loss": 1.2323, "mean_token_accuracy": 0.6856359332799912, "num_tokens": 229900650.0, "step": 7020 }, { "epoch": 0.5825524504519446, "grad_norm": 0.9698913097381592, "learning_rate": 4.434579121790735e-05, "loss": 1.1771, "mean_token_accuracy": 0.6947214111685753, "num_tokens": 230064490.0, "step": 7025 }, { "epoch": 0.5829670785305581, "grad_norm": 0.8887882828712463, "learning_rate": 4.4273894451007e-05, "loss": 1.2011, "mean_token_accuracy": 0.6927236124873162, "num_tokens": 230228330.0, "step": 7030 }, { "epoch": 0.5833817066091715, "grad_norm": 0.8867167234420776, "learning_rate": 4.420200967908373e-05, "loss": 1.1936, "mean_token_accuracy": 0.691513928771019, "num_tokens": 230392170.0, "step": 7035 }, { "epoch": 0.583796334687785, "grad_norm": 0.9249410033226013, "learning_rate": 4.413013705272084e-05, "loss": 1.2341, "mean_token_accuracy": 0.6920149102807045, "num_tokens": 230556010.0, "step": 7040 }, { "epoch": 0.5842109627663985, "grad_norm": 0.9408177733421326, "learning_rate": 4.405827672247628e-05, "loss": 1.2252, "mean_token_accuracy": 0.6882942304015159, "num_tokens": 230719850.0, "step": 7045 }, { "epoch": 0.584625590845012, "grad_norm": 0.9278060793876648, "learning_rate": 4.398642883888219e-05, "loss": 1.2217, "mean_token_accuracy": 0.6905486330389976, "num_tokens": 230883690.0, "step": 7050 }, { "epoch": 0.5850402189236255, "grad_norm": 0.9284443855285645, "learning_rate": 4.391459355244464e-05, "loss": 1.145, "mean_token_accuracy": 0.7007453590631485, "num_tokens": 231047530.0, "step": 7055 }, { "epoch": 0.585454847002239, "grad_norm": 0.9322700500488281, "learning_rate": 4.384277101364336e-05, "loss": 1.2248, "mean_token_accuracy": 0.6901820629835129, "num_tokens": 231211370.0, "step": 7060 }, { "epoch": 0.5858694750808525, "grad_norm": 0.9473124742507935, "learning_rate": 4.3770961372931305e-05, "loss": 1.1841, "mean_token_accuracy": 0.6927052780985832, "num_tokens": 231375210.0, "step": 7065 }, { "epoch": 0.586284103159466, "grad_norm": 0.9334824085235596, "learning_rate": 4.369916478073449e-05, "loss": 1.2233, "mean_token_accuracy": 0.6873633891344071, "num_tokens": 231538074.0, "step": 7070 }, { "epoch": 0.5866987312380795, "grad_norm": 0.9093325138092041, "learning_rate": 4.36273813874515e-05, "loss": 1.2381, "mean_token_accuracy": 0.6827529326081276, "num_tokens": 231701914.0, "step": 7075 }, { "epoch": 0.587113359316693, "grad_norm": 0.9013286232948303, "learning_rate": 4.355561134345336e-05, "loss": 1.1764, "mean_token_accuracy": 0.6988963022828102, "num_tokens": 231864747.0, "step": 7080 }, { "epoch": 0.5875279873953064, "grad_norm": 0.9422785639762878, "learning_rate": 4.348385479908309e-05, "loss": 1.1431, "mean_token_accuracy": 0.7011178985238076, "num_tokens": 232028475.0, "step": 7085 }, { "epoch": 0.5879426154739199, "grad_norm": 0.881239116191864, "learning_rate": 4.3412111904655414e-05, "loss": 1.1635, "mean_token_accuracy": 0.6978590875864029, "num_tokens": 232191579.0, "step": 7090 }, { "epoch": 0.5883572435525334, "grad_norm": 0.9406387209892273, "learning_rate": 4.3340382810456506e-05, "loss": 1.1867, "mean_token_accuracy": 0.6942754134535789, "num_tokens": 232355419.0, "step": 7095 }, { "epoch": 0.5887718716311469, "grad_norm": 0.949264407157898, "learning_rate": 4.326866766674362e-05, "loss": 1.2357, "mean_token_accuracy": 0.6850134387612343, "num_tokens": 232519259.0, "step": 7100 }, { "epoch": 0.5891864997097603, "grad_norm": 0.9393827319145203, "learning_rate": 4.3196966623744756e-05, "loss": 1.1852, "mean_token_accuracy": 0.6932001456618309, "num_tokens": 232683099.0, "step": 7105 }, { "epoch": 0.5896011277883738, "grad_norm": 0.8733914494514465, "learning_rate": 4.3125279831658386e-05, "loss": 1.2229, "mean_token_accuracy": 0.6882759019732475, "num_tokens": 232846939.0, "step": 7110 }, { "epoch": 0.5900157558669873, "grad_norm": 0.9470304846763611, "learning_rate": 4.3053607440653187e-05, "loss": 1.239, "mean_token_accuracy": 0.6860337257385254, "num_tokens": 233010779.0, "step": 7115 }, { "epoch": 0.5904303839456008, "grad_norm": 0.9031630754470825, "learning_rate": 4.298194960086758e-05, "loss": 1.1668, "mean_token_accuracy": 0.6998717039823532, "num_tokens": 233174619.0, "step": 7120 }, { "epoch": 0.5908450120242142, "grad_norm": 0.9249277114868164, "learning_rate": 4.291030646240955e-05, "loss": 1.2993, "mean_token_accuracy": 0.6730938404798508, "num_tokens": 233338459.0, "step": 7125 }, { "epoch": 0.5912596401028277, "grad_norm": 0.929760217666626, "learning_rate": 4.2838678175356285e-05, "loss": 1.2512, "mean_token_accuracy": 0.6843751162290573, "num_tokens": 233501361.0, "step": 7130 }, { "epoch": 0.5916742681814412, "grad_norm": 0.9114911556243896, "learning_rate": 4.276706488975388e-05, "loss": 1.0913, "mean_token_accuracy": 0.7118707254529, "num_tokens": 233665201.0, "step": 7135 }, { "epoch": 0.5920888962600547, "grad_norm": 0.9281850457191467, "learning_rate": 4.269546675561697e-05, "loss": 1.1847, "mean_token_accuracy": 0.6965053781867028, "num_tokens": 233829041.0, "step": 7140 }, { "epoch": 0.5925035243386683, "grad_norm": 0.9317891001701355, "learning_rate": 4.262388392292845e-05, "loss": 1.1841, "mean_token_accuracy": 0.7003548637032508, "num_tokens": 233992612.0, "step": 7145 }, { "epoch": 0.5929181524172817, "grad_norm": 0.895683765411377, "learning_rate": 4.2552316541639216e-05, "loss": 1.1809, "mean_token_accuracy": 0.6913734123110771, "num_tokens": 234156452.0, "step": 7150 }, { "epoch": 0.5933327804958952, "grad_norm": 0.8942002654075623, "learning_rate": 4.248076476166771e-05, "loss": 1.1815, "mean_token_accuracy": 0.6950024455785752, "num_tokens": 234320292.0, "step": 7155 }, { "epoch": 0.5937474085745087, "grad_norm": 0.9098842144012451, "learning_rate": 4.240922873289976e-05, "loss": 1.2284, "mean_token_accuracy": 0.6897869989275932, "num_tokens": 234484006.0, "step": 7160 }, { "epoch": 0.5941620366531222, "grad_norm": 0.9092161059379578, "learning_rate": 4.233770860518821e-05, "loss": 1.1719, "mean_token_accuracy": 0.6982835829257965, "num_tokens": 234647277.0, "step": 7165 }, { "epoch": 0.5945766647317356, "grad_norm": 0.9007692933082581, "learning_rate": 4.226620452835252e-05, "loss": 1.1977, "mean_token_accuracy": 0.6938905209302902, "num_tokens": 234811117.0, "step": 7170 }, { "epoch": 0.5949912928103491, "grad_norm": 0.9069666266441345, "learning_rate": 4.2194716652178576e-05, "loss": 1.2028, "mean_token_accuracy": 0.6928763464093208, "num_tokens": 234974957.0, "step": 7175 }, { "epoch": 0.5954059208889626, "grad_norm": 0.9262700080871582, "learning_rate": 4.2123245126418346e-05, "loss": 1.2319, "mean_token_accuracy": 0.6861192613840104, "num_tokens": 235138797.0, "step": 7180 }, { "epoch": 0.5958205489675761, "grad_norm": 0.8939736485481262, "learning_rate": 4.20517901007895e-05, "loss": 1.2209, "mean_token_accuracy": 0.68668133020401, "num_tokens": 235302637.0, "step": 7185 }, { "epoch": 0.5962351770461896, "grad_norm": 0.983397364616394, "learning_rate": 4.198035172497517e-05, "loss": 1.2777, "mean_token_accuracy": 0.6798264905810356, "num_tokens": 235466477.0, "step": 7190 }, { "epoch": 0.596649805124803, "grad_norm": 0.9213573336601257, "learning_rate": 4.190893014862362e-05, "loss": 1.2034, "mean_token_accuracy": 0.6935972616076469, "num_tokens": 235630317.0, "step": 7195 }, { "epoch": 0.5970644332034165, "grad_norm": 0.9221646189689636, "learning_rate": 4.183752552134791e-05, "loss": 1.1677, "mean_token_accuracy": 0.6981610432267189, "num_tokens": 235794157.0, "step": 7200 }, { "epoch": 0.59747906128203, "grad_norm": 1.1523939371109009, "learning_rate": 4.1766137992725576e-05, "loss": 1.1463, "mean_token_accuracy": 0.699657866358757, "num_tokens": 235957997.0, "step": 7205 }, { "epoch": 0.5978936893606435, "grad_norm": 0.926038384437561, "learning_rate": 4.169476771229835e-05, "loss": 1.2636, "mean_token_accuracy": 0.6807978987693787, "num_tokens": 236121837.0, "step": 7210 }, { "epoch": 0.5983083174392569, "grad_norm": 0.9002109169960022, "learning_rate": 4.1623414829571875e-05, "loss": 1.2146, "mean_token_accuracy": 0.6921409219503403, "num_tokens": 236285522.0, "step": 7215 }, { "epoch": 0.5987229455178704, "grad_norm": 0.8745740056037903, "learning_rate": 4.155207949401528e-05, "loss": 1.2097, "mean_token_accuracy": 0.6918316230177879, "num_tokens": 236449362.0, "step": 7220 }, { "epoch": 0.599137573596484, "grad_norm": 0.8774746060371399, "learning_rate": 4.1480761855060974e-05, "loss": 1.1393, "mean_token_accuracy": 0.7037817686796188, "num_tokens": 236613202.0, "step": 7225 }, { "epoch": 0.5995522016750975, "grad_norm": 0.9291770458221436, "learning_rate": 4.14094620621043e-05, "loss": 1.1597, "mean_token_accuracy": 0.7004643231630325, "num_tokens": 236777042.0, "step": 7230 }, { "epoch": 0.599966829753711, "grad_norm": 0.8415595889091492, "learning_rate": 4.133818026450318e-05, "loss": 1.1789, "mean_token_accuracy": 0.6951625302433968, "num_tokens": 236940770.0, "step": 7235 }, { "epoch": 0.6003814578323244, "grad_norm": 0.9406865239143372, "learning_rate": 4.1266916611577886e-05, "loss": 1.1178, "mean_token_accuracy": 0.7086510300636292, "num_tokens": 237104610.0, "step": 7240 }, { "epoch": 0.6007960859109379, "grad_norm": 0.9170796871185303, "learning_rate": 4.119567125261069e-05, "loss": 1.2775, "mean_token_accuracy": 0.6801114067435264, "num_tokens": 237267808.0, "step": 7245 }, { "epoch": 0.6012107139895514, "grad_norm": 0.9180638790130615, "learning_rate": 4.112444433684545e-05, "loss": 1.2122, "mean_token_accuracy": 0.689168743789196, "num_tokens": 237431242.0, "step": 7250 }, { "epoch": 0.6016253420681649, "grad_norm": 0.9154341220855713, "learning_rate": 4.105323601348749e-05, "loss": 1.2155, "mean_token_accuracy": 0.6859420806169509, "num_tokens": 237595082.0, "step": 7255 }, { "epoch": 0.6020399701467783, "grad_norm": 0.9194984436035156, "learning_rate": 4.098204643170316e-05, "loss": 1.1633, "mean_token_accuracy": 0.7001832857728004, "num_tokens": 237758922.0, "step": 7260 }, { "epoch": 0.6024545982253918, "grad_norm": 0.9072752594947815, "learning_rate": 4.091087574061952e-05, "loss": 1.2475, "mean_token_accuracy": 0.6822566166520119, "num_tokens": 237921973.0, "step": 7265 }, { "epoch": 0.6028692263040053, "grad_norm": 0.8539906740188599, "learning_rate": 4.083972408932407e-05, "loss": 1.1633, "mean_token_accuracy": 0.6977392688393593, "num_tokens": 238085383.0, "step": 7270 }, { "epoch": 0.6032838543826188, "grad_norm": 0.8923770785331726, "learning_rate": 4.076859162686446e-05, "loss": 1.1732, "mean_token_accuracy": 0.6982282474637032, "num_tokens": 238249223.0, "step": 7275 }, { "epoch": 0.6036984824612323, "grad_norm": 0.9751062393188477, "learning_rate": 4.069747850224811e-05, "loss": 1.2062, "mean_token_accuracy": 0.691937729716301, "num_tokens": 238412664.0, "step": 7280 }, { "epoch": 0.6041131105398457, "grad_norm": 0.8665412068367004, "learning_rate": 4.0626384864441925e-05, "loss": 1.1271, "mean_token_accuracy": 0.7078262493014336, "num_tokens": 238576504.0, "step": 7285 }, { "epoch": 0.6045277386184592, "grad_norm": 0.9097188711166382, "learning_rate": 4.0555310862372e-05, "loss": 1.2266, "mean_token_accuracy": 0.6905669540166854, "num_tokens": 238740344.0, "step": 7290 }, { "epoch": 0.6049423666970727, "grad_norm": 0.9177653789520264, "learning_rate": 4.0484256644923325e-05, "loss": 1.1188, "mean_token_accuracy": 0.7084032386541367, "num_tokens": 238904059.0, "step": 7295 }, { "epoch": 0.6053569947756862, "grad_norm": 0.8903054594993591, "learning_rate": 4.0413222360939395e-05, "loss": 1.1767, "mean_token_accuracy": 0.6976111918687821, "num_tokens": 239067899.0, "step": 7300 }, { "epoch": 0.6057716228542996, "grad_norm": 0.8641144037246704, "learning_rate": 4.034220815922199e-05, "loss": 1.2182, "mean_token_accuracy": 0.688984602689743, "num_tokens": 239231739.0, "step": 7305 }, { "epoch": 0.6061862509329132, "grad_norm": 0.9189815521240234, "learning_rate": 4.0271214188530804e-05, "loss": 1.1945, "mean_token_accuracy": 0.6914650112390518, "num_tokens": 239394817.0, "step": 7310 }, { "epoch": 0.6066008790115267, "grad_norm": 0.9175986051559448, "learning_rate": 4.020024059758313e-05, "loss": 1.2556, "mean_token_accuracy": 0.6834799647331238, "num_tokens": 239558657.0, "step": 7315 }, { "epoch": 0.6070155070901402, "grad_norm": 0.9445254802703857, "learning_rate": 4.012928753505362e-05, "loss": 1.0622, "mean_token_accuracy": 0.7175769805908203, "num_tokens": 239722497.0, "step": 7320 }, { "epoch": 0.6074301351687537, "grad_norm": 0.9159888625144958, "learning_rate": 4.00583551495739e-05, "loss": 1.2941, "mean_token_accuracy": 0.6788978517055512, "num_tokens": 239886337.0, "step": 7325 }, { "epoch": 0.6078447632473671, "grad_norm": 0.897580087184906, "learning_rate": 3.9987443589732256e-05, "loss": 1.1761, "mean_token_accuracy": 0.6998594805598259, "num_tokens": 240050177.0, "step": 7330 }, { "epoch": 0.6082593913259806, "grad_norm": 0.9680034518241882, "learning_rate": 3.9916553004073376e-05, "loss": 1.1336, "mean_token_accuracy": 0.7039039567112922, "num_tokens": 240214017.0, "step": 7335 }, { "epoch": 0.6086740194045941, "grad_norm": 0.9162238836288452, "learning_rate": 3.9845683541098013e-05, "loss": 1.2331, "mean_token_accuracy": 0.6869929172098637, "num_tokens": 240377857.0, "step": 7340 }, { "epoch": 0.6090886474832076, "grad_norm": 0.8927464485168457, "learning_rate": 3.977483534926267e-05, "loss": 1.1504, "mean_token_accuracy": 0.6996334314346313, "num_tokens": 240541697.0, "step": 7345 }, { "epoch": 0.609503275561821, "grad_norm": 0.9331327080726624, "learning_rate": 3.970400857697929e-05, "loss": 1.2533, "mean_token_accuracy": 0.6825652077794075, "num_tokens": 240704575.0, "step": 7350 }, { "epoch": 0.6099179036404345, "grad_norm": 0.9102768898010254, "learning_rate": 3.963320337261491e-05, "loss": 1.2194, "mean_token_accuracy": 0.6883186668157577, "num_tokens": 240868415.0, "step": 7355 }, { "epoch": 0.610332531719048, "grad_norm": 0.9383589029312134, "learning_rate": 3.9562419884491466e-05, "loss": 1.1959, "mean_token_accuracy": 0.6939088463783264, "num_tokens": 241032255.0, "step": 7360 }, { "epoch": 0.6107471597976615, "grad_norm": 0.9416522979736328, "learning_rate": 3.949165826088533e-05, "loss": 1.2391, "mean_token_accuracy": 0.6832233607769013, "num_tokens": 241196095.0, "step": 7365 }, { "epoch": 0.611161787876275, "grad_norm": 0.8662397265434265, "learning_rate": 3.94209186500271e-05, "loss": 1.2237, "mean_token_accuracy": 0.6945136860013008, "num_tokens": 241359935.0, "step": 7370 }, { "epoch": 0.6115764159548884, "grad_norm": 0.9300234913825989, "learning_rate": 3.93502012001013e-05, "loss": 1.1999, "mean_token_accuracy": 0.6895161271095276, "num_tokens": 241523775.0, "step": 7375 }, { "epoch": 0.6119910440335019, "grad_norm": 0.9370386004447937, "learning_rate": 3.927950605924593e-05, "loss": 1.2483, "mean_token_accuracy": 0.686803525686264, "num_tokens": 241687615.0, "step": 7380 }, { "epoch": 0.6124056721121154, "grad_norm": 0.8926957249641418, "learning_rate": 3.9208833375552366e-05, "loss": 1.1554, "mean_token_accuracy": 0.69775170981884, "num_tokens": 241851455.0, "step": 7385 }, { "epoch": 0.6128203001907289, "grad_norm": 0.9361787438392639, "learning_rate": 3.9138183297064894e-05, "loss": 1.247, "mean_token_accuracy": 0.6879825234413147, "num_tokens": 242014996.0, "step": 7390 }, { "epoch": 0.6132349282693424, "grad_norm": 0.9342592358589172, "learning_rate": 3.9067555971780425e-05, "loss": 1.201, "mean_token_accuracy": 0.6900476559996604, "num_tokens": 242178836.0, "step": 7395 }, { "epoch": 0.6136495563479559, "grad_norm": 0.9027369022369385, "learning_rate": 3.899695154764825e-05, "loss": 1.1817, "mean_token_accuracy": 0.6957172527909279, "num_tokens": 242342676.0, "step": 7400 }, { "epoch": 0.6140641844265694, "grad_norm": 0.890907883644104, "learning_rate": 3.892637017256967e-05, "loss": 1.1623, "mean_token_accuracy": 0.7006231635808945, "num_tokens": 242506516.0, "step": 7405 }, { "epoch": 0.6144788125051829, "grad_norm": 0.8817828297615051, "learning_rate": 3.88558119943977e-05, "loss": 1.1495, "mean_token_accuracy": 0.7012776091694832, "num_tokens": 242670001.0, "step": 7410 }, { "epoch": 0.6148934405837964, "grad_norm": 0.9252811670303345, "learning_rate": 3.878527716093673e-05, "loss": 1.2046, "mean_token_accuracy": 0.6961571365594864, "num_tokens": 242833841.0, "step": 7415 }, { "epoch": 0.6153080686624098, "grad_norm": 0.866140604019165, "learning_rate": 3.871476581994232e-05, "loss": 1.088, "mean_token_accuracy": 0.7118157342076301, "num_tokens": 242997681.0, "step": 7420 }, { "epoch": 0.6157226967410233, "grad_norm": 0.8810722827911377, "learning_rate": 3.864427811912078e-05, "loss": 1.1928, "mean_token_accuracy": 0.6912584364414215, "num_tokens": 243161071.0, "step": 7425 }, { "epoch": 0.6161373248196368, "grad_norm": 0.9337232112884521, "learning_rate": 3.8573814206128874e-05, "loss": 1.2626, "mean_token_accuracy": 0.6843047440052032, "num_tokens": 243324911.0, "step": 7430 }, { "epoch": 0.6165519528982503, "grad_norm": 0.9594038128852844, "learning_rate": 3.8503374228573566e-05, "loss": 1.2071, "mean_token_accuracy": 0.6930954113602639, "num_tokens": 243488240.0, "step": 7435 }, { "epoch": 0.6169665809768637, "grad_norm": 0.9113042950630188, "learning_rate": 3.8432958334011696e-05, "loss": 1.1896, "mean_token_accuracy": 0.6943609498441219, "num_tokens": 243652080.0, "step": 7440 }, { "epoch": 0.6173812090554772, "grad_norm": 0.9461240172386169, "learning_rate": 3.836256666994961e-05, "loss": 1.2719, "mean_token_accuracy": 0.683626589179039, "num_tokens": 243815920.0, "step": 7445 }, { "epoch": 0.6177958371340907, "grad_norm": 0.8677425384521484, "learning_rate": 3.8292199383842904e-05, "loss": 1.1222, "mean_token_accuracy": 0.702608747780323, "num_tokens": 243979760.0, "step": 7450 }, { "epoch": 0.6182104652127042, "grad_norm": 0.8997417688369751, "learning_rate": 3.8221856623096186e-05, "loss": 1.2245, "mean_token_accuracy": 0.6878299131989479, "num_tokens": 244143600.0, "step": 7455 }, { "epoch": 0.6186250932913177, "grad_norm": 0.9174249172210693, "learning_rate": 3.815153853506255e-05, "loss": 1.2138, "mean_token_accuracy": 0.6931146115064621, "num_tokens": 244307440.0, "step": 7460 }, { "epoch": 0.6190397213699311, "grad_norm": 0.8721328973770142, "learning_rate": 3.808124526704352e-05, "loss": 1.1536, "mean_token_accuracy": 0.6994314640760422, "num_tokens": 244470259.0, "step": 7465 }, { "epoch": 0.6194543494485446, "grad_norm": 0.903262197971344, "learning_rate": 3.801097696628859e-05, "loss": 1.2394, "mean_token_accuracy": 0.6869745880365372, "num_tokens": 244634099.0, "step": 7470 }, { "epoch": 0.6198689775271582, "grad_norm": 0.8871456384658813, "learning_rate": 3.7940733779994936e-05, "loss": 1.105, "mean_token_accuracy": 0.708999265730381, "num_tokens": 244797939.0, "step": 7475 }, { "epoch": 0.6202836056057717, "grad_norm": 0.897445559501648, "learning_rate": 3.7870515855307155e-05, "loss": 1.1674, "mean_token_accuracy": 0.6953873410820961, "num_tokens": 244961779.0, "step": 7480 }, { "epoch": 0.6206982336843851, "grad_norm": 0.9039589166641235, "learning_rate": 3.78003233393169e-05, "loss": 1.2352, "mean_token_accuracy": 0.6875746801495553, "num_tokens": 245125126.0, "step": 7485 }, { "epoch": 0.6211128617629986, "grad_norm": 0.9061029553413391, "learning_rate": 3.773015637906263e-05, "loss": 1.119, "mean_token_accuracy": 0.7055691704154015, "num_tokens": 245288633.0, "step": 7490 }, { "epoch": 0.6215274898416121, "grad_norm": 0.889281153678894, "learning_rate": 3.7660015121529214e-05, "loss": 1.1886, "mean_token_accuracy": 0.6930413022637367, "num_tokens": 245452473.0, "step": 7495 }, { "epoch": 0.6219421179202256, "grad_norm": 0.9166731834411621, "learning_rate": 3.758989971364774e-05, "loss": 1.2139, "mean_token_accuracy": 0.6898704841732979, "num_tokens": 245616313.0, "step": 7500 }, { "epoch": 0.6223567459988391, "grad_norm": 0.862187922000885, "learning_rate": 3.7519810302295136e-05, "loss": 1.1025, "mean_token_accuracy": 0.7072091907262802, "num_tokens": 245780153.0, "step": 7505 }, { "epoch": 0.6227713740774525, "grad_norm": 0.8542613387107849, "learning_rate": 3.744974703429382e-05, "loss": 1.2232, "mean_token_accuracy": 0.6925891965627671, "num_tokens": 245943993.0, "step": 7510 }, { "epoch": 0.623186002156066, "grad_norm": 0.9268004894256592, "learning_rate": 3.737971005641149e-05, "loss": 1.1749, "mean_token_accuracy": 0.6948831617832184, "num_tokens": 246107510.0, "step": 7515 }, { "epoch": 0.6236006302346795, "grad_norm": 0.8638924360275269, "learning_rate": 3.730969951536081e-05, "loss": 1.2193, "mean_token_accuracy": 0.6944892466068268, "num_tokens": 246271350.0, "step": 7520 }, { "epoch": 0.624015258313293, "grad_norm": 0.8861833810806274, "learning_rate": 3.723971555779896e-05, "loss": 1.248, "mean_token_accuracy": 0.6873167157173157, "num_tokens": 246435190.0, "step": 7525 }, { "epoch": 0.6244298863919064, "grad_norm": 0.8997588753700256, "learning_rate": 3.716975833032752e-05, "loss": 1.2002, "mean_token_accuracy": 0.69580889493227, "num_tokens": 246599030.0, "step": 7530 }, { "epoch": 0.6248445144705199, "grad_norm": 0.9198819994926453, "learning_rate": 3.7099827979492075e-05, "loss": 1.2284, "mean_token_accuracy": 0.6875794202089309, "num_tokens": 246762870.0, "step": 7535 }, { "epoch": 0.6252591425491334, "grad_norm": 0.9142041802406311, "learning_rate": 3.702992465178182e-05, "loss": 1.2455, "mean_token_accuracy": 0.6857679948210716, "num_tokens": 246926240.0, "step": 7540 }, { "epoch": 0.6256737706277469, "grad_norm": 0.9179061651229858, "learning_rate": 3.696004849362946e-05, "loss": 1.2106, "mean_token_accuracy": 0.6883247837424278, "num_tokens": 247090080.0, "step": 7545 }, { "epoch": 0.6260883987063603, "grad_norm": 0.8800495862960815, "learning_rate": 3.689019965141069e-05, "loss": 1.1264, "mean_token_accuracy": 0.7046978026628494, "num_tokens": 247253601.0, "step": 7550 }, { "epoch": 0.6265030267849738, "grad_norm": 0.9119208455085754, "learning_rate": 3.682037827144409e-05, "loss": 1.1777, "mean_token_accuracy": 0.6985948160290718, "num_tokens": 247417441.0, "step": 7555 }, { "epoch": 0.6269176548635874, "grad_norm": 0.8702818751335144, "learning_rate": 3.675058449999057e-05, "loss": 1.1369, "mean_token_accuracy": 0.7018633931875229, "num_tokens": 247581281.0, "step": 7560 }, { "epoch": 0.6273322829422009, "grad_norm": 0.9180110692977905, "learning_rate": 3.668081848325333e-05, "loss": 1.2648, "mean_token_accuracy": 0.6834799602627755, "num_tokens": 247745121.0, "step": 7565 }, { "epoch": 0.6277469110208144, "grad_norm": 0.9151014685630798, "learning_rate": 3.661108036737737e-05, "loss": 1.1606, "mean_token_accuracy": 0.7009775161743164, "num_tokens": 247908961.0, "step": 7570 }, { "epoch": 0.6281615390994278, "grad_norm": 0.94527268409729, "learning_rate": 3.654137029844924e-05, "loss": 1.213, "mean_token_accuracy": 0.6887867733836174, "num_tokens": 248072066.0, "step": 7575 }, { "epoch": 0.6285761671780413, "grad_norm": 0.9195386171340942, "learning_rate": 3.647168842249679e-05, "loss": 1.1822, "mean_token_accuracy": 0.693835535645485, "num_tokens": 248235906.0, "step": 7580 }, { "epoch": 0.6289907952566548, "grad_norm": 0.8834486603736877, "learning_rate": 3.640203488548876e-05, "loss": 1.1563, "mean_token_accuracy": 0.7015090450644493, "num_tokens": 248399746.0, "step": 7585 }, { "epoch": 0.6294054233352683, "grad_norm": 0.9106930494308472, "learning_rate": 3.633240983333452e-05, "loss": 1.1919, "mean_token_accuracy": 0.6935850411653519, "num_tokens": 248563586.0, "step": 7590 }, { "epoch": 0.6298200514138818, "grad_norm": 0.9122629165649414, "learning_rate": 3.6262813411883814e-05, "loss": 1.1649, "mean_token_accuracy": 0.6981880038976669, "num_tokens": 248726869.0, "step": 7595 }, { "epoch": 0.6302346794924952, "grad_norm": 0.896998941898346, "learning_rate": 3.6193245766926406e-05, "loss": 1.1874, "mean_token_accuracy": 0.6955461889505387, "num_tokens": 248890709.0, "step": 7600 }, { "epoch": 0.6306493075711087, "grad_norm": 0.903556764125824, "learning_rate": 3.612370704419172e-05, "loss": 1.1902, "mean_token_accuracy": 0.6891190126538277, "num_tokens": 249054549.0, "step": 7605 }, { "epoch": 0.6310639356497222, "grad_norm": 0.9326213002204895, "learning_rate": 3.6054197389348665e-05, "loss": 1.2256, "mean_token_accuracy": 0.6877138823270798, "num_tokens": 249216858.0, "step": 7610 }, { "epoch": 0.6314785637283357, "grad_norm": 0.9058736562728882, "learning_rate": 3.598471694800523e-05, "loss": 1.1588, "mean_token_accuracy": 0.7015294149518013, "num_tokens": 249380685.0, "step": 7615 }, { "epoch": 0.6318931918069491, "grad_norm": 0.8819628357887268, "learning_rate": 3.591526586570818e-05, "loss": 1.1887, "mean_token_accuracy": 0.6957048639655113, "num_tokens": 249543792.0, "step": 7620 }, { "epoch": 0.6323078198855626, "grad_norm": 0.9046173095703125, "learning_rate": 3.584584428794284e-05, "loss": 1.1736, "mean_token_accuracy": 0.697366812825203, "num_tokens": 249707632.0, "step": 7625 }, { "epoch": 0.6327224479641761, "grad_norm": 0.9316004514694214, "learning_rate": 3.5776452360132674e-05, "loss": 1.093, "mean_token_accuracy": 0.7076551809906959, "num_tokens": 249871472.0, "step": 7630 }, { "epoch": 0.6331370760427896, "grad_norm": 0.8980688452720642, "learning_rate": 3.57070902276391e-05, "loss": 1.1912, "mean_token_accuracy": 0.6930290833115578, "num_tokens": 250035312.0, "step": 7635 }, { "epoch": 0.633551704121403, "grad_norm": 0.8699662685394287, "learning_rate": 3.563775803576102e-05, "loss": 1.1337, "mean_token_accuracy": 0.7062133401632309, "num_tokens": 250199152.0, "step": 7640 }, { "epoch": 0.6339663322000166, "grad_norm": 0.9415832161903381, "learning_rate": 3.5568455929734703e-05, "loss": 1.2639, "mean_token_accuracy": 0.6828629016876221, "num_tokens": 250362992.0, "step": 7645 }, { "epoch": 0.6343809602786301, "grad_norm": 0.9067273139953613, "learning_rate": 3.549918405473338e-05, "loss": 1.1426, "mean_token_accuracy": 0.7032135829329491, "num_tokens": 250526832.0, "step": 7650 }, { "epoch": 0.6347955883572436, "grad_norm": 0.8815768957138062, "learning_rate": 3.542994255586691e-05, "loss": 1.1942, "mean_token_accuracy": 0.6964015141129494, "num_tokens": 250690672.0, "step": 7655 }, { "epoch": 0.6352102164358571, "grad_norm": 0.890948474407196, "learning_rate": 3.5360731578181586e-05, "loss": 1.1244, "mean_token_accuracy": 0.7033846527338028, "num_tokens": 250854512.0, "step": 7660 }, { "epoch": 0.6356248445144705, "grad_norm": 0.8967106342315674, "learning_rate": 3.529155126665972e-05, "loss": 1.2503, "mean_token_accuracy": 0.6870967745780945, "num_tokens": 251018352.0, "step": 7665 }, { "epoch": 0.636039472593084, "grad_norm": 0.9507739543914795, "learning_rate": 3.522240176621938e-05, "loss": 1.2102, "mean_token_accuracy": 0.6890212625265122, "num_tokens": 251182192.0, "step": 7670 }, { "epoch": 0.6364541006716975, "grad_norm": 0.9710748195648193, "learning_rate": 3.5153283221714114e-05, "loss": 1.1734, "mean_token_accuracy": 0.6927552953362465, "num_tokens": 251345096.0, "step": 7675 }, { "epoch": 0.636868728750311, "grad_norm": 0.881964921951294, "learning_rate": 3.5084195777932655e-05, "loss": 1.1581, "mean_token_accuracy": 0.6974370807409287, "num_tokens": 251508807.0, "step": 7680 }, { "epoch": 0.6372833568289245, "grad_norm": 0.8972564339637756, "learning_rate": 3.5015139579598506e-05, "loss": 1.2058, "mean_token_accuracy": 0.6977211624383927, "num_tokens": 251672647.0, "step": 7685 }, { "epoch": 0.6376979849075379, "grad_norm": 0.8787938356399536, "learning_rate": 3.494611477136978e-05, "loss": 1.2091, "mean_token_accuracy": 0.689198437333107, "num_tokens": 251836487.0, "step": 7690 }, { "epoch": 0.6381126129861514, "grad_norm": 0.9102070927619934, "learning_rate": 3.4877121497838786e-05, "loss": 1.1796, "mean_token_accuracy": 0.6987903207540512, "num_tokens": 252000327.0, "step": 7695 }, { "epoch": 0.6385272410647649, "grad_norm": 0.9172378182411194, "learning_rate": 3.480815990353186e-05, "loss": 1.2567, "mean_token_accuracy": 0.6841764420270919, "num_tokens": 252164167.0, "step": 7700 }, { "epoch": 0.6389418691433784, "grad_norm": 0.9535180330276489, "learning_rate": 3.473923013290887e-05, "loss": 1.1415, "mean_token_accuracy": 0.7059567451477051, "num_tokens": 252328007.0, "step": 7705 }, { "epoch": 0.6393564972219918, "grad_norm": 0.8775321841239929, "learning_rate": 3.467033233036309e-05, "loss": 1.1707, "mean_token_accuracy": 0.698680354654789, "num_tokens": 252491847.0, "step": 7710 }, { "epoch": 0.6397711253006053, "grad_norm": 0.9459161162376404, "learning_rate": 3.4601466640220825e-05, "loss": 1.1673, "mean_token_accuracy": 0.6997861638665199, "num_tokens": 252655687.0, "step": 7715 }, { "epoch": 0.6401857533792188, "grad_norm": 0.9370214343070984, "learning_rate": 3.453263320674105e-05, "loss": 1.2108, "mean_token_accuracy": 0.6920271262526512, "num_tokens": 252819527.0, "step": 7720 }, { "epoch": 0.6406003814578324, "grad_norm": 0.955539882183075, "learning_rate": 3.446383217411526e-05, "loss": 1.1547, "mean_token_accuracy": 0.7013013228774071, "num_tokens": 252983367.0, "step": 7725 }, { "epoch": 0.6410150095364459, "grad_norm": 0.8662726283073425, "learning_rate": 3.439506368646701e-05, "loss": 1.1857, "mean_token_accuracy": 0.6947641745209694, "num_tokens": 253147207.0, "step": 7730 }, { "epoch": 0.6414296376150593, "grad_norm": 0.8785839676856995, "learning_rate": 3.4326327887851686e-05, "loss": 1.1357, "mean_token_accuracy": 0.7032488837838173, "num_tokens": 253310982.0, "step": 7735 }, { "epoch": 0.6418442656936728, "grad_norm": 0.9216163754463196, "learning_rate": 3.4257624922256244e-05, "loss": 1.1401, "mean_token_accuracy": 0.7013868510723114, "num_tokens": 253474822.0, "step": 7740 }, { "epoch": 0.6422588937722863, "grad_norm": 0.9228615164756775, "learning_rate": 3.418895493359882e-05, "loss": 1.2087, "mean_token_accuracy": 0.6918560594320298, "num_tokens": 253638662.0, "step": 7745 }, { "epoch": 0.6426735218508998, "grad_norm": 0.8969170451164246, "learning_rate": 3.412031806572847e-05, "loss": 1.1717, "mean_token_accuracy": 0.6972568422555924, "num_tokens": 253802502.0, "step": 7750 }, { "epoch": 0.6430881499295132, "grad_norm": 0.8896551132202148, "learning_rate": 3.4051714462424874e-05, "loss": 1.2208, "mean_token_accuracy": 0.6876038581132888, "num_tokens": 253966342.0, "step": 7755 }, { "epoch": 0.6435027780081267, "grad_norm": 0.8922243118286133, "learning_rate": 3.398314426739807e-05, "loss": 1.1533, "mean_token_accuracy": 0.7022177428007126, "num_tokens": 254130182.0, "step": 7760 }, { "epoch": 0.6439174060867402, "grad_norm": 0.8776208758354187, "learning_rate": 3.391460762428803e-05, "loss": 1.1752, "mean_token_accuracy": 0.7000305414199829, "num_tokens": 254294022.0, "step": 7765 }, { "epoch": 0.6443320341653537, "grad_norm": 0.8624290227890015, "learning_rate": 3.384610467666453e-05, "loss": 1.1958, "mean_token_accuracy": 0.6932917907834053, "num_tokens": 254457862.0, "step": 7770 }, { "epoch": 0.6447466622439672, "grad_norm": 0.8558380603790283, "learning_rate": 3.377763556802668e-05, "loss": 1.1781, "mean_token_accuracy": 0.6946969717741013, "num_tokens": 254621702.0, "step": 7775 }, { "epoch": 0.6451612903225806, "grad_norm": 0.8980141878128052, "learning_rate": 3.37092004418028e-05, "loss": 1.0717, "mean_token_accuracy": 0.7139019802212715, "num_tokens": 254784915.0, "step": 7780 }, { "epoch": 0.6455759184011941, "grad_norm": 0.9248932600021362, "learning_rate": 3.3640799441349935e-05, "loss": 1.2491, "mean_token_accuracy": 0.6859604060649872, "num_tokens": 254948755.0, "step": 7785 }, { "epoch": 0.6459905464798076, "grad_norm": 0.8932434916496277, "learning_rate": 3.357243270995368e-05, "loss": 1.1461, "mean_token_accuracy": 0.7035068452358246, "num_tokens": 255112595.0, "step": 7790 }, { "epoch": 0.6464051745584211, "grad_norm": 0.8438484072685242, "learning_rate": 3.3504100390827856e-05, "loss": 1.1113, "mean_token_accuracy": 0.7079789817333222, "num_tokens": 255276435.0, "step": 7795 }, { "epoch": 0.6468198026370345, "grad_norm": 0.9122107625007629, "learning_rate": 3.3435802627114146e-05, "loss": 1.1611, "mean_token_accuracy": 0.697667233645916, "num_tokens": 255439429.0, "step": 7800 }, { "epoch": 0.647234430715648, "grad_norm": 0.8767837882041931, "learning_rate": 3.336753956188192e-05, "loss": 1.1916, "mean_token_accuracy": 0.6942631945014, "num_tokens": 255603269.0, "step": 7805 }, { "epoch": 0.6476490587942616, "grad_norm": 0.9044637680053711, "learning_rate": 3.329931133812783e-05, "loss": 1.2343, "mean_token_accuracy": 0.6886852413415909, "num_tokens": 255767109.0, "step": 7810 }, { "epoch": 0.6480636868728751, "grad_norm": 0.9283355474472046, "learning_rate": 3.323111809877552e-05, "loss": 1.2322, "mean_token_accuracy": 0.6858137831091881, "num_tokens": 255930949.0, "step": 7815 }, { "epoch": 0.6484783149514886, "grad_norm": 0.8792169690132141, "learning_rate": 3.3162959986675357e-05, "loss": 1.2008, "mean_token_accuracy": 0.6928946733474731, "num_tokens": 256094789.0, "step": 7820 }, { "epoch": 0.648892943030102, "grad_norm": 0.9532812833786011, "learning_rate": 3.309483714460417e-05, "loss": 1.1927, "mean_token_accuracy": 0.6942570865154266, "num_tokens": 256258629.0, "step": 7825 }, { "epoch": 0.6493075711087155, "grad_norm": 0.8612990379333496, "learning_rate": 3.302674971526485e-05, "loss": 1.1689, "mean_token_accuracy": 0.7000855296850205, "num_tokens": 256422469.0, "step": 7830 }, { "epoch": 0.649722199187329, "grad_norm": 0.8923850059509277, "learning_rate": 3.295869784128611e-05, "loss": 1.1242, "mean_token_accuracy": 0.7061583563685417, "num_tokens": 256586309.0, "step": 7835 }, { "epoch": 0.6501368272659425, "grad_norm": 0.9322280287742615, "learning_rate": 3.2890681665222226e-05, "loss": 1.2528, "mean_token_accuracy": 0.6825391009449959, "num_tokens": 256750149.0, "step": 7840 }, { "epoch": 0.6505514553445559, "grad_norm": 0.9143825173377991, "learning_rate": 3.282270132955266e-05, "loss": 1.1736, "mean_token_accuracy": 0.7003849029541016, "num_tokens": 256913989.0, "step": 7845 }, { "epoch": 0.6509660834231694, "grad_norm": 0.919293224811554, "learning_rate": 3.275475697668178e-05, "loss": 1.229, "mean_token_accuracy": 0.6859237551689148, "num_tokens": 257077829.0, "step": 7850 }, { "epoch": 0.6513807115017829, "grad_norm": 0.9100547432899475, "learning_rate": 3.2686848748938615e-05, "loss": 1.2084, "mean_token_accuracy": 0.6919843584299088, "num_tokens": 257241669.0, "step": 7855 }, { "epoch": 0.6517953395803964, "grad_norm": 0.9073319435119629, "learning_rate": 3.261897678857651e-05, "loss": 1.2239, "mean_token_accuracy": 0.6890473529696465, "num_tokens": 257405149.0, "step": 7860 }, { "epoch": 0.6522099676590098, "grad_norm": 0.8998532891273499, "learning_rate": 3.255114123777282e-05, "loss": 1.1502, "mean_token_accuracy": 0.6979288876056671, "num_tokens": 257568989.0, "step": 7865 }, { "epoch": 0.6526245957376233, "grad_norm": 0.9466356635093689, "learning_rate": 3.2483342238628645e-05, "loss": 1.2448, "mean_token_accuracy": 0.6834188640117645, "num_tokens": 257732829.0, "step": 7870 }, { "epoch": 0.6530392238162368, "grad_norm": 0.9228238463401794, "learning_rate": 3.2415579933168525e-05, "loss": 1.1342, "mean_token_accuracy": 0.7012707725167274, "num_tokens": 257896669.0, "step": 7875 }, { "epoch": 0.6534538518948503, "grad_norm": 0.8968457579612732, "learning_rate": 3.234785446334009e-05, "loss": 1.1727, "mean_token_accuracy": 0.6980144158005714, "num_tokens": 258060509.0, "step": 7880 }, { "epoch": 0.6538684799734638, "grad_norm": 0.9200351238250732, "learning_rate": 3.228016597101387e-05, "loss": 1.1889, "mean_token_accuracy": 0.6939210638403892, "num_tokens": 258224349.0, "step": 7885 }, { "epoch": 0.6542831080520772, "grad_norm": 0.9175406694412231, "learning_rate": 3.221251459798291e-05, "loss": 1.2323, "mean_token_accuracy": 0.6847568452358246, "num_tokens": 258388189.0, "step": 7890 }, { "epoch": 0.6546977361306908, "grad_norm": 0.8689358234405518, "learning_rate": 3.214490048596246e-05, "loss": 1.1572, "mean_token_accuracy": 0.700342133641243, "num_tokens": 258552029.0, "step": 7895 }, { "epoch": 0.6551123642093043, "grad_norm": 0.8892029523849487, "learning_rate": 3.2077323776589766e-05, "loss": 1.1111, "mean_token_accuracy": 0.7068059653043747, "num_tokens": 258715869.0, "step": 7900 }, { "epoch": 0.6555269922879178, "grad_norm": 0.8861289024353027, "learning_rate": 3.200978461142371e-05, "loss": 1.1604, "mean_token_accuracy": 0.6996456414461136, "num_tokens": 258879709.0, "step": 7905 }, { "epoch": 0.6559416203665313, "grad_norm": 0.9374545812606812, "learning_rate": 3.1942283131944525e-05, "loss": 1.1941, "mean_token_accuracy": 0.6943059071898461, "num_tokens": 259042763.0, "step": 7910 }, { "epoch": 0.6563562484451447, "grad_norm": 0.889559268951416, "learning_rate": 3.1874819479553484e-05, "loss": 1.1385, "mean_token_accuracy": 0.7040811315178871, "num_tokens": 259206603.0, "step": 7915 }, { "epoch": 0.6567708765237582, "grad_norm": 0.9265528321266174, "learning_rate": 3.180739379557266e-05, "loss": 1.1488, "mean_token_accuracy": 0.7060422763228417, "num_tokens": 259370443.0, "step": 7920 }, { "epoch": 0.6571855046023717, "grad_norm": 0.9090298414230347, "learning_rate": 3.1740006221244546e-05, "loss": 1.1003, "mean_token_accuracy": 0.7120295718312264, "num_tokens": 259534283.0, "step": 7925 }, { "epoch": 0.6576001326809852, "grad_norm": 0.909954845905304, "learning_rate": 3.1672656897731825e-05, "loss": 1.0904, "mean_token_accuracy": 0.7156341642141342, "num_tokens": 259698123.0, "step": 7930 }, { "epoch": 0.6580147607595986, "grad_norm": 0.8400858044624329, "learning_rate": 3.160534596611704e-05, "loss": 1.152, "mean_token_accuracy": 0.7007453545928002, "num_tokens": 259861963.0, "step": 7935 }, { "epoch": 0.6584293888382121, "grad_norm": 0.9129824042320251, "learning_rate": 3.153807356740235e-05, "loss": 1.1336, "mean_token_accuracy": 0.7012524455785751, "num_tokens": 260025803.0, "step": 7940 }, { "epoch": 0.6588440169168256, "grad_norm": 0.9285549521446228, "learning_rate": 3.147083984250914e-05, "loss": 1.206, "mean_token_accuracy": 0.6904447689652443, "num_tokens": 260189643.0, "step": 7945 }, { "epoch": 0.6592586449954391, "grad_norm": 0.897851824760437, "learning_rate": 3.1403644932277814e-05, "loss": 1.1734, "mean_token_accuracy": 0.6981427192687988, "num_tokens": 260353483.0, "step": 7950 }, { "epoch": 0.6596732730740525, "grad_norm": 0.8849973082542419, "learning_rate": 3.1336488977467484e-05, "loss": 1.1127, "mean_token_accuracy": 0.7087732166051864, "num_tokens": 260517323.0, "step": 7955 }, { "epoch": 0.660087901152666, "grad_norm": 0.924141526222229, "learning_rate": 3.126937211875559e-05, "loss": 1.1991, "mean_token_accuracy": 0.691159576177597, "num_tokens": 260681163.0, "step": 7960 }, { "epoch": 0.6605025292312795, "grad_norm": 0.912146806716919, "learning_rate": 3.1202294496737764e-05, "loss": 1.2283, "mean_token_accuracy": 0.6835349485278129, "num_tokens": 260845003.0, "step": 7965 }, { "epoch": 0.660917157309893, "grad_norm": 0.9447025656700134, "learning_rate": 3.113525625192739e-05, "loss": 1.1328, "mean_token_accuracy": 0.7048875838518143, "num_tokens": 261008843.0, "step": 7970 }, { "epoch": 0.6613317853885065, "grad_norm": 0.936392068862915, "learning_rate": 3.106825752475537e-05, "loss": 1.1901, "mean_token_accuracy": 0.6934811815619468, "num_tokens": 261172683.0, "step": 7975 }, { "epoch": 0.66174641346712, "grad_norm": 0.9893807172775269, "learning_rate": 3.100129845556982e-05, "loss": 1.1741, "mean_token_accuracy": 0.6976661816239357, "num_tokens": 261336523.0, "step": 7980 }, { "epoch": 0.6621610415457335, "grad_norm": 0.9306498169898987, "learning_rate": 3.093437918463582e-05, "loss": 1.1309, "mean_token_accuracy": 0.7051431089639664, "num_tokens": 261499679.0, "step": 7985 }, { "epoch": 0.662575669624347, "grad_norm": 0.9055891036987305, "learning_rate": 3.086749985213506e-05, "loss": 1.1941, "mean_token_accuracy": 0.6949699714779853, "num_tokens": 261663044.0, "step": 7990 }, { "epoch": 0.6629902977029605, "grad_norm": 0.9195916652679443, "learning_rate": 3.0800660598165535e-05, "loss": 1.19, "mean_token_accuracy": 0.697262954711914, "num_tokens": 261826884.0, "step": 7995 }, { "epoch": 0.663404925781574, "grad_norm": 0.8760239481925964, "learning_rate": 3.0733861562741294e-05, "loss": 1.222, "mean_token_accuracy": 0.6897421807050705, "num_tokens": 261990724.0, "step": 8000 }, { "epoch": 0.6638195538601874, "grad_norm": 0.9419845342636108, "learning_rate": 3.066710288579221e-05, "loss": 1.1852, "mean_token_accuracy": 0.695423997938633, "num_tokens": 262154564.0, "step": 8005 }, { "epoch": 0.6642341819388009, "grad_norm": 0.9906468987464905, "learning_rate": 3.0600384707163524e-05, "loss": 1.229, "mean_token_accuracy": 0.6863575249910354, "num_tokens": 262318404.0, "step": 8010 }, { "epoch": 0.6646488100174144, "grad_norm": 0.8945165276527405, "learning_rate": 3.053370716661565e-05, "loss": 1.1661, "mean_token_accuracy": 0.6972568422555924, "num_tokens": 262482244.0, "step": 8015 }, { "epoch": 0.6650634380960279, "grad_norm": 0.909670352935791, "learning_rate": 3.046707040382396e-05, "loss": 1.2078, "mean_token_accuracy": 0.6953690111637115, "num_tokens": 262646084.0, "step": 8020 }, { "epoch": 0.6654780661746413, "grad_norm": 0.9329404234886169, "learning_rate": 3.0400474558378278e-05, "loss": 1.1376, "mean_token_accuracy": 0.7053213611245155, "num_tokens": 262809924.0, "step": 8025 }, { "epoch": 0.6658926942532548, "grad_norm": 0.9019607305526733, "learning_rate": 3.033391976978282e-05, "loss": 1.1496, "mean_token_accuracy": 0.705315251648426, "num_tokens": 262973764.0, "step": 8030 }, { "epoch": 0.6663073223318683, "grad_norm": 0.9323384165763855, "learning_rate": 3.0267406177455758e-05, "loss": 1.1324, "mean_token_accuracy": 0.7034274145960808, "num_tokens": 263137604.0, "step": 8035 }, { "epoch": 0.6667219504104818, "grad_norm": 0.8958032131195068, "learning_rate": 3.0200933920728935e-05, "loss": 1.0873, "mean_token_accuracy": 0.710343350470066, "num_tokens": 263301444.0, "step": 8040 }, { "epoch": 0.6671365784890952, "grad_norm": 0.9433259963989258, "learning_rate": 3.013450313884766e-05, "loss": 1.2628, "mean_token_accuracy": 0.6852602422237396, "num_tokens": 263465171.0, "step": 8045 }, { "epoch": 0.6675512065677087, "grad_norm": 0.9961844682693481, "learning_rate": 3.006811397097033e-05, "loss": 1.169, "mean_token_accuracy": 0.7004521027207374, "num_tokens": 263629011.0, "step": 8050 }, { "epoch": 0.6679658346463222, "grad_norm": 0.9597964882850647, "learning_rate": 3.0001766556168188e-05, "loss": 1.2416, "mean_token_accuracy": 0.6857465758919716, "num_tokens": 263792851.0, "step": 8055 }, { "epoch": 0.6683804627249358, "grad_norm": 0.8648970723152161, "learning_rate": 2.9935461033424972e-05, "loss": 1.1102, "mean_token_accuracy": 0.713789102435112, "num_tokens": 263956691.0, "step": 8060 }, { "epoch": 0.6687950908035493, "grad_norm": 0.9382487535476685, "learning_rate": 2.9869197541636713e-05, "loss": 1.1416, "mean_token_accuracy": 0.7014784932136535, "num_tokens": 264120531.0, "step": 8065 }, { "epoch": 0.6692097188821627, "grad_norm": 0.9381263852119446, "learning_rate": 2.9802976219611388e-05, "loss": 1.2104, "mean_token_accuracy": 0.6897899955511093, "num_tokens": 264284175.0, "step": 8070 }, { "epoch": 0.6696243469607762, "grad_norm": 0.9110860824584961, "learning_rate": 2.97367972060686e-05, "loss": 1.2149, "mean_token_accuracy": 0.6906097263097764, "num_tokens": 264448015.0, "step": 8075 }, { "epoch": 0.6700389750393897, "grad_norm": 0.9007151126861572, "learning_rate": 2.9670660639639354e-05, "loss": 1.1291, "mean_token_accuracy": 0.7080767348408699, "num_tokens": 264611855.0, "step": 8080 }, { "epoch": 0.6704536031180032, "grad_norm": 0.9092766046524048, "learning_rate": 2.9604566658865762e-05, "loss": 1.2117, "mean_token_accuracy": 0.6890212625265122, "num_tokens": 264775695.0, "step": 8085 }, { "epoch": 0.6708682311966166, "grad_norm": 0.9114570617675781, "learning_rate": 2.9538515402200672e-05, "loss": 1.114, "mean_token_accuracy": 0.707172529399395, "num_tokens": 264939535.0, "step": 8090 }, { "epoch": 0.6712828592752301, "grad_norm": 0.9584678411483765, "learning_rate": 2.9472507008007462e-05, "loss": 1.1718, "mean_token_accuracy": 0.6979105576872826, "num_tokens": 265103375.0, "step": 8095 }, { "epoch": 0.6716974873538436, "grad_norm": 0.8790441155433655, "learning_rate": 2.9406541614559757e-05, "loss": 1.2092, "mean_token_accuracy": 0.6879531264305114, "num_tokens": 265266705.0, "step": 8100 }, { "epoch": 0.6721121154324571, "grad_norm": 0.9176095724105835, "learning_rate": 2.934061936004102e-05, "loss": 1.1874, "mean_token_accuracy": 0.6954189702868462, "num_tokens": 265430067.0, "step": 8105 }, { "epoch": 0.6725267435110706, "grad_norm": 0.9208274483680725, "learning_rate": 2.927474038254443e-05, "loss": 1.1584, "mean_token_accuracy": 0.7010539382696152, "num_tokens": 265593552.0, "step": 8110 }, { "epoch": 0.672941371589684, "grad_norm": 0.9400938153266907, "learning_rate": 2.920890482007248e-05, "loss": 1.1014, "mean_token_accuracy": 0.7097007974982261, "num_tokens": 265756985.0, "step": 8115 }, { "epoch": 0.6733559996682975, "grad_norm": 0.9142690300941467, "learning_rate": 2.9143112810536688e-05, "loss": 1.1734, "mean_token_accuracy": 0.6966886594891548, "num_tokens": 265920825.0, "step": 8120 }, { "epoch": 0.673770627746911, "grad_norm": 0.9190114736557007, "learning_rate": 2.9077364491757387e-05, "loss": 1.2588, "mean_token_accuracy": 0.6858443230390548, "num_tokens": 266084665.0, "step": 8125 }, { "epoch": 0.6741852558255245, "grad_norm": 0.9296127557754517, "learning_rate": 2.9011660001463327e-05, "loss": 1.1336, "mean_token_accuracy": 0.7009439036250115, "num_tokens": 266247905.0, "step": 8130 }, { "epoch": 0.6745998839041379, "grad_norm": 1.4097745418548584, "learning_rate": 2.8945999477291564e-05, "loss": 1.1725, "mean_token_accuracy": 0.693255127966404, "num_tokens": 266411745.0, "step": 8135 }, { "epoch": 0.6750145119827514, "grad_norm": 0.8844364881515503, "learning_rate": 2.8880383056786897e-05, "loss": 1.1612, "mean_token_accuracy": 0.6995601192116737, "num_tokens": 266575585.0, "step": 8140 }, { "epoch": 0.675429140061365, "grad_norm": 0.8976459503173828, "learning_rate": 2.8814810877401828e-05, "loss": 1.2036, "mean_token_accuracy": 0.6910068362951278, "num_tokens": 266739425.0, "step": 8145 }, { "epoch": 0.6758437681399785, "grad_norm": 0.9473207592964172, "learning_rate": 2.8749283076496226e-05, "loss": 1.209, "mean_token_accuracy": 0.6893572807312012, "num_tokens": 266903265.0, "step": 8150 }, { "epoch": 0.676258396218592, "grad_norm": 0.8655648827552795, "learning_rate": 2.8683799791336874e-05, "loss": 1.0945, "mean_token_accuracy": 0.7166544482111931, "num_tokens": 267067105.0, "step": 8155 }, { "epoch": 0.6766730242972054, "grad_norm": 0.9129595756530762, "learning_rate": 2.8618361159097396e-05, "loss": 1.1446, "mean_token_accuracy": 0.7012891039252281, "num_tokens": 267230945.0, "step": 8160 }, { "epoch": 0.6770876523758189, "grad_norm": 0.8981407284736633, "learning_rate": 2.8552967316857847e-05, "loss": 1.2631, "mean_token_accuracy": 0.6817509770393372, "num_tokens": 267394785.0, "step": 8165 }, { "epoch": 0.6775022804544324, "grad_norm": 0.9718103408813477, "learning_rate": 2.848761840160447e-05, "loss": 1.2535, "mean_token_accuracy": 0.6781463786959648, "num_tokens": 267558625.0, "step": 8170 }, { "epoch": 0.6779169085330459, "grad_norm": 0.8931992053985596, "learning_rate": 2.842231455022938e-05, "loss": 1.1518, "mean_token_accuracy": 0.7029997572302819, "num_tokens": 267722465.0, "step": 8175 }, { "epoch": 0.6783315366116593, "grad_norm": 0.8879521489143372, "learning_rate": 2.8357055899530305e-05, "loss": 1.1829, "mean_token_accuracy": 0.6984604090452194, "num_tokens": 267886305.0, "step": 8180 }, { "epoch": 0.6787461646902728, "grad_norm": 0.9124054312705994, "learning_rate": 2.8291842586210284e-05, "loss": 1.2134, "mean_token_accuracy": 0.6883736565709114, "num_tokens": 268050145.0, "step": 8185 }, { "epoch": 0.6791607927688863, "grad_norm": 0.9303012490272522, "learning_rate": 2.8226674746877363e-05, "loss": 1.2363, "mean_token_accuracy": 0.6868585079908371, "num_tokens": 268213985.0, "step": 8190 }, { "epoch": 0.6795754208474998, "grad_norm": 0.9055581092834473, "learning_rate": 2.8161552518044365e-05, "loss": 1.114, "mean_token_accuracy": 0.7071786433458328, "num_tokens": 268377825.0, "step": 8195 }, { "epoch": 0.6799900489261133, "grad_norm": 0.9333614706993103, "learning_rate": 2.809647603612855e-05, "loss": 1.1981, "mean_token_accuracy": 0.6952223852276802, "num_tokens": 268541665.0, "step": 8200 }, { "epoch": 0.6804046770047267, "grad_norm": 0.8959496021270752, "learning_rate": 2.8031445437451352e-05, "loss": 1.1051, "mean_token_accuracy": 0.711937926709652, "num_tokens": 268705505.0, "step": 8205 }, { "epoch": 0.6808193050833402, "grad_norm": 0.8913466930389404, "learning_rate": 2.7966460858238076e-05, "loss": 1.1174, "mean_token_accuracy": 0.7062072291970253, "num_tokens": 268869345.0, "step": 8210 }, { "epoch": 0.6812339331619537, "grad_norm": 0.8874464631080627, "learning_rate": 2.790152243461765e-05, "loss": 1.1403, "mean_token_accuracy": 0.7017900750041008, "num_tokens": 269033185.0, "step": 8215 }, { "epoch": 0.6816485612405672, "grad_norm": 0.9019583463668823, "learning_rate": 2.783663030262229e-05, "loss": 1.1616, "mean_token_accuracy": 0.6987719938158989, "num_tokens": 269197025.0, "step": 8220 }, { "epoch": 0.6820631893191806, "grad_norm": 0.8874391913414001, "learning_rate": 2.7771784598187268e-05, "loss": 1.156, "mean_token_accuracy": 0.6987349942326546, "num_tokens": 269360192.0, "step": 8225 }, { "epoch": 0.6824778173977942, "grad_norm": 0.8777316808700562, "learning_rate": 2.7706985457150597e-05, "loss": 1.1307, "mean_token_accuracy": 0.7065371468663215, "num_tokens": 269524032.0, "step": 8230 }, { "epoch": 0.6828924454764077, "grad_norm": 0.9662749171257019, "learning_rate": 2.7642233015252683e-05, "loss": 1.2603, "mean_token_accuracy": 0.6818487271666527, "num_tokens": 269687872.0, "step": 8235 }, { "epoch": 0.6833070735550212, "grad_norm": 0.9199495315551758, "learning_rate": 2.7577527408136217e-05, "loss": 1.18, "mean_token_accuracy": 0.6951735138893127, "num_tokens": 269851712.0, "step": 8240 }, { "epoch": 0.6837217016336347, "grad_norm": 0.9042023420333862, "learning_rate": 2.7512868771345723e-05, "loss": 1.1748, "mean_token_accuracy": 0.7006720416247845, "num_tokens": 270015552.0, "step": 8245 }, { "epoch": 0.6841363297122481, "grad_norm": 0.9222347140312195, "learning_rate": 2.744825724032731e-05, "loss": 1.146, "mean_token_accuracy": 0.698130975663662, "num_tokens": 270178894.0, "step": 8250 }, { "epoch": 0.6845509577908616, "grad_norm": 0.9103004336357117, "learning_rate": 2.738369295042843e-05, "loss": 1.2015, "mean_token_accuracy": 0.6903470143675804, "num_tokens": 270342734.0, "step": 8255 }, { "epoch": 0.6849655858694751, "grad_norm": 0.9190084934234619, "learning_rate": 2.731917603689763e-05, "loss": 1.1617, "mean_token_accuracy": 0.7091275677084923, "num_tokens": 270506574.0, "step": 8260 }, { "epoch": 0.6853802139480886, "grad_norm": 0.9093053936958313, "learning_rate": 2.7254706634884125e-05, "loss": 1.1341, "mean_token_accuracy": 0.7053946748375892, "num_tokens": 270670414.0, "step": 8265 }, { "epoch": 0.685794842026702, "grad_norm": 0.8801981210708618, "learning_rate": 2.719028487943763e-05, "loss": 1.1643, "mean_token_accuracy": 0.7008492186665535, "num_tokens": 270834254.0, "step": 8270 }, { "epoch": 0.6862094701053155, "grad_norm": 0.9740326404571533, "learning_rate": 2.7125910905508102e-05, "loss": 1.2164, "mean_token_accuracy": 0.6922470659017563, "num_tokens": 270998094.0, "step": 8275 }, { "epoch": 0.686624098183929, "grad_norm": 0.8929706811904907, "learning_rate": 2.7061584847945376e-05, "loss": 1.0785, "mean_token_accuracy": 0.7141678869724274, "num_tokens": 271161934.0, "step": 8280 }, { "epoch": 0.6870387262625425, "grad_norm": 0.8799037337303162, "learning_rate": 2.699730684149886e-05, "loss": 1.1977, "mean_token_accuracy": 0.6915811315178871, "num_tokens": 271325774.0, "step": 8285 }, { "epoch": 0.687453354341156, "grad_norm": 0.9009878039360046, "learning_rate": 2.6933077020817344e-05, "loss": 1.2165, "mean_token_accuracy": 0.6927935481071472, "num_tokens": 271488598.0, "step": 8290 }, { "epoch": 0.6878679824197694, "grad_norm": 0.9551199674606323, "learning_rate": 2.686889552044875e-05, "loss": 1.1944, "mean_token_accuracy": 0.6921798631548881, "num_tokens": 271652438.0, "step": 8295 }, { "epoch": 0.6882826104983829, "grad_norm": 0.9601017236709595, "learning_rate": 2.680476247483965e-05, "loss": 1.2383, "mean_token_accuracy": 0.6872434094548225, "num_tokens": 271816278.0, "step": 8300 }, { "epoch": 0.6886972385769964, "grad_norm": 0.9434542059898376, "learning_rate": 2.6740678018335207e-05, "loss": 1.1403, "mean_token_accuracy": 0.7043584361672401, "num_tokens": 271979753.0, "step": 8305 }, { "epoch": 0.68911186665561, "grad_norm": 0.9041075706481934, "learning_rate": 2.6676642285178754e-05, "loss": 1.1983, "mean_token_accuracy": 0.6933589935302734, "num_tokens": 272143593.0, "step": 8310 }, { "epoch": 0.6895264947342234, "grad_norm": 0.9166638255119324, "learning_rate": 2.6612655409511584e-05, "loss": 1.2031, "mean_token_accuracy": 0.6886542037129402, "num_tokens": 272307373.0, "step": 8315 }, { "epoch": 0.6899411228128369, "grad_norm": 0.8909661769866943, "learning_rate": 2.6548717525372635e-05, "loss": 1.2413, "mean_token_accuracy": 0.6819937512278557, "num_tokens": 272470536.0, "step": 8320 }, { "epoch": 0.6903557508914504, "grad_norm": 0.9195128679275513, "learning_rate": 2.6484828766698212e-05, "loss": 1.1438, "mean_token_accuracy": 0.7059733361005783, "num_tokens": 272633354.0, "step": 8325 }, { "epoch": 0.6907703789700639, "grad_norm": 0.8586157560348511, "learning_rate": 2.642098926732172e-05, "loss": 1.0929, "mean_token_accuracy": 0.7127057388424873, "num_tokens": 272796585.0, "step": 8330 }, { "epoch": 0.6911850070486774, "grad_norm": 0.930367648601532, "learning_rate": 2.6357199160973377e-05, "loss": 1.2428, "mean_token_accuracy": 0.6862353429198265, "num_tokens": 272960425.0, "step": 8335 }, { "epoch": 0.6915996351272908, "grad_norm": 0.9563949704170227, "learning_rate": 2.6293458581279938e-05, "loss": 1.2148, "mean_token_accuracy": 0.6948497071862221, "num_tokens": 273124265.0, "step": 8340 }, { "epoch": 0.6920142632059043, "grad_norm": 0.9224836230278015, "learning_rate": 2.6229767661764392e-05, "loss": 1.173, "mean_token_accuracy": 0.6997882291674614, "num_tokens": 273287618.0, "step": 8345 }, { "epoch": 0.6924288912845178, "grad_norm": 0.945375919342041, "learning_rate": 2.6166126535845715e-05, "loss": 1.1715, "mean_token_accuracy": 0.6987353324890136, "num_tokens": 273451458.0, "step": 8350 }, { "epoch": 0.6928435193631313, "grad_norm": 0.9309601187705994, "learning_rate": 2.6102535336838564e-05, "loss": 1.1151, "mean_token_accuracy": 0.7070197939872742, "num_tokens": 273615298.0, "step": 8355 }, { "epoch": 0.6932581474417447, "grad_norm": 0.9397746324539185, "learning_rate": 2.6038994197953036e-05, "loss": 1.1602, "mean_token_accuracy": 0.6988819643855095, "num_tokens": 273779138.0, "step": 8360 }, { "epoch": 0.6936727755203582, "grad_norm": 0.8752930760383606, "learning_rate": 2.597550325229433e-05, "loss": 1.1543, "mean_token_accuracy": 0.6997800603508949, "num_tokens": 273942978.0, "step": 8365 }, { "epoch": 0.6940874035989717, "grad_norm": 0.9306272864341736, "learning_rate": 2.591206263286252e-05, "loss": 1.1689, "mean_token_accuracy": 0.697134654223919, "num_tokens": 274106818.0, "step": 8370 }, { "epoch": 0.6945020316775852, "grad_norm": 0.9521862864494324, "learning_rate": 2.5848672472552253e-05, "loss": 1.2119, "mean_token_accuracy": 0.692894670367241, "num_tokens": 274270658.0, "step": 8375 }, { "epoch": 0.6949166597561987, "grad_norm": 0.8978354930877686, "learning_rate": 2.5785332904152475e-05, "loss": 1.1133, "mean_token_accuracy": 0.7113086462020874, "num_tokens": 274434498.0, "step": 8380 }, { "epoch": 0.6953312878348121, "grad_norm": 0.8853723406791687, "learning_rate": 2.572204406034615e-05, "loss": 1.1119, "mean_token_accuracy": 0.707936218380928, "num_tokens": 274598338.0, "step": 8385 }, { "epoch": 0.6957459159134256, "grad_norm": 0.9029449224472046, "learning_rate": 2.565880607371002e-05, "loss": 1.1074, "mean_token_accuracy": 0.7122495099902153, "num_tokens": 274762178.0, "step": 8390 }, { "epoch": 0.6961605439920392, "grad_norm": 0.9149707555770874, "learning_rate": 2.5595619076714173e-05, "loss": 1.1772, "mean_token_accuracy": 0.6978861212730407, "num_tokens": 274926018.0, "step": 8395 }, { "epoch": 0.6965751720706527, "grad_norm": 0.966590166091919, "learning_rate": 2.5532483201722052e-05, "loss": 1.1965, "mean_token_accuracy": 0.6957539081573486, "num_tokens": 275089858.0, "step": 8400 }, { "epoch": 0.6969898001492661, "grad_norm": 0.9344898462295532, "learning_rate": 2.5469398580989902e-05, "loss": 1.1724, "mean_token_accuracy": 0.6987475574016571, "num_tokens": 275253698.0, "step": 8405 }, { "epoch": 0.6974044282278796, "grad_norm": 0.9215954542160034, "learning_rate": 2.540636534666664e-05, "loss": 1.1521, "mean_token_accuracy": 0.6996761962771416, "num_tokens": 275417538.0, "step": 8410 }, { "epoch": 0.6978190563064931, "grad_norm": 0.8877071142196655, "learning_rate": 2.534338363079348e-05, "loss": 1.0889, "mean_token_accuracy": 0.7142167612910271, "num_tokens": 275581378.0, "step": 8415 }, { "epoch": 0.6982336843851066, "grad_norm": 0.9456600546836853, "learning_rate": 2.528045356530382e-05, "loss": 1.1511, "mean_token_accuracy": 0.69794110506773, "num_tokens": 275745218.0, "step": 8420 }, { "epoch": 0.6986483124637201, "grad_norm": 0.9057779908180237, "learning_rate": 2.5217575282022803e-05, "loss": 1.1854, "mean_token_accuracy": 0.6958027854561806, "num_tokens": 275909058.0, "step": 8425 }, { "epoch": 0.6990629405423335, "grad_norm": 0.8589238524436951, "learning_rate": 2.5154748912667036e-05, "loss": 1.111, "mean_token_accuracy": 0.7077101662755012, "num_tokens": 276072898.0, "step": 8430 }, { "epoch": 0.699477568620947, "grad_norm": 0.9326671957969666, "learning_rate": 2.5091974588844513e-05, "loss": 1.2035, "mean_token_accuracy": 0.6897669479250907, "num_tokens": 276236612.0, "step": 8435 }, { "epoch": 0.6998921966995605, "grad_norm": 0.927275538444519, "learning_rate": 2.5029252442054118e-05, "loss": 1.1811, "mean_token_accuracy": 0.6972201868891716, "num_tokens": 276400452.0, "step": 8440 }, { "epoch": 0.700306824778174, "grad_norm": 0.8658373951911926, "learning_rate": 2.4966582603685423e-05, "loss": 1.2059, "mean_token_accuracy": 0.6914638042449951, "num_tokens": 276563347.0, "step": 8445 }, { "epoch": 0.7007214528567874, "grad_norm": 0.9162196516990662, "learning_rate": 2.4903965205018448e-05, "loss": 1.151, "mean_token_accuracy": 0.6998167142271996, "num_tokens": 276727187.0, "step": 8450 }, { "epoch": 0.7011360809354009, "grad_norm": 0.9280889630317688, "learning_rate": 2.4841400377223422e-05, "loss": 1.2022, "mean_token_accuracy": 0.6918499559164047, "num_tokens": 276891027.0, "step": 8455 }, { "epoch": 0.7015507090140144, "grad_norm": 0.937462329864502, "learning_rate": 2.477888825136034e-05, "loss": 1.1621, "mean_token_accuracy": 0.6965909153223038, "num_tokens": 277054867.0, "step": 8460 }, { "epoch": 0.7019653370926279, "grad_norm": 0.9051955342292786, "learning_rate": 2.4716428958378866e-05, "loss": 1.1774, "mean_token_accuracy": 0.6993951588869095, "num_tokens": 277218707.0, "step": 8465 }, { "epoch": 0.7023799651712413, "grad_norm": 0.9229522347450256, "learning_rate": 2.4654022629117985e-05, "loss": 1.1614, "mean_token_accuracy": 0.6999572351574898, "num_tokens": 277382547.0, "step": 8470 }, { "epoch": 0.7027945932498548, "grad_norm": 0.9330226182937622, "learning_rate": 2.459166939430571e-05, "loss": 1.2024, "mean_token_accuracy": 0.6869904205203057, "num_tokens": 277546102.0, "step": 8475 }, { "epoch": 0.7032092213284684, "grad_norm": 0.8841614127159119, "learning_rate": 2.4529369384558865e-05, "loss": 1.165, "mean_token_accuracy": 0.701994250714779, "num_tokens": 277709789.0, "step": 8480 }, { "epoch": 0.7036238494070819, "grad_norm": 0.8814997673034668, "learning_rate": 2.4467122730382746e-05, "loss": 1.1611, "mean_token_accuracy": 0.7013685226440429, "num_tokens": 277873629.0, "step": 8485 }, { "epoch": 0.7040384774856954, "grad_norm": 0.8852235674858093, "learning_rate": 2.4404929562170902e-05, "loss": 1.2209, "mean_token_accuracy": 0.6892045468091965, "num_tokens": 278037469.0, "step": 8490 }, { "epoch": 0.7044531055643088, "grad_norm": 0.9323071241378784, "learning_rate": 2.4342790010204842e-05, "loss": 1.215, "mean_token_accuracy": 0.687761814892292, "num_tokens": 278200563.0, "step": 8495 }, { "epoch": 0.7048677336429223, "grad_norm": 0.9252358675003052, "learning_rate": 2.4280704204653738e-05, "loss": 1.1064, "mean_token_accuracy": 0.7062988758087159, "num_tokens": 278364403.0, "step": 8500 }, { "epoch": 0.7052823617215358, "grad_norm": 0.8709287643432617, "learning_rate": 2.4218672275574196e-05, "loss": 1.1719, "mean_token_accuracy": 0.6959982931613922, "num_tokens": 278528243.0, "step": 8505 }, { "epoch": 0.7056969898001493, "grad_norm": 0.8557941913604736, "learning_rate": 2.4156694352909957e-05, "loss": 1.148, "mean_token_accuracy": 0.7060178413987159, "num_tokens": 278692083.0, "step": 8510 }, { "epoch": 0.7061116178787628, "grad_norm": 0.9327511787414551, "learning_rate": 2.4094770566491627e-05, "loss": 1.1671, "mean_token_accuracy": 0.7007617250084877, "num_tokens": 278855386.0, "step": 8515 }, { "epoch": 0.7065262459573762, "grad_norm": 0.9525324702262878, "learning_rate": 2.4032901046036404e-05, "loss": 1.1744, "mean_token_accuracy": 0.6984604060649872, "num_tokens": 279019226.0, "step": 8520 }, { "epoch": 0.7069408740359897, "grad_norm": 0.9035742282867432, "learning_rate": 2.397108592114782e-05, "loss": 1.1417, "mean_token_accuracy": 0.7000122129917145, "num_tokens": 279183066.0, "step": 8525 }, { "epoch": 0.7073555021146032, "grad_norm": 0.9326964616775513, "learning_rate": 2.390932532131545e-05, "loss": 1.1748, "mean_token_accuracy": 0.6967069894075394, "num_tokens": 279346906.0, "step": 8530 }, { "epoch": 0.7077701301932167, "grad_norm": 0.9051055312156677, "learning_rate": 2.3847619375914686e-05, "loss": 1.1475, "mean_token_accuracy": 0.7067754149436951, "num_tokens": 279510746.0, "step": 8535 }, { "epoch": 0.7081847582718301, "grad_norm": 0.8472020626068115, "learning_rate": 2.378596821420634e-05, "loss": 1.2593, "mean_token_accuracy": 0.6837243407964706, "num_tokens": 279674586.0, "step": 8540 }, { "epoch": 0.7085993863504436, "grad_norm": 0.9158118367195129, "learning_rate": 2.3724371965336572e-05, "loss": 1.2266, "mean_token_accuracy": 0.6914833828806877, "num_tokens": 279838426.0, "step": 8545 }, { "epoch": 0.7090140144290571, "grad_norm": 0.9467946290969849, "learning_rate": 2.3662830758336453e-05, "loss": 1.1529, "mean_token_accuracy": 0.7002810403704643, "num_tokens": 280002266.0, "step": 8550 }, { "epoch": 0.7094286425076706, "grad_norm": 1.0291293859481812, "learning_rate": 2.360134472212176e-05, "loss": 1.2379, "mean_token_accuracy": 0.6841214552521706, "num_tokens": 280166106.0, "step": 8555 }, { "epoch": 0.7098432705862842, "grad_norm": 0.9345492124557495, "learning_rate": 2.35399139854927e-05, "loss": 1.2474, "mean_token_accuracy": 0.6842130973935128, "num_tokens": 280329946.0, "step": 8560 }, { "epoch": 0.7102578986648976, "grad_norm": 0.9244153499603271, "learning_rate": 2.347853867713365e-05, "loss": 1.1364, "mean_token_accuracy": 0.7075217753648758, "num_tokens": 280493579.0, "step": 8565 }, { "epoch": 0.7106725267435111, "grad_norm": 0.897363543510437, "learning_rate": 2.3417218925612877e-05, "loss": 1.1711, "mean_token_accuracy": 0.7002993687987328, "num_tokens": 280657419.0, "step": 8570 }, { "epoch": 0.7110871548221246, "grad_norm": 0.9287620186805725, "learning_rate": 2.3355954859382212e-05, "loss": 1.202, "mean_token_accuracy": 0.6938294187188149, "num_tokens": 280821259.0, "step": 8575 }, { "epoch": 0.7115017829007381, "grad_norm": 0.9229029417037964, "learning_rate": 2.329474660677693e-05, "loss": 1.2341, "mean_token_accuracy": 0.6841947734355927, "num_tokens": 280985099.0, "step": 8580 }, { "epoch": 0.7119164109793515, "grad_norm": 0.9154161810874939, "learning_rate": 2.3233594296015353e-05, "loss": 1.1241, "mean_token_accuracy": 0.7083195969462395, "num_tokens": 281148608.0, "step": 8585 }, { "epoch": 0.712331039057965, "grad_norm": 0.8730635643005371, "learning_rate": 2.317249805519856e-05, "loss": 1.1923, "mean_token_accuracy": 0.697110216319561, "num_tokens": 281312448.0, "step": 8590 }, { "epoch": 0.7127456671365785, "grad_norm": 0.9459541440010071, "learning_rate": 2.3111458012310227e-05, "loss": 1.1998, "mean_token_accuracy": 0.6961143687367439, "num_tokens": 281476288.0, "step": 8595 }, { "epoch": 0.713160295215192, "grad_norm": 0.9326469302177429, "learning_rate": 2.3050474295216364e-05, "loss": 1.1534, "mean_token_accuracy": 0.700861431658268, "num_tokens": 281640128.0, "step": 8600 }, { "epoch": 0.7135749232938055, "grad_norm": 0.8979618549346924, "learning_rate": 2.2989547031664856e-05, "loss": 1.108, "mean_token_accuracy": 0.7087976559996605, "num_tokens": 281803968.0, "step": 8605 }, { "epoch": 0.7139895513724189, "grad_norm": 0.9168345928192139, "learning_rate": 2.292867634928541e-05, "loss": 1.1346, "mean_token_accuracy": 0.7058467760682106, "num_tokens": 281967808.0, "step": 8610 }, { "epoch": 0.7144041794510324, "grad_norm": 0.9103457927703857, "learning_rate": 2.286786237558926e-05, "loss": 1.1073, "mean_token_accuracy": 0.7085715994238854, "num_tokens": 282131648.0, "step": 8615 }, { "epoch": 0.7148188075296459, "grad_norm": 0.967501699924469, "learning_rate": 2.2807105237968724e-05, "loss": 1.094, "mean_token_accuracy": 0.7083488360047341, "num_tokens": 282294691.0, "step": 8620 }, { "epoch": 0.7152334356082594, "grad_norm": 0.9350996017456055, "learning_rate": 2.2746405063697145e-05, "loss": 1.1911, "mean_token_accuracy": 0.6956928178668023, "num_tokens": 282458531.0, "step": 8625 }, { "epoch": 0.7156480636868728, "grad_norm": 0.9099265336990356, "learning_rate": 2.2685761979928506e-05, "loss": 1.2773, "mean_token_accuracy": 0.6796940915286541, "num_tokens": 282621927.0, "step": 8630 }, { "epoch": 0.7160626917654863, "grad_norm": 0.8858419060707092, "learning_rate": 2.2625176113697255e-05, "loss": 1.1896, "mean_token_accuracy": 0.6928777754306793, "num_tokens": 282785739.0, "step": 8635 }, { "epoch": 0.7164773198440998, "grad_norm": 0.8882941603660583, "learning_rate": 2.256464759191788e-05, "loss": 1.1034, "mean_token_accuracy": 0.7116385638713837, "num_tokens": 282949579.0, "step": 8640 }, { "epoch": 0.7168919479227134, "grad_norm": 0.9124554395675659, "learning_rate": 2.250417654138483e-05, "loss": 1.2252, "mean_token_accuracy": 0.6885630503296852, "num_tokens": 283113419.0, "step": 8645 }, { "epoch": 0.7173065760013269, "grad_norm": 0.885246217250824, "learning_rate": 2.2443763088772125e-05, "loss": 1.1348, "mean_token_accuracy": 0.7050769805908204, "num_tokens": 283277259.0, "step": 8650 }, { "epoch": 0.7177212040799403, "grad_norm": 0.9887017607688904, "learning_rate": 2.238340736063314e-05, "loss": 1.176, "mean_token_accuracy": 0.6959293812513352, "num_tokens": 283440874.0, "step": 8655 }, { "epoch": 0.7181358321585538, "grad_norm": 0.9638161659240723, "learning_rate": 2.2323109483400335e-05, "loss": 1.108, "mean_token_accuracy": 0.7070320084691047, "num_tokens": 283604714.0, "step": 8660 }, { "epoch": 0.7185504602371673, "grad_norm": 0.8681931495666504, "learning_rate": 2.2262869583384972e-05, "loss": 1.0963, "mean_token_accuracy": 0.7093477725982666, "num_tokens": 283767659.0, "step": 8665 }, { "epoch": 0.7189650883157808, "grad_norm": 0.8647438883781433, "learning_rate": 2.220268778677687e-05, "loss": 1.0793, "mean_token_accuracy": 0.7119557321071625, "num_tokens": 283931453.0, "step": 8670 }, { "epoch": 0.7193797163943942, "grad_norm": 0.9072765111923218, "learning_rate": 2.2142564219644136e-05, "loss": 1.2217, "mean_token_accuracy": 0.6886974617838859, "num_tokens": 284095293.0, "step": 8675 }, { "epoch": 0.7197943444730077, "grad_norm": 0.8717702627182007, "learning_rate": 2.208249900793289e-05, "loss": 1.176, "mean_token_accuracy": 0.7004154488444329, "num_tokens": 284259133.0, "step": 8680 }, { "epoch": 0.7202089725516212, "grad_norm": 0.9505860209465027, "learning_rate": 2.202249227746702e-05, "loss": 1.1514, "mean_token_accuracy": 0.7013074263930321, "num_tokens": 284422973.0, "step": 8685 }, { "epoch": 0.7206236006302347, "grad_norm": 0.9417359828948975, "learning_rate": 2.19625441539479e-05, "loss": 1.1373, "mean_token_accuracy": 0.7046844303607941, "num_tokens": 284586516.0, "step": 8690 }, { "epoch": 0.7210382287088482, "grad_norm": 0.8849696516990662, "learning_rate": 2.1902654762954143e-05, "loss": 1.1799, "mean_token_accuracy": 0.6978750191628933, "num_tokens": 284749368.0, "step": 8695 }, { "epoch": 0.7214528567874616, "grad_norm": 0.9590681791305542, "learning_rate": 2.1842824229941323e-05, "loss": 1.2275, "mean_token_accuracy": 0.6849340155720711, "num_tokens": 284913208.0, "step": 8700 }, { "epoch": 0.7218674848660751, "grad_norm": 0.9124518632888794, "learning_rate": 2.1783052680241718e-05, "loss": 1.1227, "mean_token_accuracy": 0.7079606577754021, "num_tokens": 285077048.0, "step": 8705 }, { "epoch": 0.7222821129446886, "grad_norm": 0.9214940667152405, "learning_rate": 2.1723340239064053e-05, "loss": 1.1775, "mean_token_accuracy": 0.695729473233223, "num_tokens": 285240888.0, "step": 8710 }, { "epoch": 0.7226967410233021, "grad_norm": 0.8921946287155151, "learning_rate": 2.1663687031493253e-05, "loss": 1.1235, "mean_token_accuracy": 0.7074291318655014, "num_tokens": 285404728.0, "step": 8715 }, { "epoch": 0.7231113691019155, "grad_norm": 0.8673714995384216, "learning_rate": 2.160409318249008e-05, "loss": 1.065, "mean_token_accuracy": 0.7187194541096688, "num_tokens": 285568568.0, "step": 8720 }, { "epoch": 0.723525997180529, "grad_norm": 0.9239243268966675, "learning_rate": 2.1544558816891075e-05, "loss": 1.1746, "mean_token_accuracy": 0.6981548488140106, "num_tokens": 285731942.0, "step": 8725 }, { "epoch": 0.7239406252591426, "grad_norm": 0.9728556871414185, "learning_rate": 2.14850840594081e-05, "loss": 1.127, "mean_token_accuracy": 0.7067754194140434, "num_tokens": 285895782.0, "step": 8730 }, { "epoch": 0.7243552533377561, "grad_norm": 0.9516790509223938, "learning_rate": 2.1425669034628122e-05, "loss": 1.0735, "mean_token_accuracy": 0.7161901235580445, "num_tokens": 286059622.0, "step": 8735 }, { "epoch": 0.7247698814163696, "grad_norm": 0.9084863662719727, "learning_rate": 2.136631386701306e-05, "loss": 1.0954, "mean_token_accuracy": 0.7144672483205795, "num_tokens": 286223462.0, "step": 8740 }, { "epoch": 0.725184509494983, "grad_norm": 0.9737844467163086, "learning_rate": 2.130701868089941e-05, "loss": 1.1406, "mean_token_accuracy": 0.7053213596343995, "num_tokens": 286387302.0, "step": 8745 }, { "epoch": 0.7255991375735965, "grad_norm": 0.9189149141311646, "learning_rate": 2.1247783600497984e-05, "loss": 1.181, "mean_token_accuracy": 0.6972201809287071, "num_tokens": 286551142.0, "step": 8750 }, { "epoch": 0.72601376565221, "grad_norm": 0.9415842890739441, "learning_rate": 2.1188608749893712e-05, "loss": 1.2636, "mean_token_accuracy": 0.6842436477541923, "num_tokens": 286714982.0, "step": 8755 }, { "epoch": 0.7264283937308235, "grad_norm": 0.9185128808021545, "learning_rate": 2.1129494253045396e-05, "loss": 1.1508, "mean_token_accuracy": 0.7027309387922287, "num_tokens": 286878822.0, "step": 8760 }, { "epoch": 0.7268430218094369, "grad_norm": 0.8949790000915527, "learning_rate": 2.1070440233785373e-05, "loss": 1.2114, "mean_token_accuracy": 0.6933162316679955, "num_tokens": 287042662.0, "step": 8765 }, { "epoch": 0.7272576498880504, "grad_norm": 0.9467188715934753, "learning_rate": 2.1011446815819257e-05, "loss": 1.1337, "mean_token_accuracy": 0.7046248733997345, "num_tokens": 287206502.0, "step": 8770 }, { "epoch": 0.7276722779666639, "grad_norm": 0.9147824048995972, "learning_rate": 2.0952514122725748e-05, "loss": 1.1576, "mean_token_accuracy": 0.7011363670229912, "num_tokens": 287370342.0, "step": 8775 }, { "epoch": 0.7280869060452774, "grad_norm": 0.8961211442947388, "learning_rate": 2.0893642277956404e-05, "loss": 1.2025, "mean_token_accuracy": 0.6929313272237778, "num_tokens": 287534182.0, "step": 8780 }, { "epoch": 0.7285015341238908, "grad_norm": 0.9158641695976257, "learning_rate": 2.0834831404835193e-05, "loss": 1.2276, "mean_token_accuracy": 0.6882025882601738, "num_tokens": 287698022.0, "step": 8785 }, { "epoch": 0.7289161622025043, "grad_norm": 0.9232227206230164, "learning_rate": 2.0776081626558437e-05, "loss": 1.2649, "mean_token_accuracy": 0.6843108534812927, "num_tokens": 287861862.0, "step": 8790 }, { "epoch": 0.7293307902811178, "grad_norm": 0.8580409288406372, "learning_rate": 2.0717393066194507e-05, "loss": 1.1158, "mean_token_accuracy": 0.7081439360976219, "num_tokens": 288025702.0, "step": 8795 }, { "epoch": 0.7297454183597313, "grad_norm": 0.9285319447517395, "learning_rate": 2.065876584668344e-05, "loss": 1.1391, "mean_token_accuracy": 0.7008980959653854, "num_tokens": 288189542.0, "step": 8800 }, { "epoch": 0.7301600464383448, "grad_norm": 0.962729275226593, "learning_rate": 2.0600200090836863e-05, "loss": 1.1618, "mean_token_accuracy": 0.6994929105043411, "num_tokens": 288353382.0, "step": 8805 }, { "epoch": 0.7305746745169582, "grad_norm": 0.9175639748573303, "learning_rate": 2.0541695921337605e-05, "loss": 1.1075, "mean_token_accuracy": 0.7102028340101242, "num_tokens": 288517222.0, "step": 8810 }, { "epoch": 0.7309893025955718, "grad_norm": 0.8901187181472778, "learning_rate": 2.0483253460739498e-05, "loss": 1.104, "mean_token_accuracy": 0.710874879360199, "num_tokens": 288681062.0, "step": 8815 }, { "epoch": 0.7314039306741853, "grad_norm": 0.9340787529945374, "learning_rate": 2.0424872831467106e-05, "loss": 1.1506, "mean_token_accuracy": 0.7019116520881653, "num_tokens": 288844722.0, "step": 8820 }, { "epoch": 0.7318185587527988, "grad_norm": 0.9090827703475952, "learning_rate": 2.0366554155815475e-05, "loss": 1.1108, "mean_token_accuracy": 0.7115530282258987, "num_tokens": 289008562.0, "step": 8825 }, { "epoch": 0.7322331868314123, "grad_norm": 0.8902605175971985, "learning_rate": 2.0308297555949857e-05, "loss": 1.1804, "mean_token_accuracy": 0.6916605569422245, "num_tokens": 289172402.0, "step": 8830 }, { "epoch": 0.7326478149100257, "grad_norm": 0.9276025891304016, "learning_rate": 2.025010315390548e-05, "loss": 1.1186, "mean_token_accuracy": 0.7068426206707954, "num_tokens": 289336242.0, "step": 8835 }, { "epoch": 0.7330624429886392, "grad_norm": 0.9215508103370667, "learning_rate": 2.0191971071587277e-05, "loss": 1.1487, "mean_token_accuracy": 0.7047592863440514, "num_tokens": 289500082.0, "step": 8840 }, { "epoch": 0.7334770710672527, "grad_norm": 0.9058172106742859, "learning_rate": 2.013390143076964e-05, "loss": 1.1378, "mean_token_accuracy": 0.6999755576252937, "num_tokens": 289663922.0, "step": 8845 }, { "epoch": 0.7338916991458662, "grad_norm": 0.9303228259086609, "learning_rate": 2.007589435309615e-05, "loss": 1.1803, "mean_token_accuracy": 0.6976417407393456, "num_tokens": 289827762.0, "step": 8850 }, { "epoch": 0.7343063272244796, "grad_norm": 0.9356986284255981, "learning_rate": 2.0017949960079334e-05, "loss": 1.1782, "mean_token_accuracy": 0.6956195041537285, "num_tokens": 289991602.0, "step": 8855 }, { "epoch": 0.7347209553030931, "grad_norm": 0.9283430576324463, "learning_rate": 1.9960068373100417e-05, "loss": 1.2187, "mean_token_accuracy": 0.6888685241341591, "num_tokens": 290155442.0, "step": 8860 }, { "epoch": 0.7351355833817066, "grad_norm": 0.9447711110115051, "learning_rate": 1.990224971340904e-05, "loss": 1.1276, "mean_token_accuracy": 0.7045149073004723, "num_tokens": 290319282.0, "step": 8865 }, { "epoch": 0.7355502114603201, "grad_norm": 0.9082170128822327, "learning_rate": 1.9844494102123045e-05, "loss": 1.0518, "mean_token_accuracy": 0.7191776633262634, "num_tokens": 290483122.0, "step": 8870 }, { "epoch": 0.7359648395389335, "grad_norm": 0.9077820777893066, "learning_rate": 1.9786801660228204e-05, "loss": 1.1465, "mean_token_accuracy": 0.7008919849991798, "num_tokens": 290646962.0, "step": 8875 }, { "epoch": 0.736379467617547, "grad_norm": 0.9205976128578186, "learning_rate": 1.9729172508577905e-05, "loss": 1.2184, "mean_token_accuracy": 0.6890579193830491, "num_tokens": 290810802.0, "step": 8880 }, { "epoch": 0.7367940956961605, "grad_norm": 0.9017412066459656, "learning_rate": 1.9671606767893046e-05, "loss": 1.1275, "mean_token_accuracy": 0.7070197999477387, "num_tokens": 290974642.0, "step": 8885 }, { "epoch": 0.737208723774774, "grad_norm": 0.8834683895111084, "learning_rate": 1.961410455876166e-05, "loss": 1.1985, "mean_token_accuracy": 0.6970735609531402, "num_tokens": 291138482.0, "step": 8890 }, { "epoch": 0.7376233518533876, "grad_norm": 0.8970118165016174, "learning_rate": 1.9556666001638635e-05, "loss": 1.0953, "mean_token_accuracy": 0.7111803531646729, "num_tokens": 291302322.0, "step": 8895 }, { "epoch": 0.738037979932001, "grad_norm": 0.8732209205627441, "learning_rate": 1.9499291216845578e-05, "loss": 1.1188, "mean_token_accuracy": 0.7073191553354263, "num_tokens": 291466162.0, "step": 8900 }, { "epoch": 0.7384526080106145, "grad_norm": 0.9126554131507874, "learning_rate": 1.944198032457053e-05, "loss": 1.1583, "mean_token_accuracy": 0.6986304759979248, "num_tokens": 291629209.0, "step": 8905 }, { "epoch": 0.738867236089228, "grad_norm": 0.933233380317688, "learning_rate": 1.9384733444867665e-05, "loss": 1.1907, "mean_token_accuracy": 0.696230448782444, "num_tokens": 291793049.0, "step": 8910 }, { "epoch": 0.7392818641678415, "grad_norm": 0.8659321665763855, "learning_rate": 1.9327550697656994e-05, "loss": 1.1368, "mean_token_accuracy": 0.7059353232383728, "num_tokens": 291956639.0, "step": 8915 }, { "epoch": 0.739696492246455, "grad_norm": 0.9404541254043579, "learning_rate": 1.927043220272431e-05, "loss": 1.2015, "mean_token_accuracy": 0.6902857288718224, "num_tokens": 292120389.0, "step": 8920 }, { "epoch": 0.7401111203250684, "grad_norm": 0.9204649925231934, "learning_rate": 1.9213378079720747e-05, "loss": 1.1809, "mean_token_accuracy": 0.6947702825069427, "num_tokens": 292284229.0, "step": 8925 }, { "epoch": 0.7405257484036819, "grad_norm": 0.9498615264892578, "learning_rate": 1.915638844816256e-05, "loss": 1.1896, "mean_token_accuracy": 0.6952529326081276, "num_tokens": 292448069.0, "step": 8930 }, { "epoch": 0.7409403764822954, "grad_norm": 0.835673987865448, "learning_rate": 1.9099463427430943e-05, "loss": 1.0706, "mean_token_accuracy": 0.714534455537796, "num_tokens": 292611909.0, "step": 8935 }, { "epoch": 0.7413550045609089, "grad_norm": 0.9525404572486877, "learning_rate": 1.9042603136771797e-05, "loss": 1.1649, "mean_token_accuracy": 0.6959555223584175, "num_tokens": 292775749.0, "step": 8940 }, { "epoch": 0.7417696326395223, "grad_norm": 0.98179692029953, "learning_rate": 1.8985807695295332e-05, "loss": 1.1578, "mean_token_accuracy": 0.7004763603210449, "num_tokens": 292939308.0, "step": 8945 }, { "epoch": 0.7421842607181358, "grad_norm": 0.9175640940666199, "learning_rate": 1.892907722197596e-05, "loss": 1.1706, "mean_token_accuracy": 0.7012402236461639, "num_tokens": 293103148.0, "step": 8950 }, { "epoch": 0.7425988887967493, "grad_norm": 0.8898962736129761, "learning_rate": 1.8872411835652005e-05, "loss": 1.1008, "mean_token_accuracy": 0.7081744864583015, "num_tokens": 293266988.0, "step": 8955 }, { "epoch": 0.7430135168753628, "grad_norm": 0.9193590879440308, "learning_rate": 1.881581165502543e-05, "loss": 1.1883, "mean_token_accuracy": 0.7016434505581856, "num_tokens": 293430828.0, "step": 8960 }, { "epoch": 0.7434281449539762, "grad_norm": 0.8816352486610413, "learning_rate": 1.8759276798661612e-05, "loss": 1.1365, "mean_token_accuracy": 0.7058162286877632, "num_tokens": 293594668.0, "step": 8965 }, { "epoch": 0.7438427730325897, "grad_norm": 0.8981600999832153, "learning_rate": 1.870280738498909e-05, "loss": 1.2026, "mean_token_accuracy": 0.6937744334340096, "num_tokens": 293758508.0, "step": 8970 }, { "epoch": 0.7442574011112032, "grad_norm": 0.9175354838371277, "learning_rate": 1.8646403532299316e-05, "loss": 1.1356, "mean_token_accuracy": 0.7080461919307709, "num_tokens": 293922348.0, "step": 8975 }, { "epoch": 0.7446720291898168, "grad_norm": 0.9029030799865723, "learning_rate": 1.8590065358746406e-05, "loss": 1.1872, "mean_token_accuracy": 0.6982954576611519, "num_tokens": 294086188.0, "step": 8980 }, { "epoch": 0.7450866572684303, "grad_norm": 0.9207746982574463, "learning_rate": 1.8533792982346877e-05, "loss": 1.0657, "mean_token_accuracy": 0.7148093849420547, "num_tokens": 294250028.0, "step": 8985 }, { "epoch": 0.7455012853470437, "grad_norm": 0.9070084691047668, "learning_rate": 1.8477586520979435e-05, "loss": 1.1542, "mean_token_accuracy": 0.7049059122800827, "num_tokens": 294413868.0, "step": 8990 }, { "epoch": 0.7459159134256572, "grad_norm": 0.9451984167098999, "learning_rate": 1.8421446092384693e-05, "loss": 1.14, "mean_token_accuracy": 0.7041361212730408, "num_tokens": 294577708.0, "step": 8995 }, { "epoch": 0.7463305415042707, "grad_norm": 0.879784882068634, "learning_rate": 1.836537181416495e-05, "loss": 1.1751, "mean_token_accuracy": 0.6972507297992706, "num_tokens": 294741548.0, "step": 9000 }, { "epoch": 0.7467451695828842, "grad_norm": 0.9778941869735718, "learning_rate": 1.830936380378393e-05, "loss": 1.1636, "mean_token_accuracy": 0.7022543981671333, "num_tokens": 294905388.0, "step": 9005 }, { "epoch": 0.7471597976614976, "grad_norm": 0.8739100098609924, "learning_rate": 1.8253422178566543e-05, "loss": 1.0982, "mean_token_accuracy": 0.7134714052081108, "num_tokens": 295069228.0, "step": 9010 }, { "epoch": 0.7475744257401111, "grad_norm": 0.9578571915626526, "learning_rate": 1.8197547055698622e-05, "loss": 1.2054, "mean_token_accuracy": 0.6938294231891632, "num_tokens": 295233068.0, "step": 9015 }, { "epoch": 0.7479890538187246, "grad_norm": 0.8959738612174988, "learning_rate": 1.814173855222671e-05, "loss": 1.1437, "mean_token_accuracy": 0.7040872409939766, "num_tokens": 295396908.0, "step": 9020 }, { "epoch": 0.7484036818973381, "grad_norm": 0.8838170766830444, "learning_rate": 1.808599678505779e-05, "loss": 1.1555, "mean_token_accuracy": 0.7020955502986908, "num_tokens": 295560748.0, "step": 9025 }, { "epoch": 0.7488183099759516, "grad_norm": 0.9568129181861877, "learning_rate": 1.8030321870959043e-05, "loss": 1.1724, "mean_token_accuracy": 0.6985031768679619, "num_tokens": 295724588.0, "step": 9030 }, { "epoch": 0.749232938054565, "grad_norm": 0.9240115284919739, "learning_rate": 1.797471392655763e-05, "loss": 1.1411, "mean_token_accuracy": 0.7033174470067024, "num_tokens": 295888428.0, "step": 9035 }, { "epoch": 0.7496475661331785, "grad_norm": 0.8960933089256287, "learning_rate": 1.7919173068340345e-05, "loss": 1.1646, "mean_token_accuracy": 0.7023196890950203, "num_tokens": 296051910.0, "step": 9040 }, { "epoch": 0.750062194211792, "grad_norm": 0.8688474297523499, "learning_rate": 1.7863699412653568e-05, "loss": 1.1728, "mean_token_accuracy": 0.6987781018018723, "num_tokens": 296215750.0, "step": 9045 }, { "epoch": 0.7504768222904055, "grad_norm": 0.9132707118988037, "learning_rate": 1.7808293075702832e-05, "loss": 1.1421, "mean_token_accuracy": 0.7063782960176468, "num_tokens": 296379590.0, "step": 9050 }, { "epoch": 0.7508914503690189, "grad_norm": 0.9477134346961975, "learning_rate": 1.7752954173552672e-05, "loss": 1.1669, "mean_token_accuracy": 0.6979960918426513, "num_tokens": 296543430.0, "step": 9055 }, { "epoch": 0.7513060784476324, "grad_norm": 0.9153250455856323, "learning_rate": 1.7697682822126312e-05, "loss": 1.2199, "mean_token_accuracy": 0.6940127104520798, "num_tokens": 296707270.0, "step": 9060 }, { "epoch": 0.751720706526246, "grad_norm": 0.9431553483009338, "learning_rate": 1.764247913720556e-05, "loss": 1.0696, "mean_token_accuracy": 0.7168010741472244, "num_tokens": 296871110.0, "step": 9065 }, { "epoch": 0.7521353346048595, "grad_norm": 0.9085553884506226, "learning_rate": 1.758734323443043e-05, "loss": 1.2406, "mean_token_accuracy": 0.6867057695984841, "num_tokens": 297034950.0, "step": 9070 }, { "epoch": 0.752549962683473, "grad_norm": 0.9111844897270203, "learning_rate": 1.7532275229298927e-05, "loss": 1.1737, "mean_token_accuracy": 0.6989797174930572, "num_tokens": 297198790.0, "step": 9075 }, { "epoch": 0.7529645907620864, "grad_norm": 0.9182848334312439, "learning_rate": 1.7477275237166834e-05, "loss": 1.1336, "mean_token_accuracy": 0.7052480444312096, "num_tokens": 297362630.0, "step": 9080 }, { "epoch": 0.7533792188406999, "grad_norm": 0.9506045579910278, "learning_rate": 1.742234337324753e-05, "loss": 1.1941, "mean_token_accuracy": 0.6953690081834794, "num_tokens": 297526470.0, "step": 9085 }, { "epoch": 0.7537938469193134, "grad_norm": 1.0101375579833984, "learning_rate": 1.7367479752611564e-05, "loss": 1.1485, "mean_token_accuracy": 0.7034396350383758, "num_tokens": 297690310.0, "step": 9090 }, { "epoch": 0.7542084749979269, "grad_norm": 0.9124699234962463, "learning_rate": 1.7312684490186597e-05, "loss": 1.2135, "mean_token_accuracy": 0.6892900764942169, "num_tokens": 297854150.0, "step": 9095 }, { "epoch": 0.7546231030765403, "grad_norm": 0.8840845823287964, "learning_rate": 1.7257957700757132e-05, "loss": 1.1423, "mean_token_accuracy": 0.7055901750922203, "num_tokens": 298017990.0, "step": 9100 }, { "epoch": 0.7550377311551538, "grad_norm": 0.9478710293769836, "learning_rate": 1.7203299498964143e-05, "loss": 1.2587, "mean_token_accuracy": 0.6825635373592377, "num_tokens": 298181830.0, "step": 9105 }, { "epoch": 0.7554523592337673, "grad_norm": 0.9232124090194702, "learning_rate": 1.7148709999304984e-05, "loss": 1.0747, "mean_token_accuracy": 0.7147482916712761, "num_tokens": 298345670.0, "step": 9110 }, { "epoch": 0.7558669873123808, "grad_norm": 0.9229820370674133, "learning_rate": 1.7094189316133075e-05, "loss": 1.1342, "mean_token_accuracy": 0.7042583122849464, "num_tokens": 298509510.0, "step": 9115 }, { "epoch": 0.7562816153909943, "grad_norm": 0.9188677668571472, "learning_rate": 1.7039737563657733e-05, "loss": 1.1156, "mean_token_accuracy": 0.705241933465004, "num_tokens": 298673350.0, "step": 9120 }, { "epoch": 0.7566962434696077, "grad_norm": 0.9146133661270142, "learning_rate": 1.698535485594378e-05, "loss": 1.0953, "mean_token_accuracy": 0.7126527383923531, "num_tokens": 298837190.0, "step": 9125 }, { "epoch": 0.7571108715482212, "grad_norm": 0.9183034300804138, "learning_rate": 1.693104130691148e-05, "loss": 1.1759, "mean_token_accuracy": 0.694373169541359, "num_tokens": 299001030.0, "step": 9130 }, { "epoch": 0.7575254996268347, "grad_norm": 0.9065912961959839, "learning_rate": 1.68767970303362e-05, "loss": 1.1027, "mean_token_accuracy": 0.7120051354169845, "num_tokens": 299164870.0, "step": 9135 }, { "epoch": 0.7579401277054482, "grad_norm": 0.9084035754203796, "learning_rate": 1.68226221398482e-05, "loss": 1.1369, "mean_token_accuracy": 0.6996546849608422, "num_tokens": 299327973.0, "step": 9140 }, { "epoch": 0.7583547557840618, "grad_norm": 0.942358136177063, "learning_rate": 1.6768516748932387e-05, "loss": 1.1368, "mean_token_accuracy": 0.702773705124855, "num_tokens": 299491813.0, "step": 9145 }, { "epoch": 0.7587693838626752, "grad_norm": 0.9485670328140259, "learning_rate": 1.6714480970928086e-05, "loss": 1.0961, "mean_token_accuracy": 0.7088871866464614, "num_tokens": 299655234.0, "step": 9150 }, { "epoch": 0.7591840119412887, "grad_norm": 0.9354593753814697, "learning_rate": 1.6660514919028795e-05, "loss": 1.1791, "mean_token_accuracy": 0.6979960888624192, "num_tokens": 299819074.0, "step": 9155 }, { "epoch": 0.7595986400199022, "grad_norm": 0.9166666865348816, "learning_rate": 1.660661870628195e-05, "loss": 1.1574, "mean_token_accuracy": 0.7055657356977463, "num_tokens": 299982914.0, "step": 9160 }, { "epoch": 0.7600132680985157, "grad_norm": 0.9199892282485962, "learning_rate": 1.655279244558869e-05, "loss": 1.1039, "mean_token_accuracy": 0.7136839970946312, "num_tokens": 300145961.0, "step": 9165 }, { "epoch": 0.7604278961771291, "grad_norm": 0.8845294117927551, "learning_rate": 1.649903624970361e-05, "loss": 1.0821, "mean_token_accuracy": 0.714443401992321, "num_tokens": 300309584.0, "step": 9170 }, { "epoch": 0.7608425242557426, "grad_norm": 0.9437231421470642, "learning_rate": 1.6445350231234557e-05, "loss": 1.0407, "mean_token_accuracy": 0.7223851427435874, "num_tokens": 300473424.0, "step": 9175 }, { "epoch": 0.7612571523343561, "grad_norm": 0.9445368647575378, "learning_rate": 1.6391734502642365e-05, "loss": 1.1781, "mean_token_accuracy": 0.699159836769104, "num_tokens": 300636243.0, "step": 9180 }, { "epoch": 0.7616717804129696, "grad_norm": 0.965095043182373, "learning_rate": 1.6338189176240565e-05, "loss": 1.0827, "mean_token_accuracy": 0.7107005223631859, "num_tokens": 300799759.0, "step": 9185 }, { "epoch": 0.762086408491583, "grad_norm": 0.917914628982544, "learning_rate": 1.628471436419532e-05, "loss": 1.1011, "mean_token_accuracy": 0.7113697439432144, "num_tokens": 300963599.0, "step": 9190 }, { "epoch": 0.7625010365701965, "grad_norm": 0.9634137749671936, "learning_rate": 1.6231310178525006e-05, "loss": 1.1234, "mean_token_accuracy": 0.7091825500130653, "num_tokens": 301127439.0, "step": 9195 }, { "epoch": 0.76291566464881, "grad_norm": 0.9900341033935547, "learning_rate": 1.6177976731100064e-05, "loss": 1.1586, "mean_token_accuracy": 0.6953079164028168, "num_tokens": 301291279.0, "step": 9200 }, { "epoch": 0.7633302927274235, "grad_norm": 0.9216656684875488, "learning_rate": 1.612471413364276e-05, "loss": 1.2361, "mean_token_accuracy": 0.6890090435743332, "num_tokens": 301455119.0, "step": 9205 }, { "epoch": 0.763744920806037, "grad_norm": 0.8715385794639587, "learning_rate": 1.607152249772694e-05, "loss": 1.0902, "mean_token_accuracy": 0.7106819450855255, "num_tokens": 301618391.0, "step": 9210 }, { "epoch": 0.7641595488846504, "grad_norm": 0.8799657225608826, "learning_rate": 1.6018401934777834e-05, "loss": 1.1037, "mean_token_accuracy": 0.7106854885816574, "num_tokens": 301782231.0, "step": 9215 }, { "epoch": 0.7645741769632639, "grad_norm": 0.9428990483283997, "learning_rate": 1.5965352556071695e-05, "loss": 1.1853, "mean_token_accuracy": 0.6966214567422867, "num_tokens": 301946071.0, "step": 9220 }, { "epoch": 0.7649888050418774, "grad_norm": 0.9601016640663147, "learning_rate": 1.5912374472735775e-05, "loss": 1.1461, "mean_token_accuracy": 0.7017289862036705, "num_tokens": 302109911.0, "step": 9225 }, { "epoch": 0.765403433120491, "grad_norm": 0.9618821144104004, "learning_rate": 1.5859467795747924e-05, "loss": 1.2581, "mean_token_accuracy": 0.6823558151721955, "num_tokens": 302273751.0, "step": 9230 }, { "epoch": 0.7658180611991044, "grad_norm": 0.9125745892524719, "learning_rate": 1.5806632635936385e-05, "loss": 1.196, "mean_token_accuracy": 0.6952101662755013, "num_tokens": 302437591.0, "step": 9235 }, { "epoch": 0.7662326892777179, "grad_norm": 0.9249573349952698, "learning_rate": 1.5753869103979617e-05, "loss": 1.1281, "mean_token_accuracy": 0.708146370947361, "num_tokens": 302600764.0, "step": 9240 }, { "epoch": 0.7666473173563314, "grad_norm": 0.9272013306617737, "learning_rate": 1.5701177310406074e-05, "loss": 1.22, "mean_token_accuracy": 0.6942634865641594, "num_tokens": 302763132.0, "step": 9245 }, { "epoch": 0.7670619454349449, "grad_norm": 0.8903987407684326, "learning_rate": 1.5648557365593847e-05, "loss": 1.1602, "mean_token_accuracy": 0.7022116348147392, "num_tokens": 302926972.0, "step": 9250 }, { "epoch": 0.7674765735135584, "grad_norm": 0.8905354738235474, "learning_rate": 1.5596009379770582e-05, "loss": 1.0595, "mean_token_accuracy": 0.7200260147452354, "num_tokens": 303090115.0, "step": 9255 }, { "epoch": 0.7678912015921718, "grad_norm": 0.9127805233001709, "learning_rate": 1.554353346301315e-05, "loss": 1.1398, "mean_token_accuracy": 0.7073802530765534, "num_tokens": 303253955.0, "step": 9260 }, { "epoch": 0.7683058296707853, "grad_norm": 0.8905320167541504, "learning_rate": 1.5491129725247517e-05, "loss": 1.1412, "mean_token_accuracy": 0.7063364312052727, "num_tokens": 303417455.0, "step": 9265 }, { "epoch": 0.7687204577493988, "grad_norm": 0.8764287233352661, "learning_rate": 1.5438798276248357e-05, "loss": 1.1024, "mean_token_accuracy": 0.7102937892079353, "num_tokens": 303580409.0, "step": 9270 }, { "epoch": 0.7691350858280123, "grad_norm": 0.9234013557434082, "learning_rate": 1.538653922563895e-05, "loss": 1.1668, "mean_token_accuracy": 0.6989613935351372, "num_tokens": 303744249.0, "step": 9275 }, { "epoch": 0.7695497139066257, "grad_norm": 0.9448634386062622, "learning_rate": 1.5334352682890995e-05, "loss": 1.1547, "mean_token_accuracy": 0.7002504870295525, "num_tokens": 303908089.0, "step": 9280 }, { "epoch": 0.7699643419852392, "grad_norm": 0.9277756214141846, "learning_rate": 1.528223875732417e-05, "loss": 1.1288, "mean_token_accuracy": 0.7041738271713257, "num_tokens": 304071694.0, "step": 9285 }, { "epoch": 0.7703789700638527, "grad_norm": 0.913982629776001, "learning_rate": 1.5230197558106118e-05, "loss": 1.0845, "mean_token_accuracy": 0.7101600661873817, "num_tokens": 304235534.0, "step": 9290 }, { "epoch": 0.7707935981424662, "grad_norm": 0.9111948013305664, "learning_rate": 1.5178229194252125e-05, "loss": 1.1944, "mean_token_accuracy": 0.6978800103068352, "num_tokens": 304399374.0, "step": 9295 }, { "epoch": 0.7712082262210797, "grad_norm": 0.9338374733924866, "learning_rate": 1.5126333774624884e-05, "loss": 1.1735, "mean_token_accuracy": 0.6949230194091797, "num_tokens": 304563214.0, "step": 9300 }, { "epoch": 0.7716228542996931, "grad_norm": 0.8922032713890076, "learning_rate": 1.5074511407934306e-05, "loss": 1.0864, "mean_token_accuracy": 0.7133431106805801, "num_tokens": 304727054.0, "step": 9305 }, { "epoch": 0.7720374823783066, "grad_norm": 0.9665189385414124, "learning_rate": 1.502276220273725e-05, "loss": 1.1437, "mean_token_accuracy": 0.7035278007388115, "num_tokens": 304890209.0, "step": 9310 }, { "epoch": 0.7724521104569202, "grad_norm": 0.9113141894340515, "learning_rate": 1.497108626743734e-05, "loss": 1.1442, "mean_token_accuracy": 0.6997678428888321, "num_tokens": 305054049.0, "step": 9315 }, { "epoch": 0.7728667385355337, "grad_norm": 0.9439712166786194, "learning_rate": 1.49194837102847e-05, "loss": 1.2242, "mean_token_accuracy": 0.6925625205039978, "num_tokens": 305217143.0, "step": 9320 }, { "epoch": 0.7732813666141471, "grad_norm": 0.8958566188812256, "learning_rate": 1.4867954639375747e-05, "loss": 1.0832, "mean_token_accuracy": 0.7134441658854485, "num_tokens": 305380436.0, "step": 9325 }, { "epoch": 0.7736959946927606, "grad_norm": 0.9114380478858948, "learning_rate": 1.4816499162652952e-05, "loss": 1.1208, "mean_token_accuracy": 0.7076368540525436, "num_tokens": 305544276.0, "step": 9330 }, { "epoch": 0.7741106227713741, "grad_norm": 0.9351910352706909, "learning_rate": 1.4765117387904642e-05, "loss": 1.1473, "mean_token_accuracy": 0.7046065524220466, "num_tokens": 305708116.0, "step": 9335 }, { "epoch": 0.7745252508499876, "grad_norm": 0.8757506608963013, "learning_rate": 1.471380942276473e-05, "loss": 1.0575, "mean_token_accuracy": 0.7192754164338112, "num_tokens": 305871956.0, "step": 9340 }, { "epoch": 0.7749398789286011, "grad_norm": 0.8893118500709534, "learning_rate": 1.4662575374712528e-05, "loss": 1.1157, "mean_token_accuracy": 0.7113424986600876, "num_tokens": 306035652.0, "step": 9345 }, { "epoch": 0.7753545070072145, "grad_norm": 0.8937063813209534, "learning_rate": 1.4611415351072505e-05, "loss": 1.0923, "mean_token_accuracy": 0.7138135403394699, "num_tokens": 306199492.0, "step": 9350 }, { "epoch": 0.775769135085828, "grad_norm": 0.9636370539665222, "learning_rate": 1.456032945901406e-05, "loss": 1.1465, "mean_token_accuracy": 0.7007453605532646, "num_tokens": 306363332.0, "step": 9355 }, { "epoch": 0.7761837631644415, "grad_norm": 0.9361772537231445, "learning_rate": 1.4509317805551326e-05, "loss": 1.1454, "mean_token_accuracy": 0.7041972175240516, "num_tokens": 306527172.0, "step": 9360 }, { "epoch": 0.776598391243055, "grad_norm": 0.9029082655906677, "learning_rate": 1.4458380497542851e-05, "loss": 1.1787, "mean_token_accuracy": 0.6970857784152031, "num_tokens": 306691012.0, "step": 9365 }, { "epoch": 0.7770130193216684, "grad_norm": 0.865684449672699, "learning_rate": 1.4407517641691543e-05, "loss": 1.1602, "mean_token_accuracy": 0.7019611462950707, "num_tokens": 306854852.0, "step": 9370 }, { "epoch": 0.7774276474002819, "grad_norm": 0.8580654263496399, "learning_rate": 1.4356729344544296e-05, "loss": 1.0224, "mean_token_accuracy": 0.7245845541357994, "num_tokens": 307018692.0, "step": 9375 }, { "epoch": 0.7778422754788954, "grad_norm": 0.9202730655670166, "learning_rate": 1.4306015712491788e-05, "loss": 1.0776, "mean_token_accuracy": 0.716000734269619, "num_tokens": 307182532.0, "step": 9380 }, { "epoch": 0.7782569035575089, "grad_norm": 0.8806677460670471, "learning_rate": 1.425537685176836e-05, "loss": 1.1717, "mean_token_accuracy": 0.6999755635857582, "num_tokens": 307346372.0, "step": 9385 }, { "epoch": 0.7786715316361223, "grad_norm": 0.9154284596443176, "learning_rate": 1.42048128684517e-05, "loss": 1.0968, "mean_token_accuracy": 0.7116507768630982, "num_tokens": 307510212.0, "step": 9390 }, { "epoch": 0.7790861597147359, "grad_norm": 0.9176647663116455, "learning_rate": 1.4154323868462593e-05, "loss": 1.1401, "mean_token_accuracy": 0.7049975529313087, "num_tokens": 307674052.0, "step": 9395 }, { "epoch": 0.7795007877933494, "grad_norm": 0.916608989238739, "learning_rate": 1.4103909957564792e-05, "loss": 1.1176, "mean_token_accuracy": 0.7100989744067192, "num_tokens": 307837892.0, "step": 9400 }, { "epoch": 0.7799154158719629, "grad_norm": 0.8862534761428833, "learning_rate": 1.4053571241364787e-05, "loss": 1.2014, "mean_token_accuracy": 0.6926545411348343, "num_tokens": 308001618.0, "step": 9405 }, { "epoch": 0.7803300439505764, "grad_norm": 0.888766348361969, "learning_rate": 1.4003307825311507e-05, "loss": 1.1785, "mean_token_accuracy": 0.6983260050415993, "num_tokens": 308165458.0, "step": 9410 }, { "epoch": 0.7807446720291898, "grad_norm": 0.934119701385498, "learning_rate": 1.3953119814696125e-05, "loss": 1.1837, "mean_token_accuracy": 0.6947892278432846, "num_tokens": 308328216.0, "step": 9415 }, { "epoch": 0.7811593001078033, "grad_norm": 0.9300495982170105, "learning_rate": 1.3903007314651877e-05, "loss": 1.1776, "mean_token_accuracy": 0.6941837728023529, "num_tokens": 308492056.0, "step": 9420 }, { "epoch": 0.7815739281864168, "grad_norm": 0.9354580044746399, "learning_rate": 1.3852970430153884e-05, "loss": 1.1448, "mean_token_accuracy": 0.703286899626255, "num_tokens": 308655896.0, "step": 9425 }, { "epoch": 0.7819885562650303, "grad_norm": 0.8823285698890686, "learning_rate": 1.3803009266018752e-05, "loss": 1.1344, "mean_token_accuracy": 0.705547408759594, "num_tokens": 308819736.0, "step": 9430 }, { "epoch": 0.7824031843436438, "grad_norm": 0.901211142539978, "learning_rate": 1.3753123926904527e-05, "loss": 1.1336, "mean_token_accuracy": 0.706414957344532, "num_tokens": 308983576.0, "step": 9435 }, { "epoch": 0.7828178124222572, "grad_norm": 0.9550814628601074, "learning_rate": 1.3703314517310473e-05, "loss": 1.1562, "mean_token_accuracy": 0.699321848154068, "num_tokens": 309147416.0, "step": 9440 }, { "epoch": 0.7832324405008707, "grad_norm": 0.9058507680892944, "learning_rate": 1.3653581141576687e-05, "loss": 1.128, "mean_token_accuracy": 0.709035924077034, "num_tokens": 309311256.0, "step": 9445 }, { "epoch": 0.7836470685794842, "grad_norm": 0.9165582656860352, "learning_rate": 1.3603923903884069e-05, "loss": 1.1574, "mean_token_accuracy": 0.7024167910218239, "num_tokens": 309474909.0, "step": 9450 }, { "epoch": 0.7840616966580977, "grad_norm": 0.9098576307296753, "learning_rate": 1.3554342908253998e-05, "loss": 1.0992, "mean_token_accuracy": 0.712964317202568, "num_tokens": 309638749.0, "step": 9455 }, { "epoch": 0.7844763247367111, "grad_norm": 0.9143006801605225, "learning_rate": 1.3504838258548148e-05, "loss": 1.1601, "mean_token_accuracy": 0.699217988550663, "num_tokens": 309802589.0, "step": 9460 }, { "epoch": 0.7848909528153246, "grad_norm": 0.9183697700500488, "learning_rate": 1.3455410058468266e-05, "loss": 1.1909, "mean_token_accuracy": 0.6964198410511017, "num_tokens": 309966429.0, "step": 9465 }, { "epoch": 0.7853055808939381, "grad_norm": 0.9047302007675171, "learning_rate": 1.340605841155595e-05, "loss": 1.1261, "mean_token_accuracy": 0.7058406680822372, "num_tokens": 310130269.0, "step": 9470 }, { "epoch": 0.7857202089725516, "grad_norm": 0.9284247756004333, "learning_rate": 1.3356783421192436e-05, "loss": 1.1477, "mean_token_accuracy": 0.702303272485733, "num_tokens": 310294109.0, "step": 9475 }, { "epoch": 0.7861348370511652, "grad_norm": 0.9016001224517822, "learning_rate": 1.3307585190598387e-05, "loss": 1.1685, "mean_token_accuracy": 0.6992851883172989, "num_tokens": 310457949.0, "step": 9480 }, { "epoch": 0.7865494651297786, "grad_norm": 0.9556524157524109, "learning_rate": 1.3258463822833655e-05, "loss": 1.1535, "mean_token_accuracy": 0.7016067937016487, "num_tokens": 310621789.0, "step": 9485 }, { "epoch": 0.7869640932083921, "grad_norm": 0.920761227607727, "learning_rate": 1.3209419420797098e-05, "loss": 1.1588, "mean_token_accuracy": 0.7001093402504921, "num_tokens": 310784417.0, "step": 9490 }, { "epoch": 0.7873787212870056, "grad_norm": 0.9084736704826355, "learning_rate": 1.3160452087226332e-05, "loss": 1.115, "mean_token_accuracy": 0.709023705124855, "num_tokens": 310948257.0, "step": 9495 }, { "epoch": 0.7877933493656191, "grad_norm": 0.8487734794616699, "learning_rate": 1.3111561924697552e-05, "loss": 1.138, "mean_token_accuracy": 0.7044367000460625, "num_tokens": 311111442.0, "step": 9500 }, { "epoch": 0.7882079774442325, "grad_norm": 0.9419161081314087, "learning_rate": 1.306274903562527e-05, "loss": 1.1104, "mean_token_accuracy": 0.7091336712241173, "num_tokens": 311275282.0, "step": 9505 }, { "epoch": 0.788622605522846, "grad_norm": 0.9123713970184326, "learning_rate": 1.3014013522262141e-05, "loss": 1.1511, "mean_token_accuracy": 0.7031280517578125, "num_tokens": 311439122.0, "step": 9510 }, { "epoch": 0.7890372336014595, "grad_norm": 0.911185622215271, "learning_rate": 1.2965355486698738e-05, "loss": 1.1777, "mean_token_accuracy": 0.6968230694532395, "num_tokens": 311602962.0, "step": 9515 }, { "epoch": 0.789451861680073, "grad_norm": 0.9397674202919006, "learning_rate": 1.2916775030863337e-05, "loss": 1.1689, "mean_token_accuracy": 0.6983198881149292, "num_tokens": 311766802.0, "step": 9520 }, { "epoch": 0.7898664897586865, "grad_norm": 0.9195385575294495, "learning_rate": 1.2868272256521657e-05, "loss": 1.151, "mean_token_accuracy": 0.7044660344719886, "num_tokens": 311930642.0, "step": 9525 }, { "epoch": 0.7902811178372999, "grad_norm": 0.9403318762779236, "learning_rate": 1.2819847265276757e-05, "loss": 1.1681, "mean_token_accuracy": 0.696065491437912, "num_tokens": 312094482.0, "step": 9530 }, { "epoch": 0.7906957459159134, "grad_norm": 0.9081764221191406, "learning_rate": 1.2771500158568745e-05, "loss": 1.2203, "mean_token_accuracy": 0.6894367069005967, "num_tokens": 312258322.0, "step": 9535 }, { "epoch": 0.7911103739945269, "grad_norm": 0.8648397326469421, "learning_rate": 1.272323103767451e-05, "loss": 1.0278, "mean_token_accuracy": 0.725103859603405, "num_tokens": 312422162.0, "step": 9540 }, { "epoch": 0.7915250020731404, "grad_norm": 0.9654725790023804, "learning_rate": 1.2675040003707639e-05, "loss": 1.0897, "mean_token_accuracy": 0.7063783004879951, "num_tokens": 312586002.0, "step": 9545 }, { "epoch": 0.7919396301517538, "grad_norm": 0.9537203311920166, "learning_rate": 1.2626927157618157e-05, "loss": 1.1461, "mean_token_accuracy": 0.7018206283450127, "num_tokens": 312749842.0, "step": 9550 }, { "epoch": 0.7923542582303673, "grad_norm": 0.9430860280990601, "learning_rate": 1.2578892600192272e-05, "loss": 1.1302, "mean_token_accuracy": 0.7051625117659569, "num_tokens": 312913682.0, "step": 9555 }, { "epoch": 0.7927688863089808, "grad_norm": 0.9268710017204285, "learning_rate": 1.2530936432052154e-05, "loss": 1.1222, "mean_token_accuracy": 0.709597997367382, "num_tokens": 313077522.0, "step": 9560 }, { "epoch": 0.7931835143875944, "grad_norm": 0.9255589246749878, "learning_rate": 1.2483058753655858e-05, "loss": 1.1696, "mean_token_accuracy": 0.6983382239937782, "num_tokens": 313241362.0, "step": 9565 }, { "epoch": 0.7935981424662079, "grad_norm": 0.955251157283783, "learning_rate": 1.243525966529696e-05, "loss": 1.1111, "mean_token_accuracy": 0.7081378310918808, "num_tokens": 313405202.0, "step": 9570 }, { "epoch": 0.7940127705448213, "grad_norm": 0.9550685882568359, "learning_rate": 1.2387539267104392e-05, "loss": 1.2153, "mean_token_accuracy": 0.6888379767537117, "num_tokens": 313569042.0, "step": 9575 }, { "epoch": 0.7944273986234348, "grad_norm": 0.9419229030609131, "learning_rate": 1.2339897659042266e-05, "loss": 1.1419, "mean_token_accuracy": 0.704093350470066, "num_tokens": 313732882.0, "step": 9580 }, { "epoch": 0.7948420267020483, "grad_norm": 0.8668137192726135, "learning_rate": 1.2292334940909699e-05, "loss": 1.1148, "mean_token_accuracy": 0.7130742907524109, "num_tokens": 313896722.0, "step": 9585 }, { "epoch": 0.7952566547806618, "grad_norm": 0.9046217203140259, "learning_rate": 1.2244851212340453e-05, "loss": 1.0689, "mean_token_accuracy": 0.716208453476429, "num_tokens": 314060562.0, "step": 9590 }, { "epoch": 0.7956712828592752, "grad_norm": 0.940582811832428, "learning_rate": 1.219744657280289e-05, "loss": 1.2395, "mean_token_accuracy": 0.6899193555116654, "num_tokens": 314224402.0, "step": 9595 }, { "epoch": 0.7960859109378887, "grad_norm": 0.9407996535301208, "learning_rate": 1.2150121121599672e-05, "loss": 1.1317, "mean_token_accuracy": 0.7041361182928085, "num_tokens": 314388242.0, "step": 9600 }, { "epoch": 0.7965005390165022, "grad_norm": 0.8708174824714661, "learning_rate": 1.2102874957867587e-05, "loss": 1.1043, "mean_token_accuracy": 0.7086939424276352, "num_tokens": 314551308.0, "step": 9605 }, { "epoch": 0.7969151670951157, "grad_norm": 0.891569197177887, "learning_rate": 1.205570818057734e-05, "loss": 1.1586, "mean_token_accuracy": 0.7046187713742256, "num_tokens": 314715148.0, "step": 9610 }, { "epoch": 0.7973297951737292, "grad_norm": 0.9294144511222839, "learning_rate": 1.2008620888533306e-05, "loss": 1.1247, "mean_token_accuracy": 0.7054679840803146, "num_tokens": 314878988.0, "step": 9615 }, { "epoch": 0.7977444232523426, "grad_norm": 0.9487144947052002, "learning_rate": 1.1961613180373421e-05, "loss": 1.1141, "mean_token_accuracy": 0.706329420208931, "num_tokens": 315042828.0, "step": 9620 }, { "epoch": 0.7981590513309561, "grad_norm": 0.941530168056488, "learning_rate": 1.1914685154568822e-05, "loss": 1.1742, "mean_token_accuracy": 0.6969696968793869, "num_tokens": 315206668.0, "step": 9625 }, { "epoch": 0.7985736794095696, "grad_norm": 0.9285726547241211, "learning_rate": 1.1867836909423797e-05, "loss": 1.2046, "mean_token_accuracy": 0.6940921306610107, "num_tokens": 315370508.0, "step": 9630 }, { "epoch": 0.7989883074881831, "grad_norm": 0.9117364287376404, "learning_rate": 1.1821068543075481e-05, "loss": 1.1294, "mean_token_accuracy": 0.7027370512485505, "num_tokens": 315534348.0, "step": 9635 }, { "epoch": 0.7994029355667965, "grad_norm": 0.9249398112297058, "learning_rate": 1.177438015349368e-05, "loss": 1.0664, "mean_token_accuracy": 0.7145039081573487, "num_tokens": 315698188.0, "step": 9640 }, { "epoch": 0.79981756364541, "grad_norm": 0.9028452634811401, "learning_rate": 1.1727771838480678e-05, "loss": 1.1752, "mean_token_accuracy": 0.696138808131218, "num_tokens": 315862028.0, "step": 9645 }, { "epoch": 0.8002321917240236, "grad_norm": 0.9576271772384644, "learning_rate": 1.1681243695671013e-05, "loss": 1.156, "mean_token_accuracy": 0.6987964272499084, "num_tokens": 316025868.0, "step": 9650 }, { "epoch": 0.8006468198026371, "grad_norm": 0.9440005421638489, "learning_rate": 1.1634795822531275e-05, "loss": 1.1531, "mean_token_accuracy": 0.7014662772417068, "num_tokens": 316189708.0, "step": 9655 }, { "epoch": 0.8010614478812506, "grad_norm": 0.9253191351890564, "learning_rate": 1.1588428316359912e-05, "loss": 1.0808, "mean_token_accuracy": 0.7155119732022286, "num_tokens": 316353548.0, "step": 9660 }, { "epoch": 0.801476075959864, "grad_norm": 0.9329873323440552, "learning_rate": 1.1542141274287032e-05, "loss": 1.1319, "mean_token_accuracy": 0.7086255982518196, "num_tokens": 316516714.0, "step": 9665 }, { "epoch": 0.8018907040384775, "grad_norm": 0.9451928734779358, "learning_rate": 1.1495934793274132e-05, "loss": 1.2161, "mean_token_accuracy": 0.69285189807415, "num_tokens": 316680554.0, "step": 9670 }, { "epoch": 0.802305332117091, "grad_norm": 0.9663540124893188, "learning_rate": 1.144980897011404e-05, "loss": 1.1314, "mean_token_accuracy": 0.7076063051819801, "num_tokens": 316844394.0, "step": 9675 }, { "epoch": 0.8027199601957045, "grad_norm": 0.967907190322876, "learning_rate": 1.140376390143057e-05, "loss": 1.1521, "mean_token_accuracy": 0.699883921444416, "num_tokens": 317008234.0, "step": 9680 }, { "epoch": 0.8031345882743179, "grad_norm": 0.9527965784072876, "learning_rate": 1.1357799683678332e-05, "loss": 1.1615, "mean_token_accuracy": 0.7014907151460648, "num_tokens": 317172074.0, "step": 9685 }, { "epoch": 0.8035492163529314, "grad_norm": 0.8742443323135376, "learning_rate": 1.1311916413142671e-05, "loss": 1.184, "mean_token_accuracy": 0.6934322997927665, "num_tokens": 317335914.0, "step": 9690 }, { "epoch": 0.8039638444315449, "grad_norm": 0.9260457158088684, "learning_rate": 1.1266114185939286e-05, "loss": 1.1367, "mean_token_accuracy": 0.7050342127680779, "num_tokens": 317499754.0, "step": 9695 }, { "epoch": 0.8043784725101584, "grad_norm": 0.8898655772209167, "learning_rate": 1.1220393098014147e-05, "loss": 1.1426, "mean_token_accuracy": 0.7027431607246399, "num_tokens": 317663594.0, "step": 9700 }, { "epoch": 0.8047931005887718, "grad_norm": 0.8611737489700317, "learning_rate": 1.1174753245143205e-05, "loss": 1.0749, "mean_token_accuracy": 0.7163734138011932, "num_tokens": 317827434.0, "step": 9705 }, { "epoch": 0.8052077286673853, "grad_norm": 0.9127941131591797, "learning_rate": 1.1129194722932307e-05, "loss": 1.1924, "mean_token_accuracy": 0.6959433034062386, "num_tokens": 317991274.0, "step": 9710 }, { "epoch": 0.8056223567459988, "grad_norm": 0.9587748646736145, "learning_rate": 1.1083717626816904e-05, "loss": 1.1248, "mean_token_accuracy": 0.7029631018638611, "num_tokens": 318155114.0, "step": 9715 }, { "epoch": 0.8060369848246123, "grad_norm": 0.9331362843513489, "learning_rate": 1.1038322052061834e-05, "loss": 1.2249, "mean_token_accuracy": 0.6885263949632645, "num_tokens": 318318954.0, "step": 9720 }, { "epoch": 0.8064516129032258, "grad_norm": 0.9582976698875427, "learning_rate": 1.0993008093761214e-05, "loss": 1.2495, "mean_token_accuracy": 0.6804313257336616, "num_tokens": 318482794.0, "step": 9725 }, { "epoch": 0.8068662409818393, "grad_norm": 0.9009600281715393, "learning_rate": 1.094777584683821e-05, "loss": 1.1759, "mean_token_accuracy": 0.6934811815619468, "num_tokens": 318646634.0, "step": 9730 }, { "epoch": 0.8072808690604528, "grad_norm": 0.9420273303985596, "learning_rate": 1.0902625406044753e-05, "loss": 1.0941, "mean_token_accuracy": 0.7095948219299316, "num_tokens": 318809703.0, "step": 9735 }, { "epoch": 0.8076954971390663, "grad_norm": 0.9314038157463074, "learning_rate": 1.0857556865961437e-05, "loss": 1.0884, "mean_token_accuracy": 0.7102150544524193, "num_tokens": 318973543.0, "step": 9740 }, { "epoch": 0.8081101252176798, "grad_norm": 0.8720875382423401, "learning_rate": 1.081257032099735e-05, "loss": 1.0702, "mean_token_accuracy": 0.7190852329134941, "num_tokens": 319137036.0, "step": 9745 }, { "epoch": 0.8085247532962933, "grad_norm": 0.9193700551986694, "learning_rate": 1.0767665865389714e-05, "loss": 1.1556, "mean_token_accuracy": 0.7058703511953354, "num_tokens": 319300542.0, "step": 9750 }, { "epoch": 0.8089393813749067, "grad_norm": 0.9067955017089844, "learning_rate": 1.0722843593203862e-05, "loss": 1.1711, "mean_token_accuracy": 0.6960166156291961, "num_tokens": 319464382.0, "step": 9755 }, { "epoch": 0.8093540094535202, "grad_norm": 0.9262675642967224, "learning_rate": 1.0678103598332939e-05, "loss": 1.2394, "mean_token_accuracy": 0.6870135754346848, "num_tokens": 319627902.0, "step": 9760 }, { "epoch": 0.8097686375321337, "grad_norm": 0.9415069818496704, "learning_rate": 1.063344597449778e-05, "loss": 1.102, "mean_token_accuracy": 0.7079545423388481, "num_tokens": 319791742.0, "step": 9765 }, { "epoch": 0.8101832656107472, "grad_norm": 0.9466578960418701, "learning_rate": 1.0588870815246604e-05, "loss": 1.1496, "mean_token_accuracy": 0.7043682783842087, "num_tokens": 319955582.0, "step": 9770 }, { "epoch": 0.8105978936893606, "grad_norm": 0.9340462684631348, "learning_rate": 1.0544378213954935e-05, "loss": 1.1309, "mean_token_accuracy": 0.705266372859478, "num_tokens": 320119422.0, "step": 9775 }, { "epoch": 0.8110125217679741, "grad_norm": 0.9780790209770203, "learning_rate": 1.0499968263825332e-05, "loss": 1.2006, "mean_token_accuracy": 0.6927358239889145, "num_tokens": 320283262.0, "step": 9780 }, { "epoch": 0.8114271498465876, "grad_norm": 0.9603679776191711, "learning_rate": 1.0455641057887229e-05, "loss": 1.1339, "mean_token_accuracy": 0.7017289817333221, "num_tokens": 320447102.0, "step": 9785 }, { "epoch": 0.8118417779252011, "grad_norm": 0.9297229051589966, "learning_rate": 1.0411396688996722e-05, "loss": 1.1352, "mean_token_accuracy": 0.7051136359572411, "num_tokens": 320610942.0, "step": 9790 }, { "epoch": 0.8122564060038145, "grad_norm": 0.9165574312210083, "learning_rate": 1.0367235249836383e-05, "loss": 1.1071, "mean_token_accuracy": 0.7109909549355506, "num_tokens": 320774782.0, "step": 9795 }, { "epoch": 0.812671034082428, "grad_norm": 0.9278814792633057, "learning_rate": 1.0323156832915066e-05, "loss": 1.1511, "mean_token_accuracy": 0.7008975803852081, "num_tokens": 320937748.0, "step": 9800 }, { "epoch": 0.8130856621610415, "grad_norm": 0.9207982420921326, "learning_rate": 1.0279161530567711e-05, "loss": 1.1416, "mean_token_accuracy": 0.7050158873200416, "num_tokens": 321101588.0, "step": 9805 }, { "epoch": 0.813500290239655, "grad_norm": 0.9000398516654968, "learning_rate": 1.0235249434955141e-05, "loss": 1.1846, "mean_token_accuracy": 0.6955950632691383, "num_tokens": 321265428.0, "step": 9810 }, { "epoch": 0.8139149183182686, "grad_norm": 0.8642020225524902, "learning_rate": 1.0191420638063887e-05, "loss": 1.0791, "mean_token_accuracy": 0.7153164729475975, "num_tokens": 321429268.0, "step": 9815 }, { "epoch": 0.814329546396882, "grad_norm": 0.9205776453018188, "learning_rate": 1.0147675231705989e-05, "loss": 1.2521, "mean_token_accuracy": 0.6874633371829987, "num_tokens": 321593108.0, "step": 9820 }, { "epoch": 0.8147441744754955, "grad_norm": 0.8823779225349426, "learning_rate": 1.01040133075188e-05, "loss": 1.1504, "mean_token_accuracy": 0.697543989121914, "num_tokens": 321756948.0, "step": 9825 }, { "epoch": 0.815158802554109, "grad_norm": 0.9273008108139038, "learning_rate": 1.0060434956964792e-05, "loss": 1.1397, "mean_token_accuracy": 0.7007820159196854, "num_tokens": 321920788.0, "step": 9830 }, { "epoch": 0.8155734306327225, "grad_norm": 0.9190025925636292, "learning_rate": 1.0016940271331365e-05, "loss": 1.1571, "mean_token_accuracy": 0.7008614405989647, "num_tokens": 322084628.0, "step": 9835 }, { "epoch": 0.815988058711336, "grad_norm": 0.9909818172454834, "learning_rate": 9.97352934173067e-06, "loss": 1.1285, "mean_token_accuracy": 0.705999507009983, "num_tokens": 322248468.0, "step": 9840 }, { "epoch": 0.8164026867899494, "grad_norm": 0.9337329268455505, "learning_rate": 9.930202259099397e-06, "loss": 1.1454, "mean_token_accuracy": 0.7040017083287239, "num_tokens": 322412308.0, "step": 9845 }, { "epoch": 0.8168173148685629, "grad_norm": 0.952125072479248, "learning_rate": 9.886959114198601e-06, "loss": 1.1633, "mean_token_accuracy": 0.7001160800457, "num_tokens": 322576148.0, "step": 9850 }, { "epoch": 0.8172319429471764, "grad_norm": 0.9478073716163635, "learning_rate": 9.843799997613495e-06, "loss": 1.1494, "mean_token_accuracy": 0.7000427633523941, "num_tokens": 322739988.0, "step": 9855 }, { "epoch": 0.8176465710257899, "grad_norm": 0.9033485651016235, "learning_rate": 9.800724999753298e-06, "loss": 1.1313, "mean_token_accuracy": 0.7057612419128418, "num_tokens": 322903828.0, "step": 9860 }, { "epoch": 0.8180611991044033, "grad_norm": 0.8981077075004578, "learning_rate": 9.757734210850956e-06, "loss": 1.0984, "mean_token_accuracy": 0.7132087007164956, "num_tokens": 323067668.0, "step": 9865 }, { "epoch": 0.8184758271830168, "grad_norm": 0.8958315849304199, "learning_rate": 9.714827720963089e-06, "loss": 1.0902, "mean_token_accuracy": 0.7091581091284752, "num_tokens": 323231508.0, "step": 9870 }, { "epoch": 0.8188904552616303, "grad_norm": 0.9041430950164795, "learning_rate": 9.672005619969705e-06, "loss": 1.1608, "mean_token_accuracy": 0.7004704251885414, "num_tokens": 323395348.0, "step": 9875 }, { "epoch": 0.8193050833402438, "grad_norm": 0.9167628884315491, "learning_rate": 9.629267997573998e-06, "loss": 1.1195, "mean_token_accuracy": 0.7073863670229912, "num_tokens": 323559188.0, "step": 9880 }, { "epoch": 0.8197197114188572, "grad_norm": 0.948016881942749, "learning_rate": 9.586614943302225e-06, "loss": 1.1701, "mean_token_accuracy": 0.6989162027835846, "num_tokens": 323722759.0, "step": 9885 }, { "epoch": 0.8201343394974707, "grad_norm": 0.9600058197975159, "learning_rate": 9.544046546503526e-06, "loss": 1.1119, "mean_token_accuracy": 0.7122678458690643, "num_tokens": 323886599.0, "step": 9890 }, { "epoch": 0.8205489675760842, "grad_norm": 0.9428344368934631, "learning_rate": 9.501562896349636e-06, "loss": 1.1537, "mean_token_accuracy": 0.7019672557711601, "num_tokens": 324050439.0, "step": 9895 }, { "epoch": 0.8209635956546978, "grad_norm": 0.8840602040290833, "learning_rate": 9.459164081834803e-06, "loss": 1.0664, "mean_token_accuracy": 0.7209860756993294, "num_tokens": 324214279.0, "step": 9900 }, { "epoch": 0.8213782237333113, "grad_norm": 0.9207726120948792, "learning_rate": 9.41685019177554e-06, "loss": 1.1189, "mean_token_accuracy": 0.7070136874914169, "num_tokens": 324378119.0, "step": 9905 }, { "epoch": 0.8217928518119247, "grad_norm": 0.9566503763198853, "learning_rate": 9.374621314810517e-06, "loss": 1.1231, "mean_token_accuracy": 0.7043621718883515, "num_tokens": 324541959.0, "step": 9910 }, { "epoch": 0.8222074798905382, "grad_norm": 0.9196707606315613, "learning_rate": 9.332477539400237e-06, "loss": 1.1587, "mean_token_accuracy": 0.7004154473543167, "num_tokens": 324705799.0, "step": 9915 }, { "epoch": 0.8226221079691517, "grad_norm": 0.9198716282844543, "learning_rate": 9.290418953826969e-06, "loss": 1.1419, "mean_token_accuracy": 0.6994196027517319, "num_tokens": 324869639.0, "step": 9920 }, { "epoch": 0.8230367360477652, "grad_norm": 0.9049525260925293, "learning_rate": 9.248445646194575e-06, "loss": 1.1427, "mean_token_accuracy": 0.7043212160468102, "num_tokens": 325032620.0, "step": 9925 }, { "epoch": 0.8234513641263786, "grad_norm": 0.8652673959732056, "learning_rate": 9.206557704428203e-06, "loss": 1.0587, "mean_token_accuracy": 0.7168009787797928, "num_tokens": 325195322.0, "step": 9930 }, { "epoch": 0.8238659922049921, "grad_norm": 0.9393934607505798, "learning_rate": 9.164755216274213e-06, "loss": 1.119, "mean_token_accuracy": 0.7064393922686577, "num_tokens": 325359162.0, "step": 9935 }, { "epoch": 0.8242806202836056, "grad_norm": 0.8978515267372131, "learning_rate": 9.123038269299961e-06, "loss": 1.0676, "mean_token_accuracy": 0.7135019555687905, "num_tokens": 325523002.0, "step": 9940 }, { "epoch": 0.8246952483622191, "grad_norm": 0.9161264300346375, "learning_rate": 9.0814069508936e-06, "loss": 1.1641, "mean_token_accuracy": 0.6990469247102737, "num_tokens": 325686842.0, "step": 9945 }, { "epoch": 0.8251098764408326, "grad_norm": 0.97547447681427, "learning_rate": 9.039861348263916e-06, "loss": 1.2432, "mean_token_accuracy": 0.688801321387291, "num_tokens": 325850682.0, "step": 9950 }, { "epoch": 0.825524504519446, "grad_norm": 0.9172536730766296, "learning_rate": 8.99840154844015e-06, "loss": 1.1863, "mean_token_accuracy": 0.700428931415081, "num_tokens": 326013695.0, "step": 9955 }, { "epoch": 0.8259391325980595, "grad_norm": 0.8994693160057068, "learning_rate": 8.957027638271775e-06, "loss": 1.0593, "mean_token_accuracy": 0.7163795173168183, "num_tokens": 326177535.0, "step": 9960 }, { "epoch": 0.826353760676673, "grad_norm": 0.8992630243301392, "learning_rate": 8.915739704428366e-06, "loss": 1.1719, "mean_token_accuracy": 0.6994257062673569, "num_tokens": 326341375.0, "step": 9965 }, { "epoch": 0.8267683887552865, "grad_norm": 0.9343917965888977, "learning_rate": 8.874537833399399e-06, "loss": 1.1711, "mean_token_accuracy": 0.6989876970648765, "num_tokens": 326504772.0, "step": 9970 }, { "epoch": 0.8271830168338999, "grad_norm": 0.9663552045822144, "learning_rate": 8.833422111494043e-06, "loss": 1.1592, "mean_token_accuracy": 0.7007820174098015, "num_tokens": 326668612.0, "step": 9975 }, { "epoch": 0.8275976449125135, "grad_norm": 0.8932210803031921, "learning_rate": 8.792392624841034e-06, "loss": 1.1281, "mean_token_accuracy": 0.7049425706267357, "num_tokens": 326832452.0, "step": 9980 }, { "epoch": 0.828012272991127, "grad_norm": 0.888608992099762, "learning_rate": 8.751449459388434e-06, "loss": 1.1138, "mean_token_accuracy": 0.707203084230423, "num_tokens": 326996292.0, "step": 9985 }, { "epoch": 0.8284269010697405, "grad_norm": 0.915816068649292, "learning_rate": 8.710592700903496e-06, "loss": 1.1424, "mean_token_accuracy": 0.7073291972279548, "num_tokens": 327159314.0, "step": 9990 }, { "epoch": 0.828841529148354, "grad_norm": 0.9738427996635437, "learning_rate": 8.669822434972474e-06, "loss": 1.1868, "mean_token_accuracy": 0.6964626044034958, "num_tokens": 327323154.0, "step": 9995 }, { "epoch": 0.8292561572269674, "grad_norm": 0.9736239314079285, "learning_rate": 8.629138747000425e-06, "loss": 1.1859, "mean_token_accuracy": 0.6947519570589066, "num_tokens": 327486994.0, "step": 10000 }, { "epoch": 0.8296707853055809, "grad_norm": 0.9018882513046265, "learning_rate": 8.588541722211063e-06, "loss": 1.0624, "mean_token_accuracy": 0.7181940361857414, "num_tokens": 327650834.0, "step": 10005 }, { "epoch": 0.8300854133841944, "grad_norm": 0.9313610792160034, "learning_rate": 8.548031445646509e-06, "loss": 1.1422, "mean_token_accuracy": 0.7015029296278954, "num_tokens": 327814674.0, "step": 10010 }, { "epoch": 0.8305000414628079, "grad_norm": 0.9230836033821106, "learning_rate": 8.507608002167244e-06, "loss": 1.1125, "mean_token_accuracy": 0.7078018069267273, "num_tokens": 327978514.0, "step": 10015 }, { "epoch": 0.8309146695414213, "grad_norm": 0.9039062857627869, "learning_rate": 8.46727147645181e-06, "loss": 1.0786, "mean_token_accuracy": 0.7102129101753235, "num_tokens": 328142017.0, "step": 10020 }, { "epoch": 0.8313292976200348, "grad_norm": 0.8902859091758728, "learning_rate": 8.427021952996633e-06, "loss": 1.1766, "mean_token_accuracy": 0.6976967275142669, "num_tokens": 328305857.0, "step": 10025 }, { "epoch": 0.8317439256986483, "grad_norm": 0.9402782917022705, "learning_rate": 8.386859516115974e-06, "loss": 1.0782, "mean_token_accuracy": 0.7144122704863548, "num_tokens": 328469697.0, "step": 10030 }, { "epoch": 0.8321585537772618, "grad_norm": 0.9468030333518982, "learning_rate": 8.346784249941619e-06, "loss": 1.1426, "mean_token_accuracy": 0.7015090376138687, "num_tokens": 328633537.0, "step": 10035 }, { "epoch": 0.8325731818558753, "grad_norm": 0.9220185875892639, "learning_rate": 8.306796238422736e-06, "loss": 1.2338, "mean_token_accuracy": 0.6866507828235626, "num_tokens": 328797377.0, "step": 10040 }, { "epoch": 0.8329878099344887, "grad_norm": 0.9200132489204407, "learning_rate": 8.266895565325722e-06, "loss": 1.1577, "mean_token_accuracy": 0.6988514199852943, "num_tokens": 328961217.0, "step": 10045 }, { "epoch": 0.8334024380131022, "grad_norm": 0.9153368473052979, "learning_rate": 8.227082314234058e-06, "loss": 1.1172, "mean_token_accuracy": 0.7106427147984504, "num_tokens": 329125057.0, "step": 10050 }, { "epoch": 0.8338170660917157, "grad_norm": 0.8476196527481079, "learning_rate": 8.18735656854806e-06, "loss": 1.1204, "mean_token_accuracy": 0.7102761447429657, "num_tokens": 329288897.0, "step": 10055 }, { "epoch": 0.8342316941703292, "grad_norm": 0.8842123746871948, "learning_rate": 8.147718411484717e-06, "loss": 1.1358, "mean_token_accuracy": 0.707978980243206, "num_tokens": 329452737.0, "step": 10060 }, { "epoch": 0.8346463222489428, "grad_norm": 0.9294978976249695, "learning_rate": 8.10816792607757e-06, "loss": 1.083, "mean_token_accuracy": 0.7122824415564537, "num_tokens": 329615930.0, "step": 10065 }, { "epoch": 0.8350609503275562, "grad_norm": 0.915131151676178, "learning_rate": 8.068705195176535e-06, "loss": 1.1108, "mean_token_accuracy": 0.7099340170621872, "num_tokens": 329779770.0, "step": 10070 }, { "epoch": 0.8354755784061697, "grad_norm": 0.9168100357055664, "learning_rate": 8.029330301447618e-06, "loss": 1.1126, "mean_token_accuracy": 0.7066104561090469, "num_tokens": 329943610.0, "step": 10075 }, { "epoch": 0.8358902064847832, "grad_norm": 0.939510703086853, "learning_rate": 7.990043327372904e-06, "loss": 1.1494, "mean_token_accuracy": 0.7025109991431236, "num_tokens": 330107450.0, "step": 10080 }, { "epoch": 0.8363048345633967, "grad_norm": 0.9343424439430237, "learning_rate": 7.950844355250259e-06, "loss": 1.1409, "mean_token_accuracy": 0.7024132460355759, "num_tokens": 330271290.0, "step": 10085 }, { "epoch": 0.8367194626420101, "grad_norm": 0.9184180498123169, "learning_rate": 7.911733467193227e-06, "loss": 1.0922, "mean_token_accuracy": 0.7142900779843331, "num_tokens": 330435130.0, "step": 10090 }, { "epoch": 0.8371340907206236, "grad_norm": 0.9067801833152771, "learning_rate": 7.872710745130824e-06, "loss": 1.0844, "mean_token_accuracy": 0.7128604590892792, "num_tokens": 330598970.0, "step": 10095 }, { "epoch": 0.8375487187992371, "grad_norm": 0.9362291693687439, "learning_rate": 7.833776270807374e-06, "loss": 1.0784, "mean_token_accuracy": 0.7090097352862358, "num_tokens": 330762716.0, "step": 10100 }, { "epoch": 0.8379633468778506, "grad_norm": 0.9227790832519531, "learning_rate": 7.794930125782352e-06, "loss": 1.1974, "mean_token_accuracy": 0.6948191598057747, "num_tokens": 330926556.0, "step": 10105 }, { "epoch": 0.838377974956464, "grad_norm": 0.9292659759521484, "learning_rate": 7.756172391430188e-06, "loss": 1.0708, "mean_token_accuracy": 0.7169965773820877, "num_tokens": 331090396.0, "step": 10110 }, { "epoch": 0.8387926030350775, "grad_norm": 0.9163357615470886, "learning_rate": 7.717503148940125e-06, "loss": 1.1436, "mean_token_accuracy": 0.7077346071600914, "num_tokens": 331254236.0, "step": 10115 }, { "epoch": 0.839207231113691, "grad_norm": 0.9387226104736328, "learning_rate": 7.678922479316025e-06, "loss": 1.129, "mean_token_accuracy": 0.7058895394206047, "num_tokens": 331418076.0, "step": 10120 }, { "epoch": 0.8396218591923045, "grad_norm": 0.9172439575195312, "learning_rate": 7.640430463376214e-06, "loss": 1.1417, "mean_token_accuracy": 0.7037695482373237, "num_tokens": 331581916.0, "step": 10125 }, { "epoch": 0.840036487270918, "grad_norm": 0.8944951891899109, "learning_rate": 7.602027181753302e-06, "loss": 1.14, "mean_token_accuracy": 0.7064210638403893, "num_tokens": 331745756.0, "step": 10130 }, { "epoch": 0.8404511153495314, "grad_norm": 0.9381096959114075, "learning_rate": 7.5637127148940164e-06, "loss": 1.0607, "mean_token_accuracy": 0.7156891539692879, "num_tokens": 331909596.0, "step": 10135 }, { "epoch": 0.8408657434281449, "grad_norm": 0.9678494334220886, "learning_rate": 7.525487143059046e-06, "loss": 1.1832, "mean_token_accuracy": 0.6973851352930069, "num_tokens": 332073436.0, "step": 10140 }, { "epoch": 0.8412803715067584, "grad_norm": 0.9401522874832153, "learning_rate": 7.487350546322858e-06, "loss": 1.1335, "mean_token_accuracy": 0.7039284020662308, "num_tokens": 332237276.0, "step": 10145 }, { "epoch": 0.841694999585372, "grad_norm": 0.9201824069023132, "learning_rate": 7.449303004573538e-06, "loss": 1.1038, "mean_token_accuracy": 0.7088831886649132, "num_tokens": 332401116.0, "step": 10150 }, { "epoch": 0.8421096276639854, "grad_norm": 0.9344897866249084, "learning_rate": 7.4113445975126205e-06, "loss": 1.1722, "mean_token_accuracy": 0.7006353884935379, "num_tokens": 332564956.0, "step": 10155 }, { "epoch": 0.8425242557425989, "grad_norm": 0.9061588644981384, "learning_rate": 7.373475404654917e-06, "loss": 1.0887, "mean_token_accuracy": 0.7113880768418313, "num_tokens": 332728796.0, "step": 10160 }, { "epoch": 0.8429388838212124, "grad_norm": 0.913648784160614, "learning_rate": 7.335695505328366e-06, "loss": 1.2308, "mean_token_accuracy": 0.6860153913497925, "num_tokens": 332892636.0, "step": 10165 }, { "epoch": 0.8433535118998259, "grad_norm": 0.9222296476364136, "learning_rate": 7.298004978673817e-06, "loss": 1.1789, "mean_token_accuracy": 0.6969758063554764, "num_tokens": 333056476.0, "step": 10170 }, { "epoch": 0.8437681399784394, "grad_norm": 0.9109228253364563, "learning_rate": 7.260403903644969e-06, "loss": 1.0364, "mean_token_accuracy": 0.7219635859131813, "num_tokens": 333220316.0, "step": 10175 }, { "epoch": 0.8441827680570528, "grad_norm": 0.9047378301620483, "learning_rate": 7.222892359008082e-06, "loss": 1.1158, "mean_token_accuracy": 0.709744618833065, "num_tokens": 333384156.0, "step": 10180 }, { "epoch": 0.8445973961356663, "grad_norm": 0.8748897910118103, "learning_rate": 7.185470423341906e-06, "loss": 1.0768, "mean_token_accuracy": 0.7150476530194283, "num_tokens": 333547996.0, "step": 10185 }, { "epoch": 0.8450120242142798, "grad_norm": 0.916329026222229, "learning_rate": 7.148138175037427e-06, "loss": 1.133, "mean_token_accuracy": 0.7057490259408951, "num_tokens": 333711836.0, "step": 10190 }, { "epoch": 0.8454266522928933, "grad_norm": 0.8952322006225586, "learning_rate": 7.110895692297825e-06, "loss": 1.0881, "mean_token_accuracy": 0.7132270276546478, "num_tokens": 333875676.0, "step": 10195 }, { "epoch": 0.8458412803715067, "grad_norm": 0.9101585149765015, "learning_rate": 7.0737430531381984e-06, "loss": 1.1821, "mean_token_accuracy": 0.6984176456928253, "num_tokens": 334039516.0, "step": 10200 }, { "epoch": 0.8462559084501202, "grad_norm": 0.9506310820579529, "learning_rate": 7.036680335385426e-06, "loss": 1.1783, "mean_token_accuracy": 0.6965603157877922, "num_tokens": 334202508.0, "step": 10205 }, { "epoch": 0.8466705365287337, "grad_norm": 0.9565568566322327, "learning_rate": 6.999707616678064e-06, "loss": 1.1938, "mean_token_accuracy": 0.693558556586504, "num_tokens": 334366062.0, "step": 10210 }, { "epoch": 0.8470851646073472, "grad_norm": 0.9096312522888184, "learning_rate": 6.962824974466131e-06, "loss": 1.0899, "mean_token_accuracy": 0.7146627560257912, "num_tokens": 334529902.0, "step": 10215 }, { "epoch": 0.8474997926859607, "grad_norm": 0.9432851672172546, "learning_rate": 6.926032486010909e-06, "loss": 1.1197, "mean_token_accuracy": 0.707447449862957, "num_tokens": 334693742.0, "step": 10220 }, { "epoch": 0.8479144207645741, "grad_norm": 0.9380450248718262, "learning_rate": 6.889330228384872e-06, "loss": 1.1292, "mean_token_accuracy": 0.7084066480398178, "num_tokens": 334857582.0, "step": 10225 }, { "epoch": 0.8483290488431876, "grad_norm": 0.9398831725120544, "learning_rate": 6.8527182784714925e-06, "loss": 1.1497, "mean_token_accuracy": 0.7017961874604225, "num_tokens": 335021422.0, "step": 10230 }, { "epoch": 0.8487436769218012, "grad_norm": 0.9589568376541138, "learning_rate": 6.816196712965012e-06, "loss": 1.0939, "mean_token_accuracy": 0.7131842628121376, "num_tokens": 335185262.0, "step": 10235 }, { "epoch": 0.8491583050004147, "grad_norm": 0.9200323820114136, "learning_rate": 6.779765608370381e-06, "loss": 1.1777, "mean_token_accuracy": 0.6959616333246231, "num_tokens": 335349102.0, "step": 10240 }, { "epoch": 0.8495729330790281, "grad_norm": 0.8970324993133545, "learning_rate": 6.743425041003032e-06, "loss": 1.1821, "mean_token_accuracy": 0.6969391494989395, "num_tokens": 335512942.0, "step": 10245 }, { "epoch": 0.8499875611576416, "grad_norm": 0.9432082176208496, "learning_rate": 6.70717508698876e-06, "loss": 1.1291, "mean_token_accuracy": 0.7087915413081646, "num_tokens": 335676782.0, "step": 10250 }, { "epoch": 0.8504021892362551, "grad_norm": 0.9453873038291931, "learning_rate": 6.6710158222635214e-06, "loss": 1.0524, "mean_token_accuracy": 0.71732037961483, "num_tokens": 335840622.0, "step": 10255 }, { "epoch": 0.8508168173148686, "grad_norm": 0.9753464460372925, "learning_rate": 6.634947322573315e-06, "loss": 1.1827, "mean_token_accuracy": 0.6988941878080368, "num_tokens": 336004462.0, "step": 10260 }, { "epoch": 0.8512314453934821, "grad_norm": 0.8873828649520874, "learning_rate": 6.5989696634739975e-06, "loss": 1.1724, "mean_token_accuracy": 0.6980716556310653, "num_tokens": 336167735.0, "step": 10265 }, { "epoch": 0.8516460734720955, "grad_norm": 0.843457818031311, "learning_rate": 6.563082920331143e-06, "loss": 1.061, "mean_token_accuracy": 0.7177908137440682, "num_tokens": 336331575.0, "step": 10270 }, { "epoch": 0.852060701550709, "grad_norm": 0.9610382318496704, "learning_rate": 6.527287168319857e-06, "loss": 1.1429, "mean_token_accuracy": 0.7072542741894722, "num_tokens": 336494655.0, "step": 10275 }, { "epoch": 0.8524753296293225, "grad_norm": 0.9118860960006714, "learning_rate": 6.491582482424663e-06, "loss": 1.1218, "mean_token_accuracy": 0.7077346026897431, "num_tokens": 336658495.0, "step": 10280 }, { "epoch": 0.852889957707936, "grad_norm": 0.9611846208572388, "learning_rate": 6.455968937439299e-06, "loss": 1.1314, "mean_token_accuracy": 0.7065094336867332, "num_tokens": 336821488.0, "step": 10285 }, { "epoch": 0.8533045857865494, "grad_norm": 0.9499552845954895, "learning_rate": 6.4204466079666016e-06, "loss": 1.1761, "mean_token_accuracy": 0.7020955517888069, "num_tokens": 336985328.0, "step": 10290 }, { "epoch": 0.8537192138651629, "grad_norm": 0.8805335164070129, "learning_rate": 6.385015568418307e-06, "loss": 1.1255, "mean_token_accuracy": 0.7115713566541672, "num_tokens": 337149168.0, "step": 10295 }, { "epoch": 0.8541338419437764, "grad_norm": 0.9237547516822815, "learning_rate": 6.349675893014933e-06, "loss": 1.1945, "mean_token_accuracy": 0.6932795748114586, "num_tokens": 337313008.0, "step": 10300 }, { "epoch": 0.8545484700223899, "grad_norm": 0.9319496750831604, "learning_rate": 6.3144276557856155e-06, "loss": 1.1413, "mean_token_accuracy": 0.7031341657042504, "num_tokens": 337476848.0, "step": 10305 }, { "epoch": 0.8549630981010033, "grad_norm": 0.9150850176811218, "learning_rate": 6.279270930567943e-06, "loss": 1.1264, "mean_token_accuracy": 0.7067143246531487, "num_tokens": 337640688.0, "step": 10310 }, { "epoch": 0.8553777261796169, "grad_norm": 0.9051622152328491, "learning_rate": 6.244205791007768e-06, "loss": 1.1661, "mean_token_accuracy": 0.699865597486496, "num_tokens": 337804528.0, "step": 10315 }, { "epoch": 0.8557923542582304, "grad_norm": 0.8876291513442993, "learning_rate": 6.209232310559149e-06, "loss": 1.0941, "mean_token_accuracy": 0.7114613935351372, "num_tokens": 337968368.0, "step": 10320 }, { "epoch": 0.8562069823368439, "grad_norm": 0.9366981983184814, "learning_rate": 6.1743505624841155e-06, "loss": 1.0545, "mean_token_accuracy": 0.719440370798111, "num_tokens": 338132208.0, "step": 10325 }, { "epoch": 0.8566216104154574, "grad_norm": 0.9815709590911865, "learning_rate": 6.139560619852524e-06, "loss": 1.1019, "mean_token_accuracy": 0.7128482386469841, "num_tokens": 338296048.0, "step": 10330 }, { "epoch": 0.8570362384940708, "grad_norm": 0.9242997765541077, "learning_rate": 6.104862555541935e-06, "loss": 1.0545, "mean_token_accuracy": 0.7165017127990723, "num_tokens": 338459888.0, "step": 10335 }, { "epoch": 0.8574508665726843, "grad_norm": 0.9416835308074951, "learning_rate": 6.070256442237426e-06, "loss": 1.1271, "mean_token_accuracy": 0.7058345556259156, "num_tokens": 338623728.0, "step": 10340 }, { "epoch": 0.8578654946512978, "grad_norm": 0.9095317125320435, "learning_rate": 6.0357423524314896e-06, "loss": 1.0937, "mean_token_accuracy": 0.712644311785698, "num_tokens": 338786808.0, "step": 10345 }, { "epoch": 0.8582801227299113, "grad_norm": 0.9168542623519897, "learning_rate": 6.001320358423784e-06, "loss": 1.1875, "mean_token_accuracy": 0.6952815085649491, "num_tokens": 338950140.0, "step": 10350 }, { "epoch": 0.8586947508085248, "grad_norm": 0.9308287501335144, "learning_rate": 5.966990532321126e-06, "loss": 1.155, "mean_token_accuracy": 0.6985923990607261, "num_tokens": 339113721.0, "step": 10355 }, { "epoch": 0.8591093788871382, "grad_norm": 0.8683541417121887, "learning_rate": 5.932752946037223e-06, "loss": 1.0512, "mean_token_accuracy": 0.7197947204113007, "num_tokens": 339277561.0, "step": 10360 }, { "epoch": 0.8595240069657517, "grad_norm": 0.9227983951568604, "learning_rate": 5.898607671292533e-06, "loss": 1.2245, "mean_token_accuracy": 0.68935117572546, "num_tokens": 339441401.0, "step": 10365 }, { "epoch": 0.8599386350443652, "grad_norm": 0.8903299570083618, "learning_rate": 5.864554779614173e-06, "loss": 1.1656, "mean_token_accuracy": 0.7003726795315742, "num_tokens": 339605241.0, "step": 10370 }, { "epoch": 0.8603532631229787, "grad_norm": 0.9057297110557556, "learning_rate": 5.83059434233576e-06, "loss": 1.0949, "mean_token_accuracy": 0.7124938845634461, "num_tokens": 339769081.0, "step": 10375 }, { "epoch": 0.8607678912015921, "grad_norm": 0.9160807728767395, "learning_rate": 5.796726430597177e-06, "loss": 1.1551, "mean_token_accuracy": 0.7034396409988404, "num_tokens": 339932921.0, "step": 10380 }, { "epoch": 0.8611825192802056, "grad_norm": 0.9203912615776062, "learning_rate": 5.762951115344517e-06, "loss": 1.1418, "mean_token_accuracy": 0.7056757092475892, "num_tokens": 340096761.0, "step": 10385 }, { "epoch": 0.8615971473588191, "grad_norm": 0.9417601823806763, "learning_rate": 5.72926846732994e-06, "loss": 1.2017, "mean_token_accuracy": 0.6915078178048134, "num_tokens": 340260601.0, "step": 10390 }, { "epoch": 0.8620117754374326, "grad_norm": 0.9596326351165771, "learning_rate": 5.695678557111417e-06, "loss": 1.2143, "mean_token_accuracy": 0.6903286874294281, "num_tokens": 340424441.0, "step": 10395 }, { "epoch": 0.8624264035160462, "grad_norm": 0.9278862476348877, "learning_rate": 5.6621814550526955e-06, "loss": 1.1223, "mean_token_accuracy": 0.7098545983433724, "num_tokens": 340588281.0, "step": 10400 }, { "epoch": 0.8628410315946596, "grad_norm": 0.9150340557098389, "learning_rate": 5.628777231323101e-06, "loss": 1.1154, "mean_token_accuracy": 0.7060789301991462, "num_tokens": 340752121.0, "step": 10405 }, { "epoch": 0.8632556596732731, "grad_norm": 0.9144976139068604, "learning_rate": 5.5954659558974275e-06, "loss": 1.1716, "mean_token_accuracy": 0.6999450191855431, "num_tokens": 340915961.0, "step": 10410 }, { "epoch": 0.8636702877518866, "grad_norm": 0.960284948348999, "learning_rate": 5.562247698555695e-06, "loss": 1.0815, "mean_token_accuracy": 0.7087146058678627, "num_tokens": 341079573.0, "step": 10415 }, { "epoch": 0.8640849158305001, "grad_norm": 0.8988045454025269, "learning_rate": 5.52912252888313e-06, "loss": 1.1377, "mean_token_accuracy": 0.7064760476350784, "num_tokens": 341243413.0, "step": 10420 }, { "epoch": 0.8644995439091135, "grad_norm": 0.905921459197998, "learning_rate": 5.496090516269936e-06, "loss": 1.1608, "mean_token_accuracy": 0.7029386609792709, "num_tokens": 341407253.0, "step": 10425 }, { "epoch": 0.864914171987727, "grad_norm": 0.901429295539856, "learning_rate": 5.4631517299111755e-06, "loss": 1.2206, "mean_token_accuracy": 0.691397850215435, "num_tokens": 341571093.0, "step": 10430 }, { "epoch": 0.8653288000663405, "grad_norm": 0.8878471255302429, "learning_rate": 5.430306238806626e-06, "loss": 1.029, "mean_token_accuracy": 0.7252299129962921, "num_tokens": 341734914.0, "step": 10435 }, { "epoch": 0.865743428144954, "grad_norm": 0.9485891461372375, "learning_rate": 5.397554111760617e-06, "loss": 1.1296, "mean_token_accuracy": 0.7043988257646561, "num_tokens": 341898754.0, "step": 10440 }, { "epoch": 0.8661580562235675, "grad_norm": 0.8997884392738342, "learning_rate": 5.364895417381921e-06, "loss": 1.1871, "mean_token_accuracy": 0.6963770732283592, "num_tokens": 342062594.0, "step": 10445 }, { "epoch": 0.8665726843021809, "grad_norm": 0.9660028219223022, "learning_rate": 5.33233022408357e-06, "loss": 1.1105, "mean_token_accuracy": 0.7105938419699669, "num_tokens": 342226434.0, "step": 10450 }, { "epoch": 0.8669873123807944, "grad_norm": 0.9538041353225708, "learning_rate": 5.299858600082752e-06, "loss": 1.1144, "mean_token_accuracy": 0.7105266362428665, "num_tokens": 342390274.0, "step": 10455 }, { "epoch": 0.8674019404594079, "grad_norm": 0.8947070240974426, "learning_rate": 5.267480613400616e-06, "loss": 1.1334, "mean_token_accuracy": 0.7077346041798591, "num_tokens": 342554114.0, "step": 10460 }, { "epoch": 0.8678165685380214, "grad_norm": 0.8957281112670898, "learning_rate": 5.235196331862196e-06, "loss": 1.0513, "mean_token_accuracy": 0.7214931592345237, "num_tokens": 342717954.0, "step": 10465 }, { "epoch": 0.8682311966166348, "grad_norm": 0.920192301273346, "learning_rate": 5.203005823096207e-06, "loss": 1.1912, "mean_token_accuracy": 0.6949596747756004, "num_tokens": 342881794.0, "step": 10470 }, { "epoch": 0.8686458246952483, "grad_norm": 0.934984564781189, "learning_rate": 5.170909154534942e-06, "loss": 1.1365, "mean_token_accuracy": 0.7070564493536949, "num_tokens": 343045634.0, "step": 10475 }, { "epoch": 0.8690604527738618, "grad_norm": 0.8826227784156799, "learning_rate": 5.138906393414123e-06, "loss": 1.0759, "mean_token_accuracy": 0.7142350882291794, "num_tokens": 343209474.0, "step": 10480 }, { "epoch": 0.8694750808524754, "grad_norm": 0.9156180024147034, "learning_rate": 5.106997606772734e-06, "loss": 1.1517, "mean_token_accuracy": 0.7029875323176384, "num_tokens": 343373314.0, "step": 10485 }, { "epoch": 0.8698897089310889, "grad_norm": 0.9311761260032654, "learning_rate": 5.075182861452943e-06, "loss": 1.1354, "mean_token_accuracy": 0.7048264876008034, "num_tokens": 343537154.0, "step": 10490 }, { "epoch": 0.8703043370097023, "grad_norm": 0.9276612401008606, "learning_rate": 5.0434622240998595e-06, "loss": 1.1191, "mean_token_accuracy": 0.7078445747494697, "num_tokens": 343700994.0, "step": 10495 }, { "epoch": 0.8707189650883158, "grad_norm": 0.9433877468109131, "learning_rate": 5.011835761161521e-06, "loss": 1.1138, "mean_token_accuracy": 0.7077651530504226, "num_tokens": 343864834.0, "step": 10500 }, { "epoch": 0.8711335931669293, "grad_norm": 0.9424065351486206, "learning_rate": 4.980303538888664e-06, "loss": 1.1321, "mean_token_accuracy": 0.7079362139105797, "num_tokens": 344028674.0, "step": 10505 }, { "epoch": 0.8715482212455428, "grad_norm": 0.9527302384376526, "learning_rate": 4.948865623334581e-06, "loss": 1.1602, "mean_token_accuracy": 0.7004765406250953, "num_tokens": 344192514.0, "step": 10510 }, { "epoch": 0.8719628493241562, "grad_norm": 0.9701392650604248, "learning_rate": 4.917522080355064e-06, "loss": 1.1372, "mean_token_accuracy": 0.7040017083287239, "num_tokens": 344356354.0, "step": 10515 }, { "epoch": 0.8723774774027697, "grad_norm": 0.9226775169372559, "learning_rate": 4.886272975608197e-06, "loss": 1.1048, "mean_token_accuracy": 0.7104777619242668, "num_tokens": 344520194.0, "step": 10520 }, { "epoch": 0.8727921054813832, "grad_norm": 0.9319620728492737, "learning_rate": 4.855118374554202e-06, "loss": 1.1621, "mean_token_accuracy": 0.7017656415700912, "num_tokens": 344684034.0, "step": 10525 }, { "epoch": 0.8732067335599967, "grad_norm": 0.9370831251144409, "learning_rate": 4.8240583424553674e-06, "loss": 1.1468, "mean_token_accuracy": 0.7070992231369019, "num_tokens": 344847874.0, "step": 10530 }, { "epoch": 0.8736213616386101, "grad_norm": 0.9271456599235535, "learning_rate": 4.7930929443758935e-06, "loss": 1.1367, "mean_token_accuracy": 0.7005070865154266, "num_tokens": 345011714.0, "step": 10535 }, { "epoch": 0.8740359897172236, "grad_norm": 0.8970178365707397, "learning_rate": 4.762222245181719e-06, "loss": 1.1148, "mean_token_accuracy": 0.7075757578015327, "num_tokens": 345175554.0, "step": 10540 }, { "epoch": 0.8744506177958371, "grad_norm": 0.9386332035064697, "learning_rate": 4.7314463095404e-06, "loss": 1.1563, "mean_token_accuracy": 0.699101909995079, "num_tokens": 345339394.0, "step": 10545 }, { "epoch": 0.8748652458744506, "grad_norm": 0.9848224520683289, "learning_rate": 4.700765201920998e-06, "loss": 1.2436, "mean_token_accuracy": 0.685410051047802, "num_tokens": 345502571.0, "step": 10550 }, { "epoch": 0.8752798739530641, "grad_norm": 0.929392397403717, "learning_rate": 4.670178986593948e-06, "loss": 1.1694, "mean_token_accuracy": 0.700164957344532, "num_tokens": 345666411.0, "step": 10555 }, { "epoch": 0.8756945020316775, "grad_norm": 0.9089635610580444, "learning_rate": 4.639687727630865e-06, "loss": 1.1217, "mean_token_accuracy": 0.7107526905834675, "num_tokens": 345830251.0, "step": 10560 }, { "epoch": 0.8761091301102911, "grad_norm": 0.9183827638626099, "learning_rate": 4.609291488904472e-06, "loss": 1.1095, "mean_token_accuracy": 0.7111681342124939, "num_tokens": 345994091.0, "step": 10565 }, { "epoch": 0.8765237581889046, "grad_norm": 0.9086915850639343, "learning_rate": 4.578990334088468e-06, "loss": 1.0955, "mean_token_accuracy": 0.7092069864273072, "num_tokens": 346157931.0, "step": 10570 }, { "epoch": 0.8769383862675181, "grad_norm": 0.8890146613121033, "learning_rate": 4.5487843266573235e-06, "loss": 1.0894, "mean_token_accuracy": 0.7138990744948387, "num_tokens": 346321771.0, "step": 10575 }, { "epoch": 0.8773530143461316, "grad_norm": 0.9215315580368042, "learning_rate": 4.518673529886231e-06, "loss": 1.0649, "mean_token_accuracy": 0.7202284976840019, "num_tokens": 346485611.0, "step": 10580 }, { "epoch": 0.877767642424745, "grad_norm": 0.9251281023025513, "learning_rate": 4.488658006850915e-06, "loss": 1.1543, "mean_token_accuracy": 0.7058259814977645, "num_tokens": 346649398.0, "step": 10585 }, { "epoch": 0.8781822705033585, "grad_norm": 0.8954192996025085, "learning_rate": 4.458737820427545e-06, "loss": 1.1213, "mean_token_accuracy": 0.7065554738044739, "num_tokens": 346813238.0, "step": 10590 }, { "epoch": 0.878596898581972, "grad_norm": 0.9286209940910339, "learning_rate": 4.428913033292559e-06, "loss": 1.1571, "mean_token_accuracy": 0.6992729738354683, "num_tokens": 346977078.0, "step": 10595 }, { "epoch": 0.8790115266605855, "grad_norm": 0.9251374006271362, "learning_rate": 4.399183707922566e-06, "loss": 1.1796, "mean_token_accuracy": 0.6961632415652275, "num_tokens": 347140918.0, "step": 10600 }, { "epoch": 0.8794261547391989, "grad_norm": 0.9257077574729919, "learning_rate": 4.369549906594195e-06, "loss": 1.1771, "mean_token_accuracy": 0.6977761521935463, "num_tokens": 347304758.0, "step": 10605 }, { "epoch": 0.8798407828178124, "grad_norm": 0.9449937343597412, "learning_rate": 4.340011691383983e-06, "loss": 1.1267, "mean_token_accuracy": 0.7064149603247643, "num_tokens": 347468598.0, "step": 10610 }, { "epoch": 0.8802554108964259, "grad_norm": 0.9059110283851624, "learning_rate": 4.310569124168229e-06, "loss": 1.164, "mean_token_accuracy": 0.7002016142010689, "num_tokens": 347632438.0, "step": 10615 }, { "epoch": 0.8806700389750394, "grad_norm": 0.9088716506958008, "learning_rate": 4.281222266622864e-06, "loss": 1.0961, "mean_token_accuracy": 0.7130865097045899, "num_tokens": 347796278.0, "step": 10620 }, { "epoch": 0.8810846670536528, "grad_norm": 0.8918694257736206, "learning_rate": 4.25197118022333e-06, "loss": 1.1256, "mean_token_accuracy": 0.7093108505010605, "num_tokens": 347960118.0, "step": 10625 }, { "epoch": 0.8814992951322663, "grad_norm": 0.925315797328949, "learning_rate": 4.222815926244455e-06, "loss": 1.2178, "mean_token_accuracy": 0.690927417576313, "num_tokens": 348123958.0, "step": 10630 }, { "epoch": 0.8819139232108798, "grad_norm": 0.8862684965133667, "learning_rate": 4.193756565760315e-06, "loss": 1.1023, "mean_token_accuracy": 0.711528591811657, "num_tokens": 348287798.0, "step": 10635 }, { "epoch": 0.8823285512894933, "grad_norm": 0.9163962602615356, "learning_rate": 4.164793159644109e-06, "loss": 1.0738, "mean_token_accuracy": 0.7229288846254349, "num_tokens": 348451638.0, "step": 10640 }, { "epoch": 0.8827431793681068, "grad_norm": 0.9359307289123535, "learning_rate": 4.135925768568028e-06, "loss": 1.1612, "mean_token_accuracy": 0.7009225338697433, "num_tokens": 348615478.0, "step": 10645 }, { "epoch": 0.8831578074467203, "grad_norm": 0.9241249561309814, "learning_rate": 4.107154453003148e-06, "loss": 1.1954, "mean_token_accuracy": 0.6945578485727311, "num_tokens": 348778327.0, "step": 10650 }, { "epoch": 0.8835724355253338, "grad_norm": 0.875142514705658, "learning_rate": 4.078479273219249e-06, "loss": 1.0893, "mean_token_accuracy": 0.7108382225036621, "num_tokens": 348942167.0, "step": 10655 }, { "epoch": 0.8839870636039473, "grad_norm": 0.9249883890151978, "learning_rate": 4.049900289284781e-06, "loss": 1.1509, "mean_token_accuracy": 0.7056634902954102, "num_tokens": 349106007.0, "step": 10660 }, { "epoch": 0.8844016916825608, "grad_norm": 0.9233696460723877, "learning_rate": 4.021417561066649e-06, "loss": 1.0809, "mean_token_accuracy": 0.7163734093308449, "num_tokens": 349269847.0, "step": 10665 }, { "epoch": 0.8848163197611743, "grad_norm": 0.8704918026924133, "learning_rate": 3.993031148230114e-06, "loss": 1.1255, "mean_token_accuracy": 0.7117546945810318, "num_tokens": 349433611.0, "step": 10670 }, { "epoch": 0.8852309478397877, "grad_norm": 0.9347735643386841, "learning_rate": 3.964741110238695e-06, "loss": 1.1009, "mean_token_accuracy": 0.7121028825640678, "num_tokens": 349597451.0, "step": 10675 }, { "epoch": 0.8856455759184012, "grad_norm": 0.9115481972694397, "learning_rate": 3.936547506354038e-06, "loss": 1.1182, "mean_token_accuracy": 0.7080034166574478, "num_tokens": 349761291.0, "step": 10680 }, { "epoch": 0.8860602039970147, "grad_norm": 0.92775958776474, "learning_rate": 3.908450395635771e-06, "loss": 1.1146, "mean_token_accuracy": 0.7106427147984504, "num_tokens": 349925131.0, "step": 10685 }, { "epoch": 0.8864748320756282, "grad_norm": 0.8857362270355225, "learning_rate": 3.880449836941352e-06, "loss": 1.0615, "mean_token_accuracy": 0.7175005540251732, "num_tokens": 350088720.0, "step": 10690 }, { "epoch": 0.8868894601542416, "grad_norm": 0.9340742230415344, "learning_rate": 3.8525458889260454e-06, "loss": 1.0995, "mean_token_accuracy": 0.712231183052063, "num_tokens": 350252560.0, "step": 10695 }, { "epoch": 0.8873040882328551, "grad_norm": 0.9503607153892517, "learning_rate": 3.8247386100427e-06, "loss": 1.1441, "mean_token_accuracy": 0.7080706223845482, "num_tokens": 350416400.0, "step": 10700 }, { "epoch": 0.8877187163114686, "grad_norm": 0.9132021069526672, "learning_rate": 3.7970280585416574e-06, "loss": 1.1013, "mean_token_accuracy": 0.7119199618697166, "num_tokens": 350579808.0, "step": 10705 }, { "epoch": 0.8881333443900821, "grad_norm": 0.9462399482727051, "learning_rate": 3.7694142924706467e-06, "loss": 1.0906, "mean_token_accuracy": 0.7143477022647857, "num_tokens": 350743000.0, "step": 10710 }, { "epoch": 0.8885479724686955, "grad_norm": 0.9100886583328247, "learning_rate": 3.741897369674674e-06, "loss": 1.1007, "mean_token_accuracy": 0.7157502487301827, "num_tokens": 350906840.0, "step": 10715 }, { "epoch": 0.888962600547309, "grad_norm": 0.9081605672836304, "learning_rate": 3.714477347795836e-06, "loss": 1.1059, "mean_token_accuracy": 0.7109359741210938, "num_tokens": 351070680.0, "step": 10720 }, { "epoch": 0.8893772286259225, "grad_norm": 0.883823037147522, "learning_rate": 3.6871542842732755e-06, "loss": 1.0441, "mean_token_accuracy": 0.7192143186926842, "num_tokens": 351234520.0, "step": 10725 }, { "epoch": 0.889791856704536, "grad_norm": 0.9276925325393677, "learning_rate": 3.659928236343013e-06, "loss": 1.0591, "mean_token_accuracy": 0.7205584034323692, "num_tokens": 351398360.0, "step": 10730 }, { "epoch": 0.8902064847831496, "grad_norm": 0.9058579802513123, "learning_rate": 3.6327992610378505e-06, "loss": 1.139, "mean_token_accuracy": 0.704841522872448, "num_tokens": 351561184.0, "step": 10735 }, { "epoch": 0.890621112861763, "grad_norm": 0.9753690361976624, "learning_rate": 3.6057674151872336e-06, "loss": 1.1236, "mean_token_accuracy": 0.70912756472826, "num_tokens": 351725024.0, "step": 10740 }, { "epoch": 0.8910357409403765, "grad_norm": 0.9092490673065186, "learning_rate": 3.578832755417155e-06, "loss": 1.0853, "mean_token_accuracy": 0.7153225794434548, "num_tokens": 351888864.0, "step": 10745 }, { "epoch": 0.89145036901899, "grad_norm": 0.8844450116157532, "learning_rate": 3.5519953381500157e-06, "loss": 1.1252, "mean_token_accuracy": 0.7079545482993126, "num_tokens": 352052704.0, "step": 10750 }, { "epoch": 0.8918649970976035, "grad_norm": 0.8999067544937134, "learning_rate": 3.5252552196045065e-06, "loss": 1.0841, "mean_token_accuracy": 0.7133003413677216, "num_tokens": 352216544.0, "step": 10755 }, { "epoch": 0.892279625176217, "grad_norm": 0.9216794967651367, "learning_rate": 3.4986124557955137e-06, "loss": 1.1149, "mean_token_accuracy": 0.7114186227321625, "num_tokens": 352380384.0, "step": 10760 }, { "epoch": 0.8926942532548304, "grad_norm": 0.9328907132148743, "learning_rate": 3.472067102533977e-06, "loss": 1.2421, "mean_token_accuracy": 0.6862288989126683, "num_tokens": 352543574.0, "step": 10765 }, { "epoch": 0.8931088813334439, "grad_norm": 0.9407025575637817, "learning_rate": 3.445619215426782e-06, "loss": 1.1621, "mean_token_accuracy": 0.6999144695699215, "num_tokens": 352707414.0, "step": 10770 }, { "epoch": 0.8935235094120574, "grad_norm": 0.9398549199104309, "learning_rate": 3.4192688498766444e-06, "loss": 1.1392, "mean_token_accuracy": 0.7009042024612426, "num_tokens": 352871254.0, "step": 10775 }, { "epoch": 0.8939381374906709, "grad_norm": 0.9020984768867493, "learning_rate": 3.3930160610819937e-06, "loss": 1.196, "mean_token_accuracy": 0.6935483887791634, "num_tokens": 353035094.0, "step": 10780 }, { "epoch": 0.8943527655692843, "grad_norm": 0.8914738893508911, "learning_rate": 3.366860904036856e-06, "loss": 1.0662, "mean_token_accuracy": 0.7212732180953025, "num_tokens": 353198934.0, "step": 10785 }, { "epoch": 0.8947673936478978, "grad_norm": 0.9360032677650452, "learning_rate": 3.340803433530737e-06, "loss": 1.1005, "mean_token_accuracy": 0.7113025352358818, "num_tokens": 353362774.0, "step": 10790 }, { "epoch": 0.8951820217265113, "grad_norm": 0.9081243872642517, "learning_rate": 3.3148437041485236e-06, "loss": 1.1593, "mean_token_accuracy": 0.7005865097045898, "num_tokens": 353526614.0, "step": 10795 }, { "epoch": 0.8955966498051248, "grad_norm": 0.9637793302536011, "learning_rate": 3.288981770270333e-06, "loss": 1.1613, "mean_token_accuracy": 0.7011241421103478, "num_tokens": 353690454.0, "step": 10800 }, { "epoch": 0.8960112778837382, "grad_norm": 0.9164398908615112, "learning_rate": 3.263217686071435e-06, "loss": 1.1734, "mean_token_accuracy": 0.6994990259408951, "num_tokens": 353854294.0, "step": 10805 }, { "epoch": 0.8964259059623517, "grad_norm": 0.8988476395606995, "learning_rate": 3.237551505522135e-06, "loss": 1.1431, "mean_token_accuracy": 0.7030303031206131, "num_tokens": 354018134.0, "step": 10810 }, { "epoch": 0.8968405340409653, "grad_norm": 0.8833062648773193, "learning_rate": 3.211983282387615e-06, "loss": 1.1183, "mean_token_accuracy": 0.7093902751803398, "num_tokens": 354181974.0, "step": 10815 }, { "epoch": 0.8972551621195788, "grad_norm": 0.9248128533363342, "learning_rate": 3.1865130702278977e-06, "loss": 1.1218, "mean_token_accuracy": 0.7110092908143997, "num_tokens": 354345814.0, "step": 10820 }, { "epoch": 0.8976697901981923, "grad_norm": 0.9079005122184753, "learning_rate": 3.1611409223976817e-06, "loss": 1.1326, "mean_token_accuracy": 0.7085654929280281, "num_tokens": 354509654.0, "step": 10825 }, { "epoch": 0.8980844182768057, "grad_norm": 0.8842004537582397, "learning_rate": 3.135866892046241e-06, "loss": 1.1492, "mean_token_accuracy": 0.7079117774963379, "num_tokens": 354673494.0, "step": 10830 }, { "epoch": 0.8984990463554192, "grad_norm": 0.8912933468818665, "learning_rate": 3.11069103211728e-06, "loss": 1.1562, "mean_token_accuracy": 0.7026881724596024, "num_tokens": 354837334.0, "step": 10835 }, { "epoch": 0.8989136744340327, "grad_norm": 0.9348641633987427, "learning_rate": 3.0856133953489184e-06, "loss": 1.233, "mean_token_accuracy": 0.6882453575730324, "num_tokens": 355001174.0, "step": 10840 }, { "epoch": 0.8993283025126462, "grad_norm": 0.9215148687362671, "learning_rate": 3.0606340342734853e-06, "loss": 1.1124, "mean_token_accuracy": 0.7089076235890388, "num_tokens": 355165014.0, "step": 10845 }, { "epoch": 0.8997429305912596, "grad_norm": 0.9471371173858643, "learning_rate": 3.035753001217423e-06, "loss": 1.1136, "mean_token_accuracy": 0.7102561622858048, "num_tokens": 355327852.0, "step": 10850 }, { "epoch": 0.9001575586698731, "grad_norm": 0.9406538009643555, "learning_rate": 3.0109703483012452e-06, "loss": 1.1024, "mean_token_accuracy": 0.7098362594842911, "num_tokens": 355491692.0, "step": 10855 }, { "epoch": 0.9005721867484866, "grad_norm": 0.8713740110397339, "learning_rate": 2.9862861274393474e-06, "loss": 1.1124, "mean_token_accuracy": 0.7118707180023194, "num_tokens": 355655532.0, "step": 10860 }, { "epoch": 0.9009868148271001, "grad_norm": 0.9907150268554688, "learning_rate": 2.9617003903399333e-06, "loss": 1.1423, "mean_token_accuracy": 0.704325507581234, "num_tokens": 355819372.0, "step": 10865 }, { "epoch": 0.9014014429057136, "grad_norm": 0.9606465101242065, "learning_rate": 2.9372131885049058e-06, "loss": 1.1436, "mean_token_accuracy": 0.7018633931875229, "num_tokens": 355983212.0, "step": 10870 }, { "epoch": 0.901816070984327, "grad_norm": 0.9174510836601257, "learning_rate": 2.912824573229783e-06, "loss": 1.1764, "mean_token_accuracy": 0.7021016642451287, "num_tokens": 356147052.0, "step": 10875 }, { "epoch": 0.9022306990629405, "grad_norm": 0.9237547516822815, "learning_rate": 2.8885345956035205e-06, "loss": 1.1415, "mean_token_accuracy": 0.7037756577134132, "num_tokens": 356310892.0, "step": 10880 }, { "epoch": 0.902645327141554, "grad_norm": 0.929188072681427, "learning_rate": 2.8643433065084824e-06, "loss": 1.1255, "mean_token_accuracy": 0.7043377324938774, "num_tokens": 356474732.0, "step": 10885 }, { "epoch": 0.9030599552201675, "grad_norm": 0.9025934338569641, "learning_rate": 2.840250756620272e-06, "loss": 1.1304, "mean_token_accuracy": 0.704802057147026, "num_tokens": 356638572.0, "step": 10890 }, { "epoch": 0.9034745832987809, "grad_norm": 0.920540988445282, "learning_rate": 2.816256996407707e-06, "loss": 1.1484, "mean_token_accuracy": 0.702421247959137, "num_tokens": 356801251.0, "step": 10895 }, { "epoch": 0.9038892113773945, "grad_norm": 0.935437798500061, "learning_rate": 2.7923620761325986e-06, "loss": 1.0632, "mean_token_accuracy": 0.7161219537258148, "num_tokens": 356964438.0, "step": 10900 }, { "epoch": 0.904303839456008, "grad_norm": 0.9342902898788452, "learning_rate": 2.768566045849752e-06, "loss": 1.1382, "mean_token_accuracy": 0.7036717966198921, "num_tokens": 357128278.0, "step": 10905 }, { "epoch": 0.9047184675346215, "grad_norm": 0.979550302028656, "learning_rate": 2.7448689554067985e-06, "loss": 1.2227, "mean_token_accuracy": 0.6927358254790306, "num_tokens": 357292118.0, "step": 10910 }, { "epoch": 0.905133095613235, "grad_norm": 0.9403480291366577, "learning_rate": 2.7212708544441244e-06, "loss": 1.1697, "mean_token_accuracy": 0.7031769335269928, "num_tokens": 357455958.0, "step": 10915 }, { "epoch": 0.9055477236918484, "grad_norm": 0.8986895084381104, "learning_rate": 2.697771792394743e-06, "loss": 1.1182, "mean_token_accuracy": 0.7077284932136536, "num_tokens": 357619798.0, "step": 10920 }, { "epoch": 0.9059623517704619, "grad_norm": 0.9211082458496094, "learning_rate": 2.6743718184842058e-06, "loss": 1.0705, "mean_token_accuracy": 0.7168255105614663, "num_tokens": 357783638.0, "step": 10925 }, { "epoch": 0.9063769798490754, "grad_norm": 0.8845340609550476, "learning_rate": 2.6510709817305024e-06, "loss": 1.1135, "mean_token_accuracy": 0.7054383754730225, "num_tokens": 357946818.0, "step": 10930 }, { "epoch": 0.9067916079276889, "grad_norm": 0.9751585125923157, "learning_rate": 2.627869330943944e-06, "loss": 1.1206, "mean_token_accuracy": 0.7069648057222366, "num_tokens": 358110658.0, "step": 10935 }, { "epoch": 0.9072062360063023, "grad_norm": 0.9455736875534058, "learning_rate": 2.6047669147270635e-06, "loss": 1.1558, "mean_token_accuracy": 0.7025293216109276, "num_tokens": 358274498.0, "step": 10940 }, { "epoch": 0.9076208640849158, "grad_norm": 0.9326270222663879, "learning_rate": 2.581763781474533e-06, "loss": 1.1423, "mean_token_accuracy": 0.7030913949012756, "num_tokens": 358438338.0, "step": 10945 }, { "epoch": 0.9080354921635293, "grad_norm": 0.9129369258880615, "learning_rate": 2.5588599793730405e-06, "loss": 1.1019, "mean_token_accuracy": 0.7141312330961227, "num_tokens": 358602178.0, "step": 10950 }, { "epoch": 0.9084501202421428, "grad_norm": 0.9351323246955872, "learning_rate": 2.5360555564011903e-06, "loss": 1.1645, "mean_token_accuracy": 0.7004582151770592, "num_tokens": 358766018.0, "step": 10955 }, { "epoch": 0.9088647483207563, "grad_norm": 0.8951199054718018, "learning_rate": 2.513350560329403e-06, "loss": 1.0588, "mean_token_accuracy": 0.7205950617790222, "num_tokens": 358929858.0, "step": 10960 }, { "epoch": 0.9092793763993697, "grad_norm": 0.8859832882881165, "learning_rate": 2.4907450387198495e-06, "loss": 1.0844, "mean_token_accuracy": 0.7139895841479301, "num_tokens": 359092807.0, "step": 10965 }, { "epoch": 0.9096940044779832, "grad_norm": 0.9251822829246521, "learning_rate": 2.4682390389262956e-06, "loss": 1.0616, "mean_token_accuracy": 0.7172715052962303, "num_tokens": 359256647.0, "step": 10970 }, { "epoch": 0.9101086325565967, "grad_norm": 0.9385135173797607, "learning_rate": 2.4458326080940398e-06, "loss": 1.0476, "mean_token_accuracy": 0.7187072306871414, "num_tokens": 359420487.0, "step": 10975 }, { "epoch": 0.9105232606352102, "grad_norm": 0.9257782101631165, "learning_rate": 2.423525793159809e-06, "loss": 1.1778, "mean_token_accuracy": 0.6981915965676307, "num_tokens": 359584327.0, "step": 10980 }, { "epoch": 0.9109378887138238, "grad_norm": 0.9558985233306885, "learning_rate": 2.401318640851641e-06, "loss": 1.2147, "mean_token_accuracy": 0.6936339169740677, "num_tokens": 359748167.0, "step": 10985 }, { "epoch": 0.9113525167924372, "grad_norm": 0.9049248099327087, "learning_rate": 2.37921119768883e-06, "loss": 1.1101, "mean_token_accuracy": 0.7097690656781197, "num_tokens": 359912007.0, "step": 10990 }, { "epoch": 0.9117671448710507, "grad_norm": 0.9582362174987793, "learning_rate": 2.3572035099817535e-06, "loss": 1.06, "mean_token_accuracy": 0.720039102435112, "num_tokens": 360075847.0, "step": 10995 }, { "epoch": 0.9121817729496642, "grad_norm": 0.9491180181503296, "learning_rate": 2.335295623831868e-06, "loss": 1.1178, "mean_token_accuracy": 0.7037878766655922, "num_tokens": 360239687.0, "step": 11000 }, { "epoch": 0.9125964010282777, "grad_norm": 0.9045689702033997, "learning_rate": 2.313487585131563e-06, "loss": 1.1507, "mean_token_accuracy": 0.7044049352407455, "num_tokens": 360403527.0, "step": 11005 }, { "epoch": 0.9130110291068911, "grad_norm": 0.8995153307914734, "learning_rate": 2.291779439564029e-06, "loss": 1.1369, "mean_token_accuracy": 0.7027065053582191, "num_tokens": 360567367.0, "step": 11010 }, { "epoch": 0.9134256571855046, "grad_norm": 0.9227888584136963, "learning_rate": 2.270171232603241e-06, "loss": 1.066, "mean_token_accuracy": 0.7193304002285004, "num_tokens": 360731207.0, "step": 11015 }, { "epoch": 0.9138402852641181, "grad_norm": 0.9289051294326782, "learning_rate": 2.2486630095138184e-06, "loss": 1.1494, "mean_token_accuracy": 0.7053235292434692, "num_tokens": 360893382.0, "step": 11020 }, { "epoch": 0.9142549133427316, "grad_norm": 0.9562925100326538, "learning_rate": 2.2272548153509155e-06, "loss": 1.1945, "mean_token_accuracy": 0.694709187746048, "num_tokens": 361057222.0, "step": 11025 }, { "epoch": 0.914669541421345, "grad_norm": 0.9172911643981934, "learning_rate": 2.2059466949601594e-06, "loss": 1.1179, "mean_token_accuracy": 0.7094469025731087, "num_tokens": 361220686.0, "step": 11030 }, { "epoch": 0.9150841694999585, "grad_norm": 0.9196988940238953, "learning_rate": 2.184738692977556e-06, "loss": 1.1029, "mean_token_accuracy": 0.7084433034062385, "num_tokens": 361384526.0, "step": 11035 }, { "epoch": 0.915498797578572, "grad_norm": 0.959909975528717, "learning_rate": 2.1636308538293794e-06, "loss": 1.1153, "mean_token_accuracy": 0.7096346527338028, "num_tokens": 361548366.0, "step": 11040 }, { "epoch": 0.9159134256571855, "grad_norm": 0.9608067870140076, "learning_rate": 2.142623221732054e-06, "loss": 1.1618, "mean_token_accuracy": 0.698104539513588, "num_tokens": 361711105.0, "step": 11045 }, { "epoch": 0.916328053735799, "grad_norm": 0.8869704604148865, "learning_rate": 2.1217158406921176e-06, "loss": 1.0719, "mean_token_accuracy": 0.7176208943128586, "num_tokens": 361874425.0, "step": 11050 }, { "epoch": 0.9167426818144124, "grad_norm": 0.9388926029205322, "learning_rate": 2.1009087545061258e-06, "loss": 1.0922, "mean_token_accuracy": 0.7119195967912674, "num_tokens": 362038265.0, "step": 11055 }, { "epoch": 0.9171573098930259, "grad_norm": 0.9204244613647461, "learning_rate": 2.0802020067604843e-06, "loss": 1.0974, "mean_token_accuracy": 0.7133064493536949, "num_tokens": 362202105.0, "step": 11060 }, { "epoch": 0.9175719379716394, "grad_norm": 0.9380967617034912, "learning_rate": 2.059595640831452e-06, "loss": 1.1215, "mean_token_accuracy": 0.7061278134584427, "num_tokens": 362365945.0, "step": 11065 }, { "epoch": 0.917986566050253, "grad_norm": 0.8923094868659973, "learning_rate": 2.0390896998849996e-06, "loss": 1.1248, "mean_token_accuracy": 0.7078488484025002, "num_tokens": 362529202.0, "step": 11070 }, { "epoch": 0.9184011941288664, "grad_norm": 0.9815731644630432, "learning_rate": 2.018684226876716e-06, "loss": 1.1595, "mean_token_accuracy": 0.6996273174881935, "num_tokens": 362693042.0, "step": 11075 }, { "epoch": 0.9188158222074799, "grad_norm": 0.9461018443107605, "learning_rate": 1.9983792645517475e-06, "loss": 1.1783, "mean_token_accuracy": 0.6971441462635994, "num_tokens": 362856493.0, "step": 11080 }, { "epoch": 0.9192304502860934, "grad_norm": 0.9883350133895874, "learning_rate": 1.9781748554446867e-06, "loss": 1.1202, "mean_token_accuracy": 0.7085654959082603, "num_tokens": 363020333.0, "step": 11085 }, { "epoch": 0.9196450783647069, "grad_norm": 0.933786928653717, "learning_rate": 1.958071041879478e-06, "loss": 1.2037, "mean_token_accuracy": 0.6949841171503067, "num_tokens": 363184173.0, "step": 11090 }, { "epoch": 0.9200597064433204, "grad_norm": 0.8661555647850037, "learning_rate": 1.9380678659693563e-06, "loss": 1.1058, "mean_token_accuracy": 0.7145405665040017, "num_tokens": 363348013.0, "step": 11095 }, { "epoch": 0.9204743345219338, "grad_norm": 0.8983901143074036, "learning_rate": 1.9181653696167312e-06, "loss": 1.1112, "mean_token_accuracy": 0.7085105136036873, "num_tokens": 363511853.0, "step": 11100 }, { "epoch": 0.9208889626005473, "grad_norm": 0.9139328598976135, "learning_rate": 1.898363594513114e-06, "loss": 1.0769, "mean_token_accuracy": 0.7180901765823364, "num_tokens": 363675693.0, "step": 11105 }, { "epoch": 0.9213035906791608, "grad_norm": 0.9308121800422668, "learning_rate": 1.8786625821390236e-06, "loss": 1.1448, "mean_token_accuracy": 0.7064027398824692, "num_tokens": 363839533.0, "step": 11110 }, { "epoch": 0.9217182187577743, "grad_norm": 0.8937799334526062, "learning_rate": 1.8590623737639035e-06, "loss": 1.1182, "mean_token_accuracy": 0.7109054252505302, "num_tokens": 364003373.0, "step": 11115 }, { "epoch": 0.9221328468363877, "grad_norm": 0.8960309028625488, "learning_rate": 1.8395630104460327e-06, "loss": 1.1596, "mean_token_accuracy": 0.7020527794957161, "num_tokens": 364167213.0, "step": 11120 }, { "epoch": 0.9225474749150012, "grad_norm": 0.954889714717865, "learning_rate": 1.8201645330324479e-06, "loss": 1.0809, "mean_token_accuracy": 0.7125427678227425, "num_tokens": 364331053.0, "step": 11125 }, { "epoch": 0.9229621029936147, "grad_norm": 0.9314802885055542, "learning_rate": 1.8008669821588497e-06, "loss": 1.0989, "mean_token_accuracy": 0.7106121718883515, "num_tokens": 364494893.0, "step": 11130 }, { "epoch": 0.9233767310722282, "grad_norm": 0.9300594329833984, "learning_rate": 1.7816703982495075e-06, "loss": 1.0691, "mean_token_accuracy": 0.7181207224726677, "num_tokens": 364658733.0, "step": 11135 }, { "epoch": 0.9237913591508417, "grad_norm": 0.8461613059043884, "learning_rate": 1.7625748215171878e-06, "loss": 1.012, "mean_token_accuracy": 0.7287390038371087, "num_tokens": 364822573.0, "step": 11140 }, { "epoch": 0.9242059872294551, "grad_norm": 0.9067208766937256, "learning_rate": 1.7435802919630929e-06, "loss": 1.0838, "mean_token_accuracy": 0.7154997497797012, "num_tokens": 364986413.0, "step": 11145 }, { "epoch": 0.9246206153080687, "grad_norm": 0.9463154673576355, "learning_rate": 1.7246868493767277e-06, "loss": 1.1697, "mean_token_accuracy": 0.6987170085310936, "num_tokens": 365150253.0, "step": 11150 }, { "epoch": 0.9250352433866822, "grad_norm": 0.8903513550758362, "learning_rate": 1.7058945333358388e-06, "loss": 1.0472, "mean_token_accuracy": 0.7249511256814003, "num_tokens": 365314093.0, "step": 11155 }, { "epoch": 0.9254498714652957, "grad_norm": 0.9678393006324768, "learning_rate": 1.6872033832063538e-06, "loss": 1.0955, "mean_token_accuracy": 0.7143267408013344, "num_tokens": 365477933.0, "step": 11160 }, { "epoch": 0.9258644995439091, "grad_norm": 0.8709613680839539, "learning_rate": 1.6686134381422802e-06, "loss": 1.0566, "mean_token_accuracy": 0.7199596792459488, "num_tokens": 365641773.0, "step": 11165 }, { "epoch": 0.9262791276225226, "grad_norm": 0.9399612545967102, "learning_rate": 1.6501247370855844e-06, "loss": 1.122, "mean_token_accuracy": 0.7127504914999008, "num_tokens": 365805613.0, "step": 11170 }, { "epoch": 0.9266937557011361, "grad_norm": 0.9037859439849854, "learning_rate": 1.631737318766191e-06, "loss": 1.1935, "mean_token_accuracy": 0.6953629046678543, "num_tokens": 365969453.0, "step": 11175 }, { "epoch": 0.9271083837797496, "grad_norm": 0.925135612487793, "learning_rate": 1.613451221701845e-06, "loss": 1.1299, "mean_token_accuracy": 0.7003421306610107, "num_tokens": 366133293.0, "step": 11180 }, { "epoch": 0.9275230118583631, "grad_norm": 0.9519877433776855, "learning_rate": 1.5952664841980437e-06, "loss": 1.2125, "mean_token_accuracy": 0.6950879722833634, "num_tokens": 366297133.0, "step": 11185 }, { "epoch": 0.9279376399369765, "grad_norm": 0.9566942453384399, "learning_rate": 1.5771831443479435e-06, "loss": 1.1731, "mean_token_accuracy": 0.6997311800718308, "num_tokens": 366460973.0, "step": 11190 }, { "epoch": 0.92835226801559, "grad_norm": 0.9131047129631042, "learning_rate": 1.5592012400323152e-06, "loss": 1.1506, "mean_token_accuracy": 0.7048509269952774, "num_tokens": 366624813.0, "step": 11195 }, { "epoch": 0.9287668960942035, "grad_norm": 0.8717803359031677, "learning_rate": 1.5413208089194387e-06, "loss": 1.0545, "mean_token_accuracy": 0.7235703766345978, "num_tokens": 366788653.0, "step": 11200 }, { "epoch": 0.929181524172817, "grad_norm": 0.9159350991249084, "learning_rate": 1.5235418884650243e-06, "loss": 1.0669, "mean_token_accuracy": 0.7212060108780861, "num_tokens": 366952493.0, "step": 11205 }, { "epoch": 0.9295961522514304, "grad_norm": 0.9394529461860657, "learning_rate": 1.5058645159121365e-06, "loss": 1.097, "mean_token_accuracy": 0.7103250250220299, "num_tokens": 367116333.0, "step": 11210 }, { "epoch": 0.9300107803300439, "grad_norm": 0.8994800448417664, "learning_rate": 1.4882887282911318e-06, "loss": 1.0163, "mean_token_accuracy": 0.7267896369099617, "num_tokens": 367279606.0, "step": 11215 }, { "epoch": 0.9304254084086574, "grad_norm": 0.9277415871620178, "learning_rate": 1.470814562419548e-06, "loss": 1.0836, "mean_token_accuracy": 0.7113575249910354, "num_tokens": 367443446.0, "step": 11220 }, { "epoch": 0.9308400364872709, "grad_norm": 0.9330523610115051, "learning_rate": 1.4534420549020655e-06, "loss": 1.1591, "mean_token_accuracy": 0.7000794216990471, "num_tokens": 367607286.0, "step": 11225 }, { "epoch": 0.9312546645658843, "grad_norm": 0.928530216217041, "learning_rate": 1.4361712421303963e-06, "loss": 1.1629, "mean_token_accuracy": 0.7005498617887497, "num_tokens": 367771126.0, "step": 11230 }, { "epoch": 0.9316692926444979, "grad_norm": 0.8880189061164856, "learning_rate": 1.419002160283245e-06, "loss": 1.1284, "mean_token_accuracy": 0.7053824573755264, "num_tokens": 367934966.0, "step": 11235 }, { "epoch": 0.9320839207231114, "grad_norm": 0.8915965557098389, "learning_rate": 1.4019348453261805e-06, "loss": 1.1011, "mean_token_accuracy": 0.7101661771535873, "num_tokens": 368098806.0, "step": 11240 }, { "epoch": 0.9324985488017249, "grad_norm": 0.936050295829773, "learning_rate": 1.384969333011621e-06, "loss": 1.1442, "mean_token_accuracy": 0.7046293050050736, "num_tokens": 368262527.0, "step": 11245 }, { "epoch": 0.9329131768803384, "grad_norm": 0.8871857523918152, "learning_rate": 1.3681056588787156e-06, "loss": 1.1522, "mean_token_accuracy": 0.7038062065839767, "num_tokens": 368426367.0, "step": 11250 }, { "epoch": 0.9333278049589518, "grad_norm": 0.9392543435096741, "learning_rate": 1.3513438582532844e-06, "loss": 1.151, "mean_token_accuracy": 0.7028775662183762, "num_tokens": 368590207.0, "step": 11255 }, { "epoch": 0.9337424330375653, "grad_norm": 0.9720740914344788, "learning_rate": 1.3346839662477406e-06, "loss": 1.2145, "mean_token_accuracy": 0.6947580620646476, "num_tokens": 368754047.0, "step": 11260 }, { "epoch": 0.9341570611161788, "grad_norm": 0.942632257938385, "learning_rate": 1.31812601776104e-06, "loss": 1.1088, "mean_token_accuracy": 0.7081132680177689, "num_tokens": 368917863.0, "step": 11265 }, { "epoch": 0.9345716891947923, "grad_norm": 0.940656304359436, "learning_rate": 1.3016700474785593e-06, "loss": 1.1047, "mean_token_accuracy": 0.7101172998547554, "num_tokens": 369081703.0, "step": 11270 }, { "epoch": 0.9349863172734058, "grad_norm": 0.9205859303474426, "learning_rate": 1.285316089872074e-06, "loss": 1.051, "mean_token_accuracy": 0.7213526412844657, "num_tokens": 369245543.0, "step": 11275 }, { "epoch": 0.9354009453520192, "grad_norm": 0.9171337485313416, "learning_rate": 1.2690641791996582e-06, "loss": 1.0978, "mean_token_accuracy": 0.7125855311751366, "num_tokens": 369409383.0, "step": 11280 }, { "epoch": 0.9358155734306327, "grad_norm": 1.0124074220657349, "learning_rate": 1.2529143495056183e-06, "loss": 1.1558, "mean_token_accuracy": 0.702278833091259, "num_tokens": 369573223.0, "step": 11285 }, { "epoch": 0.9362302015092462, "grad_norm": 0.9083593487739563, "learning_rate": 1.2368666346204206e-06, "loss": 1.154, "mean_token_accuracy": 0.7014479473233223, "num_tokens": 369737063.0, "step": 11290 }, { "epoch": 0.9366448295878597, "grad_norm": 0.9585278630256653, "learning_rate": 1.2209210681606299e-06, "loss": 1.1594, "mean_token_accuracy": 0.7005681827664375, "num_tokens": 369900903.0, "step": 11295 }, { "epoch": 0.9370594576664731, "grad_norm": 0.9256246089935303, "learning_rate": 1.2050776835288213e-06, "loss": 1.1185, "mean_token_accuracy": 0.7068426176905632, "num_tokens": 370064743.0, "step": 11300 }, { "epoch": 0.9374740857450866, "grad_norm": 0.9040131568908691, "learning_rate": 1.1893365139135303e-06, "loss": 1.1276, "mean_token_accuracy": 0.707337486743927, "num_tokens": 370228583.0, "step": 11305 }, { "epoch": 0.9378887138237001, "grad_norm": 0.9167429208755493, "learning_rate": 1.1736975922891745e-06, "loss": 1.0737, "mean_token_accuracy": 0.7120051324367523, "num_tokens": 370392423.0, "step": 11310 }, { "epoch": 0.9383033419023136, "grad_norm": 0.9664468169212341, "learning_rate": 1.1581609514159653e-06, "loss": 1.1443, "mean_token_accuracy": 0.706650273501873, "num_tokens": 370555558.0, "step": 11315 }, { "epoch": 0.9387179699809272, "grad_norm": 0.9230952858924866, "learning_rate": 1.1427266238398793e-06, "loss": 1.0584, "mean_token_accuracy": 0.7179007872939109, "num_tokens": 370719398.0, "step": 11320 }, { "epoch": 0.9391325980595406, "grad_norm": 0.8887954354286194, "learning_rate": 1.1273946418925651e-06, "loss": 1.0938, "mean_token_accuracy": 0.7143939375877381, "num_tokens": 370883238.0, "step": 11325 }, { "epoch": 0.9395472261381541, "grad_norm": 0.9098343253135681, "learning_rate": 1.1121650376912706e-06, "loss": 1.0811, "mean_token_accuracy": 0.7132759064435958, "num_tokens": 371047078.0, "step": 11330 }, { "epoch": 0.9399618542167676, "grad_norm": 0.9233249425888062, "learning_rate": 1.0970378431387817e-06, "loss": 1.0538, "mean_token_accuracy": 0.7183834314346313, "num_tokens": 371210918.0, "step": 11335 }, { "epoch": 0.9403764822953811, "grad_norm": 0.9147783517837524, "learning_rate": 1.082013089923367e-06, "loss": 1.0673, "mean_token_accuracy": 0.7187866598367691, "num_tokens": 371374758.0, "step": 11340 }, { "epoch": 0.9407911103739945, "grad_norm": 0.9527497887611389, "learning_rate": 1.0670908095187115e-06, "loss": 1.1868, "mean_token_accuracy": 0.6949596792459488, "num_tokens": 371538598.0, "step": 11345 }, { "epoch": 0.941205738452608, "grad_norm": 0.926953911781311, "learning_rate": 1.0522710331838048e-06, "loss": 1.067, "mean_token_accuracy": 0.7175158828496933, "num_tokens": 371702438.0, "step": 11350 }, { "epoch": 0.9416203665312215, "grad_norm": 0.9394188523292542, "learning_rate": 1.037553791962953e-06, "loss": 1.1346, "mean_token_accuracy": 0.7028897821903228, "num_tokens": 371866278.0, "step": 11355 }, { "epoch": 0.942034994609835, "grad_norm": 0.9399611353874207, "learning_rate": 1.022939116685656e-06, "loss": 1.1925, "mean_token_accuracy": 0.697794483602047, "num_tokens": 372030118.0, "step": 11360 }, { "epoch": 0.9424496226884485, "grad_norm": 0.9226964712142944, "learning_rate": 1.0084270379665473e-06, "loss": 1.1273, "mean_token_accuracy": 0.7038580939173699, "num_tokens": 372193521.0, "step": 11365 }, { "epoch": 0.9428642507670619, "grad_norm": 0.9384846091270447, "learning_rate": 9.940175862053703e-07, "loss": 1.1222, "mean_token_accuracy": 0.7093412965536118, "num_tokens": 372356909.0, "step": 11370 }, { "epoch": 0.9432788788456754, "grad_norm": 0.9163985252380371, "learning_rate": 9.797107915868574e-07, "loss": 1.0488, "mean_token_accuracy": 0.7222774296998977, "num_tokens": 372519837.0, "step": 11375 }, { "epoch": 0.9436935069242889, "grad_norm": 0.9299659132957458, "learning_rate": 9.655066840807193e-07, "loss": 1.1404, "mean_token_accuracy": 0.7039345040917396, "num_tokens": 372683677.0, "step": 11380 }, { "epoch": 0.9441081350029024, "grad_norm": 0.9414932131767273, "learning_rate": 9.514052934415485e-07, "loss": 1.194, "mean_token_accuracy": 0.7013196542859077, "num_tokens": 372847517.0, "step": 11385 }, { "epoch": 0.9445227630815158, "grad_norm": 0.960216224193573, "learning_rate": 9.374066492087608e-07, "loss": 1.1444, "mean_token_accuracy": 0.7023032769560814, "num_tokens": 373011357.0, "step": 11390 }, { "epoch": 0.9449373911601293, "grad_norm": 0.922818124294281, "learning_rate": 9.235107807065657e-07, "loss": 1.1822, "mean_token_accuracy": 0.7004215568304062, "num_tokens": 373175197.0, "step": 11395 }, { "epoch": 0.9453520192387429, "grad_norm": 0.9108608365058899, "learning_rate": 9.097177170438453e-07, "loss": 1.1108, "mean_token_accuracy": 0.7077773675322533, "num_tokens": 373339037.0, "step": 11400 }, { "epoch": 0.9457666473173564, "grad_norm": 0.9480197429656982, "learning_rate": 8.960274871141427e-07, "loss": 1.1504, "mean_token_accuracy": 0.704331623017788, "num_tokens": 373502877.0, "step": 11405 }, { "epoch": 0.9461812753959699, "grad_norm": 0.886017918586731, "learning_rate": 8.824401195955956e-07, "loss": 1.102, "mean_token_accuracy": 0.7145711123943329, "num_tokens": 373666717.0, "step": 11410 }, { "epoch": 0.9465959034745833, "grad_norm": 0.9021741151809692, "learning_rate": 8.689556429508583e-07, "loss": 1.1941, "mean_token_accuracy": 0.6929985351860524, "num_tokens": 373830557.0, "step": 11415 }, { "epoch": 0.9470105315531968, "grad_norm": 0.9444058537483215, "learning_rate": 8.555740854270411e-07, "loss": 1.1666, "mean_token_accuracy": 0.7004154458642006, "num_tokens": 373994397.0, "step": 11420 }, { "epoch": 0.9474251596318103, "grad_norm": 0.9039308428764343, "learning_rate": 8.422954750556766e-07, "loss": 1.0472, "mean_token_accuracy": 0.7199108004570007, "num_tokens": 374158237.0, "step": 11425 }, { "epoch": 0.9478397877104238, "grad_norm": 0.9307836294174194, "learning_rate": 8.291198396526368e-07, "loss": 1.1729, "mean_token_accuracy": 0.6965970203280449, "num_tokens": 374322077.0, "step": 11430 }, { "epoch": 0.9482544157890372, "grad_norm": 0.9643678665161133, "learning_rate": 8.16047206818088e-07, "loss": 1.1457, "mean_token_accuracy": 0.7040200352668762, "num_tokens": 374485917.0, "step": 11435 }, { "epoch": 0.9486690438676507, "grad_norm": 0.9190859794616699, "learning_rate": 8.030776039364196e-07, "loss": 1.0938, "mean_token_accuracy": 0.7135202825069428, "num_tokens": 374649757.0, "step": 11440 }, { "epoch": 0.9490836719462642, "grad_norm": 0.9101454019546509, "learning_rate": 7.90211058176199e-07, "loss": 1.1289, "mean_token_accuracy": 0.7079789832234382, "num_tokens": 374813597.0, "step": 11445 }, { "epoch": 0.9494983000248777, "grad_norm": 0.9434904456138611, "learning_rate": 7.774475964901107e-07, "loss": 1.1151, "mean_token_accuracy": 0.7092314288020134, "num_tokens": 374977437.0, "step": 11450 }, { "epoch": 0.9499129281034911, "grad_norm": 0.9281089901924133, "learning_rate": 7.647872456149119e-07, "loss": 1.0864, "mean_token_accuracy": 0.7115163713693619, "num_tokens": 375141277.0, "step": 11455 }, { "epoch": 0.9503275561821046, "grad_norm": 0.9672658443450928, "learning_rate": 7.522300320713382e-07, "loss": 1.1032, "mean_token_accuracy": 0.7090237036347389, "num_tokens": 375305117.0, "step": 11460 }, { "epoch": 0.9507421842607181, "grad_norm": 0.9243296980857849, "learning_rate": 7.397759821640981e-07, "loss": 1.1414, "mean_token_accuracy": 0.7054802104830742, "num_tokens": 375468957.0, "step": 11465 }, { "epoch": 0.9511568123393316, "grad_norm": 0.9196622371673584, "learning_rate": 7.274251219817785e-07, "loss": 1.1123, "mean_token_accuracy": 0.7119990184903144, "num_tokens": 375632797.0, "step": 11470 }, { "epoch": 0.9515714404179451, "grad_norm": 0.8937596678733826, "learning_rate": 7.151774773968278e-07, "loss": 1.0793, "mean_token_accuracy": 0.7141190156340599, "num_tokens": 375796637.0, "step": 11475 }, { "epoch": 0.9519860684965585, "grad_norm": 0.9182631969451904, "learning_rate": 7.030330740654456e-07, "loss": 1.1309, "mean_token_accuracy": 0.7044477075338363, "num_tokens": 375960477.0, "step": 11480 }, { "epoch": 0.9524006965751721, "grad_norm": 0.9258006811141968, "learning_rate": 6.909919374275987e-07, "loss": 1.0804, "mean_token_accuracy": 0.716819404065609, "num_tokens": 376124317.0, "step": 11485 }, { "epoch": 0.9528153246537856, "grad_norm": 0.9379932284355164, "learning_rate": 6.7905409270691e-07, "loss": 1.147, "mean_token_accuracy": 0.7027187243103981, "num_tokens": 376288157.0, "step": 11490 }, { "epoch": 0.9532299527323991, "grad_norm": 0.9158267378807068, "learning_rate": 6.672195649106205e-07, "loss": 1.1284, "mean_token_accuracy": 0.7096346557140351, "num_tokens": 376451997.0, "step": 11495 }, { "epoch": 0.9536445808110126, "grad_norm": 0.886914074420929, "learning_rate": 6.554883788295718e-07, "loss": 1.0926, "mean_token_accuracy": 0.7122641801834106, "num_tokens": 376615813.0, "step": 11500 }, { "epoch": 0.954059208889626, "grad_norm": 0.9485747814178467, "learning_rate": 6.438605590381119e-07, "loss": 1.1302, "mean_token_accuracy": 0.7081500500440597, "num_tokens": 376779653.0, "step": 11505 }, { "epoch": 0.9544738369682395, "grad_norm": 0.9509932398796082, "learning_rate": 6.323361298940455e-07, "loss": 1.0504, "mean_token_accuracy": 0.7201918348670006, "num_tokens": 376943493.0, "step": 11510 }, { "epoch": 0.954888465046853, "grad_norm": 1.0838360786437988, "learning_rate": 6.209151155386173e-07, "loss": 1.0738, "mean_token_accuracy": 0.7173374041914939, "num_tokens": 377106821.0, "step": 11515 }, { "epoch": 0.9553030931254665, "grad_norm": 0.9364581108093262, "learning_rate": 6.095975398964337e-07, "loss": 1.0641, "mean_token_accuracy": 0.7187194496393203, "num_tokens": 377270661.0, "step": 11520 }, { "epoch": 0.9557177212040799, "grad_norm": 0.9208771586418152, "learning_rate": 5.983834266754029e-07, "loss": 1.1313, "mean_token_accuracy": 0.7060850396752357, "num_tokens": 377434501.0, "step": 11525 }, { "epoch": 0.9561323492826934, "grad_norm": 0.9197729229927063, "learning_rate": 5.872727993667282e-07, "loss": 1.1365, "mean_token_accuracy": 0.7076735079288483, "num_tokens": 377598341.0, "step": 11530 }, { "epoch": 0.9565469773613069, "grad_norm": 0.9293209910392761, "learning_rate": 5.762656812448086e-07, "loss": 1.0914, "mean_token_accuracy": 0.7135997027158737, "num_tokens": 377762181.0, "step": 11535 }, { "epoch": 0.9569616054399204, "grad_norm": 0.9614062309265137, "learning_rate": 5.653620953672334e-07, "loss": 1.1484, "mean_token_accuracy": 0.7025354325771331, "num_tokens": 377926021.0, "step": 11540 }, { "epoch": 0.9573762335185338, "grad_norm": 0.8375124335289001, "learning_rate": 5.545620645746985e-07, "loss": 1.0751, "mean_token_accuracy": 0.7167216524481773, "num_tokens": 378089861.0, "step": 11545 }, { "epoch": 0.9577908615971473, "grad_norm": 0.9335710406303406, "learning_rate": 5.438656114909679e-07, "loss": 1.0971, "mean_token_accuracy": 0.7122006356716156, "num_tokens": 378253701.0, "step": 11550 }, { "epoch": 0.9582054896757608, "grad_norm": 0.9336086511611938, "learning_rate": 5.332727585228569e-07, "loss": 1.2236, "mean_token_accuracy": 0.6912817701697349, "num_tokens": 378417541.0, "step": 11555 }, { "epoch": 0.9586201177543743, "grad_norm": 0.8826514482498169, "learning_rate": 5.227835278601379e-07, "loss": 1.1248, "mean_token_accuracy": 0.7095991492271423, "num_tokens": 378581044.0, "step": 11560 }, { "epoch": 0.9590347458329878, "grad_norm": 0.940909743309021, "learning_rate": 5.123979414755343e-07, "loss": 1.081, "mean_token_accuracy": 0.7155180796980858, "num_tokens": 378744884.0, "step": 11565 }, { "epoch": 0.9594493739116013, "grad_norm": 0.9472241401672363, "learning_rate": 5.021160211246378e-07, "loss": 1.168, "mean_token_accuracy": 0.6959921777248382, "num_tokens": 378908724.0, "step": 11570 }, { "epoch": 0.9598640019902148, "grad_norm": 0.9369872808456421, "learning_rate": 4.919377883458975e-07, "loss": 1.1965, "mean_token_accuracy": 0.6996044397354126, "num_tokens": 379072325.0, "step": 11575 }, { "epoch": 0.9602786300688283, "grad_norm": 0.9338463544845581, "learning_rate": 4.81863264460547e-07, "loss": 1.2003, "mean_token_accuracy": 0.6928030341863632, "num_tokens": 379236165.0, "step": 11580 }, { "epoch": 0.9606932581474418, "grad_norm": 0.9180485606193542, "learning_rate": 4.71892470572588e-07, "loss": 1.2313, "mean_token_accuracy": 0.6894122660160065, "num_tokens": 379400005.0, "step": 11585 }, { "epoch": 0.9611078862260553, "grad_norm": 0.9087678790092468, "learning_rate": 4.620254275687075e-07, "loss": 1.0501, "mean_token_accuracy": 0.7211693525314331, "num_tokens": 379563845.0, "step": 11590 }, { "epoch": 0.9615225143046687, "grad_norm": 0.9397708177566528, "learning_rate": 4.522621561182772e-07, "loss": 1.1268, "mean_token_accuracy": 0.7062866583466529, "num_tokens": 379727685.0, "step": 11595 }, { "epoch": 0.9619371423832822, "grad_norm": 0.9346681833267212, "learning_rate": 4.426026766732816e-07, "loss": 1.0802, "mean_token_accuracy": 0.7151800289750099, "num_tokens": 379890601.0, "step": 11600 }, { "epoch": 0.9623517704618957, "grad_norm": 0.8871059417724609, "learning_rate": 4.3304700946827373e-07, "loss": 1.0478, "mean_token_accuracy": 0.7215909063816071, "num_tokens": 380054441.0, "step": 11605 }, { "epoch": 0.9627663985405092, "grad_norm": 0.9635825157165527, "learning_rate": 4.2359517452035815e-07, "loss": 1.201, "mean_token_accuracy": 0.6936033710837364, "num_tokens": 380218281.0, "step": 11610 }, { "epoch": 0.9631810266191226, "grad_norm": 0.9270527958869934, "learning_rate": 4.1424719162912464e-07, "loss": 1.2035, "mean_token_accuracy": 0.6924303472042084, "num_tokens": 380382121.0, "step": 11615 }, { "epoch": 0.9635956546977361, "grad_norm": 0.9289128184318542, "learning_rate": 4.0500308037660915e-07, "loss": 1.1745, "mean_token_accuracy": 0.6984359726309777, "num_tokens": 380545961.0, "step": 11620 }, { "epoch": 0.9640102827763496, "grad_norm": 0.9154837131500244, "learning_rate": 3.958628601272663e-07, "loss": 1.1345, "mean_token_accuracy": 0.7014968231320381, "num_tokens": 380709801.0, "step": 11625 }, { "epoch": 0.9644249108549631, "grad_norm": 0.9176326990127563, "learning_rate": 3.8682655002792446e-07, "loss": 1.098, "mean_token_accuracy": 0.7111820951104164, "num_tokens": 380872907.0, "step": 11630 }, { "epoch": 0.9648395389335765, "grad_norm": 0.885033130645752, "learning_rate": 3.7789416900773647e-07, "loss": 1.1397, "mean_token_accuracy": 0.7058101162314415, "num_tokens": 381036747.0, "step": 11635 }, { "epoch": 0.96525416701219, "grad_norm": 0.9226931929588318, "learning_rate": 3.690657357781402e-07, "loss": 1.2085, "mean_token_accuracy": 0.6952407151460648, "num_tokens": 381200587.0, "step": 11640 }, { "epoch": 0.9656687950908035, "grad_norm": 0.9441694617271423, "learning_rate": 3.603412688328367e-07, "loss": 1.1136, "mean_token_accuracy": 0.7085050046443939, "num_tokens": 381364285.0, "step": 11645 }, { "epoch": 0.9660834231694171, "grad_norm": 0.968190610408783, "learning_rate": 3.517207864477401e-07, "loss": 1.1527, "mean_token_accuracy": 0.6979771569371224, "num_tokens": 381527844.0, "step": 11650 }, { "epoch": 0.9664980512480306, "grad_norm": 0.9461910724639893, "learning_rate": 3.4320430668092206e-07, "loss": 1.1371, "mean_token_accuracy": 0.705865104496479, "num_tokens": 381691684.0, "step": 11655 }, { "epoch": 0.966912679326644, "grad_norm": 0.9138695597648621, "learning_rate": 3.347918473726064e-07, "loss": 1.0254, "mean_token_accuracy": 0.726417401432991, "num_tokens": 381855524.0, "step": 11660 }, { "epoch": 0.9673273074052575, "grad_norm": 0.8957282900810242, "learning_rate": 3.264834261451133e-07, "loss": 1.1628, "mean_token_accuracy": 0.7030608490109443, "num_tokens": 382019364.0, "step": 11665 }, { "epoch": 0.967741935483871, "grad_norm": 0.9572055339813232, "learning_rate": 3.182790604028263e-07, "loss": 1.1871, "mean_token_accuracy": 0.6945931106805802, "num_tokens": 382183204.0, "step": 11670 }, { "epoch": 0.9681565635624845, "grad_norm": 0.9100860357284546, "learning_rate": 3.101787673321421e-07, "loss": 1.2264, "mean_token_accuracy": 0.6919979244470597, "num_tokens": 382346008.0, "step": 11675 }, { "epoch": 0.968571191641098, "grad_norm": 0.9562139511108398, "learning_rate": 3.0218256390146525e-07, "loss": 1.1402, "mean_token_accuracy": 0.7035862639546394, "num_tokens": 382509848.0, "step": 11680 }, { "epoch": 0.9689858197197114, "grad_norm": 0.9326156973838806, "learning_rate": 2.942904668611468e-07, "loss": 1.0916, "mean_token_accuracy": 0.7109237551689148, "num_tokens": 382673688.0, "step": 11685 }, { "epoch": 0.9694004477983249, "grad_norm": 0.9341801404953003, "learning_rate": 2.865024927434512e-07, "loss": 1.1373, "mean_token_accuracy": 0.7055073603987694, "num_tokens": 382836954.0, "step": 11690 }, { "epoch": 0.9698150758769384, "grad_norm": 0.907103955745697, "learning_rate": 2.788186578625396e-07, "loss": 1.0982, "mean_token_accuracy": 0.7117118775844574, "num_tokens": 383000794.0, "step": 11695 }, { "epoch": 0.9702297039555519, "grad_norm": 0.8543350696563721, "learning_rate": 2.7123897831441427e-07, "loss": 1.1194, "mean_token_accuracy": 0.7077773705124855, "num_tokens": 383164634.0, "step": 11700 }, { "epoch": 0.9706443320341653, "grad_norm": 0.9193892478942871, "learning_rate": 2.637634699768965e-07, "loss": 1.0522, "mean_token_accuracy": 0.7232038155198097, "num_tokens": 383328474.0, "step": 11705 }, { "epoch": 0.9710589601127788, "grad_norm": 0.9411940574645996, "learning_rate": 2.563921485095877e-07, "loss": 1.202, "mean_token_accuracy": 0.6897971585392952, "num_tokens": 383492314.0, "step": 11710 }, { "epoch": 0.9714735881913923, "grad_norm": 0.9080677628517151, "learning_rate": 2.491250293538472e-07, "loss": 1.1203, "mean_token_accuracy": 0.7104960888624191, "num_tokens": 383656154.0, "step": 11715 }, { "epoch": 0.9718882162700058, "grad_norm": 0.9072685241699219, "learning_rate": 2.4196212773274773e-07, "loss": 1.1119, "mean_token_accuracy": 0.7090437635779381, "num_tokens": 383819428.0, "step": 11720 }, { "epoch": 0.9723028443486192, "grad_norm": 0.8989502191543579, "learning_rate": 2.3490345865105344e-07, "loss": 1.1031, "mean_token_accuracy": 0.7131559088826179, "num_tokens": 383982803.0, "step": 11725 }, { "epoch": 0.9727174724272327, "grad_norm": 0.9268701672554016, "learning_rate": 2.2794903689517533e-07, "loss": 1.1365, "mean_token_accuracy": 0.7080437660217285, "num_tokens": 384146211.0, "step": 11730 }, { "epoch": 0.9731321005058463, "grad_norm": 0.9161537289619446, "learning_rate": 2.2109887703315458e-07, "loss": 1.1482, "mean_token_accuracy": 0.7007575735449791, "num_tokens": 384310051.0, "step": 11735 }, { "epoch": 0.9735467285844598, "grad_norm": 0.9547184705734253, "learning_rate": 2.1435299341461822e-07, "loss": 1.0889, "mean_token_accuracy": 0.7124229952692985, "num_tokens": 384473824.0, "step": 11740 }, { "epoch": 0.9739613566630733, "grad_norm": 0.9535466432571411, "learning_rate": 2.0771140017076806e-07, "loss": 1.2288, "mean_token_accuracy": 0.6856060579419136, "num_tokens": 384637664.0, "step": 11745 }, { "epoch": 0.9743759847416867, "grad_norm": 0.9268850684165955, "learning_rate": 2.0117411121433616e-07, "loss": 1.1047, "mean_token_accuracy": 0.712927196919918, "num_tokens": 384801452.0, "step": 11750 }, { "epoch": 0.9747906128203002, "grad_norm": 0.9306238889694214, "learning_rate": 1.9474114023954604e-07, "loss": 1.1047, "mean_token_accuracy": 0.7088220864534378, "num_tokens": 384965292.0, "step": 11755 }, { "epoch": 0.9752052408989137, "grad_norm": 0.9298808574676514, "learning_rate": 1.8841250072211824e-07, "loss": 1.0884, "mean_token_accuracy": 0.7143695011734963, "num_tokens": 385129132.0, "step": 11760 }, { "epoch": 0.9756198689775272, "grad_norm": 0.9119917750358582, "learning_rate": 1.8218820591920372e-07, "loss": 1.1635, "mean_token_accuracy": 0.7002871423959732, "num_tokens": 385292972.0, "step": 11765 }, { "epoch": 0.9760344970561406, "grad_norm": 0.9063863158226013, "learning_rate": 1.7606826886938933e-07, "loss": 1.0596, "mean_token_accuracy": 0.7162512198090554, "num_tokens": 385456812.0, "step": 11770 }, { "epoch": 0.9764491251347541, "grad_norm": 0.9217805862426758, "learning_rate": 1.7005270239263683e-07, "loss": 1.1521, "mean_token_accuracy": 0.704728738963604, "num_tokens": 385620652.0, "step": 11775 }, { "epoch": 0.9768637532133676, "grad_norm": 0.9595069289207458, "learning_rate": 1.641415190902884e-07, "loss": 1.1743, "mean_token_accuracy": 0.6987774550914765, "num_tokens": 385784488.0, "step": 11780 }, { "epoch": 0.9772783812919811, "grad_norm": 0.9089632034301758, "learning_rate": 1.583347313450112e-07, "loss": 1.1317, "mean_token_accuracy": 0.7037695482373237, "num_tokens": 385948328.0, "step": 11785 }, { "epoch": 0.9776930093705946, "grad_norm": 0.9190571308135986, "learning_rate": 1.5263235132080279e-07, "loss": 1.1105, "mean_token_accuracy": 0.7096163287758828, "num_tokens": 386112168.0, "step": 11790 }, { "epoch": 0.978107637449208, "grad_norm": 0.899294376373291, "learning_rate": 1.4703439096294126e-07, "loss": 1.0591, "mean_token_accuracy": 0.7189324885606766, "num_tokens": 386274464.0, "step": 11795 }, { "epoch": 0.9785222655278215, "grad_norm": 0.9493892788887024, "learning_rate": 1.4154086199795747e-07, "loss": 1.1429, "mean_token_accuracy": 0.7037820160388947, "num_tokens": 386437291.0, "step": 11800 }, { "epoch": 0.978936893606435, "grad_norm": 0.9160469174385071, "learning_rate": 1.361517759336406e-07, "loss": 1.0717, "mean_token_accuracy": 0.7159518584609031, "num_tokens": 386601131.0, "step": 11805 }, { "epoch": 0.9793515216850485, "grad_norm": 0.9399734735488892, "learning_rate": 1.3086714405897705e-07, "loss": 1.1195, "mean_token_accuracy": 0.7086204752326012, "num_tokens": 386764971.0, "step": 11810 }, { "epoch": 0.9797661497636619, "grad_norm": 0.9285438060760498, "learning_rate": 1.256869774441505e-07, "loss": 1.1302, "mean_token_accuracy": 0.707080890238285, "num_tokens": 386928811.0, "step": 11815 }, { "epoch": 0.9801807778422755, "grad_norm": 0.9658012986183167, "learning_rate": 1.2061128694050848e-07, "loss": 1.221, "mean_token_accuracy": 0.6887096747756004, "num_tokens": 387092651.0, "step": 11820 }, { "epoch": 0.980595405920889, "grad_norm": 0.9252815246582031, "learning_rate": 1.1564008318055708e-07, "loss": 1.1807, "mean_token_accuracy": 0.7000767543911934, "num_tokens": 387255540.0, "step": 11825 }, { "epoch": 0.9810100339995025, "grad_norm": 0.9168857932090759, "learning_rate": 1.107733765779051e-07, "loss": 1.057, "mean_token_accuracy": 0.717179861664772, "num_tokens": 387419380.0, "step": 11830 }, { "epoch": 0.981424662078116, "grad_norm": 0.9354410171508789, "learning_rate": 1.0601117732727539e-07, "loss": 1.1421, "mean_token_accuracy": 0.7040261492133141, "num_tokens": 387583220.0, "step": 11835 }, { "epoch": 0.9818392901567294, "grad_norm": 0.9230213165283203, "learning_rate": 1.0135349540446038e-07, "loss": 1.0855, "mean_token_accuracy": 0.7141923293471336, "num_tokens": 387747060.0, "step": 11840 }, { "epoch": 0.9822539182353429, "grad_norm": 0.9315711259841919, "learning_rate": 9.680034056632203e-08, "loss": 1.205, "mean_token_accuracy": 0.6943181812763214, "num_tokens": 387910900.0, "step": 11845 }, { "epoch": 0.9826685463139564, "grad_norm": 0.882120668888092, "learning_rate": 9.235172235074752e-08, "loss": 1.1033, "mean_token_accuracy": 0.7119501441717148, "num_tokens": 388074740.0, "step": 11850 }, { "epoch": 0.9830831743925699, "grad_norm": 0.8793638944625854, "learning_rate": 8.800765007665469e-08, "loss": 1.1175, "mean_token_accuracy": 0.709995111823082, "num_tokens": 388238580.0, "step": 11855 }, { "epoch": 0.9834978024711833, "grad_norm": 0.9546449780464172, "learning_rate": 8.376813284395324e-08, "loss": 1.119, "mean_token_accuracy": 0.7083150029182435, "num_tokens": 388402420.0, "step": 11860 }, { "epoch": 0.9839124305497968, "grad_norm": 0.9052698016166687, "learning_rate": 7.963317953353366e-08, "loss": 1.1325, "mean_token_accuracy": 0.7061889037489891, "num_tokens": 388566260.0, "step": 11865 }, { "epoch": 0.9843270586284103, "grad_norm": 0.9273753762245178, "learning_rate": 7.560279880723942e-08, "loss": 1.1656, "mean_token_accuracy": 0.7007209196686744, "num_tokens": 388730100.0, "step": 11870 }, { "epoch": 0.9847416867070238, "grad_norm": 0.9197583794593811, "learning_rate": 7.167699910787251e-08, "loss": 1.222, "mean_token_accuracy": 0.6885202795267105, "num_tokens": 388893940.0, "step": 11875 }, { "epoch": 0.9851563147856373, "grad_norm": 0.9500606656074524, "learning_rate": 6.78557886591491e-08, "loss": 1.1255, "mean_token_accuracy": 0.7084921777248383, "num_tokens": 389057780.0, "step": 11880 }, { "epoch": 0.9855709428642507, "grad_norm": 0.9366825819015503, "learning_rate": 6.413917546569393e-08, "loss": 1.1387, "mean_token_accuracy": 0.7038248598575592, "num_tokens": 389220815.0, "step": 11885 }, { "epoch": 0.9859855709428642, "grad_norm": 0.9425362348556519, "learning_rate": 6.052716731301811e-08, "loss": 1.1231, "mean_token_accuracy": 0.7050708711147309, "num_tokens": 389384655.0, "step": 11890 }, { "epoch": 0.9864001990214777, "grad_norm": 0.854095458984375, "learning_rate": 5.701977176751916e-08, "loss": 1.0811, "mean_token_accuracy": 0.7167277589440346, "num_tokens": 389548495.0, "step": 11895 }, { "epoch": 0.9868148271000912, "grad_norm": 0.9569870233535767, "learning_rate": 5.361699617644211e-08, "loss": 1.076, "mean_token_accuracy": 0.7142534211277962, "num_tokens": 389712335.0, "step": 11900 }, { "epoch": 0.9872294551787048, "grad_norm": 0.9416723251342773, "learning_rate": 5.031884766789064e-08, "loss": 1.1414, "mean_token_accuracy": 0.7019489198923111, "num_tokens": 389876175.0, "step": 11905 }, { "epoch": 0.9876440832573182, "grad_norm": 0.894721269607544, "learning_rate": 4.712533315077705e-08, "loss": 1.1264, "mean_token_accuracy": 0.703176936507225, "num_tokens": 390040015.0, "step": 11910 }, { "epoch": 0.9880587113359317, "grad_norm": 0.9371886253356934, "learning_rate": 4.403645931483902e-08, "loss": 1.1667, "mean_token_accuracy": 0.6983622968196869, "num_tokens": 390203804.0, "step": 11915 }, { "epoch": 0.9884733394145452, "grad_norm": 0.9553115963935852, "learning_rate": 4.105223263061175e-08, "loss": 1.0813, "mean_token_accuracy": 0.7137829884886742, "num_tokens": 390367644.0, "step": 11920 }, { "epoch": 0.9888879674931587, "grad_norm": 0.9798563718795776, "learning_rate": 3.817265934941694e-08, "loss": 1.1887, "mean_token_accuracy": 0.6951429590582847, "num_tokens": 390531484.0, "step": 11925 }, { "epoch": 0.9893025955717721, "grad_norm": 0.91167151927948, "learning_rate": 3.539774550335717e-08, "loss": 1.1323, "mean_token_accuracy": 0.7083638802170753, "num_tokens": 390695324.0, "step": 11930 }, { "epoch": 0.9897172236503856, "grad_norm": 0.9019158482551575, "learning_rate": 3.2727496905282654e-08, "loss": 1.2026, "mean_token_accuracy": 0.695654584467411, "num_tokens": 390859155.0, "step": 11935 }, { "epoch": 0.9901318517289991, "grad_norm": 0.9292673468589783, "learning_rate": 3.0161919148796736e-08, "loss": 1.1376, "mean_token_accuracy": 0.7033602118492126, "num_tokens": 391022995.0, "step": 11940 }, { "epoch": 0.9905464798076126, "grad_norm": 0.9041401743888855, "learning_rate": 2.7701017608239288e-08, "loss": 1.1124, "mean_token_accuracy": 0.7084732592105866, "num_tokens": 391186174.0, "step": 11945 }, { "epoch": 0.990961107886226, "grad_norm": 0.9249791502952576, "learning_rate": 2.5344797438686673e-08, "loss": 1.1751, "mean_token_accuracy": 0.7000488787889481, "num_tokens": 391350014.0, "step": 11950 }, { "epoch": 0.9913757359648395, "grad_norm": 0.9407515525817871, "learning_rate": 2.3093263575912906e-08, "loss": 1.1524, "mean_token_accuracy": 0.7073802500963211, "num_tokens": 391513854.0, "step": 11955 }, { "epoch": 0.991790364043453, "grad_norm": 0.9295756220817566, "learning_rate": 2.094642073640629e-08, "loss": 1.1698, "mean_token_accuracy": 0.6980205267667771, "num_tokens": 391677694.0, "step": 11960 }, { "epoch": 0.9922049921220665, "grad_norm": 0.8901556134223938, "learning_rate": 1.890427341734724e-08, "loss": 1.1572, "mean_token_accuracy": 0.7028382167220115, "num_tokens": 391840868.0, "step": 11965 }, { "epoch": 0.99261962020068, "grad_norm": 0.9549967646598816, "learning_rate": 1.696682589659715e-08, "loss": 1.1822, "mean_token_accuracy": 0.6990713581442833, "num_tokens": 392004708.0, "step": 11970 }, { "epoch": 0.9930342482792934, "grad_norm": 0.9046486020088196, "learning_rate": 1.513408223270396e-08, "loss": 1.0956, "mean_token_accuracy": 0.7138257578015328, "num_tokens": 392168548.0, "step": 11975 }, { "epoch": 0.9934488763579069, "grad_norm": 0.9164446592330933, "learning_rate": 1.3406046264874405e-08, "loss": 1.156, "mean_token_accuracy": 0.7049792245030403, "num_tokens": 392332388.0, "step": 11980 }, { "epoch": 0.9938635044365205, "grad_norm": 0.8822214007377625, "learning_rate": 1.1782721612979553e-08, "loss": 1.1395, "mean_token_accuracy": 0.7028164654970169, "num_tokens": 392496228.0, "step": 11985 }, { "epoch": 0.994278132515134, "grad_norm": 0.9024549126625061, "learning_rate": 1.0264111677538158e-08, "loss": 1.0855, "mean_token_accuracy": 0.7141373410820961, "num_tokens": 392660068.0, "step": 11990 }, { "epoch": 0.9946927605937474, "grad_norm": 0.9262228012084961, "learning_rate": 8.850219639716662e-09, "loss": 1.1692, "mean_token_accuracy": 0.6999022468924523, "num_tokens": 392823908.0, "step": 11995 }, { "epoch": 0.9951073886723609, "grad_norm": 0.9171954393386841, "learning_rate": 7.54104846131809e-09, "loss": 1.0827, "mean_token_accuracy": 0.7156158372759819, "num_tokens": 392987748.0, "step": 12000 }, { "epoch": 0.9955220167509744, "grad_norm": 0.9143432974815369, "learning_rate": 6.3366008847820515e-09, "loss": 1.0833, "mean_token_accuracy": 0.7156586021184921, "num_tokens": 393151588.0, "step": 12005 }, { "epoch": 0.9959366448295879, "grad_norm": 0.858116865158081, "learning_rate": 5.236879433162534e-09, "loss": 1.0369, "mean_token_accuracy": 0.7210471659898758, "num_tokens": 393315428.0, "step": 12010 }, { "epoch": 0.9963512729082014, "grad_norm": 0.9210796356201172, "learning_rate": 4.2418864101501085e-09, "loss": 1.1426, "mean_token_accuracy": 0.7030425250530243, "num_tokens": 393479268.0, "step": 12015 }, { "epoch": 0.9967659009868148, "grad_norm": 0.9030014276504517, "learning_rate": 3.351623900044176e-09, "loss": 1.1529, "mean_token_accuracy": 0.7069403648376464, "num_tokens": 393643108.0, "step": 12020 }, { "epoch": 0.9971805290654283, "grad_norm": 0.9101301431655884, "learning_rate": 2.566093767758515e-09, "loss": 1.1693, "mean_token_accuracy": 0.7013722419738769, "num_tokens": 393806152.0, "step": 12025 }, { "epoch": 0.9975951571440418, "grad_norm": 0.9940186738967896, "learning_rate": 1.88529765880463e-09, "loss": 1.1615, "mean_token_accuracy": 0.6981671527028084, "num_tokens": 393969992.0, "step": 12030 }, { "epoch": 0.9980097852226553, "grad_norm": 0.9276906251907349, "learning_rate": 1.3092369993084052e-09, "loss": 1.162, "mean_token_accuracy": 0.7017839699983597, "num_tokens": 394133832.0, "step": 12035 }, { "epoch": 0.9984244133012687, "grad_norm": 0.903628945350647, "learning_rate": 8.379129959934506e-10, "loss": 1.1023, "mean_token_accuracy": 0.7125794261693954, "num_tokens": 394297672.0, "step": 12040 }, { "epoch": 0.9988390413798822, "grad_norm": 0.9416557550430298, "learning_rate": 4.713266361866531e-10, "loss": 1.2072, "mean_token_accuracy": 0.6941043511033058, "num_tokens": 394461512.0, "step": 12045 }, { "epoch": 0.9992536694584957, "grad_norm": 0.9321599006652832, "learning_rate": 2.0947868781262537e-10, "loss": 1.0976, "mean_token_accuracy": 0.7115285903215408, "num_tokens": 394625352.0, "step": 12050 }, { "epoch": 0.9996682975371092, "grad_norm": 0.883375883102417, "learning_rate": 5.236969937705283e-11, "loss": 1.0519, "mean_token_accuracy": 0.7237719938158989, "num_tokens": 394789192.0, "step": 12055 }, { "epoch": 1.0, "eval_loss": 1.127059817314148, "eval_mean_token_accuracy": 0.707993041371355, "eval_num_tokens": 394920264.0, "eval_runtime": 843.2579, "eval_samples_per_second": 23.955, "eval_steps_per_second": 5.989, "step": 12059 }, { "epoch": 1.0, "mean_token_accuracy": 0.7099141571670771, "num_tokens": 394920264.0, "step": 12059, "total_flos": 1.0443024731807416e+18, "train_loss": 1.2318119363747622, "train_runtime": 64706.2591, "train_samples_per_second": 5.964, "train_steps_per_second": 0.186 } ], "logging_steps": 5, "max_steps": 12059, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0443024731807416e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }