{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.999796257937451, "eval_steps": 500, "global_step": 44172, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013582804169920881, "grad_norm": 4.332608222961426, "learning_rate": 2.2634676324128564e-07, "loss": 2.1396, "step": 10 }, { "epoch": 0.0027165608339841762, "grad_norm": 4.334485054016113, "learning_rate": 4.526935264825713e-07, "loss": 2.1411, "step": 20 }, { "epoch": 0.004074841250976264, "grad_norm": 4.4394659996032715, "learning_rate": 6.790402897238569e-07, "loss": 2.1096, "step": 30 }, { "epoch": 0.0054331216679683525, "grad_norm": 3.910888195037842, "learning_rate": 9.053870529651426e-07, "loss": 2.0141, "step": 40 }, { "epoch": 0.00679140208496044, "grad_norm": 3.170830011367798, "learning_rate": 1.1317338162064282e-06, "loss": 1.8657, "step": 50 }, { "epoch": 0.008149682501952528, "grad_norm": 3.0281105041503906, "learning_rate": 1.3580805794477138e-06, "loss": 1.6997, "step": 60 }, { "epoch": 0.009507962918944617, "grad_norm": 2.563246726989746, "learning_rate": 1.5844273426889996e-06, "loss": 1.4437, "step": 70 }, { "epoch": 0.010866243335936705, "grad_norm": 2.062389612197876, "learning_rate": 1.8107741059302851e-06, "loss": 1.2096, "step": 80 }, { "epoch": 0.012224523752928792, "grad_norm": 2.082141637802124, "learning_rate": 2.037120869171571e-06, "loss": 0.9929, "step": 90 }, { "epoch": 0.01358280416992088, "grad_norm": 1.8081134557724, "learning_rate": 2.2634676324128565e-06, "loss": 0.7628, "step": 100 }, { "epoch": 0.014941084586912968, "grad_norm": 1.58918035030365, "learning_rate": 2.489814395654142e-06, "loss": 0.5687, "step": 110 }, { "epoch": 0.016299365003905057, "grad_norm": 1.717963695526123, "learning_rate": 2.7161611588954276e-06, "loss": 0.47, "step": 120 }, { "epoch": 0.017657645420897143, "grad_norm": 1.5857183933258057, "learning_rate": 2.9425079221367136e-06, "loss": 0.3996, "step": 130 }, { "epoch": 0.019015925837889233, "grad_norm": 1.321204423904419, "learning_rate": 3.168854685377999e-06, "loss": 0.3776, "step": 140 }, { "epoch": 0.02037420625488132, "grad_norm": 1.5279806852340698, "learning_rate": 3.3952014486192847e-06, "loss": 0.3724, "step": 150 }, { "epoch": 0.02173248667187341, "grad_norm": 1.3158127069473267, "learning_rate": 3.6215482118605703e-06, "loss": 0.3579, "step": 160 }, { "epoch": 0.023090767088865496, "grad_norm": 1.6571372747421265, "learning_rate": 3.847894975101856e-06, "loss": 0.3461, "step": 170 }, { "epoch": 0.024449047505857583, "grad_norm": 1.563489317893982, "learning_rate": 4.074241738343142e-06, "loss": 0.3408, "step": 180 }, { "epoch": 0.025807327922849673, "grad_norm": 1.5311400890350342, "learning_rate": 4.300588501584427e-06, "loss": 0.3362, "step": 190 }, { "epoch": 0.02716560833984176, "grad_norm": 1.9872287511825562, "learning_rate": 4.526935264825713e-06, "loss": 0.3269, "step": 200 }, { "epoch": 0.02852388875683385, "grad_norm": 1.3148654699325562, "learning_rate": 4.753282028066999e-06, "loss": 0.3287, "step": 210 }, { "epoch": 0.029882169173825936, "grad_norm": 2.864900588989258, "learning_rate": 4.979628791308284e-06, "loss": 0.3296, "step": 220 }, { "epoch": 0.031240449590818023, "grad_norm": 1.3058042526245117, "learning_rate": 5.20597555454957e-06, "loss": 0.3171, "step": 230 }, { "epoch": 0.03259873000781011, "grad_norm": 1.6011741161346436, "learning_rate": 5.432322317790855e-06, "loss": 0.3161, "step": 240 }, { "epoch": 0.0339570104248022, "grad_norm": 2.3401198387145996, "learning_rate": 5.658669081032141e-06, "loss": 0.3112, "step": 250 }, { "epoch": 0.035315290841794286, "grad_norm": 1.6047661304473877, "learning_rate": 5.885015844273427e-06, "loss": 0.3181, "step": 260 }, { "epoch": 0.036673571258786376, "grad_norm": 1.356286883354187, "learning_rate": 6.111362607514713e-06, "loss": 0.3019, "step": 270 }, { "epoch": 0.038031851675778466, "grad_norm": 2.707825183868408, "learning_rate": 6.337709370755998e-06, "loss": 0.3003, "step": 280 }, { "epoch": 0.03939013209277055, "grad_norm": 1.4180113077163696, "learning_rate": 6.564056133997284e-06, "loss": 0.3002, "step": 290 }, { "epoch": 0.04074841250976264, "grad_norm": 1.0536142587661743, "learning_rate": 6.7904028972385694e-06, "loss": 0.3022, "step": 300 }, { "epoch": 0.04210669292675473, "grad_norm": 1.2422553300857544, "learning_rate": 7.016749660479855e-06, "loss": 0.2973, "step": 310 }, { "epoch": 0.04346497334374682, "grad_norm": 1.0227172374725342, "learning_rate": 7.2430964237211406e-06, "loss": 0.2979, "step": 320 }, { "epoch": 0.0448232537607389, "grad_norm": 1.6890310049057007, "learning_rate": 7.469443186962427e-06, "loss": 0.2901, "step": 330 }, { "epoch": 0.04618153417773099, "grad_norm": 1.4753016233444214, "learning_rate": 7.695789950203712e-06, "loss": 0.294, "step": 340 }, { "epoch": 0.04753981459472308, "grad_norm": 1.386568307876587, "learning_rate": 7.922136713444998e-06, "loss": 0.2908, "step": 350 }, { "epoch": 0.048898095011715166, "grad_norm": 1.3728959560394287, "learning_rate": 8.148483476686284e-06, "loss": 0.2881, "step": 360 }, { "epoch": 0.050256375428707256, "grad_norm": 2.7356626987457275, "learning_rate": 8.37483023992757e-06, "loss": 0.2849, "step": 370 }, { "epoch": 0.051614655845699346, "grad_norm": 1.2492156028747559, "learning_rate": 8.601177003168854e-06, "loss": 0.287, "step": 380 }, { "epoch": 0.05297293626269143, "grad_norm": 1.424012541770935, "learning_rate": 8.827523766410142e-06, "loss": 0.2858, "step": 390 }, { "epoch": 0.05433121667968352, "grad_norm": 1.0016438961029053, "learning_rate": 9.053870529651426e-06, "loss": 0.2806, "step": 400 }, { "epoch": 0.05568949709667561, "grad_norm": 1.1349787712097168, "learning_rate": 9.280217292892712e-06, "loss": 0.2942, "step": 410 }, { "epoch": 0.0570477775136677, "grad_norm": 1.2387032508850098, "learning_rate": 9.506564056133998e-06, "loss": 0.2776, "step": 420 }, { "epoch": 0.05840605793065978, "grad_norm": 1.8727020025253296, "learning_rate": 9.732910819375284e-06, "loss": 0.2773, "step": 430 }, { "epoch": 0.05976433834765187, "grad_norm": 1.298897385597229, "learning_rate": 9.959257582616568e-06, "loss": 0.2678, "step": 440 }, { "epoch": 0.06112261876464396, "grad_norm": 1.5061310529708862, "learning_rate": 1.0185604345857854e-05, "loss": 0.2764, "step": 450 }, { "epoch": 0.062480899181636046, "grad_norm": 0.982003927230835, "learning_rate": 1.041195110909914e-05, "loss": 0.2809, "step": 460 }, { "epoch": 0.06383917959862814, "grad_norm": 1.0416069030761719, "learning_rate": 1.0638297872340426e-05, "loss": 0.2681, "step": 470 }, { "epoch": 0.06519746001562023, "grad_norm": 1.0330541133880615, "learning_rate": 1.086464463558171e-05, "loss": 0.2668, "step": 480 }, { "epoch": 0.06655574043261231, "grad_norm": 1.0588951110839844, "learning_rate": 1.1090991398822998e-05, "loss": 0.2688, "step": 490 }, { "epoch": 0.0679140208496044, "grad_norm": 1.053652286529541, "learning_rate": 1.1317338162064282e-05, "loss": 0.2633, "step": 500 }, { "epoch": 0.06927230126659649, "grad_norm": 2.054144859313965, "learning_rate": 1.1543684925305568e-05, "loss": 0.2713, "step": 510 }, { "epoch": 0.07063058168358857, "grad_norm": 1.0282090902328491, "learning_rate": 1.1770031688546854e-05, "loss": 0.2687, "step": 520 }, { "epoch": 0.07198886210058067, "grad_norm": 0.9309536814689636, "learning_rate": 1.199637845178814e-05, "loss": 0.2779, "step": 530 }, { "epoch": 0.07334714251757275, "grad_norm": 0.8386070132255554, "learning_rate": 1.2222725215029426e-05, "loss": 0.2728, "step": 540 }, { "epoch": 0.07470542293456484, "grad_norm": 0.8954309225082397, "learning_rate": 1.244907197827071e-05, "loss": 0.2781, "step": 550 }, { "epoch": 0.07606370335155693, "grad_norm": 2.715548515319824, "learning_rate": 1.2675418741511997e-05, "loss": 0.2674, "step": 560 }, { "epoch": 0.07742198376854902, "grad_norm": 1.3553210496902466, "learning_rate": 1.2901765504753283e-05, "loss": 0.2638, "step": 570 }, { "epoch": 0.0787802641855411, "grad_norm": 0.840496301651001, "learning_rate": 1.3128112267994569e-05, "loss": 0.2566, "step": 580 }, { "epoch": 0.0801385446025332, "grad_norm": 1.0644850730895996, "learning_rate": 1.3354459031235855e-05, "loss": 0.2606, "step": 590 }, { "epoch": 0.08149682501952528, "grad_norm": 0.9570043683052063, "learning_rate": 1.3580805794477139e-05, "loss": 0.2613, "step": 600 }, { "epoch": 0.08285510543651736, "grad_norm": 0.7820966839790344, "learning_rate": 1.3807152557718425e-05, "loss": 0.2568, "step": 610 }, { "epoch": 0.08421338585350946, "grad_norm": 0.9967162609100342, "learning_rate": 1.403349932095971e-05, "loss": 0.2606, "step": 620 }, { "epoch": 0.08557166627050154, "grad_norm": 2.2971596717834473, "learning_rate": 1.4259846084200999e-05, "loss": 0.2607, "step": 630 }, { "epoch": 0.08692994668749364, "grad_norm": 0.8225627541542053, "learning_rate": 1.4486192847442281e-05, "loss": 0.2603, "step": 640 }, { "epoch": 0.08828822710448572, "grad_norm": 0.7975953817367554, "learning_rate": 1.4712539610683567e-05, "loss": 0.2557, "step": 650 }, { "epoch": 0.0896465075214778, "grad_norm": 0.8600499629974365, "learning_rate": 1.4938886373924855e-05, "loss": 0.26, "step": 660 }, { "epoch": 0.0910047879384699, "grad_norm": 0.7347790598869324, "learning_rate": 1.516523313716614e-05, "loss": 0.2546, "step": 670 }, { "epoch": 0.09236306835546199, "grad_norm": 0.8944090604782104, "learning_rate": 1.5391579900407423e-05, "loss": 0.2462, "step": 680 }, { "epoch": 0.09372134877245407, "grad_norm": 1.213087797164917, "learning_rate": 1.561792666364871e-05, "loss": 0.251, "step": 690 }, { "epoch": 0.09507962918944617, "grad_norm": 2.5634043216705322, "learning_rate": 1.5844273426889995e-05, "loss": 0.2518, "step": 700 }, { "epoch": 0.09643790960643825, "grad_norm": 0.862399697303772, "learning_rate": 1.6070620190131283e-05, "loss": 0.2562, "step": 710 }, { "epoch": 0.09779619002343033, "grad_norm": 0.8064952492713928, "learning_rate": 1.6296966953372567e-05, "loss": 0.2374, "step": 720 }, { "epoch": 0.09915447044042243, "grad_norm": 1.0488232374191284, "learning_rate": 1.652331371661385e-05, "loss": 0.2605, "step": 730 }, { "epoch": 0.10051275085741451, "grad_norm": 0.889559805393219, "learning_rate": 1.674966047985514e-05, "loss": 0.2538, "step": 740 }, { "epoch": 0.1018710312744066, "grad_norm": 1.4817384481430054, "learning_rate": 1.6976007243096427e-05, "loss": 0.2473, "step": 750 }, { "epoch": 0.10322931169139869, "grad_norm": 0.6944621801376343, "learning_rate": 1.7202354006337708e-05, "loss": 0.2499, "step": 760 }, { "epoch": 0.10458759210839078, "grad_norm": 1.182078242301941, "learning_rate": 1.7428700769578996e-05, "loss": 0.2567, "step": 770 }, { "epoch": 0.10594587252538286, "grad_norm": 0.7537926435470581, "learning_rate": 1.7655047532820283e-05, "loss": 0.2526, "step": 780 }, { "epoch": 0.10730415294237496, "grad_norm": 1.0597002506256104, "learning_rate": 1.7881394296061568e-05, "loss": 0.2491, "step": 790 }, { "epoch": 0.10866243335936704, "grad_norm": 0.87506502866745, "learning_rate": 1.8107741059302852e-05, "loss": 0.2529, "step": 800 }, { "epoch": 0.11002071377635914, "grad_norm": 0.6898545622825623, "learning_rate": 1.833408782254414e-05, "loss": 0.259, "step": 810 }, { "epoch": 0.11137899419335122, "grad_norm": 0.8834750056266785, "learning_rate": 1.8560434585785424e-05, "loss": 0.2518, "step": 820 }, { "epoch": 0.1127372746103433, "grad_norm": 0.9087128043174744, "learning_rate": 1.878678134902671e-05, "loss": 0.2456, "step": 830 }, { "epoch": 0.1140955550273354, "grad_norm": 1.0379595756530762, "learning_rate": 1.9013128112267996e-05, "loss": 0.2443, "step": 840 }, { "epoch": 0.11545383544432748, "grad_norm": 0.6750545501708984, "learning_rate": 1.923947487550928e-05, "loss": 0.2583, "step": 850 }, { "epoch": 0.11681211586131957, "grad_norm": 0.6066490411758423, "learning_rate": 1.9465821638750568e-05, "loss": 0.2427, "step": 860 }, { "epoch": 0.11817039627831166, "grad_norm": 0.842696487903595, "learning_rate": 1.9692168401991852e-05, "loss": 0.242, "step": 870 }, { "epoch": 0.11952867669530375, "grad_norm": 0.6928762197494507, "learning_rate": 1.9918515165233136e-05, "loss": 0.2441, "step": 880 }, { "epoch": 0.12088695711229583, "grad_norm": 0.7964233160018921, "learning_rate": 2.0144861928474424e-05, "loss": 0.2615, "step": 890 }, { "epoch": 0.12224523752928793, "grad_norm": 0.8001169562339783, "learning_rate": 2.0371208691715708e-05, "loss": 0.2466, "step": 900 }, { "epoch": 0.12360351794628001, "grad_norm": 0.6021906137466431, "learning_rate": 2.0597555454956996e-05, "loss": 0.2527, "step": 910 }, { "epoch": 0.12496179836327209, "grad_norm": 0.6645887494087219, "learning_rate": 2.082390221819828e-05, "loss": 0.2382, "step": 920 }, { "epoch": 0.1263200787802642, "grad_norm": 0.8555026054382324, "learning_rate": 2.1050248981439565e-05, "loss": 0.2536, "step": 930 }, { "epoch": 0.12767835919725629, "grad_norm": 0.8588036298751831, "learning_rate": 2.1276595744680852e-05, "loss": 0.2558, "step": 940 }, { "epoch": 0.12903663961424836, "grad_norm": 0.6873536109924316, "learning_rate": 2.150294250792214e-05, "loss": 0.2492, "step": 950 }, { "epoch": 0.13039492003124045, "grad_norm": 0.7976923584938049, "learning_rate": 2.172928927116342e-05, "loss": 0.2489, "step": 960 }, { "epoch": 0.13175320044823255, "grad_norm": 2.7427146434783936, "learning_rate": 2.195563603440471e-05, "loss": 0.239, "step": 970 }, { "epoch": 0.13311148086522462, "grad_norm": 0.6412870287895203, "learning_rate": 2.2181982797645996e-05, "loss": 0.2538, "step": 980 }, { "epoch": 0.13446976128221672, "grad_norm": 0.6794038414955139, "learning_rate": 2.240832956088728e-05, "loss": 0.2392, "step": 990 }, { "epoch": 0.1358280416992088, "grad_norm": 0.9705951809883118, "learning_rate": 2.2634676324128565e-05, "loss": 0.2537, "step": 1000 }, { "epoch": 0.13718632211620088, "grad_norm": 0.6528685688972473, "learning_rate": 2.2861023087369852e-05, "loss": 0.2503, "step": 1010 }, { "epoch": 0.13854460253319298, "grad_norm": 0.757220983505249, "learning_rate": 2.3087369850611137e-05, "loss": 0.2444, "step": 1020 }, { "epoch": 0.13990288295018508, "grad_norm": 0.8307459950447083, "learning_rate": 2.3313716613852424e-05, "loss": 0.2471, "step": 1030 }, { "epoch": 0.14126116336717714, "grad_norm": 0.5947362184524536, "learning_rate": 2.354006337709371e-05, "loss": 0.2488, "step": 1040 }, { "epoch": 0.14261944378416924, "grad_norm": 0.6661319732666016, "learning_rate": 2.3766410140334993e-05, "loss": 0.242, "step": 1050 }, { "epoch": 0.14397772420116134, "grad_norm": 1.0111156702041626, "learning_rate": 2.399275690357628e-05, "loss": 0.2493, "step": 1060 }, { "epoch": 0.1453360046181534, "grad_norm": 0.644440770149231, "learning_rate": 2.4219103666817565e-05, "loss": 0.2367, "step": 1070 }, { "epoch": 0.1466942850351455, "grad_norm": 0.7669150829315186, "learning_rate": 2.4445450430058853e-05, "loss": 0.2418, "step": 1080 }, { "epoch": 0.1480525654521376, "grad_norm": 0.5541858077049255, "learning_rate": 2.4671797193300137e-05, "loss": 0.2483, "step": 1090 }, { "epoch": 0.14941084586912967, "grad_norm": 0.9453285336494446, "learning_rate": 2.489814395654142e-05, "loss": 0.2491, "step": 1100 }, { "epoch": 0.15076912628612177, "grad_norm": 0.6971938014030457, "learning_rate": 2.5124490719782706e-05, "loss": 0.2462, "step": 1110 }, { "epoch": 0.15212740670311387, "grad_norm": 0.8671134114265442, "learning_rate": 2.5350837483023993e-05, "loss": 0.2453, "step": 1120 }, { "epoch": 0.15348568712010593, "grad_norm": 0.821707010269165, "learning_rate": 2.5577184246265277e-05, "loss": 0.253, "step": 1130 }, { "epoch": 0.15484396753709803, "grad_norm": 0.5413023233413696, "learning_rate": 2.5803531009506565e-05, "loss": 0.2418, "step": 1140 }, { "epoch": 0.15620224795409013, "grad_norm": 0.6273642778396606, "learning_rate": 2.6029877772747853e-05, "loss": 0.2464, "step": 1150 }, { "epoch": 0.1575605283710822, "grad_norm": 0.6397319436073303, "learning_rate": 2.6256224535989137e-05, "loss": 0.2457, "step": 1160 }, { "epoch": 0.1589188087880743, "grad_norm": 1.0248420238494873, "learning_rate": 2.6482571299230425e-05, "loss": 0.2434, "step": 1170 }, { "epoch": 0.1602770892050664, "grad_norm": 0.765489935874939, "learning_rate": 2.670891806247171e-05, "loss": 0.2436, "step": 1180 }, { "epoch": 0.16163536962205846, "grad_norm": 0.7257832288742065, "learning_rate": 2.693526482571299e-05, "loss": 0.2421, "step": 1190 }, { "epoch": 0.16299365003905056, "grad_norm": 0.6136078238487244, "learning_rate": 2.7161611588954278e-05, "loss": 0.2446, "step": 1200 }, { "epoch": 0.16435193045604266, "grad_norm": 1.3877404928207397, "learning_rate": 2.7387958352195565e-05, "loss": 0.2502, "step": 1210 }, { "epoch": 0.16571021087303472, "grad_norm": 0.5170011520385742, "learning_rate": 2.761430511543685e-05, "loss": 0.245, "step": 1220 }, { "epoch": 0.16706849129002682, "grad_norm": 0.6201781034469604, "learning_rate": 2.7840651878678137e-05, "loss": 0.2442, "step": 1230 }, { "epoch": 0.16842677170701892, "grad_norm": 0.5909647941589355, "learning_rate": 2.806699864191942e-05, "loss": 0.246, "step": 1240 }, { "epoch": 0.16978505212401102, "grad_norm": 0.7007549405097961, "learning_rate": 2.829334540516071e-05, "loss": 0.2503, "step": 1250 }, { "epoch": 0.17114333254100308, "grad_norm": 0.6708640456199646, "learning_rate": 2.8519692168401997e-05, "loss": 0.2407, "step": 1260 }, { "epoch": 0.17250161295799518, "grad_norm": 1.2493112087249756, "learning_rate": 2.8746038931643278e-05, "loss": 0.2359, "step": 1270 }, { "epoch": 0.17385989337498728, "grad_norm": 0.46546387672424316, "learning_rate": 2.8972385694884562e-05, "loss": 0.2385, "step": 1280 }, { "epoch": 0.17521817379197935, "grad_norm": 0.5315385460853577, "learning_rate": 2.919873245812585e-05, "loss": 0.2437, "step": 1290 }, { "epoch": 0.17657645420897145, "grad_norm": 0.6751018762588501, "learning_rate": 2.9425079221367134e-05, "loss": 0.2321, "step": 1300 }, { "epoch": 0.17793473462596354, "grad_norm": 0.8895365595817566, "learning_rate": 2.9651425984608422e-05, "loss": 0.2445, "step": 1310 }, { "epoch": 0.1792930150429556, "grad_norm": 0.6249046325683594, "learning_rate": 2.987777274784971e-05, "loss": 0.2465, "step": 1320 }, { "epoch": 0.1806512954599477, "grad_norm": 0.6597574949264526, "learning_rate": 3.0104119511090994e-05, "loss": 0.2323, "step": 1330 }, { "epoch": 0.1820095758769398, "grad_norm": 0.6391541361808777, "learning_rate": 3.033046627433228e-05, "loss": 0.2433, "step": 1340 }, { "epoch": 0.18336785629393187, "grad_norm": 0.6053256392478943, "learning_rate": 3.055681303757356e-05, "loss": 0.2454, "step": 1350 }, { "epoch": 0.18472613671092397, "grad_norm": 0.7838967442512512, "learning_rate": 3.078315980081485e-05, "loss": 0.2462, "step": 1360 }, { "epoch": 0.18608441712791607, "grad_norm": 0.5283857583999634, "learning_rate": 3.100950656405614e-05, "loss": 0.2529, "step": 1370 }, { "epoch": 0.18744269754490814, "grad_norm": 0.6863192319869995, "learning_rate": 3.123585332729742e-05, "loss": 0.2387, "step": 1380 }, { "epoch": 0.18880097796190023, "grad_norm": 0.5115373134613037, "learning_rate": 3.1462200090538706e-05, "loss": 0.2355, "step": 1390 }, { "epoch": 0.19015925837889233, "grad_norm": 0.4958800971508026, "learning_rate": 3.168854685377999e-05, "loss": 0.2353, "step": 1400 }, { "epoch": 0.1915175387958844, "grad_norm": 0.6991377472877502, "learning_rate": 3.191489361702128e-05, "loss": 0.2396, "step": 1410 }, { "epoch": 0.1928758192128765, "grad_norm": 0.698701798915863, "learning_rate": 3.2141240380262566e-05, "loss": 0.2363, "step": 1420 }, { "epoch": 0.1942340996298686, "grad_norm": 0.43657001852989197, "learning_rate": 3.2367587143503844e-05, "loss": 0.24, "step": 1430 }, { "epoch": 0.19559238004686066, "grad_norm": 0.5303202271461487, "learning_rate": 3.2593933906745135e-05, "loss": 0.2437, "step": 1440 }, { "epoch": 0.19695066046385276, "grad_norm": 0.82850581407547, "learning_rate": 3.282028066998642e-05, "loss": 0.2392, "step": 1450 }, { "epoch": 0.19830894088084486, "grad_norm": 0.6485626697540283, "learning_rate": 3.30466274332277e-05, "loss": 0.2397, "step": 1460 }, { "epoch": 0.19966722129783693, "grad_norm": 0.56908118724823, "learning_rate": 3.3272974196468994e-05, "loss": 0.23, "step": 1470 }, { "epoch": 0.20102550171482902, "grad_norm": 0.4578416645526886, "learning_rate": 3.349932095971028e-05, "loss": 0.2316, "step": 1480 }, { "epoch": 0.20238378213182112, "grad_norm": 0.7648049592971802, "learning_rate": 3.372566772295156e-05, "loss": 0.2444, "step": 1490 }, { "epoch": 0.2037420625488132, "grad_norm": 0.540086030960083, "learning_rate": 3.3952014486192854e-05, "loss": 0.2311, "step": 1500 }, { "epoch": 0.2051003429658053, "grad_norm": 0.5862252712249756, "learning_rate": 3.417836124943413e-05, "loss": 0.2431, "step": 1510 }, { "epoch": 0.20645862338279738, "grad_norm": 0.7461491227149963, "learning_rate": 3.4404708012675416e-05, "loss": 0.242, "step": 1520 }, { "epoch": 0.20781690379978945, "grad_norm": 0.6063190698623657, "learning_rate": 3.463105477591671e-05, "loss": 0.2381, "step": 1530 }, { "epoch": 0.20917518421678155, "grad_norm": 0.6737818121910095, "learning_rate": 3.485740153915799e-05, "loss": 0.2381, "step": 1540 }, { "epoch": 0.21053346463377365, "grad_norm": 0.5183326601982117, "learning_rate": 3.5083748302399275e-05, "loss": 0.2467, "step": 1550 }, { "epoch": 0.21189174505076572, "grad_norm": 0.6733895540237427, "learning_rate": 3.5310095065640566e-05, "loss": 0.249, "step": 1560 }, { "epoch": 0.21325002546775781, "grad_norm": 0.49622488021850586, "learning_rate": 3.553644182888185e-05, "loss": 0.235, "step": 1570 }, { "epoch": 0.2146083058847499, "grad_norm": 0.5387190580368042, "learning_rate": 3.5762788592123135e-05, "loss": 0.2385, "step": 1580 }, { "epoch": 0.215966586301742, "grad_norm": 0.4926791191101074, "learning_rate": 3.598913535536442e-05, "loss": 0.2385, "step": 1590 }, { "epoch": 0.21732486671873408, "grad_norm": 0.5796099901199341, "learning_rate": 3.6215482118605704e-05, "loss": 0.2388, "step": 1600 }, { "epoch": 0.21868314713572617, "grad_norm": 0.49090734124183655, "learning_rate": 3.644182888184699e-05, "loss": 0.2536, "step": 1610 }, { "epoch": 0.22004142755271827, "grad_norm": 0.41109010577201843, "learning_rate": 3.666817564508828e-05, "loss": 0.2485, "step": 1620 }, { "epoch": 0.22139970796971034, "grad_norm": 0.42971354722976685, "learning_rate": 3.689452240832956e-05, "loss": 0.2271, "step": 1630 }, { "epoch": 0.22275798838670244, "grad_norm": 0.641934871673584, "learning_rate": 3.712086917157085e-05, "loss": 0.2334, "step": 1640 }, { "epoch": 0.22411626880369453, "grad_norm": 0.47131165862083435, "learning_rate": 3.734721593481214e-05, "loss": 0.2413, "step": 1650 }, { "epoch": 0.2254745492206866, "grad_norm": 0.49795448780059814, "learning_rate": 3.757356269805342e-05, "loss": 0.236, "step": 1660 }, { "epoch": 0.2268328296376787, "grad_norm": 0.5687565207481384, "learning_rate": 3.779990946129471e-05, "loss": 0.2359, "step": 1670 }, { "epoch": 0.2281911100546708, "grad_norm": 0.9793988466262817, "learning_rate": 3.802625622453599e-05, "loss": 0.2392, "step": 1680 }, { "epoch": 0.22954939047166287, "grad_norm": 0.6737297177314758, "learning_rate": 3.8252602987777276e-05, "loss": 0.2364, "step": 1690 }, { "epoch": 0.23090767088865496, "grad_norm": 0.7096961736679077, "learning_rate": 3.847894975101856e-05, "loss": 0.243, "step": 1700 }, { "epoch": 0.23226595130564706, "grad_norm": 0.46155887842178345, "learning_rate": 3.870529651425985e-05, "loss": 0.2438, "step": 1710 }, { "epoch": 0.23362423172263913, "grad_norm": 0.8529092669487, "learning_rate": 3.8931643277501135e-05, "loss": 0.2398, "step": 1720 }, { "epoch": 0.23498251213963123, "grad_norm": 0.5858421921730042, "learning_rate": 3.915799004074242e-05, "loss": 0.2458, "step": 1730 }, { "epoch": 0.23634079255662332, "grad_norm": 0.43549036979675293, "learning_rate": 3.9384336803983704e-05, "loss": 0.2358, "step": 1740 }, { "epoch": 0.2376990729736154, "grad_norm": 0.49200713634490967, "learning_rate": 3.9610683567224995e-05, "loss": 0.2399, "step": 1750 }, { "epoch": 0.2390573533906075, "grad_norm": 0.56099933385849, "learning_rate": 3.983703033046627e-05, "loss": 0.2329, "step": 1760 }, { "epoch": 0.2404156338075996, "grad_norm": 0.5640586614608765, "learning_rate": 4.0063377093707564e-05, "loss": 0.2381, "step": 1770 }, { "epoch": 0.24177391422459166, "grad_norm": 0.7880319356918335, "learning_rate": 4.028972385694885e-05, "loss": 0.2345, "step": 1780 }, { "epoch": 0.24313219464158375, "grad_norm": 0.42779436707496643, "learning_rate": 4.051607062019013e-05, "loss": 0.2435, "step": 1790 }, { "epoch": 0.24449047505857585, "grad_norm": 0.6199722290039062, "learning_rate": 4.0742417383431417e-05, "loss": 0.2367, "step": 1800 }, { "epoch": 0.24584875547556792, "grad_norm": 0.396634042263031, "learning_rate": 4.096876414667271e-05, "loss": 0.2319, "step": 1810 }, { "epoch": 0.24720703589256002, "grad_norm": 0.5526453852653503, "learning_rate": 4.119511090991399e-05, "loss": 0.2366, "step": 1820 }, { "epoch": 0.24856531630955211, "grad_norm": 0.539271891117096, "learning_rate": 4.1421457673155276e-05, "loss": 0.2386, "step": 1830 }, { "epoch": 0.24992359672654418, "grad_norm": 0.4604705572128296, "learning_rate": 4.164780443639656e-05, "loss": 0.2412, "step": 1840 }, { "epoch": 0.2512818771435363, "grad_norm": 0.6201371550559998, "learning_rate": 4.1874151199637845e-05, "loss": 0.2392, "step": 1850 }, { "epoch": 0.2526401575605284, "grad_norm": 0.894784152507782, "learning_rate": 4.210049796287913e-05, "loss": 0.2351, "step": 1860 }, { "epoch": 0.2539984379775205, "grad_norm": 0.4338095486164093, "learning_rate": 4.232684472612042e-05, "loss": 0.2267, "step": 1870 }, { "epoch": 0.25535671839451257, "grad_norm": 0.5191648602485657, "learning_rate": 4.2553191489361704e-05, "loss": 0.2351, "step": 1880 }, { "epoch": 0.2567149988115046, "grad_norm": 0.4038718342781067, "learning_rate": 4.277953825260299e-05, "loss": 0.243, "step": 1890 }, { "epoch": 0.2580732792284967, "grad_norm": 0.7034184336662292, "learning_rate": 4.300588501584428e-05, "loss": 0.2477, "step": 1900 }, { "epoch": 0.2594315596454888, "grad_norm": 0.38926178216934204, "learning_rate": 4.3232231779085564e-05, "loss": 0.2447, "step": 1910 }, { "epoch": 0.2607898400624809, "grad_norm": 1.5272995233535767, "learning_rate": 4.345857854232684e-05, "loss": 0.233, "step": 1920 }, { "epoch": 0.262148120479473, "grad_norm": 0.39782124757766724, "learning_rate": 4.368492530556813e-05, "loss": 0.2382, "step": 1930 }, { "epoch": 0.2635064008964651, "grad_norm": 0.5238626599311829, "learning_rate": 4.391127206880942e-05, "loss": 0.2436, "step": 1940 }, { "epoch": 0.26486468131345714, "grad_norm": 0.4123516082763672, "learning_rate": 4.41376188320507e-05, "loss": 0.2395, "step": 1950 }, { "epoch": 0.26622296173044924, "grad_norm": 0.6417278051376343, "learning_rate": 4.436396559529199e-05, "loss": 0.2498, "step": 1960 }, { "epoch": 0.26758124214744133, "grad_norm": 0.7081544399261475, "learning_rate": 4.459031235853328e-05, "loss": 0.2385, "step": 1970 }, { "epoch": 0.26893952256443343, "grad_norm": 0.5928105115890503, "learning_rate": 4.481665912177456e-05, "loss": 0.2397, "step": 1980 }, { "epoch": 0.27029780298142553, "grad_norm": 0.5397465825080872, "learning_rate": 4.504300588501585e-05, "loss": 0.2335, "step": 1990 }, { "epoch": 0.2716560833984176, "grad_norm": 0.5259700417518616, "learning_rate": 4.526935264825713e-05, "loss": 0.2484, "step": 2000 }, { "epoch": 0.27301436381540967, "grad_norm": 0.5188269019126892, "learning_rate": 4.5495699411498414e-05, "loss": 0.2281, "step": 2010 }, { "epoch": 0.27437264423240176, "grad_norm": 0.5917236804962158, "learning_rate": 4.5722046174739705e-05, "loss": 0.2438, "step": 2020 }, { "epoch": 0.27573092464939386, "grad_norm": 0.3486921191215515, "learning_rate": 4.594839293798099e-05, "loss": 0.2299, "step": 2030 }, { "epoch": 0.27708920506638596, "grad_norm": 0.48005345463752747, "learning_rate": 4.6174739701222273e-05, "loss": 0.245, "step": 2040 }, { "epoch": 0.27844748548337805, "grad_norm": 0.5464040637016296, "learning_rate": 4.6401086464463565e-05, "loss": 0.2337, "step": 2050 }, { "epoch": 0.27980576590037015, "grad_norm": 0.4074684977531433, "learning_rate": 4.662743322770485e-05, "loss": 0.2511, "step": 2060 }, { "epoch": 0.2811640463173622, "grad_norm": 1.2195255756378174, "learning_rate": 4.685377999094613e-05, "loss": 0.2326, "step": 2070 }, { "epoch": 0.2825223267343543, "grad_norm": 0.5130909085273743, "learning_rate": 4.708012675418742e-05, "loss": 0.2448, "step": 2080 }, { "epoch": 0.2838806071513464, "grad_norm": 0.5406199097633362, "learning_rate": 4.73064735174287e-05, "loss": 0.2409, "step": 2090 }, { "epoch": 0.2852388875683385, "grad_norm": 0.9959255456924438, "learning_rate": 4.7532820280669986e-05, "loss": 0.2412, "step": 2100 }, { "epoch": 0.2865971679853306, "grad_norm": 0.5617865324020386, "learning_rate": 4.775916704391128e-05, "loss": 0.2362, "step": 2110 }, { "epoch": 0.2879554484023227, "grad_norm": 0.4168139696121216, "learning_rate": 4.798551380715256e-05, "loss": 0.2248, "step": 2120 }, { "epoch": 0.2893137288193148, "grad_norm": 0.4500597417354584, "learning_rate": 4.8211860570393846e-05, "loss": 0.2279, "step": 2130 }, { "epoch": 0.2906720092363068, "grad_norm": 0.47042763233184814, "learning_rate": 4.843820733363513e-05, "loss": 0.2398, "step": 2140 }, { "epoch": 0.2920302896532989, "grad_norm": 0.511652946472168, "learning_rate": 4.866455409687642e-05, "loss": 0.2332, "step": 2150 }, { "epoch": 0.293388570070291, "grad_norm": 0.48689916729927063, "learning_rate": 4.8890900860117705e-05, "loss": 0.2322, "step": 2160 }, { "epoch": 0.2947468504872831, "grad_norm": 0.47885528206825256, "learning_rate": 4.911724762335899e-05, "loss": 0.2446, "step": 2170 }, { "epoch": 0.2961051309042752, "grad_norm": 0.5884508490562439, "learning_rate": 4.9343594386600274e-05, "loss": 0.2349, "step": 2180 }, { "epoch": 0.2974634113212673, "grad_norm": 0.965680718421936, "learning_rate": 4.956994114984156e-05, "loss": 0.2359, "step": 2190 }, { "epoch": 0.29882169173825934, "grad_norm": 0.7696114778518677, "learning_rate": 4.979628791308284e-05, "loss": 0.231, "step": 2200 }, { "epoch": 0.30017997215525144, "grad_norm": 0.4431217312812805, "learning_rate": 5.0022634676324134e-05, "loss": 0.2394, "step": 2210 }, { "epoch": 0.30153825257224354, "grad_norm": 0.5985506176948547, "learning_rate": 5.024898143956541e-05, "loss": 0.2364, "step": 2220 }, { "epoch": 0.30289653298923563, "grad_norm": 0.508510172367096, "learning_rate": 5.04753282028067e-05, "loss": 0.2467, "step": 2230 }, { "epoch": 0.30425481340622773, "grad_norm": 0.5461332201957703, "learning_rate": 5.0701674966047986e-05, "loss": 0.2317, "step": 2240 }, { "epoch": 0.30561309382321983, "grad_norm": 0.3439137637615204, "learning_rate": 5.092802172928928e-05, "loss": 0.2346, "step": 2250 }, { "epoch": 0.30697137424021187, "grad_norm": 0.48549193143844604, "learning_rate": 5.1154368492530555e-05, "loss": 0.2347, "step": 2260 }, { "epoch": 0.30832965465720397, "grad_norm": 0.5423714518547058, "learning_rate": 5.138071525577185e-05, "loss": 0.2359, "step": 2270 }, { "epoch": 0.30968793507419606, "grad_norm": 0.6127755641937256, "learning_rate": 5.160706201901313e-05, "loss": 0.2392, "step": 2280 }, { "epoch": 0.31104621549118816, "grad_norm": 0.5976607203483582, "learning_rate": 5.183340878225441e-05, "loss": 0.2437, "step": 2290 }, { "epoch": 0.31240449590818026, "grad_norm": 0.7950578927993774, "learning_rate": 5.2059755545495706e-05, "loss": 0.2429, "step": 2300 }, { "epoch": 0.31376277632517235, "grad_norm": 0.43977001309394836, "learning_rate": 5.228610230873698e-05, "loss": 0.2296, "step": 2310 }, { "epoch": 0.3151210567421644, "grad_norm": 1.0626436471939087, "learning_rate": 5.2512449071978274e-05, "loss": 0.2269, "step": 2320 }, { "epoch": 0.3164793371591565, "grad_norm": 0.42409637570381165, "learning_rate": 5.273879583521956e-05, "loss": 0.2343, "step": 2330 }, { "epoch": 0.3178376175761486, "grad_norm": 0.7139838933944702, "learning_rate": 5.296514259846085e-05, "loss": 0.2432, "step": 2340 }, { "epoch": 0.3191958979931407, "grad_norm": 0.40164828300476074, "learning_rate": 5.319148936170213e-05, "loss": 0.2351, "step": 2350 }, { "epoch": 0.3205541784101328, "grad_norm": 0.3874810039997101, "learning_rate": 5.341783612494342e-05, "loss": 0.2285, "step": 2360 }, { "epoch": 0.3219124588271249, "grad_norm": 0.5761833786964417, "learning_rate": 5.36441828881847e-05, "loss": 0.2343, "step": 2370 }, { "epoch": 0.3232707392441169, "grad_norm": 0.32223182916641235, "learning_rate": 5.387052965142598e-05, "loss": 0.2349, "step": 2380 }, { "epoch": 0.324629019661109, "grad_norm": 0.4948557913303375, "learning_rate": 5.409687641466728e-05, "loss": 0.2276, "step": 2390 }, { "epoch": 0.3259873000781011, "grad_norm": 1.2757432460784912, "learning_rate": 5.4323223177908555e-05, "loss": 0.2392, "step": 2400 }, { "epoch": 0.3273455804950932, "grad_norm": 0.7249438166618347, "learning_rate": 5.4549569941149846e-05, "loss": 0.2463, "step": 2410 }, { "epoch": 0.3287038609120853, "grad_norm": 0.6693064570426941, "learning_rate": 5.477591670439113e-05, "loss": 0.2338, "step": 2420 }, { "epoch": 0.3300621413290774, "grad_norm": 0.5241517424583435, "learning_rate": 5.500226346763242e-05, "loss": 0.2386, "step": 2430 }, { "epoch": 0.33142042174606945, "grad_norm": 0.4753365218639374, "learning_rate": 5.52286102308737e-05, "loss": 0.2446, "step": 2440 }, { "epoch": 0.33277870216306155, "grad_norm": 0.42271506786346436, "learning_rate": 5.5454956994114984e-05, "loss": 0.2473, "step": 2450 }, { "epoch": 0.33413698258005364, "grad_norm": 0.4124278426170349, "learning_rate": 5.5681303757356275e-05, "loss": 0.2427, "step": 2460 }, { "epoch": 0.33549526299704574, "grad_norm": 0.3566555380821228, "learning_rate": 5.590765052059755e-05, "loss": 0.2381, "step": 2470 }, { "epoch": 0.33685354341403784, "grad_norm": 0.4008844494819641, "learning_rate": 5.613399728383884e-05, "loss": 0.2379, "step": 2480 }, { "epoch": 0.33821182383102993, "grad_norm": 0.3966061770915985, "learning_rate": 5.636034404708013e-05, "loss": 0.2465, "step": 2490 }, { "epoch": 0.33957010424802203, "grad_norm": 0.32933351397514343, "learning_rate": 5.658669081032142e-05, "loss": 0.2405, "step": 2500 }, { "epoch": 0.3409283846650141, "grad_norm": 0.6496679186820984, "learning_rate": 5.68130375735627e-05, "loss": 0.2277, "step": 2510 }, { "epoch": 0.34228666508200617, "grad_norm": 0.3383789658546448, "learning_rate": 5.7039384336803994e-05, "loss": 0.2408, "step": 2520 }, { "epoch": 0.34364494549899827, "grad_norm": 0.4984326958656311, "learning_rate": 5.726573110004527e-05, "loss": 0.2394, "step": 2530 }, { "epoch": 0.34500322591599036, "grad_norm": 0.641692042350769, "learning_rate": 5.7492077863286556e-05, "loss": 0.237, "step": 2540 }, { "epoch": 0.34636150633298246, "grad_norm": 0.7406136393547058, "learning_rate": 5.771842462652785e-05, "loss": 0.2411, "step": 2550 }, { "epoch": 0.34771978674997456, "grad_norm": 0.32972589135169983, "learning_rate": 5.7944771389769124e-05, "loss": 0.2367, "step": 2560 }, { "epoch": 0.3490780671669666, "grad_norm": 0.4483562707901001, "learning_rate": 5.8171118153010415e-05, "loss": 0.2317, "step": 2570 }, { "epoch": 0.3504363475839587, "grad_norm": 0.40283137559890747, "learning_rate": 5.83974649162517e-05, "loss": 0.236, "step": 2580 }, { "epoch": 0.3517946280009508, "grad_norm": 0.33451423048973083, "learning_rate": 5.862381167949299e-05, "loss": 0.238, "step": 2590 }, { "epoch": 0.3531529084179429, "grad_norm": 0.404154509305954, "learning_rate": 5.885015844273427e-05, "loss": 0.2458, "step": 2600 }, { "epoch": 0.354511188834935, "grad_norm": 0.8510772585868835, "learning_rate": 5.907650520597555e-05, "loss": 0.2403, "step": 2610 }, { "epoch": 0.3558694692519271, "grad_norm": 0.5136832594871521, "learning_rate": 5.9302851969216844e-05, "loss": 0.2293, "step": 2620 }, { "epoch": 0.3572277496689191, "grad_norm": 0.36652278900146484, "learning_rate": 5.952919873245812e-05, "loss": 0.238, "step": 2630 }, { "epoch": 0.3585860300859112, "grad_norm": 0.3559562861919403, "learning_rate": 5.975554549569942e-05, "loss": 0.241, "step": 2640 }, { "epoch": 0.3599443105029033, "grad_norm": 0.3530553877353668, "learning_rate": 5.9981892258940697e-05, "loss": 0.242, "step": 2650 }, { "epoch": 0.3613025909198954, "grad_norm": 0.6754847764968872, "learning_rate": 6.020823902218199e-05, "loss": 0.2384, "step": 2660 }, { "epoch": 0.3626608713368875, "grad_norm": 1.0385140180587769, "learning_rate": 6.043458578542327e-05, "loss": 0.2413, "step": 2670 }, { "epoch": 0.3640191517538796, "grad_norm": 0.36458244919776917, "learning_rate": 6.066093254866456e-05, "loss": 0.2312, "step": 2680 }, { "epoch": 0.36537743217087165, "grad_norm": 0.40099698305130005, "learning_rate": 6.088727931190584e-05, "loss": 0.2347, "step": 2690 }, { "epoch": 0.36673571258786375, "grad_norm": 0.3237636089324951, "learning_rate": 6.111362607514712e-05, "loss": 0.2408, "step": 2700 }, { "epoch": 0.36809399300485585, "grad_norm": 0.36819812655448914, "learning_rate": 6.133997283838841e-05, "loss": 0.2484, "step": 2710 }, { "epoch": 0.36945227342184794, "grad_norm": 0.5458301901817322, "learning_rate": 6.15663196016297e-05, "loss": 0.2447, "step": 2720 }, { "epoch": 0.37081055383884004, "grad_norm": 0.3975117802619934, "learning_rate": 6.179266636487099e-05, "loss": 0.2418, "step": 2730 }, { "epoch": 0.37216883425583214, "grad_norm": 0.5429337024688721, "learning_rate": 6.201901312811228e-05, "loss": 0.242, "step": 2740 }, { "epoch": 0.3735271146728242, "grad_norm": 0.731829822063446, "learning_rate": 6.224535989135356e-05, "loss": 0.2399, "step": 2750 }, { "epoch": 0.3748853950898163, "grad_norm": 0.7816576361656189, "learning_rate": 6.247170665459484e-05, "loss": 0.2547, "step": 2760 }, { "epoch": 0.3762436755068084, "grad_norm": 0.35735172033309937, "learning_rate": 6.269805341783613e-05, "loss": 0.2397, "step": 2770 }, { "epoch": 0.37760195592380047, "grad_norm": 0.432102769613266, "learning_rate": 6.292440018107741e-05, "loss": 0.2469, "step": 2780 }, { "epoch": 0.37896023634079257, "grad_norm": 0.5073881149291992, "learning_rate": 6.31507469443187e-05, "loss": 0.2347, "step": 2790 }, { "epoch": 0.38031851675778466, "grad_norm": 0.371686190366745, "learning_rate": 6.337709370755998e-05, "loss": 0.2367, "step": 2800 }, { "epoch": 0.3816767971747767, "grad_norm": 0.5288065075874329, "learning_rate": 6.360344047080127e-05, "loss": 0.2421, "step": 2810 }, { "epoch": 0.3830350775917688, "grad_norm": 0.5315120816230774, "learning_rate": 6.382978723404256e-05, "loss": 0.2385, "step": 2820 }, { "epoch": 0.3843933580087609, "grad_norm": 0.361716628074646, "learning_rate": 6.405613399728383e-05, "loss": 0.2398, "step": 2830 }, { "epoch": 0.385751638425753, "grad_norm": 0.41051751375198364, "learning_rate": 6.428248076052513e-05, "loss": 0.2373, "step": 2840 }, { "epoch": 0.3871099188427451, "grad_norm": 0.5072987079620361, "learning_rate": 6.450882752376642e-05, "loss": 0.2355, "step": 2850 }, { "epoch": 0.3884681992597372, "grad_norm": 0.4059125781059265, "learning_rate": 6.473517428700769e-05, "loss": 0.237, "step": 2860 }, { "epoch": 0.3898264796767293, "grad_norm": 0.3442498743534088, "learning_rate": 6.496152105024898e-05, "loss": 0.2358, "step": 2870 }, { "epoch": 0.39118476009372133, "grad_norm": 0.3778746724128723, "learning_rate": 6.518786781349027e-05, "loss": 0.2441, "step": 2880 }, { "epoch": 0.3925430405107134, "grad_norm": 1.010395884513855, "learning_rate": 6.541421457673155e-05, "loss": 0.2398, "step": 2890 }, { "epoch": 0.3939013209277055, "grad_norm": 0.39121153950691223, "learning_rate": 6.564056133997284e-05, "loss": 0.2269, "step": 2900 }, { "epoch": 0.3952596013446976, "grad_norm": 0.7784329056739807, "learning_rate": 6.586690810321414e-05, "loss": 0.2269, "step": 2910 }, { "epoch": 0.3966178817616897, "grad_norm": 0.30037087202072144, "learning_rate": 6.60932548664554e-05, "loss": 0.2414, "step": 2920 }, { "epoch": 0.3979761621786818, "grad_norm": 0.45715004205703735, "learning_rate": 6.63196016296967e-05, "loss": 0.2344, "step": 2930 }, { "epoch": 0.39933444259567386, "grad_norm": 4.130895137786865, "learning_rate": 6.654594839293799e-05, "loss": 0.2358, "step": 2940 }, { "epoch": 0.40069272301266595, "grad_norm": 0.38412022590637207, "learning_rate": 6.677229515617926e-05, "loss": 0.249, "step": 2950 }, { "epoch": 0.40205100342965805, "grad_norm": 0.4622412323951721, "learning_rate": 6.699864191942056e-05, "loss": 0.2326, "step": 2960 }, { "epoch": 0.40340928384665015, "grad_norm": 0.3459821939468384, "learning_rate": 6.722498868266184e-05, "loss": 0.2452, "step": 2970 }, { "epoch": 0.40476756426364224, "grad_norm": 0.3937090039253235, "learning_rate": 6.745133544590313e-05, "loss": 0.2342, "step": 2980 }, { "epoch": 0.40612584468063434, "grad_norm": 0.3459404408931732, "learning_rate": 6.767768220914441e-05, "loss": 0.236, "step": 2990 }, { "epoch": 0.4074841250976264, "grad_norm": 1.4856414794921875, "learning_rate": 6.790402897238571e-05, "loss": 0.2316, "step": 3000 }, { "epoch": 0.4088424055146185, "grad_norm": 0.4535047113895416, "learning_rate": 6.813037573562698e-05, "loss": 0.2388, "step": 3010 }, { "epoch": 0.4102006859316106, "grad_norm": 0.5312026143074036, "learning_rate": 6.835672249886826e-05, "loss": 0.2516, "step": 3020 }, { "epoch": 0.4115589663486027, "grad_norm": 0.6093303561210632, "learning_rate": 6.858306926210956e-05, "loss": 0.2356, "step": 3030 }, { "epoch": 0.41291724676559477, "grad_norm": 0.5334001779556274, "learning_rate": 6.880941602535083e-05, "loss": 0.2473, "step": 3040 }, { "epoch": 0.41427552718258687, "grad_norm": 0.3756815791130066, "learning_rate": 6.903576278859213e-05, "loss": 0.2481, "step": 3050 }, { "epoch": 0.4156338075995789, "grad_norm": 0.3304213285446167, "learning_rate": 6.926210955183341e-05, "loss": 0.2388, "step": 3060 }, { "epoch": 0.416992088016571, "grad_norm": 0.43518051505088806, "learning_rate": 6.94884563150747e-05, "loss": 0.245, "step": 3070 }, { "epoch": 0.4183503684335631, "grad_norm": 0.507654070854187, "learning_rate": 6.971480307831598e-05, "loss": 0.2519, "step": 3080 }, { "epoch": 0.4197086488505552, "grad_norm": 0.5116168856620789, "learning_rate": 6.994114984155728e-05, "loss": 0.2416, "step": 3090 }, { "epoch": 0.4210669292675473, "grad_norm": 0.47795939445495605, "learning_rate": 7.016749660479855e-05, "loss": 0.2388, "step": 3100 }, { "epoch": 0.4224252096845394, "grad_norm": 0.4212625026702881, "learning_rate": 7.039384336803984e-05, "loss": 0.2327, "step": 3110 }, { "epoch": 0.42378349010153143, "grad_norm": 0.4932740032672882, "learning_rate": 7.062019013128113e-05, "loss": 0.2361, "step": 3120 }, { "epoch": 0.42514177051852353, "grad_norm": 0.36035752296447754, "learning_rate": 7.08465368945224e-05, "loss": 0.2293, "step": 3130 }, { "epoch": 0.42650005093551563, "grad_norm": 0.37398573756217957, "learning_rate": 7.10728836577637e-05, "loss": 0.2396, "step": 3140 }, { "epoch": 0.4278583313525077, "grad_norm": 0.3661610186100006, "learning_rate": 7.129923042100499e-05, "loss": 0.246, "step": 3150 }, { "epoch": 0.4292166117694998, "grad_norm": 0.46308425068855286, "learning_rate": 7.152557718424627e-05, "loss": 0.2403, "step": 3160 }, { "epoch": 0.4305748921864919, "grad_norm": 0.46572229266166687, "learning_rate": 7.175192394748755e-05, "loss": 0.2498, "step": 3170 }, { "epoch": 0.431933172603484, "grad_norm": 0.4191354215145111, "learning_rate": 7.197827071072884e-05, "loss": 0.2416, "step": 3180 }, { "epoch": 0.43329145302047606, "grad_norm": 0.41681990027427673, "learning_rate": 7.220461747397012e-05, "loss": 0.2334, "step": 3190 }, { "epoch": 0.43464973343746816, "grad_norm": 1.0335698127746582, "learning_rate": 7.243096423721141e-05, "loss": 0.239, "step": 3200 }, { "epoch": 0.43600801385446025, "grad_norm": 0.3487764298915863, "learning_rate": 7.26573110004527e-05, "loss": 0.2483, "step": 3210 }, { "epoch": 0.43736629427145235, "grad_norm": 0.36458444595336914, "learning_rate": 7.288365776369398e-05, "loss": 0.2479, "step": 3220 }, { "epoch": 0.43872457468844445, "grad_norm": 0.6315724849700928, "learning_rate": 7.311000452693527e-05, "loss": 0.2364, "step": 3230 }, { "epoch": 0.44008285510543654, "grad_norm": 0.795318603515625, "learning_rate": 7.333635129017656e-05, "loss": 0.2467, "step": 3240 }, { "epoch": 0.4414411355224286, "grad_norm": 0.32811155915260315, "learning_rate": 7.356269805341784e-05, "loss": 0.2368, "step": 3250 }, { "epoch": 0.4427994159394207, "grad_norm": 0.38429826498031616, "learning_rate": 7.378904481665913e-05, "loss": 0.2337, "step": 3260 }, { "epoch": 0.4441576963564128, "grad_norm": 0.33426088094711304, "learning_rate": 7.401539157990041e-05, "loss": 0.2426, "step": 3270 }, { "epoch": 0.4455159767734049, "grad_norm": 0.44243502616882324, "learning_rate": 7.42417383431417e-05, "loss": 0.2352, "step": 3280 }, { "epoch": 0.446874257190397, "grad_norm": 0.3277808129787445, "learning_rate": 7.446808510638298e-05, "loss": 0.2367, "step": 3290 }, { "epoch": 0.44823253760738907, "grad_norm": 0.9782993793487549, "learning_rate": 7.469443186962428e-05, "loss": 0.2444, "step": 3300 }, { "epoch": 0.4495908180243811, "grad_norm": 0.4226798415184021, "learning_rate": 7.492077863286555e-05, "loss": 0.2399, "step": 3310 }, { "epoch": 0.4509490984413732, "grad_norm": 0.41981738805770874, "learning_rate": 7.514712539610685e-05, "loss": 0.2305, "step": 3320 }, { "epoch": 0.4523073788583653, "grad_norm": 0.29633334279060364, "learning_rate": 7.537347215934813e-05, "loss": 0.2505, "step": 3330 }, { "epoch": 0.4536656592753574, "grad_norm": 0.5532662868499756, "learning_rate": 7.559981892258941e-05, "loss": 0.2416, "step": 3340 }, { "epoch": 0.4550239396923495, "grad_norm": 0.32161685824394226, "learning_rate": 7.58261656858307e-05, "loss": 0.2341, "step": 3350 }, { "epoch": 0.4563822201093416, "grad_norm": 0.41319361329078674, "learning_rate": 7.605251244907198e-05, "loss": 0.2368, "step": 3360 }, { "epoch": 0.45774050052633364, "grad_norm": 0.43355077505111694, "learning_rate": 7.627885921231327e-05, "loss": 0.2422, "step": 3370 }, { "epoch": 0.45909878094332573, "grad_norm": 0.45644256472587585, "learning_rate": 7.650520597555455e-05, "loss": 0.2437, "step": 3380 }, { "epoch": 0.46045706136031783, "grad_norm": 0.4174960255622864, "learning_rate": 7.673155273879584e-05, "loss": 0.2503, "step": 3390 }, { "epoch": 0.46181534177730993, "grad_norm": 0.33323749899864197, "learning_rate": 7.695789950203712e-05, "loss": 0.2466, "step": 3400 }, { "epoch": 0.463173622194302, "grad_norm": 0.31461480259895325, "learning_rate": 7.718424626527842e-05, "loss": 0.2407, "step": 3410 }, { "epoch": 0.4645319026112941, "grad_norm": 0.49118146300315857, "learning_rate": 7.74105930285197e-05, "loss": 0.2362, "step": 3420 }, { "epoch": 0.46589018302828616, "grad_norm": 0.43850240111351013, "learning_rate": 7.763693979176097e-05, "loss": 0.2432, "step": 3430 }, { "epoch": 0.46724846344527826, "grad_norm": 0.5156286358833313, "learning_rate": 7.786328655500227e-05, "loss": 0.2367, "step": 3440 }, { "epoch": 0.46860674386227036, "grad_norm": 0.3263431489467621, "learning_rate": 7.808963331824356e-05, "loss": 0.245, "step": 3450 }, { "epoch": 0.46996502427926246, "grad_norm": 0.3053440749645233, "learning_rate": 7.831598008148484e-05, "loss": 0.2329, "step": 3460 }, { "epoch": 0.47132330469625455, "grad_norm": 0.34962204098701477, "learning_rate": 7.854232684472612e-05, "loss": 0.2411, "step": 3470 }, { "epoch": 0.47268158511324665, "grad_norm": 0.3352089822292328, "learning_rate": 7.876867360796741e-05, "loss": 0.2416, "step": 3480 }, { "epoch": 0.4740398655302387, "grad_norm": 0.31437620520591736, "learning_rate": 7.899502037120869e-05, "loss": 0.2383, "step": 3490 }, { "epoch": 0.4753981459472308, "grad_norm": 0.36700060963630676, "learning_rate": 7.922136713444999e-05, "loss": 0.2471, "step": 3500 }, { "epoch": 0.4767564263642229, "grad_norm": 0.3413242995738983, "learning_rate": 7.944771389769126e-05, "loss": 0.2517, "step": 3510 }, { "epoch": 0.478114706781215, "grad_norm": 0.3104558289051056, "learning_rate": 7.967406066093255e-05, "loss": 0.2408, "step": 3520 }, { "epoch": 0.4794729871982071, "grad_norm": 0.5462628602981567, "learning_rate": 7.990040742417384e-05, "loss": 0.2341, "step": 3530 }, { "epoch": 0.4808312676151992, "grad_norm": 0.5931156277656555, "learning_rate": 8.012675418741513e-05, "loss": 0.246, "step": 3540 }, { "epoch": 0.4821895480321913, "grad_norm": 0.3639640808105469, "learning_rate": 8.035310095065641e-05, "loss": 0.2374, "step": 3550 }, { "epoch": 0.4835478284491833, "grad_norm": 0.31788602471351624, "learning_rate": 8.05794477138977e-05, "loss": 0.2418, "step": 3560 }, { "epoch": 0.4849061088661754, "grad_norm": 0.36856967210769653, "learning_rate": 8.080579447713898e-05, "loss": 0.2436, "step": 3570 }, { "epoch": 0.4862643892831675, "grad_norm": 0.39972272515296936, "learning_rate": 8.103214124038026e-05, "loss": 0.2454, "step": 3580 }, { "epoch": 0.4876226697001596, "grad_norm": 0.4053763449192047, "learning_rate": 8.125848800362155e-05, "loss": 0.2413, "step": 3590 }, { "epoch": 0.4889809501171517, "grad_norm": 0.5275582671165466, "learning_rate": 8.148483476686283e-05, "loss": 0.243, "step": 3600 }, { "epoch": 0.4903392305341438, "grad_norm": 0.3535505533218384, "learning_rate": 8.171118153010412e-05, "loss": 0.2504, "step": 3610 }, { "epoch": 0.49169751095113584, "grad_norm": 0.5302631258964539, "learning_rate": 8.193752829334542e-05, "loss": 0.2335, "step": 3620 }, { "epoch": 0.49305579136812794, "grad_norm": 0.3826453685760498, "learning_rate": 8.216387505658669e-05, "loss": 0.2441, "step": 3630 }, { "epoch": 0.49441407178512004, "grad_norm": 0.3383348286151886, "learning_rate": 8.239022181982798e-05, "loss": 0.2311, "step": 3640 }, { "epoch": 0.49577235220211213, "grad_norm": 0.37670835852622986, "learning_rate": 8.261656858306927e-05, "loss": 0.2416, "step": 3650 }, { "epoch": 0.49713063261910423, "grad_norm": 0.3722772002220154, "learning_rate": 8.284291534631055e-05, "loss": 0.2477, "step": 3660 }, { "epoch": 0.4984889130360963, "grad_norm": 0.3182002902030945, "learning_rate": 8.306926210955184e-05, "loss": 0.2542, "step": 3670 }, { "epoch": 0.49984719345308837, "grad_norm": 0.36047887802124023, "learning_rate": 8.329560887279312e-05, "loss": 0.2397, "step": 3680 }, { "epoch": 0.5012054738700805, "grad_norm": 0.4944659471511841, "learning_rate": 8.35219556360344e-05, "loss": 0.2437, "step": 3690 }, { "epoch": 0.5025637542870726, "grad_norm": 0.6708717942237854, "learning_rate": 8.374830239927569e-05, "loss": 0.2495, "step": 3700 }, { "epoch": 0.5039220347040646, "grad_norm": 0.344917356967926, "learning_rate": 8.397464916251699e-05, "loss": 0.2475, "step": 3710 }, { "epoch": 0.5052803151210568, "grad_norm": 0.332320898771286, "learning_rate": 8.420099592575826e-05, "loss": 0.2413, "step": 3720 }, { "epoch": 0.5066385955380488, "grad_norm": 0.32127082347869873, "learning_rate": 8.442734268899956e-05, "loss": 0.2416, "step": 3730 }, { "epoch": 0.507996875955041, "grad_norm": 0.42068928480148315, "learning_rate": 8.465368945224084e-05, "loss": 0.2418, "step": 3740 }, { "epoch": 0.509355156372033, "grad_norm": 0.3096528649330139, "learning_rate": 8.488003621548211e-05, "loss": 0.2572, "step": 3750 }, { "epoch": 0.5107134367890251, "grad_norm": 0.44093209505081177, "learning_rate": 8.510638297872341e-05, "loss": 0.2399, "step": 3760 }, { "epoch": 0.5120717172060172, "grad_norm": 0.4476853013038635, "learning_rate": 8.533272974196469e-05, "loss": 0.2461, "step": 3770 }, { "epoch": 0.5134299976230092, "grad_norm": 0.5141857266426086, "learning_rate": 8.555907650520598e-05, "loss": 0.2356, "step": 3780 }, { "epoch": 0.5147882780400014, "grad_norm": 0.32433292269706726, "learning_rate": 8.578542326844726e-05, "loss": 0.2399, "step": 3790 }, { "epoch": 0.5161465584569934, "grad_norm": 0.413129985332489, "learning_rate": 8.601177003168856e-05, "loss": 0.2404, "step": 3800 }, { "epoch": 0.5175048388739856, "grad_norm": 0.4676617681980133, "learning_rate": 8.623811679492983e-05, "loss": 0.2412, "step": 3810 }, { "epoch": 0.5188631192909776, "grad_norm": 0.3669430911540985, "learning_rate": 8.646446355817113e-05, "loss": 0.2413, "step": 3820 }, { "epoch": 0.5202213997079697, "grad_norm": 0.8355333209037781, "learning_rate": 8.669081032141241e-05, "loss": 0.2394, "step": 3830 }, { "epoch": 0.5215796801249618, "grad_norm": 0.31101301312446594, "learning_rate": 8.691715708465368e-05, "loss": 0.2401, "step": 3840 }, { "epoch": 0.5229379605419539, "grad_norm": 0.27916184067726135, "learning_rate": 8.714350384789498e-05, "loss": 0.2395, "step": 3850 }, { "epoch": 0.524296240958946, "grad_norm": 0.35946857929229736, "learning_rate": 8.736985061113627e-05, "loss": 0.2367, "step": 3860 }, { "epoch": 0.525654521375938, "grad_norm": 0.4156729578971863, "learning_rate": 8.759619737437755e-05, "loss": 0.2384, "step": 3870 }, { "epoch": 0.5270128017929302, "grad_norm": 0.41875219345092773, "learning_rate": 8.782254413761883e-05, "loss": 0.2403, "step": 3880 }, { "epoch": 0.5283710822099222, "grad_norm": 0.4341963231563568, "learning_rate": 8.804889090086013e-05, "loss": 0.2439, "step": 3890 }, { "epoch": 0.5297293626269143, "grad_norm": 0.5678895115852356, "learning_rate": 8.82752376641014e-05, "loss": 0.2451, "step": 3900 }, { "epoch": 0.5310876430439064, "grad_norm": 0.561272144317627, "learning_rate": 8.85015844273427e-05, "loss": 0.2375, "step": 3910 }, { "epoch": 0.5324459234608985, "grad_norm": 0.45090624690055847, "learning_rate": 8.872793119058398e-05, "loss": 0.2406, "step": 3920 }, { "epoch": 0.5338042038778906, "grad_norm": 0.3993671238422394, "learning_rate": 8.895427795382526e-05, "loss": 0.2434, "step": 3930 }, { "epoch": 0.5351624842948827, "grad_norm": 0.4161466658115387, "learning_rate": 8.918062471706655e-05, "loss": 0.2412, "step": 3940 }, { "epoch": 0.5365207647118748, "grad_norm": 0.8329044580459595, "learning_rate": 8.940697148030784e-05, "loss": 0.2409, "step": 3950 }, { "epoch": 0.5378790451288669, "grad_norm": 0.5138368606567383, "learning_rate": 8.963331824354912e-05, "loss": 0.2349, "step": 3960 }, { "epoch": 0.5392373255458589, "grad_norm": 0.39746806025505066, "learning_rate": 8.98596650067904e-05, "loss": 0.2464, "step": 3970 }, { "epoch": 0.5405956059628511, "grad_norm": 1.3628511428833008, "learning_rate": 9.00860117700317e-05, "loss": 0.2351, "step": 3980 }, { "epoch": 0.5419538863798431, "grad_norm": 0.34771066904067993, "learning_rate": 9.031235853327297e-05, "loss": 0.2505, "step": 3990 }, { "epoch": 0.5433121667968352, "grad_norm": 0.3667686879634857, "learning_rate": 9.053870529651426e-05, "loss": 0.2453, "step": 4000 }, { "epoch": 0.5446704472138273, "grad_norm": 0.29015713930130005, "learning_rate": 9.076505205975556e-05, "loss": 0.2476, "step": 4010 }, { "epoch": 0.5460287276308193, "grad_norm": 0.4081239700317383, "learning_rate": 9.099139882299683e-05, "loss": 0.2521, "step": 4020 }, { "epoch": 0.5473870080478115, "grad_norm": 0.41040000319480896, "learning_rate": 9.121774558623813e-05, "loss": 0.245, "step": 4030 }, { "epoch": 0.5487452884648035, "grad_norm": 0.3289068639278412, "learning_rate": 9.144409234947941e-05, "loss": 0.2482, "step": 4040 }, { "epoch": 0.5501035688817957, "grad_norm": 0.43900078535079956, "learning_rate": 9.16704391127207e-05, "loss": 0.2437, "step": 4050 }, { "epoch": 0.5514618492987877, "grad_norm": 0.5150415301322937, "learning_rate": 9.189678587596198e-05, "loss": 0.2428, "step": 4060 }, { "epoch": 0.5528201297157799, "grad_norm": 0.4778790771961212, "learning_rate": 9.212313263920326e-05, "loss": 0.235, "step": 4070 }, { "epoch": 0.5541784101327719, "grad_norm": 0.3917141258716583, "learning_rate": 9.234947940244455e-05, "loss": 0.2521, "step": 4080 }, { "epoch": 0.555536690549764, "grad_norm": 0.5191987156867981, "learning_rate": 9.257582616568583e-05, "loss": 0.2442, "step": 4090 }, { "epoch": 0.5568949709667561, "grad_norm": 0.3180747628211975, "learning_rate": 9.280217292892713e-05, "loss": 0.2424, "step": 4100 }, { "epoch": 0.5582532513837482, "grad_norm": 0.44960424304008484, "learning_rate": 9.30285196921684e-05, "loss": 0.2381, "step": 4110 }, { "epoch": 0.5596115318007403, "grad_norm": 0.3191477656364441, "learning_rate": 9.32548664554097e-05, "loss": 0.2361, "step": 4120 }, { "epoch": 0.5609698122177323, "grad_norm": 0.38165682554244995, "learning_rate": 9.348121321865098e-05, "loss": 0.2427, "step": 4130 }, { "epoch": 0.5623280926347244, "grad_norm": 0.48823222517967224, "learning_rate": 9.370755998189227e-05, "loss": 0.2348, "step": 4140 }, { "epoch": 0.5636863730517165, "grad_norm": 0.6220352053642273, "learning_rate": 9.393390674513355e-05, "loss": 0.2416, "step": 4150 }, { "epoch": 0.5650446534687086, "grad_norm": 0.3477441370487213, "learning_rate": 9.416025350837483e-05, "loss": 0.2387, "step": 4160 }, { "epoch": 0.5664029338857007, "grad_norm": 0.5098167657852173, "learning_rate": 9.438660027161612e-05, "loss": 0.2411, "step": 4170 }, { "epoch": 0.5677612143026928, "grad_norm": 0.5033051371574402, "learning_rate": 9.46129470348574e-05, "loss": 0.2539, "step": 4180 }, { "epoch": 0.5691194947196849, "grad_norm": 0.32857057452201843, "learning_rate": 9.483929379809869e-05, "loss": 0.2526, "step": 4190 }, { "epoch": 0.570477775136677, "grad_norm": 0.4129979610443115, "learning_rate": 9.506564056133997e-05, "loss": 0.2438, "step": 4200 }, { "epoch": 0.571836055553669, "grad_norm": 0.3565722703933716, "learning_rate": 9.529198732458127e-05, "loss": 0.2523, "step": 4210 }, { "epoch": 0.5731943359706612, "grad_norm": 0.4729892909526825, "learning_rate": 9.551833408782255e-05, "loss": 0.2402, "step": 4220 }, { "epoch": 0.5745526163876532, "grad_norm": 0.33927279710769653, "learning_rate": 9.574468085106384e-05, "loss": 0.2517, "step": 4230 }, { "epoch": 0.5759108968046454, "grad_norm": 0.4262118935585022, "learning_rate": 9.597102761430512e-05, "loss": 0.2532, "step": 4240 }, { "epoch": 0.5772691772216374, "grad_norm": 0.3570193648338318, "learning_rate": 9.619737437754641e-05, "loss": 0.2449, "step": 4250 }, { "epoch": 0.5786274576386295, "grad_norm": 0.3058898150920868, "learning_rate": 9.642372114078769e-05, "loss": 0.2415, "step": 4260 }, { "epoch": 0.5799857380556216, "grad_norm": 0.5027090907096863, "learning_rate": 9.665006790402898e-05, "loss": 0.24, "step": 4270 }, { "epoch": 0.5813440184726136, "grad_norm": 0.33633390069007874, "learning_rate": 9.687641466727026e-05, "loss": 0.2514, "step": 4280 }, { "epoch": 0.5827022988896058, "grad_norm": 0.3312252163887024, "learning_rate": 9.710276143051154e-05, "loss": 0.2482, "step": 4290 }, { "epoch": 0.5840605793065978, "grad_norm": 0.30275124311447144, "learning_rate": 9.732910819375284e-05, "loss": 0.2419, "step": 4300 }, { "epoch": 0.58541885972359, "grad_norm": 0.3391088843345642, "learning_rate": 9.755545495699411e-05, "loss": 0.2556, "step": 4310 }, { "epoch": 0.586777140140582, "grad_norm": 0.44849511981010437, "learning_rate": 9.778180172023541e-05, "loss": 0.2442, "step": 4320 }, { "epoch": 0.5881354205575741, "grad_norm": 0.4353494942188263, "learning_rate": 9.80081484834767e-05, "loss": 0.2441, "step": 4330 }, { "epoch": 0.5894937009745662, "grad_norm": 0.3884948492050171, "learning_rate": 9.823449524671798e-05, "loss": 0.2374, "step": 4340 }, { "epoch": 0.5908519813915583, "grad_norm": 0.42669016122817993, "learning_rate": 9.846084200995926e-05, "loss": 0.2416, "step": 4350 }, { "epoch": 0.5922102618085504, "grad_norm": 0.5052133202552795, "learning_rate": 9.868718877320055e-05, "loss": 0.2422, "step": 4360 }, { "epoch": 0.5935685422255425, "grad_norm": 0.383476585149765, "learning_rate": 9.891353553644183e-05, "loss": 0.2513, "step": 4370 }, { "epoch": 0.5949268226425346, "grad_norm": 0.318293958902359, "learning_rate": 9.913988229968312e-05, "loss": 0.2562, "step": 4380 }, { "epoch": 0.5962851030595266, "grad_norm": 0.48555293679237366, "learning_rate": 9.936622906292441e-05, "loss": 0.2388, "step": 4390 }, { "epoch": 0.5976433834765187, "grad_norm": 0.34200236201286316, "learning_rate": 9.959257582616568e-05, "loss": 0.25, "step": 4400 }, { "epoch": 0.5990016638935108, "grad_norm": 0.45821714401245117, "learning_rate": 9.981892258940697e-05, "loss": 0.2495, "step": 4410 }, { "epoch": 0.6003599443105029, "grad_norm": 0.5355885624885559, "learning_rate": 9.999999937549191e-05, "loss": 0.2357, "step": 4420 }, { "epoch": 0.601718224727495, "grad_norm": 0.3656679093837738, "learning_rate": 9.999997751770993e-05, "loss": 0.2445, "step": 4430 }, { "epoch": 0.6030765051444871, "grad_norm": 0.3463793694972992, "learning_rate": 9.999992443453842e-05, "loss": 0.2418, "step": 4440 }, { "epoch": 0.6044347855614791, "grad_norm": 0.359862357378006, "learning_rate": 9.99998401260105e-05, "loss": 0.2417, "step": 4450 }, { "epoch": 0.6057930659784713, "grad_norm": 0.5530717372894287, "learning_rate": 9.999972459217885e-05, "loss": 0.2458, "step": 4460 }, { "epoch": 0.6071513463954633, "grad_norm": 0.27854397892951965, "learning_rate": 9.999957783311559e-05, "loss": 0.2452, "step": 4470 }, { "epoch": 0.6085096268124555, "grad_norm": 0.34041205048561096, "learning_rate": 9.999939984891239e-05, "loss": 0.2458, "step": 4480 }, { "epoch": 0.6098679072294475, "grad_norm": 0.2987198233604431, "learning_rate": 9.99991906396804e-05, "loss": 0.248, "step": 4490 }, { "epoch": 0.6112261876464397, "grad_norm": 0.38563501834869385, "learning_rate": 9.999895020555027e-05, "loss": 0.254, "step": 4500 }, { "epoch": 0.6125844680634317, "grad_norm": 0.3378385603427887, "learning_rate": 9.999867854667215e-05, "loss": 0.2399, "step": 4510 }, { "epoch": 0.6139427484804237, "grad_norm": 0.7777177095413208, "learning_rate": 9.999837566321572e-05, "loss": 0.2386, "step": 4520 }, { "epoch": 0.6153010288974159, "grad_norm": 0.3116154670715332, "learning_rate": 9.99980415553701e-05, "loss": 0.2372, "step": 4530 }, { "epoch": 0.6166593093144079, "grad_norm": 0.4889775514602661, "learning_rate": 9.999767622334395e-05, "loss": 0.2421, "step": 4540 }, { "epoch": 0.6180175897314001, "grad_norm": 0.2842913269996643, "learning_rate": 9.999727966736542e-05, "loss": 0.248, "step": 4550 }, { "epoch": 0.6193758701483921, "grad_norm": 0.3226028084754944, "learning_rate": 9.999685188768219e-05, "loss": 0.243, "step": 4560 }, { "epoch": 0.6207341505653842, "grad_norm": 0.3192172646522522, "learning_rate": 9.999639288456137e-05, "loss": 0.2483, "step": 4570 }, { "epoch": 0.6220924309823763, "grad_norm": 0.30197593569755554, "learning_rate": 9.999590265828966e-05, "loss": 0.239, "step": 4580 }, { "epoch": 0.6234507113993684, "grad_norm": 0.4451127052307129, "learning_rate": 9.999538120917316e-05, "loss": 0.2428, "step": 4590 }, { "epoch": 0.6248089918163605, "grad_norm": 0.2990131080150604, "learning_rate": 9.999482853753756e-05, "loss": 0.2414, "step": 4600 }, { "epoch": 0.6261672722333526, "grad_norm": 0.4426460564136505, "learning_rate": 9.999424464372797e-05, "loss": 0.2454, "step": 4610 }, { "epoch": 0.6275255526503447, "grad_norm": 0.6033828258514404, "learning_rate": 9.999362952810907e-05, "loss": 0.2507, "step": 4620 }, { "epoch": 0.6288838330673368, "grad_norm": 0.36733752489089966, "learning_rate": 9.999298319106501e-05, "loss": 0.2605, "step": 4630 }, { "epoch": 0.6302421134843288, "grad_norm": 0.4766634404659271, "learning_rate": 9.999230563299939e-05, "loss": 0.2511, "step": 4640 }, { "epoch": 0.631600393901321, "grad_norm": 0.33961543440818787, "learning_rate": 9.999159685433538e-05, "loss": 0.2439, "step": 4650 }, { "epoch": 0.632958674318313, "grad_norm": 0.33434340357780457, "learning_rate": 9.999085685551561e-05, "loss": 0.252, "step": 4660 }, { "epoch": 0.6343169547353051, "grad_norm": 0.37442994117736816, "learning_rate": 9.999008563700222e-05, "loss": 0.2484, "step": 4670 }, { "epoch": 0.6356752351522972, "grad_norm": 0.3704357445240021, "learning_rate": 9.998928319927684e-05, "loss": 0.241, "step": 4680 }, { "epoch": 0.6370335155692893, "grad_norm": 0.48296162486076355, "learning_rate": 9.998844954284061e-05, "loss": 0.2389, "step": 4690 }, { "epoch": 0.6383917959862814, "grad_norm": 0.31681951880455017, "learning_rate": 9.998758466821412e-05, "loss": 0.2392, "step": 4700 }, { "epoch": 0.6397500764032734, "grad_norm": 0.47707727551460266, "learning_rate": 9.998668857593753e-05, "loss": 0.234, "step": 4710 }, { "epoch": 0.6411083568202656, "grad_norm": 0.6801664233207703, "learning_rate": 9.998576126657044e-05, "loss": 0.2509, "step": 4720 }, { "epoch": 0.6424666372372576, "grad_norm": 0.322346955537796, "learning_rate": 9.998480274069198e-05, "loss": 0.2358, "step": 4730 }, { "epoch": 0.6438249176542498, "grad_norm": 0.2722484767436981, "learning_rate": 9.998381299890071e-05, "loss": 0.2397, "step": 4740 }, { "epoch": 0.6451831980712418, "grad_norm": 0.34327027201652527, "learning_rate": 9.998279204181478e-05, "loss": 0.2323, "step": 4750 }, { "epoch": 0.6465414784882338, "grad_norm": 0.556814968585968, "learning_rate": 9.998173987007177e-05, "loss": 0.2432, "step": 4760 }, { "epoch": 0.647899758905226, "grad_norm": 0.3654954135417938, "learning_rate": 9.998065648432877e-05, "loss": 0.2477, "step": 4770 }, { "epoch": 0.649258039322218, "grad_norm": 0.419805109500885, "learning_rate": 9.997954188526236e-05, "loss": 0.2396, "step": 4780 }, { "epoch": 0.6506163197392102, "grad_norm": 0.3656160831451416, "learning_rate": 9.997839607356862e-05, "loss": 0.239, "step": 4790 }, { "epoch": 0.6519746001562022, "grad_norm": 0.36111393570899963, "learning_rate": 9.997721904996311e-05, "loss": 0.2507, "step": 4800 }, { "epoch": 0.6533328805731944, "grad_norm": 0.3709546625614166, "learning_rate": 9.997601081518091e-05, "loss": 0.2433, "step": 4810 }, { "epoch": 0.6546911609901864, "grad_norm": 0.7201614379882812, "learning_rate": 9.997477136997655e-05, "loss": 0.246, "step": 4820 }, { "epoch": 0.6560494414071785, "grad_norm": 0.3621259033679962, "learning_rate": 9.997350071512408e-05, "loss": 0.2476, "step": 4830 }, { "epoch": 0.6574077218241706, "grad_norm": 0.3774379789829254, "learning_rate": 9.997219885141705e-05, "loss": 0.2388, "step": 4840 }, { "epoch": 0.6587660022411627, "grad_norm": 0.4363941252231598, "learning_rate": 9.997086577966846e-05, "loss": 0.234, "step": 4850 }, { "epoch": 0.6601242826581548, "grad_norm": 0.4726136028766632, "learning_rate": 9.996950150071085e-05, "loss": 0.2465, "step": 4860 }, { "epoch": 0.6614825630751469, "grad_norm": 0.5062173008918762, "learning_rate": 9.99681060153962e-05, "loss": 0.2511, "step": 4870 }, { "epoch": 0.6628408434921389, "grad_norm": 0.45381689071655273, "learning_rate": 9.996667932459602e-05, "loss": 0.2394, "step": 4880 }, { "epoch": 0.664199123909131, "grad_norm": 0.3236313462257385, "learning_rate": 9.996522142920126e-05, "loss": 0.2468, "step": 4890 }, { "epoch": 0.6655574043261231, "grad_norm": 0.37791240215301514, "learning_rate": 9.996373233012242e-05, "loss": 0.2497, "step": 4900 }, { "epoch": 0.6669156847431152, "grad_norm": 0.42276760935783386, "learning_rate": 9.996221202828946e-05, "loss": 0.2565, "step": 4910 }, { "epoch": 0.6682739651601073, "grad_norm": 0.4305911064147949, "learning_rate": 9.996066052465178e-05, "loss": 0.2454, "step": 4920 }, { "epoch": 0.6696322455770994, "grad_norm": 0.9241501092910767, "learning_rate": 9.995907782017834e-05, "loss": 0.2535, "step": 4930 }, { "epoch": 0.6709905259940915, "grad_norm": 0.373008131980896, "learning_rate": 9.995746391585754e-05, "loss": 0.244, "step": 4940 }, { "epoch": 0.6723488064110835, "grad_norm": 0.4501871168613434, "learning_rate": 9.995581881269729e-05, "loss": 0.2379, "step": 4950 }, { "epoch": 0.6737070868280757, "grad_norm": 0.4394315183162689, "learning_rate": 9.995414251172494e-05, "loss": 0.2472, "step": 4960 }, { "epoch": 0.6750653672450677, "grad_norm": 0.4668097198009491, "learning_rate": 9.995243501398739e-05, "loss": 0.2464, "step": 4970 }, { "epoch": 0.6764236476620599, "grad_norm": 0.3379581868648529, "learning_rate": 9.995069632055095e-05, "loss": 0.2374, "step": 4980 }, { "epoch": 0.6777819280790519, "grad_norm": 0.35374245047569275, "learning_rate": 9.994892643250147e-05, "loss": 0.2469, "step": 4990 }, { "epoch": 0.6791402084960441, "grad_norm": 0.4228232502937317, "learning_rate": 9.994712535094426e-05, "loss": 0.2407, "step": 5000 }, { "epoch": 0.6804984889130361, "grad_norm": 0.7559335231781006, "learning_rate": 9.99452930770041e-05, "loss": 0.2489, "step": 5010 }, { "epoch": 0.6818567693300281, "grad_norm": 0.3815743923187256, "learning_rate": 9.994342961182528e-05, "loss": 0.2464, "step": 5020 }, { "epoch": 0.6832150497470203, "grad_norm": 0.4869216978549957, "learning_rate": 9.994153495657152e-05, "loss": 0.2409, "step": 5030 }, { "epoch": 0.6845733301640123, "grad_norm": 0.31721049547195435, "learning_rate": 9.993960911242607e-05, "loss": 0.247, "step": 5040 }, { "epoch": 0.6859316105810045, "grad_norm": 0.996837317943573, "learning_rate": 9.993765208059162e-05, "loss": 0.2464, "step": 5050 }, { "epoch": 0.6872898909979965, "grad_norm": 0.41157132387161255, "learning_rate": 9.993566386229036e-05, "loss": 0.2501, "step": 5060 }, { "epoch": 0.6886481714149886, "grad_norm": 0.3574490249156952, "learning_rate": 9.993364445876396e-05, "loss": 0.2533, "step": 5070 }, { "epoch": 0.6900064518319807, "grad_norm": 0.40719348192214966, "learning_rate": 9.993159387127353e-05, "loss": 0.2344, "step": 5080 }, { "epoch": 0.6913647322489728, "grad_norm": 0.48582857847213745, "learning_rate": 9.99295121010997e-05, "loss": 0.2457, "step": 5090 }, { "epoch": 0.6927230126659649, "grad_norm": 0.3486964702606201, "learning_rate": 9.992739914954253e-05, "loss": 0.2407, "step": 5100 }, { "epoch": 0.694081293082957, "grad_norm": 0.4454726576805115, "learning_rate": 9.992525501792159e-05, "loss": 0.253, "step": 5110 }, { "epoch": 0.6954395734999491, "grad_norm": 0.4705508351325989, "learning_rate": 9.99230797075759e-05, "loss": 0.2532, "step": 5120 }, { "epoch": 0.6967978539169412, "grad_norm": 0.30512502789497375, "learning_rate": 9.992087321986399e-05, "loss": 0.2438, "step": 5130 }, { "epoch": 0.6981561343339332, "grad_norm": 0.3000001907348633, "learning_rate": 9.991863555616377e-05, "loss": 0.2458, "step": 5140 }, { "epoch": 0.6995144147509254, "grad_norm": 0.43206772208213806, "learning_rate": 9.991636671787274e-05, "loss": 0.2422, "step": 5150 }, { "epoch": 0.7008726951679174, "grad_norm": 0.4188641607761383, "learning_rate": 9.991406670640776e-05, "loss": 0.2447, "step": 5160 }, { "epoch": 0.7022309755849095, "grad_norm": 1.1176211833953857, "learning_rate": 9.991173552320523e-05, "loss": 0.2464, "step": 5170 }, { "epoch": 0.7035892560019016, "grad_norm": 0.3489271104335785, "learning_rate": 9.9909373169721e-05, "loss": 0.2538, "step": 5180 }, { "epoch": 0.7049475364188936, "grad_norm": 0.41792792081832886, "learning_rate": 9.990697964743036e-05, "loss": 0.2468, "step": 5190 }, { "epoch": 0.7063058168358858, "grad_norm": 0.40551429986953735, "learning_rate": 9.990455495782809e-05, "loss": 0.2429, "step": 5200 }, { "epoch": 0.7076640972528778, "grad_norm": 0.3205186426639557, "learning_rate": 9.990209910242843e-05, "loss": 0.2381, "step": 5210 }, { "epoch": 0.70902237766987, "grad_norm": 0.35722675919532776, "learning_rate": 9.989961208276509e-05, "loss": 0.2523, "step": 5220 }, { "epoch": 0.710380658086862, "grad_norm": 0.34950554370880127, "learning_rate": 9.98970939003912e-05, "loss": 0.255, "step": 5230 }, { "epoch": 0.7117389385038542, "grad_norm": 0.4854452908039093, "learning_rate": 9.989454455687944e-05, "loss": 0.2365, "step": 5240 }, { "epoch": 0.7130972189208462, "grad_norm": 0.43008875846862793, "learning_rate": 9.989196405382184e-05, "loss": 0.239, "step": 5250 }, { "epoch": 0.7144554993378383, "grad_norm": 0.39063185453414917, "learning_rate": 9.988935239282998e-05, "loss": 0.2412, "step": 5260 }, { "epoch": 0.7158137797548304, "grad_norm": 0.3474505543708801, "learning_rate": 9.988670957553486e-05, "loss": 0.2396, "step": 5270 }, { "epoch": 0.7171720601718224, "grad_norm": 0.6352435350418091, "learning_rate": 9.988403560358694e-05, "loss": 0.2458, "step": 5280 }, { "epoch": 0.7185303405888146, "grad_norm": 0.30608150362968445, "learning_rate": 9.98813304786561e-05, "loss": 0.2413, "step": 5290 }, { "epoch": 0.7198886210058066, "grad_norm": 0.323609858751297, "learning_rate": 9.987859420243179e-05, "loss": 0.2412, "step": 5300 }, { "epoch": 0.7212469014227988, "grad_norm": 0.3578800857067108, "learning_rate": 9.987582677662277e-05, "loss": 0.2401, "step": 5310 }, { "epoch": 0.7226051818397908, "grad_norm": 0.6265963912010193, "learning_rate": 9.987302820295734e-05, "loss": 0.2461, "step": 5320 }, { "epoch": 0.7239634622567829, "grad_norm": 0.5625576972961426, "learning_rate": 9.987019848318324e-05, "loss": 0.2398, "step": 5330 }, { "epoch": 0.725321742673775, "grad_norm": 0.3313911557197571, "learning_rate": 9.986733761906763e-05, "loss": 0.2385, "step": 5340 }, { "epoch": 0.7266800230907671, "grad_norm": 0.28852254152297974, "learning_rate": 9.986444561239717e-05, "loss": 0.2405, "step": 5350 }, { "epoch": 0.7280383035077592, "grad_norm": 0.3409920930862427, "learning_rate": 9.986152246497792e-05, "loss": 0.2343, "step": 5360 }, { "epoch": 0.7293965839247513, "grad_norm": 0.4291243255138397, "learning_rate": 9.985856817863541e-05, "loss": 0.2431, "step": 5370 }, { "epoch": 0.7307548643417433, "grad_norm": 0.3431280255317688, "learning_rate": 9.985558275521465e-05, "loss": 0.236, "step": 5380 }, { "epoch": 0.7321131447587355, "grad_norm": 0.3831133246421814, "learning_rate": 9.985256619658004e-05, "loss": 0.2524, "step": 5390 }, { "epoch": 0.7334714251757275, "grad_norm": 0.32188960909843445, "learning_rate": 9.984951850461541e-05, "loss": 0.2526, "step": 5400 }, { "epoch": 0.7348297055927197, "grad_norm": 0.37936219573020935, "learning_rate": 9.98464396812241e-05, "loss": 0.2443, "step": 5410 }, { "epoch": 0.7361879860097117, "grad_norm": 0.5681051015853882, "learning_rate": 9.984332972832888e-05, "loss": 0.2488, "step": 5420 }, { "epoch": 0.7375462664267038, "grad_norm": 0.39168909192085266, "learning_rate": 9.984018864787192e-05, "loss": 0.2393, "step": 5430 }, { "epoch": 0.7389045468436959, "grad_norm": 0.47878298163414, "learning_rate": 9.983701644181484e-05, "loss": 0.2452, "step": 5440 }, { "epoch": 0.7402628272606879, "grad_norm": 0.5150695443153381, "learning_rate": 9.983381311213873e-05, "loss": 0.2382, "step": 5450 }, { "epoch": 0.7416211076776801, "grad_norm": 0.6049244999885559, "learning_rate": 9.983057866084407e-05, "loss": 0.2438, "step": 5460 }, { "epoch": 0.7429793880946721, "grad_norm": 0.48798322677612305, "learning_rate": 9.982731308995081e-05, "loss": 0.2397, "step": 5470 }, { "epoch": 0.7443376685116643, "grad_norm": 0.36459213495254517, "learning_rate": 9.982401640149835e-05, "loss": 0.243, "step": 5480 }, { "epoch": 0.7456959489286563, "grad_norm": 0.3920586407184601, "learning_rate": 9.982068859754546e-05, "loss": 0.2522, "step": 5490 }, { "epoch": 0.7470542293456484, "grad_norm": 0.3867689073085785, "learning_rate": 9.981732968017041e-05, "loss": 0.2585, "step": 5500 }, { "epoch": 0.7484125097626405, "grad_norm": 0.30405476689338684, "learning_rate": 9.981393965147085e-05, "loss": 0.2485, "step": 5510 }, { "epoch": 0.7497707901796326, "grad_norm": 0.33099398016929626, "learning_rate": 9.98105185135639e-05, "loss": 0.2469, "step": 5520 }, { "epoch": 0.7511290705966247, "grad_norm": 0.4071502387523651, "learning_rate": 9.980706626858607e-05, "loss": 0.2456, "step": 5530 }, { "epoch": 0.7524873510136167, "grad_norm": 0.4201465845108032, "learning_rate": 9.980358291869332e-05, "loss": 0.2394, "step": 5540 }, { "epoch": 0.7538456314306089, "grad_norm": 0.2686833441257477, "learning_rate": 9.980006846606104e-05, "loss": 0.2423, "step": 5550 }, { "epoch": 0.7552039118476009, "grad_norm": 0.5115419626235962, "learning_rate": 9.979652291288402e-05, "loss": 0.245, "step": 5560 }, { "epoch": 0.756562192264593, "grad_norm": 0.3374570906162262, "learning_rate": 9.97929462613765e-05, "loss": 0.2431, "step": 5570 }, { "epoch": 0.7579204726815851, "grad_norm": 0.2973199188709259, "learning_rate": 9.978933851377212e-05, "loss": 0.26, "step": 5580 }, { "epoch": 0.7592787530985772, "grad_norm": 0.6901597380638123, "learning_rate": 9.978569967232394e-05, "loss": 0.2473, "step": 5590 }, { "epoch": 0.7606370335155693, "grad_norm": 0.5757603645324707, "learning_rate": 9.978202973930447e-05, "loss": 0.2386, "step": 5600 }, { "epoch": 0.7619953139325614, "grad_norm": 0.38110312819480896, "learning_rate": 9.97783287170056e-05, "loss": 0.2474, "step": 5610 }, { "epoch": 0.7633535943495534, "grad_norm": 0.29937678575515747, "learning_rate": 9.977459660773862e-05, "loss": 0.2449, "step": 5620 }, { "epoch": 0.7647118747665456, "grad_norm": 0.8653799891471863, "learning_rate": 9.977083341383433e-05, "loss": 0.2481, "step": 5630 }, { "epoch": 0.7660701551835376, "grad_norm": 0.3930273652076721, "learning_rate": 9.97670391376428e-05, "loss": 0.2507, "step": 5640 }, { "epoch": 0.7674284356005298, "grad_norm": 0.8426046967506409, "learning_rate": 9.976321378153363e-05, "loss": 0.2497, "step": 5650 }, { "epoch": 0.7687867160175218, "grad_norm": 0.34208494424819946, "learning_rate": 9.975935734789576e-05, "loss": 0.243, "step": 5660 }, { "epoch": 0.770144996434514, "grad_norm": 0.5973019003868103, "learning_rate": 9.975546983913759e-05, "loss": 0.2414, "step": 5670 }, { "epoch": 0.771503276851506, "grad_norm": 0.3836170732975006, "learning_rate": 9.97515512576869e-05, "loss": 0.2499, "step": 5680 }, { "epoch": 0.772861557268498, "grad_norm": 0.5062615871429443, "learning_rate": 9.974760160599086e-05, "loss": 0.2499, "step": 5690 }, { "epoch": 0.7742198376854902, "grad_norm": 0.33686432242393494, "learning_rate": 9.974362088651607e-05, "loss": 0.2434, "step": 5700 }, { "epoch": 0.7755781181024822, "grad_norm": 0.3658219575881958, "learning_rate": 9.97396091017485e-05, "loss": 0.2475, "step": 5710 }, { "epoch": 0.7769363985194744, "grad_norm": 0.4787846505641937, "learning_rate": 9.973556625419357e-05, "loss": 0.2446, "step": 5720 }, { "epoch": 0.7782946789364664, "grad_norm": 0.29139378666877747, "learning_rate": 9.973149234637606e-05, "loss": 0.2478, "step": 5730 }, { "epoch": 0.7796529593534586, "grad_norm": 1.0890406370162964, "learning_rate": 9.972738738084015e-05, "loss": 0.2546, "step": 5740 }, { "epoch": 0.7810112397704506, "grad_norm": 0.5712840557098389, "learning_rate": 9.972325136014944e-05, "loss": 0.2347, "step": 5750 }, { "epoch": 0.7823695201874427, "grad_norm": 0.6613163352012634, "learning_rate": 9.971908428688689e-05, "loss": 0.2467, "step": 5760 }, { "epoch": 0.7837278006044348, "grad_norm": 0.67340087890625, "learning_rate": 9.971488616365488e-05, "loss": 0.2492, "step": 5770 }, { "epoch": 0.7850860810214269, "grad_norm": 0.32583826780319214, "learning_rate": 9.971065699307518e-05, "loss": 0.2381, "step": 5780 }, { "epoch": 0.786444361438419, "grad_norm": 0.3280521631240845, "learning_rate": 9.970639677778893e-05, "loss": 0.2468, "step": 5790 }, { "epoch": 0.787802641855411, "grad_norm": 0.3107339143753052, "learning_rate": 9.970210552045667e-05, "loss": 0.2581, "step": 5800 }, { "epoch": 0.7891609222724031, "grad_norm": 0.3663097321987152, "learning_rate": 9.969778322375833e-05, "loss": 0.2454, "step": 5810 }, { "epoch": 0.7905192026893952, "grad_norm": 0.2708544433116913, "learning_rate": 9.969342989039323e-05, "loss": 0.2513, "step": 5820 }, { "epoch": 0.7918774831063873, "grad_norm": 0.27981331944465637, "learning_rate": 9.968904552308003e-05, "loss": 0.2486, "step": 5830 }, { "epoch": 0.7932357635233794, "grad_norm": 0.40943774580955505, "learning_rate": 9.968463012455682e-05, "loss": 0.2489, "step": 5840 }, { "epoch": 0.7945940439403715, "grad_norm": 0.38253241777420044, "learning_rate": 9.968018369758108e-05, "loss": 0.2482, "step": 5850 }, { "epoch": 0.7959523243573636, "grad_norm": 0.5188789367675781, "learning_rate": 9.967570624492959e-05, "loss": 0.2409, "step": 5860 }, { "epoch": 0.7973106047743557, "grad_norm": 0.3218250274658203, "learning_rate": 9.967119776939857e-05, "loss": 0.2452, "step": 5870 }, { "epoch": 0.7986688851913477, "grad_norm": 0.3254469633102417, "learning_rate": 9.966665827380362e-05, "loss": 0.2489, "step": 5880 }, { "epoch": 0.8000271656083399, "grad_norm": 0.3821564018726349, "learning_rate": 9.966208776097969e-05, "loss": 0.2482, "step": 5890 }, { "epoch": 0.8013854460253319, "grad_norm": 0.3075064420700073, "learning_rate": 9.965748623378107e-05, "loss": 0.2459, "step": 5900 }, { "epoch": 0.8027437264423241, "grad_norm": 0.42981183528900146, "learning_rate": 9.965285369508148e-05, "loss": 0.2523, "step": 5910 }, { "epoch": 0.8041020068593161, "grad_norm": 0.3291740417480469, "learning_rate": 9.964819014777398e-05, "loss": 0.2488, "step": 5920 }, { "epoch": 0.8054602872763081, "grad_norm": 0.6508928537368774, "learning_rate": 9.964349559477097e-05, "loss": 0.2499, "step": 5930 }, { "epoch": 0.8068185676933003, "grad_norm": 0.3323608636856079, "learning_rate": 9.963877003900426e-05, "loss": 0.2342, "step": 5940 }, { "epoch": 0.8081768481102923, "grad_norm": 0.30968499183654785, "learning_rate": 9.963401348342498e-05, "loss": 0.2422, "step": 5950 }, { "epoch": 0.8095351285272845, "grad_norm": 0.3979131579399109, "learning_rate": 9.962922593100363e-05, "loss": 0.2438, "step": 5960 }, { "epoch": 0.8108934089442765, "grad_norm": 0.3904065191745758, "learning_rate": 9.962440738473011e-05, "loss": 0.244, "step": 5970 }, { "epoch": 0.8122516893612687, "grad_norm": 0.2780715227127075, "learning_rate": 9.961955784761361e-05, "loss": 0.2495, "step": 5980 }, { "epoch": 0.8136099697782607, "grad_norm": 0.5840536952018738, "learning_rate": 9.961467732268272e-05, "loss": 0.2517, "step": 5990 }, { "epoch": 0.8149682501952528, "grad_norm": 0.42017292976379395, "learning_rate": 9.960976581298536e-05, "loss": 0.2431, "step": 6000 }, { "epoch": 0.8163265306122449, "grad_norm": 0.6404939293861389, "learning_rate": 9.960482332158882e-05, "loss": 0.2499, "step": 6010 }, { "epoch": 0.817684811029237, "grad_norm": 0.6290649771690369, "learning_rate": 9.959984985157971e-05, "loss": 0.2472, "step": 6020 }, { "epoch": 0.8190430914462291, "grad_norm": 0.7773398756980896, "learning_rate": 9.959484540606401e-05, "loss": 0.2507, "step": 6030 }, { "epoch": 0.8204013718632212, "grad_norm": 0.3622499704360962, "learning_rate": 9.958980998816703e-05, "loss": 0.2494, "step": 6040 }, { "epoch": 0.8217596522802133, "grad_norm": 0.3607257902622223, "learning_rate": 9.958474360103342e-05, "loss": 0.2519, "step": 6050 }, { "epoch": 0.8231179326972053, "grad_norm": 0.5308447480201721, "learning_rate": 9.957964624782721e-05, "loss": 0.2493, "step": 6060 }, { "epoch": 0.8244762131141974, "grad_norm": 0.4861830770969391, "learning_rate": 9.957451793173172e-05, "loss": 0.234, "step": 6070 }, { "epoch": 0.8258344935311895, "grad_norm": 0.31227338314056396, "learning_rate": 9.956935865594963e-05, "loss": 0.2442, "step": 6080 }, { "epoch": 0.8271927739481816, "grad_norm": 0.4732406437397003, "learning_rate": 9.956416842370295e-05, "loss": 0.2552, "step": 6090 }, { "epoch": 0.8285510543651737, "grad_norm": 0.29500365257263184, "learning_rate": 9.9558947238233e-05, "loss": 0.2512, "step": 6100 }, { "epoch": 0.8299093347821658, "grad_norm": 0.3557686507701874, "learning_rate": 9.955369510280049e-05, "loss": 0.2369, "step": 6110 }, { "epoch": 0.8312676151991578, "grad_norm": 0.37719419598579407, "learning_rate": 9.954841202068538e-05, "loss": 0.245, "step": 6120 }, { "epoch": 0.83262589561615, "grad_norm": 0.39485713839530945, "learning_rate": 9.954309799518703e-05, "loss": 0.2511, "step": 6130 }, { "epoch": 0.833984176033142, "grad_norm": 0.308869332075119, "learning_rate": 9.953775302962407e-05, "loss": 0.2477, "step": 6140 }, { "epoch": 0.8353424564501342, "grad_norm": 0.43867987394332886, "learning_rate": 9.95323771273345e-05, "loss": 0.2541, "step": 6150 }, { "epoch": 0.8367007368671262, "grad_norm": 0.3418971002101898, "learning_rate": 9.952697029167559e-05, "loss": 0.2408, "step": 6160 }, { "epoch": 0.8380590172841184, "grad_norm": 0.3127572536468506, "learning_rate": 9.952153252602397e-05, "loss": 0.2497, "step": 6170 }, { "epoch": 0.8394172977011104, "grad_norm": 0.405727356672287, "learning_rate": 9.951606383377555e-05, "loss": 0.2439, "step": 6180 }, { "epoch": 0.8407755781181024, "grad_norm": 0.4104103446006775, "learning_rate": 9.951056421834559e-05, "loss": 0.2409, "step": 6190 }, { "epoch": 0.8421338585350946, "grad_norm": 0.5788498520851135, "learning_rate": 9.950503368316863e-05, "loss": 0.2429, "step": 6200 }, { "epoch": 0.8434921389520866, "grad_norm": 0.47129592299461365, "learning_rate": 9.949947223169856e-05, "loss": 0.2343, "step": 6210 }, { "epoch": 0.8448504193690788, "grad_norm": 0.3980062007904053, "learning_rate": 9.94938798674085e-05, "loss": 0.2532, "step": 6220 }, { "epoch": 0.8462086997860708, "grad_norm": 0.42404109239578247, "learning_rate": 9.948825659379097e-05, "loss": 0.2403, "step": 6230 }, { "epoch": 0.8475669802030629, "grad_norm": 0.8009231090545654, "learning_rate": 9.948260241435774e-05, "loss": 0.2379, "step": 6240 }, { "epoch": 0.848925260620055, "grad_norm": 0.43402308225631714, "learning_rate": 9.947691733263991e-05, "loss": 0.2507, "step": 6250 }, { "epoch": 0.8502835410370471, "grad_norm": 0.3350345492362976, "learning_rate": 9.947120135218781e-05, "loss": 0.2375, "step": 6260 }, { "epoch": 0.8516418214540392, "grad_norm": 0.33181142807006836, "learning_rate": 9.946545447657116e-05, "loss": 0.2289, "step": 6270 }, { "epoch": 0.8530001018710313, "grad_norm": 0.3225500285625458, "learning_rate": 9.94596767093789e-05, "loss": 0.2505, "step": 6280 }, { "epoch": 0.8543583822880234, "grad_norm": 0.3817516565322876, "learning_rate": 9.945386805421932e-05, "loss": 0.2539, "step": 6290 }, { "epoch": 0.8557166627050155, "grad_norm": 0.35765305161476135, "learning_rate": 9.944802851471995e-05, "loss": 0.2478, "step": 6300 }, { "epoch": 0.8570749431220075, "grad_norm": 0.33067193627357483, "learning_rate": 9.944215809452765e-05, "loss": 0.2471, "step": 6310 }, { "epoch": 0.8584332235389996, "grad_norm": 0.4384279251098633, "learning_rate": 9.943625679730853e-05, "loss": 0.2581, "step": 6320 }, { "epoch": 0.8597915039559917, "grad_norm": 0.39662012457847595, "learning_rate": 9.9430324626748e-05, "loss": 0.2426, "step": 6330 }, { "epoch": 0.8611497843729838, "grad_norm": 0.47680652141571045, "learning_rate": 9.942436158655073e-05, "loss": 0.246, "step": 6340 }, { "epoch": 0.8625080647899759, "grad_norm": 0.5390910506248474, "learning_rate": 9.941836768044073e-05, "loss": 0.2424, "step": 6350 }, { "epoch": 0.863866345206968, "grad_norm": 0.47606027126312256, "learning_rate": 9.941234291216121e-05, "loss": 0.2457, "step": 6360 }, { "epoch": 0.8652246256239601, "grad_norm": 0.38781264424324036, "learning_rate": 9.94062872854747e-05, "loss": 0.2472, "step": 6370 }, { "epoch": 0.8665829060409521, "grad_norm": 0.3760601580142975, "learning_rate": 9.940020080416297e-05, "loss": 0.2437, "step": 6380 }, { "epoch": 0.8679411864579443, "grad_norm": 0.4429396986961365, "learning_rate": 9.93940834720271e-05, "loss": 0.2489, "step": 6390 }, { "epoch": 0.8692994668749363, "grad_norm": 0.524994969367981, "learning_rate": 9.93879352928874e-05, "loss": 0.2492, "step": 6400 }, { "epoch": 0.8706577472919285, "grad_norm": 0.433590292930603, "learning_rate": 9.938175627058346e-05, "loss": 0.251, "step": 6410 }, { "epoch": 0.8720160277089205, "grad_norm": 0.3745516240596771, "learning_rate": 9.937554640897413e-05, "loss": 0.2392, "step": 6420 }, { "epoch": 0.8733743081259125, "grad_norm": 1.2245509624481201, "learning_rate": 9.936930571193753e-05, "loss": 0.2398, "step": 6430 }, { "epoch": 0.8747325885429047, "grad_norm": 0.34582260251045227, "learning_rate": 9.936303418337101e-05, "loss": 0.248, "step": 6440 }, { "epoch": 0.8760908689598967, "grad_norm": 0.28406664729118347, "learning_rate": 9.935673182719119e-05, "loss": 0.2443, "step": 6450 }, { "epoch": 0.8774491493768889, "grad_norm": 0.3210006058216095, "learning_rate": 9.935039864733394e-05, "loss": 0.232, "step": 6460 }, { "epoch": 0.8788074297938809, "grad_norm": 0.4010201394557953, "learning_rate": 9.93440346477544e-05, "loss": 0.2535, "step": 6470 }, { "epoch": 0.8801657102108731, "grad_norm": 0.3190806806087494, "learning_rate": 9.933763983242691e-05, "loss": 0.2438, "step": 6480 }, { "epoch": 0.8815239906278651, "grad_norm": 0.3404321074485779, "learning_rate": 9.933121420534511e-05, "loss": 0.2434, "step": 6490 }, { "epoch": 0.8828822710448572, "grad_norm": 0.43759095668792725, "learning_rate": 9.932475777052185e-05, "loss": 0.2429, "step": 6500 }, { "epoch": 0.8842405514618493, "grad_norm": 0.3698059320449829, "learning_rate": 9.931827053198923e-05, "loss": 0.2388, "step": 6510 }, { "epoch": 0.8855988318788414, "grad_norm": 0.31481635570526123, "learning_rate": 9.931175249379855e-05, "loss": 0.2428, "step": 6520 }, { "epoch": 0.8869571122958335, "grad_norm": 0.5177509784698486, "learning_rate": 9.930520366002043e-05, "loss": 0.247, "step": 6530 }, { "epoch": 0.8883153927128256, "grad_norm": 0.42036452889442444, "learning_rate": 9.929862403474464e-05, "loss": 0.2448, "step": 6540 }, { "epoch": 0.8896736731298176, "grad_norm": 0.724575936794281, "learning_rate": 9.92920136220802e-05, "loss": 0.2407, "step": 6550 }, { "epoch": 0.8910319535468098, "grad_norm": 0.3753584027290344, "learning_rate": 9.92853724261554e-05, "loss": 0.2487, "step": 6560 }, { "epoch": 0.8923902339638018, "grad_norm": 0.35200509428977966, "learning_rate": 9.927870045111767e-05, "loss": 0.2433, "step": 6570 }, { "epoch": 0.893748514380794, "grad_norm": 0.5653567314147949, "learning_rate": 9.927199770113376e-05, "loss": 0.2378, "step": 6580 }, { "epoch": 0.895106794797786, "grad_norm": 0.29447272419929504, "learning_rate": 9.926526418038957e-05, "loss": 0.2455, "step": 6590 }, { "epoch": 0.8964650752147781, "grad_norm": 0.4254431426525116, "learning_rate": 9.925849989309024e-05, "loss": 0.2469, "step": 6600 }, { "epoch": 0.8978233556317702, "grad_norm": 0.36052531003952026, "learning_rate": 9.925170484346012e-05, "loss": 0.2505, "step": 6610 }, { "epoch": 0.8991816360487622, "grad_norm": 0.45553654432296753, "learning_rate": 9.924487903574278e-05, "loss": 0.2495, "step": 6620 }, { "epoch": 0.9005399164657544, "grad_norm": 0.4005371630191803, "learning_rate": 9.923802247420098e-05, "loss": 0.2467, "step": 6630 }, { "epoch": 0.9018981968827464, "grad_norm": 0.3477528691291809, "learning_rate": 9.92311351631167e-05, "loss": 0.2454, "step": 6640 }, { "epoch": 0.9032564772997386, "grad_norm": 0.47104912996292114, "learning_rate": 9.922421710679115e-05, "loss": 0.2412, "step": 6650 }, { "epoch": 0.9046147577167306, "grad_norm": 0.5097659230232239, "learning_rate": 9.921726830954468e-05, "loss": 0.2436, "step": 6660 }, { "epoch": 0.9059730381337227, "grad_norm": 0.367531418800354, "learning_rate": 9.921028877571687e-05, "loss": 0.25, "step": 6670 }, { "epoch": 0.9073313185507148, "grad_norm": 0.29040250182151794, "learning_rate": 9.92032785096665e-05, "loss": 0.2427, "step": 6680 }, { "epoch": 0.9086895989677068, "grad_norm": 0.41248467564582825, "learning_rate": 9.919623751577157e-05, "loss": 0.2488, "step": 6690 }, { "epoch": 0.910047879384699, "grad_norm": 0.35198503732681274, "learning_rate": 9.918916579842919e-05, "loss": 0.2442, "step": 6700 }, { "epoch": 0.911406159801691, "grad_norm": 0.5491409301757812, "learning_rate": 9.918206336205571e-05, "loss": 0.2478, "step": 6710 }, { "epoch": 0.9127644402186832, "grad_norm": 0.5477699041366577, "learning_rate": 9.917493021108669e-05, "loss": 0.2446, "step": 6720 }, { "epoch": 0.9141227206356752, "grad_norm": 0.568323016166687, "learning_rate": 9.916776634997683e-05, "loss": 0.2455, "step": 6730 }, { "epoch": 0.9154810010526673, "grad_norm": 0.2822461724281311, "learning_rate": 9.916057178319999e-05, "loss": 0.2385, "step": 6740 }, { "epoch": 0.9168392814696594, "grad_norm": 0.3304057717323303, "learning_rate": 9.915334651524928e-05, "loss": 0.2423, "step": 6750 }, { "epoch": 0.9181975618866515, "grad_norm": 0.44017091393470764, "learning_rate": 9.914609055063691e-05, "loss": 0.246, "step": 6760 }, { "epoch": 0.9195558423036436, "grad_norm": 0.8728335499763489, "learning_rate": 9.91388038938943e-05, "loss": 0.2414, "step": 6770 }, { "epoch": 0.9209141227206357, "grad_norm": 0.41547584533691406, "learning_rate": 9.9131486549572e-05, "loss": 0.2459, "step": 6780 }, { "epoch": 0.9222724031376278, "grad_norm": 0.6903845071792603, "learning_rate": 9.912413852223979e-05, "loss": 0.2474, "step": 6790 }, { "epoch": 0.9236306835546199, "grad_norm": 0.4918045103549957, "learning_rate": 9.911675981648653e-05, "loss": 0.2447, "step": 6800 }, { "epoch": 0.9249889639716119, "grad_norm": 0.564278781414032, "learning_rate": 9.910935043692033e-05, "loss": 0.245, "step": 6810 }, { "epoch": 0.926347244388604, "grad_norm": 0.3200502395629883, "learning_rate": 9.910191038816836e-05, "loss": 0.2427, "step": 6820 }, { "epoch": 0.9277055248055961, "grad_norm": 0.5327267646789551, "learning_rate": 9.909443967487704e-05, "loss": 0.2443, "step": 6830 }, { "epoch": 0.9290638052225882, "grad_norm": 0.3043926954269409, "learning_rate": 9.908693830171183e-05, "loss": 0.2547, "step": 6840 }, { "epoch": 0.9304220856395803, "grad_norm": 0.3255450427532196, "learning_rate": 9.907940627335745e-05, "loss": 0.2336, "step": 6850 }, { "epoch": 0.9317803660565723, "grad_norm": 0.6329029202461243, "learning_rate": 9.907184359451769e-05, "loss": 0.259, "step": 6860 }, { "epoch": 0.9331386464735645, "grad_norm": 0.4716109037399292, "learning_rate": 9.906425026991551e-05, "loss": 0.2392, "step": 6870 }, { "epoch": 0.9344969268905565, "grad_norm": 0.4078250527381897, "learning_rate": 9.9056626304293e-05, "loss": 0.249, "step": 6880 }, { "epoch": 0.9358552073075487, "grad_norm": 0.38135090470314026, "learning_rate": 9.904897170241138e-05, "loss": 0.2496, "step": 6890 }, { "epoch": 0.9372134877245407, "grad_norm": 0.39853060245513916, "learning_rate": 9.904128646905103e-05, "loss": 0.2473, "step": 6900 }, { "epoch": 0.9385717681415329, "grad_norm": 0.5313524007797241, "learning_rate": 9.903357060901141e-05, "loss": 0.2464, "step": 6910 }, { "epoch": 0.9399300485585249, "grad_norm": 0.8978538513183594, "learning_rate": 9.90258241271112e-05, "loss": 0.2357, "step": 6920 }, { "epoch": 0.941288328975517, "grad_norm": 0.36447224020957947, "learning_rate": 9.901804702818805e-05, "loss": 0.2517, "step": 6930 }, { "epoch": 0.9426466093925091, "grad_norm": 0.2911955714225769, "learning_rate": 9.901023931709888e-05, "loss": 0.2456, "step": 6940 }, { "epoch": 0.9440048898095011, "grad_norm": 0.48454591631889343, "learning_rate": 9.900240099871967e-05, "loss": 0.2476, "step": 6950 }, { "epoch": 0.9453631702264933, "grad_norm": 0.3345385491847992, "learning_rate": 9.899453207794547e-05, "loss": 0.2413, "step": 6960 }, { "epoch": 0.9467214506434853, "grad_norm": 2.1616320610046387, "learning_rate": 9.898663255969054e-05, "loss": 0.2442, "step": 6970 }, { "epoch": 0.9480797310604774, "grad_norm": 0.29099032282829285, "learning_rate": 9.897870244888816e-05, "loss": 0.2528, "step": 6980 }, { "epoch": 0.9494380114774695, "grad_norm": 0.3078690469264984, "learning_rate": 9.897074175049076e-05, "loss": 0.2429, "step": 6990 }, { "epoch": 0.9507962918944616, "grad_norm": 0.35784491896629333, "learning_rate": 9.896275046946984e-05, "loss": 0.2524, "step": 7000 }, { "epoch": 0.9521545723114537, "grad_norm": 0.39735087752342224, "learning_rate": 9.895472861081605e-05, "loss": 0.2427, "step": 7010 }, { "epoch": 0.9535128527284458, "grad_norm": 0.4260939955711365, "learning_rate": 9.894667617953908e-05, "loss": 0.2564, "step": 7020 }, { "epoch": 0.9548711331454379, "grad_norm": 0.30280131101608276, "learning_rate": 9.893859318066776e-05, "loss": 0.2481, "step": 7030 }, { "epoch": 0.95622941356243, "grad_norm": 0.3902614414691925, "learning_rate": 9.893047961924996e-05, "loss": 0.2451, "step": 7040 }, { "epoch": 0.957587693979422, "grad_norm": 0.2885327935218811, "learning_rate": 9.892233550035269e-05, "loss": 0.2506, "step": 7050 }, { "epoch": 0.9589459743964142, "grad_norm": 0.7783188223838806, "learning_rate": 9.8914160829062e-05, "loss": 0.235, "step": 7060 }, { "epoch": 0.9603042548134062, "grad_norm": 0.3145168125629425, "learning_rate": 9.890595561048305e-05, "loss": 0.2559, "step": 7070 }, { "epoch": 0.9616625352303984, "grad_norm": 0.3989132344722748, "learning_rate": 9.889771984974008e-05, "loss": 0.2388, "step": 7080 }, { "epoch": 0.9630208156473904, "grad_norm": 0.43889036774635315, "learning_rate": 9.888945355197637e-05, "loss": 0.2459, "step": 7090 }, { "epoch": 0.9643790960643825, "grad_norm": 0.28526127338409424, "learning_rate": 9.888115672235427e-05, "loss": 0.2503, "step": 7100 }, { "epoch": 0.9657373764813746, "grad_norm": 0.3514707684516907, "learning_rate": 9.887282936605526e-05, "loss": 0.2369, "step": 7110 }, { "epoch": 0.9670956568983666, "grad_norm": 0.42194613814353943, "learning_rate": 9.88644714882798e-05, "loss": 0.2504, "step": 7120 }, { "epoch": 0.9684539373153588, "grad_norm": 0.30878156423568726, "learning_rate": 9.885608309424748e-05, "loss": 0.2503, "step": 7130 }, { "epoch": 0.9698122177323508, "grad_norm": 0.3170742094516754, "learning_rate": 9.884766418919693e-05, "loss": 0.2482, "step": 7140 }, { "epoch": 0.971170498149343, "grad_norm": 0.3786158263683319, "learning_rate": 9.883921477838578e-05, "loss": 0.2415, "step": 7150 }, { "epoch": 0.972528778566335, "grad_norm": 1.7608978748321533, "learning_rate": 9.883073486709078e-05, "loss": 0.2588, "step": 7160 }, { "epoch": 0.9738870589833271, "grad_norm": 0.38172513246536255, "learning_rate": 9.882222446060772e-05, "loss": 0.2507, "step": 7170 }, { "epoch": 0.9752453394003192, "grad_norm": 0.4118957221508026, "learning_rate": 9.881368356425139e-05, "loss": 0.2453, "step": 7180 }, { "epoch": 0.9766036198173113, "grad_norm": 0.3225587010383606, "learning_rate": 9.880511218335566e-05, "loss": 0.246, "step": 7190 }, { "epoch": 0.9779619002343034, "grad_norm": 0.3492511510848999, "learning_rate": 9.879651032327343e-05, "loss": 0.2473, "step": 7200 }, { "epoch": 0.9793201806512954, "grad_norm": 0.2936345934867859, "learning_rate": 9.878787798937664e-05, "loss": 0.2387, "step": 7210 }, { "epoch": 0.9806784610682876, "grad_norm": 0.4257153272628784, "learning_rate": 9.877921518705623e-05, "loss": 0.245, "step": 7220 }, { "epoch": 0.9820367414852796, "grad_norm": 0.5120747685432434, "learning_rate": 9.877052192172221e-05, "loss": 0.2403, "step": 7230 }, { "epoch": 0.9833950219022717, "grad_norm": 0.4968951642513275, "learning_rate": 9.876179819880356e-05, "loss": 0.2489, "step": 7240 }, { "epoch": 0.9847533023192638, "grad_norm": 0.4866449236869812, "learning_rate": 9.875304402374836e-05, "loss": 0.2484, "step": 7250 }, { "epoch": 0.9861115827362559, "grad_norm": 0.36357176303863525, "learning_rate": 9.874425940202363e-05, "loss": 0.2491, "step": 7260 }, { "epoch": 0.987469863153248, "grad_norm": 0.3697587251663208, "learning_rate": 9.873544433911545e-05, "loss": 0.2499, "step": 7270 }, { "epoch": 0.9888281435702401, "grad_norm": 0.2926376163959503, "learning_rate": 9.872659884052891e-05, "loss": 0.2467, "step": 7280 }, { "epoch": 0.9901864239872321, "grad_norm": 0.6101000905036926, "learning_rate": 9.871772291178809e-05, "loss": 0.2473, "step": 7290 }, { "epoch": 0.9915447044042243, "grad_norm": 0.467873752117157, "learning_rate": 9.870881655843605e-05, "loss": 0.2509, "step": 7300 }, { "epoch": 0.9929029848212163, "grad_norm": 0.3085642457008362, "learning_rate": 9.86998797860349e-05, "loss": 0.2473, "step": 7310 }, { "epoch": 0.9942612652382085, "grad_norm": 0.3369590938091278, "learning_rate": 9.869091260016574e-05, "loss": 0.2414, "step": 7320 }, { "epoch": 0.9956195456552005, "grad_norm": 0.3773331046104431, "learning_rate": 9.868191500642864e-05, "loss": 0.2441, "step": 7330 }, { "epoch": 0.9969778260721927, "grad_norm": 0.3425379991531372, "learning_rate": 9.867288701044267e-05, "loss": 0.2585, "step": 7340 }, { "epoch": 0.9983361064891847, "grad_norm": 0.6044887900352478, "learning_rate": 9.866382861784589e-05, "loss": 0.2494, "step": 7350 }, { "epoch": 0.9996943869061767, "grad_norm": 0.8698163032531738, "learning_rate": 9.865473983429532e-05, "loss": 0.2525, "step": 7360 }, { "epoch": 1.0010526673231688, "grad_norm": 0.6376912593841553, "learning_rate": 9.864562066546698e-05, "loss": 0.2319, "step": 7370 }, { "epoch": 1.002410947740161, "grad_norm": 0.4927957355976105, "learning_rate": 9.863647111705591e-05, "loss": 0.2428, "step": 7380 }, { "epoch": 1.003769228157153, "grad_norm": 0.48219847679138184, "learning_rate": 9.862729119477603e-05, "loss": 0.2394, "step": 7390 }, { "epoch": 1.0051275085741451, "grad_norm": 0.3745911717414856, "learning_rate": 9.861808090436029e-05, "loss": 0.2357, "step": 7400 }, { "epoch": 1.0064857889911372, "grad_norm": 0.2563064396381378, "learning_rate": 9.860884025156058e-05, "loss": 0.2357, "step": 7410 }, { "epoch": 1.0078440694081292, "grad_norm": 0.42804840207099915, "learning_rate": 9.859956924214777e-05, "loss": 0.239, "step": 7420 }, { "epoch": 1.0092023498251215, "grad_norm": 0.37420523166656494, "learning_rate": 9.85902678819117e-05, "loss": 0.2444, "step": 7430 }, { "epoch": 1.0105606302421135, "grad_norm": 0.36284732818603516, "learning_rate": 9.858093617666112e-05, "loss": 0.24, "step": 7440 }, { "epoch": 1.0119189106591056, "grad_norm": 0.4046810269355774, "learning_rate": 9.857157413222375e-05, "loss": 0.2427, "step": 7450 }, { "epoch": 1.0132771910760976, "grad_norm": 0.36761555075645447, "learning_rate": 9.856218175444629e-05, "loss": 0.2306, "step": 7460 }, { "epoch": 1.0146354714930899, "grad_norm": 0.3915986716747284, "learning_rate": 9.855275904919435e-05, "loss": 0.2392, "step": 7470 }, { "epoch": 1.015993751910082, "grad_norm": 0.9772692918777466, "learning_rate": 9.854330602235246e-05, "loss": 0.2355, "step": 7480 }, { "epoch": 1.017352032327074, "grad_norm": 0.41798871755599976, "learning_rate": 9.853382267982413e-05, "loss": 0.3275, "step": 7490 }, { "epoch": 1.018710312744066, "grad_norm": 0.4862975776195526, "learning_rate": 9.852430902753178e-05, "loss": 0.2502, "step": 7500 }, { "epoch": 1.020068593161058, "grad_norm": 0.3766954839229584, "learning_rate": 9.851476507141677e-05, "loss": 0.245, "step": 7510 }, { "epoch": 1.0214268735780503, "grad_norm": 0.7479773759841919, "learning_rate": 9.850519081743936e-05, "loss": 0.2403, "step": 7520 }, { "epoch": 1.0227851539950423, "grad_norm": 0.47081416845321655, "learning_rate": 9.849558627157878e-05, "loss": 0.2532, "step": 7530 }, { "epoch": 1.0241434344120344, "grad_norm": 1.3125473260879517, "learning_rate": 9.848595143983312e-05, "loss": 0.2411, "step": 7540 }, { "epoch": 1.0255017148290264, "grad_norm": 0.5578804612159729, "learning_rate": 9.847628632821942e-05, "loss": 0.2808, "step": 7550 }, { "epoch": 1.0268599952460185, "grad_norm": 0.32416287064552307, "learning_rate": 9.84665909427736e-05, "loss": 0.2414, "step": 7560 }, { "epoch": 1.0282182756630107, "grad_norm": 0.7437952160835266, "learning_rate": 9.845686528955054e-05, "loss": 0.2804, "step": 7570 }, { "epoch": 1.0295765560800028, "grad_norm": 0.45193883776664734, "learning_rate": 9.844710937462396e-05, "loss": 0.2755, "step": 7580 }, { "epoch": 1.0309348364969948, "grad_norm": 0.4266040623188019, "learning_rate": 9.843732320408653e-05, "loss": 0.2363, "step": 7590 }, { "epoch": 1.0322931169139868, "grad_norm": 0.39513319730758667, "learning_rate": 9.842750678404977e-05, "loss": 0.2381, "step": 7600 }, { "epoch": 1.0336513973309789, "grad_norm": 0.41839197278022766, "learning_rate": 9.841766012064414e-05, "loss": 0.2375, "step": 7610 }, { "epoch": 1.0350096777479711, "grad_norm": 0.7193804979324341, "learning_rate": 9.840778322001896e-05, "loss": 0.2351, "step": 7620 }, { "epoch": 1.0363679581649632, "grad_norm": 0.3606083393096924, "learning_rate": 9.83978760883424e-05, "loss": 0.2398, "step": 7630 }, { "epoch": 1.0377262385819552, "grad_norm": 0.5342528820037842, "learning_rate": 9.838793873180157e-05, "loss": 0.2314, "step": 7640 }, { "epoch": 1.0390845189989473, "grad_norm": 0.4345169961452484, "learning_rate": 9.837797115660244e-05, "loss": 0.2418, "step": 7650 }, { "epoch": 1.0404427994159393, "grad_norm": 0.28131407499313354, "learning_rate": 9.83679733689698e-05, "loss": 0.2401, "step": 7660 }, { "epoch": 1.0418010798329316, "grad_norm": 0.41502001881599426, "learning_rate": 9.835794537514738e-05, "loss": 0.2355, "step": 7670 }, { "epoch": 1.0431593602499236, "grad_norm": 0.419588178396225, "learning_rate": 9.834788718139776e-05, "loss": 0.236, "step": 7680 }, { "epoch": 1.0445176406669157, "grad_norm": 0.32255053520202637, "learning_rate": 9.833779879400233e-05, "loss": 0.2412, "step": 7690 }, { "epoch": 1.0458759210839077, "grad_norm": 0.26190540194511414, "learning_rate": 9.832768021926138e-05, "loss": 0.2419, "step": 7700 }, { "epoch": 1.0472342015009, "grad_norm": 0.6941227912902832, "learning_rate": 9.831753146349402e-05, "loss": 0.2296, "step": 7710 }, { "epoch": 1.048592481917892, "grad_norm": 0.35642021894454956, "learning_rate": 9.830735253303828e-05, "loss": 0.2299, "step": 7720 }, { "epoch": 1.049950762334884, "grad_norm": 0.3313494324684143, "learning_rate": 9.829714343425094e-05, "loss": 0.2208, "step": 7730 }, { "epoch": 1.051309042751876, "grad_norm": 0.3570537567138672, "learning_rate": 9.82869041735077e-05, "loss": 0.231, "step": 7740 }, { "epoch": 1.0526673231688681, "grad_norm": 0.3344622850418091, "learning_rate": 9.827663475720302e-05, "loss": 0.2424, "step": 7750 }, { "epoch": 1.0540256035858604, "grad_norm": 0.3999479115009308, "learning_rate": 9.826633519175028e-05, "loss": 0.2317, "step": 7760 }, { "epoch": 1.0553838840028524, "grad_norm": 0.3402807116508484, "learning_rate": 9.82560054835816e-05, "loss": 0.2422, "step": 7770 }, { "epoch": 1.0567421644198445, "grad_norm": 0.2754310369491577, "learning_rate": 9.824564563914797e-05, "loss": 0.2352, "step": 7780 }, { "epoch": 1.0581004448368365, "grad_norm": 0.4087323546409607, "learning_rate": 9.823525566491922e-05, "loss": 0.2373, "step": 7790 }, { "epoch": 1.0594587252538286, "grad_norm": 0.4814484119415283, "learning_rate": 9.822483556738397e-05, "loss": 0.2331, "step": 7800 }, { "epoch": 1.0608170056708208, "grad_norm": 0.4177996814250946, "learning_rate": 9.821438535304966e-05, "loss": 0.2358, "step": 7810 }, { "epoch": 1.0621752860878129, "grad_norm": 0.31193283200263977, "learning_rate": 9.82039050284425e-05, "loss": 0.2439, "step": 7820 }, { "epoch": 1.063533566504805, "grad_norm": 0.3762202858924866, "learning_rate": 9.819339460010755e-05, "loss": 0.234, "step": 7830 }, { "epoch": 1.064891846921797, "grad_norm": 0.3268336057662964, "learning_rate": 9.818285407460867e-05, "loss": 0.2381, "step": 7840 }, { "epoch": 1.066250127338789, "grad_norm": 0.4578656554222107, "learning_rate": 9.817228345852851e-05, "loss": 0.237, "step": 7850 }, { "epoch": 1.0676084077557813, "grad_norm": 0.49291202425956726, "learning_rate": 9.81616827584685e-05, "loss": 0.2298, "step": 7860 }, { "epoch": 1.0689666881727733, "grad_norm": 0.3192181885242462, "learning_rate": 9.815105198104885e-05, "loss": 0.2295, "step": 7870 }, { "epoch": 1.0703249685897653, "grad_norm": 0.3418056070804596, "learning_rate": 9.814039113290857e-05, "loss": 0.2304, "step": 7880 }, { "epoch": 1.0716832490067574, "grad_norm": 1.175872802734375, "learning_rate": 9.812970022070544e-05, "loss": 0.2354, "step": 7890 }, { "epoch": 1.0730415294237496, "grad_norm": 0.39178434014320374, "learning_rate": 9.811897925111604e-05, "loss": 0.2344, "step": 7900 }, { "epoch": 1.0743998098407417, "grad_norm": 0.4785352051258087, "learning_rate": 9.810822823083568e-05, "loss": 0.2379, "step": 7910 }, { "epoch": 1.0757580902577337, "grad_norm": 0.3936551809310913, "learning_rate": 9.809744716657848e-05, "loss": 0.2375, "step": 7920 }, { "epoch": 1.0771163706747258, "grad_norm": 0.37845101952552795, "learning_rate": 9.808663606507729e-05, "loss": 0.2491, "step": 7930 }, { "epoch": 1.0784746510917178, "grad_norm": 0.46890538930892944, "learning_rate": 9.807579493308373e-05, "loss": 0.2354, "step": 7940 }, { "epoch": 1.07983293150871, "grad_norm": 0.3032284080982208, "learning_rate": 9.806492377736817e-05, "loss": 0.2432, "step": 7950 }, { "epoch": 1.0811912119257021, "grad_norm": 0.35670164227485657, "learning_rate": 9.805402260471977e-05, "loss": 0.2434, "step": 7960 }, { "epoch": 1.0825494923426942, "grad_norm": 0.3077481687068939, "learning_rate": 9.804309142194634e-05, "loss": 0.2308, "step": 7970 }, { "epoch": 1.0839077727596862, "grad_norm": 0.31387320160865784, "learning_rate": 9.803213023587453e-05, "loss": 0.2513, "step": 7980 }, { "epoch": 1.0852660531766782, "grad_norm": 0.44298121333122253, "learning_rate": 9.802113905334969e-05, "loss": 0.24, "step": 7990 }, { "epoch": 1.0866243335936705, "grad_norm": 0.48027119040489197, "learning_rate": 9.801011788123589e-05, "loss": 0.2376, "step": 8000 }, { "epoch": 1.0879826140106625, "grad_norm": 0.6710615158081055, "learning_rate": 9.799906672641595e-05, "loss": 0.2383, "step": 8010 }, { "epoch": 1.0893408944276546, "grad_norm": 0.33985400199890137, "learning_rate": 9.798798559579139e-05, "loss": 0.2458, "step": 8020 }, { "epoch": 1.0906991748446466, "grad_norm": 0.4286821782588959, "learning_rate": 9.797687449628247e-05, "loss": 0.2295, "step": 8030 }, { "epoch": 1.0920574552616387, "grad_norm": 0.4144464433193207, "learning_rate": 9.796573343482819e-05, "loss": 0.2371, "step": 8040 }, { "epoch": 1.093415735678631, "grad_norm": 0.36185285449028015, "learning_rate": 9.79545624183862e-05, "loss": 0.2277, "step": 8050 }, { "epoch": 1.094774016095623, "grad_norm": 0.439866840839386, "learning_rate": 9.79433614539329e-05, "loss": 0.239, "step": 8060 }, { "epoch": 1.096132296512615, "grad_norm": 0.46581074595451355, "learning_rate": 9.793213054846338e-05, "loss": 0.2257, "step": 8070 }, { "epoch": 1.097490576929607, "grad_norm": 0.3567286431789398, "learning_rate": 9.792086970899144e-05, "loss": 0.2468, "step": 8080 }, { "epoch": 1.0988488573465993, "grad_norm": 0.3623661994934082, "learning_rate": 9.790957894254957e-05, "loss": 0.2316, "step": 8090 }, { "epoch": 1.1002071377635914, "grad_norm": 0.2945813238620758, "learning_rate": 9.789825825618891e-05, "loss": 0.2383, "step": 8100 }, { "epoch": 1.1015654181805834, "grad_norm": 0.334659218788147, "learning_rate": 9.788690765697937e-05, "loss": 0.2398, "step": 8110 }, { "epoch": 1.1029236985975754, "grad_norm": 0.49678102135658264, "learning_rate": 9.787552715200944e-05, "loss": 0.2304, "step": 8120 }, { "epoch": 1.1042819790145675, "grad_norm": 0.5790162682533264, "learning_rate": 9.786411674838639e-05, "loss": 0.2391, "step": 8130 }, { "epoch": 1.1056402594315597, "grad_norm": 0.4023284912109375, "learning_rate": 9.785267645323605e-05, "loss": 0.2367, "step": 8140 }, { "epoch": 1.1069985398485518, "grad_norm": 0.35249799489974976, "learning_rate": 9.784120627370302e-05, "loss": 0.2403, "step": 8150 }, { "epoch": 1.1083568202655438, "grad_norm": 0.4567525088787079, "learning_rate": 9.782970621695051e-05, "loss": 0.2318, "step": 8160 }, { "epoch": 1.1097151006825359, "grad_norm": 0.36693963408470154, "learning_rate": 9.78181762901604e-05, "loss": 0.2357, "step": 8170 }, { "epoch": 1.111073381099528, "grad_norm": 0.33589598536491394, "learning_rate": 9.780661650053322e-05, "loss": 0.2403, "step": 8180 }, { "epoch": 1.1124316615165202, "grad_norm": 0.35659483075141907, "learning_rate": 9.779502685528814e-05, "loss": 0.237, "step": 8190 }, { "epoch": 1.1137899419335122, "grad_norm": 0.3584313988685608, "learning_rate": 9.7783407361663e-05, "loss": 0.2284, "step": 8200 }, { "epoch": 1.1151482223505043, "grad_norm": 0.5540043115615845, "learning_rate": 9.777175802691428e-05, "loss": 0.2323, "step": 8210 }, { "epoch": 1.1165065027674963, "grad_norm": 0.9529482126235962, "learning_rate": 9.776007885831705e-05, "loss": 0.2248, "step": 8220 }, { "epoch": 1.1178647831844883, "grad_norm": 0.5270287394523621, "learning_rate": 9.774836986316507e-05, "loss": 0.2349, "step": 8230 }, { "epoch": 1.1192230636014806, "grad_norm": 0.45280736684799194, "learning_rate": 9.77366310487707e-05, "loss": 0.2295, "step": 8240 }, { "epoch": 1.1205813440184726, "grad_norm": 0.4259120523929596, "learning_rate": 9.772486242246492e-05, "loss": 0.2314, "step": 8250 }, { "epoch": 1.1219396244354647, "grad_norm": 0.3228849172592163, "learning_rate": 9.771306399159732e-05, "loss": 0.2317, "step": 8260 }, { "epoch": 1.1232979048524567, "grad_norm": 0.3239177465438843, "learning_rate": 9.770123576353614e-05, "loss": 0.2364, "step": 8270 }, { "epoch": 1.124656185269449, "grad_norm": 0.801771879196167, "learning_rate": 9.768937774566818e-05, "loss": 0.2335, "step": 8280 }, { "epoch": 1.126014465686441, "grad_norm": 0.32234644889831543, "learning_rate": 9.767748994539889e-05, "loss": 0.2353, "step": 8290 }, { "epoch": 1.127372746103433, "grad_norm": 0.3816761374473572, "learning_rate": 9.766557237015229e-05, "loss": 0.2315, "step": 8300 }, { "epoch": 1.1287310265204251, "grad_norm": 0.2968044579029083, "learning_rate": 9.765362502737097e-05, "loss": 0.239, "step": 8310 }, { "epoch": 1.1300893069374172, "grad_norm": 0.3127933740615845, "learning_rate": 9.76416479245162e-05, "loss": 0.2291, "step": 8320 }, { "epoch": 1.1314475873544092, "grad_norm": 0.4167024493217468, "learning_rate": 9.762964106906774e-05, "loss": 0.229, "step": 8330 }, { "epoch": 1.1328058677714015, "grad_norm": 1.2388274669647217, "learning_rate": 9.761760446852397e-05, "loss": 0.2376, "step": 8340 }, { "epoch": 1.1341641481883935, "grad_norm": 0.4435670077800751, "learning_rate": 9.760553813040185e-05, "loss": 0.2325, "step": 8350 }, { "epoch": 1.1355224286053855, "grad_norm": 0.35425737500190735, "learning_rate": 9.759344206223691e-05, "loss": 0.2331, "step": 8360 }, { "epoch": 1.1368807090223776, "grad_norm": 0.2853676378726959, "learning_rate": 9.758131627158325e-05, "loss": 0.2281, "step": 8370 }, { "epoch": 1.1382389894393699, "grad_norm": 0.5125002861022949, "learning_rate": 9.756916076601349e-05, "loss": 0.2306, "step": 8380 }, { "epoch": 1.139597269856362, "grad_norm": 0.4584089517593384, "learning_rate": 9.755697555311886e-05, "loss": 0.2321, "step": 8390 }, { "epoch": 1.140955550273354, "grad_norm": 0.4530831277370453, "learning_rate": 9.754476064050914e-05, "loss": 0.2304, "step": 8400 }, { "epoch": 1.142313830690346, "grad_norm": 0.42023783922195435, "learning_rate": 9.753251603581263e-05, "loss": 0.2319, "step": 8410 }, { "epoch": 1.143672111107338, "grad_norm": 0.352321058511734, "learning_rate": 9.752024174667617e-05, "loss": 0.2338, "step": 8420 }, { "epoch": 1.1450303915243303, "grad_norm": 0.40681594610214233, "learning_rate": 9.750793778076519e-05, "loss": 0.2329, "step": 8430 }, { "epoch": 1.1463886719413223, "grad_norm": 0.3396472632884979, "learning_rate": 9.749560414576357e-05, "loss": 0.2449, "step": 8440 }, { "epoch": 1.1477469523583144, "grad_norm": 0.3892918825149536, "learning_rate": 9.74832408493738e-05, "loss": 0.231, "step": 8450 }, { "epoch": 1.1491052327753064, "grad_norm": 0.35944390296936035, "learning_rate": 9.747084789931684e-05, "loss": 0.2286, "step": 8460 }, { "epoch": 1.1504635131922987, "grad_norm": 0.32335415482521057, "learning_rate": 9.74584253033322e-05, "loss": 0.23, "step": 8470 }, { "epoch": 1.1518217936092907, "grad_norm": 0.6031498312950134, "learning_rate": 9.744597306917786e-05, "loss": 0.2374, "step": 8480 }, { "epoch": 1.1531800740262828, "grad_norm": 0.3470638692378998, "learning_rate": 9.743349120463038e-05, "loss": 0.2393, "step": 8490 }, { "epoch": 1.1545383544432748, "grad_norm": 0.35518866777420044, "learning_rate": 9.742097971748478e-05, "loss": 0.2321, "step": 8500 }, { "epoch": 1.1558966348602668, "grad_norm": 0.6447528600692749, "learning_rate": 9.740843861555455e-05, "loss": 0.2396, "step": 8510 }, { "epoch": 1.1572549152772589, "grad_norm": 0.33227604627609253, "learning_rate": 9.739586790667174e-05, "loss": 0.2382, "step": 8520 }, { "epoch": 1.1586131956942511, "grad_norm": 0.5345719456672668, "learning_rate": 9.738326759868687e-05, "loss": 0.2366, "step": 8530 }, { "epoch": 1.1599714761112432, "grad_norm": 0.316815584897995, "learning_rate": 9.73706376994689e-05, "loss": 0.2402, "step": 8540 }, { "epoch": 1.1613297565282352, "grad_norm": 0.27835845947265625, "learning_rate": 9.735797821690533e-05, "loss": 0.2392, "step": 8550 }, { "epoch": 1.1626880369452273, "grad_norm": 0.36102980375289917, "learning_rate": 9.73452891589021e-05, "loss": 0.2276, "step": 8560 }, { "epoch": 1.1640463173622195, "grad_norm": 0.3106635808944702, "learning_rate": 9.733257053338362e-05, "loss": 0.2333, "step": 8570 }, { "epoch": 1.1654045977792116, "grad_norm": 0.7032580971717834, "learning_rate": 9.73198223482928e-05, "loss": 0.233, "step": 8580 }, { "epoch": 1.1667628781962036, "grad_norm": 0.4163115918636322, "learning_rate": 9.730704461159095e-05, "loss": 0.233, "step": 8590 }, { "epoch": 1.1681211586131957, "grad_norm": 0.2902928590774536, "learning_rate": 9.729423733125788e-05, "loss": 0.2494, "step": 8600 }, { "epoch": 1.1694794390301877, "grad_norm": 0.6888032555580139, "learning_rate": 9.728140051529186e-05, "loss": 0.2308, "step": 8610 }, { "epoch": 1.17083771944718, "grad_norm": 0.4462730884552002, "learning_rate": 9.726853417170958e-05, "loss": 0.2329, "step": 8620 }, { "epoch": 1.172195999864172, "grad_norm": 0.362491250038147, "learning_rate": 9.725563830854616e-05, "loss": 0.2301, "step": 8630 }, { "epoch": 1.173554280281164, "grad_norm": 0.4402921199798584, "learning_rate": 9.724271293385518e-05, "loss": 0.2324, "step": 8640 }, { "epoch": 1.174912560698156, "grad_norm": 0.5608346462249756, "learning_rate": 9.722975805570865e-05, "loss": 0.2235, "step": 8650 }, { "epoch": 1.1762708411151483, "grad_norm": 0.29627472162246704, "learning_rate": 9.721677368219697e-05, "loss": 0.2304, "step": 8660 }, { "epoch": 1.1776291215321404, "grad_norm": 0.38079744577407837, "learning_rate": 9.7203759821429e-05, "loss": 0.2242, "step": 8670 }, { "epoch": 1.1789874019491324, "grad_norm": 0.40428072214126587, "learning_rate": 9.719071648153202e-05, "loss": 0.248, "step": 8680 }, { "epoch": 1.1803456823661245, "grad_norm": 1.389643907546997, "learning_rate": 9.717764367065166e-05, "loss": 0.2401, "step": 8690 }, { "epoch": 1.1817039627831165, "grad_norm": 0.4101589620113373, "learning_rate": 9.716454139695203e-05, "loss": 0.2379, "step": 8700 }, { "epoch": 1.1830622432001086, "grad_norm": 0.36372894048690796, "learning_rate": 9.71514096686156e-05, "loss": 0.2299, "step": 8710 }, { "epoch": 1.1844205236171008, "grad_norm": 0.33140572905540466, "learning_rate": 9.713824849384324e-05, "loss": 0.2369, "step": 8720 }, { "epoch": 1.1857788040340929, "grad_norm": 0.36736559867858887, "learning_rate": 9.712505788085419e-05, "loss": 0.2388, "step": 8730 }, { "epoch": 1.187137084451085, "grad_norm": 0.31755226850509644, "learning_rate": 9.711183783788612e-05, "loss": 0.2372, "step": 8740 }, { "epoch": 1.188495364868077, "grad_norm": 0.3570660650730133, "learning_rate": 9.709858837319504e-05, "loss": 0.2299, "step": 8750 }, { "epoch": 1.1898536452850692, "grad_norm": 0.7559611201286316, "learning_rate": 9.708530949505535e-05, "loss": 0.2295, "step": 8760 }, { "epoch": 1.1912119257020612, "grad_norm": 0.29632654786109924, "learning_rate": 9.707200121175983e-05, "loss": 0.2298, "step": 8770 }, { "epoch": 1.1925702061190533, "grad_norm": 0.47530606389045715, "learning_rate": 9.705866353161959e-05, "loss": 0.2363, "step": 8780 }, { "epoch": 1.1939284865360453, "grad_norm": 0.3248406946659088, "learning_rate": 9.704529646296413e-05, "loss": 0.2382, "step": 8790 }, { "epoch": 1.1952867669530374, "grad_norm": 0.47188740968704224, "learning_rate": 9.703190001414126e-05, "loss": 0.2406, "step": 8800 }, { "epoch": 1.1966450473700296, "grad_norm": 0.5387668013572693, "learning_rate": 9.701847419351723e-05, "loss": 0.2447, "step": 8810 }, { "epoch": 1.1980033277870217, "grad_norm": 0.4194428622722626, "learning_rate": 9.700501900947654e-05, "loss": 0.2389, "step": 8820 }, { "epoch": 1.1993616082040137, "grad_norm": 0.3936431109905243, "learning_rate": 9.699153447042207e-05, "loss": 0.234, "step": 8830 }, { "epoch": 1.2007198886210058, "grad_norm": 0.4762571454048157, "learning_rate": 9.697802058477499e-05, "loss": 0.2418, "step": 8840 }, { "epoch": 1.2020781690379978, "grad_norm": 1.0641995668411255, "learning_rate": 9.696447736097487e-05, "loss": 0.239, "step": 8850 }, { "epoch": 1.20343644945499, "grad_norm": 0.4293621778488159, "learning_rate": 9.695090480747956e-05, "loss": 0.2324, "step": 8860 }, { "epoch": 1.204794729871982, "grad_norm": 0.48767969012260437, "learning_rate": 9.693730293276521e-05, "loss": 0.2325, "step": 8870 }, { "epoch": 1.2061530102889741, "grad_norm": 0.5202492475509644, "learning_rate": 9.692367174532632e-05, "loss": 0.2364, "step": 8880 }, { "epoch": 1.2075112907059662, "grad_norm": 0.4196806848049164, "learning_rate": 9.691001125367566e-05, "loss": 0.2428, "step": 8890 }, { "epoch": 1.2088695711229582, "grad_norm": 0.34737488627433777, "learning_rate": 9.689632146634434e-05, "loss": 0.2276, "step": 8900 }, { "epoch": 1.2102278515399505, "grad_norm": 0.3607257604598999, "learning_rate": 9.68826023918817e-05, "loss": 0.2356, "step": 8910 }, { "epoch": 1.2115861319569425, "grad_norm": 0.4119270443916321, "learning_rate": 9.686885403885546e-05, "loss": 0.2463, "step": 8920 }, { "epoch": 1.2129444123739346, "grad_norm": 0.5941003561019897, "learning_rate": 9.685507641585155e-05, "loss": 0.2395, "step": 8930 }, { "epoch": 1.2143026927909266, "grad_norm": 0.5818313956260681, "learning_rate": 9.684126953147421e-05, "loss": 0.2369, "step": 8940 }, { "epoch": 1.2156609732079189, "grad_norm": 0.39362215995788574, "learning_rate": 9.682743339434595e-05, "loss": 0.2458, "step": 8950 }, { "epoch": 1.217019253624911, "grad_norm": 0.37926438450813293, "learning_rate": 9.681356801310757e-05, "loss": 0.2266, "step": 8960 }, { "epoch": 1.218377534041903, "grad_norm": 0.39186081290245056, "learning_rate": 9.679967339641809e-05, "loss": 0.2415, "step": 8970 }, { "epoch": 1.219735814458895, "grad_norm": 0.4335801899433136, "learning_rate": 9.678574955295481e-05, "loss": 0.2398, "step": 8980 }, { "epoch": 1.221094094875887, "grad_norm": 0.5193931460380554, "learning_rate": 9.67717964914133e-05, "loss": 0.2347, "step": 8990 }, { "epoch": 1.2224523752928793, "grad_norm": 0.41706201434135437, "learning_rate": 9.675781422050734e-05, "loss": 0.2351, "step": 9000 }, { "epoch": 1.2238106557098714, "grad_norm": 0.5409320592880249, "learning_rate": 9.674380274896899e-05, "loss": 0.2317, "step": 9010 }, { "epoch": 1.2251689361268634, "grad_norm": 0.5755084753036499, "learning_rate": 9.67297620855485e-05, "loss": 0.2429, "step": 9020 }, { "epoch": 1.2265272165438554, "grad_norm": 0.3601730465888977, "learning_rate": 9.671569223901441e-05, "loss": 0.245, "step": 9030 }, { "epoch": 1.2278854969608475, "grad_norm": 0.4144686162471771, "learning_rate": 9.670159321815343e-05, "loss": 0.2309, "step": 9040 }, { "epoch": 1.2292437773778397, "grad_norm": 0.30298352241516113, "learning_rate": 9.668746503177053e-05, "loss": 0.2376, "step": 9050 }, { "epoch": 1.2306020577948318, "grad_norm": 0.4292883574962616, "learning_rate": 9.667330768868885e-05, "loss": 0.2373, "step": 9060 }, { "epoch": 1.2319603382118238, "grad_norm": 0.5007824897766113, "learning_rate": 9.665912119774979e-05, "loss": 0.226, "step": 9070 }, { "epoch": 1.2333186186288159, "grad_norm": 0.36007213592529297, "learning_rate": 9.664490556781292e-05, "loss": 0.2319, "step": 9080 }, { "epoch": 1.234676899045808, "grad_norm": 0.33118975162506104, "learning_rate": 9.663066080775601e-05, "loss": 0.2452, "step": 9090 }, { "epoch": 1.2360351794628002, "grad_norm": 0.38804057240486145, "learning_rate": 9.661638692647503e-05, "loss": 0.2464, "step": 9100 }, { "epoch": 1.2373934598797922, "grad_norm": 0.3851204514503479, "learning_rate": 9.660208393288414e-05, "loss": 0.2353, "step": 9110 }, { "epoch": 1.2387517402967843, "grad_norm": 0.34776198863983154, "learning_rate": 9.658775183591568e-05, "loss": 0.2354, "step": 9120 }, { "epoch": 1.2401100207137763, "grad_norm": 0.3994421660900116, "learning_rate": 9.657339064452016e-05, "loss": 0.2411, "step": 9130 }, { "epoch": 1.2414683011307686, "grad_norm": 0.3774348199367523, "learning_rate": 9.655900036766623e-05, "loss": 0.2378, "step": 9140 }, { "epoch": 1.2428265815477606, "grad_norm": 1.112257480621338, "learning_rate": 9.654458101434077e-05, "loss": 0.2397, "step": 9150 }, { "epoch": 1.2441848619647526, "grad_norm": 0.3896957039833069, "learning_rate": 9.653013259354876e-05, "loss": 0.2369, "step": 9160 }, { "epoch": 1.2455431423817447, "grad_norm": 0.3395043611526489, "learning_rate": 9.651565511431336e-05, "loss": 0.2283, "step": 9170 }, { "epoch": 1.2469014227987367, "grad_norm": 0.34692245721817017, "learning_rate": 9.650114858567588e-05, "loss": 0.2389, "step": 9180 }, { "epoch": 1.2482597032157288, "grad_norm": 0.3633946478366852, "learning_rate": 9.648661301669577e-05, "loss": 0.2357, "step": 9190 }, { "epoch": 1.249617983632721, "grad_norm": 0.4311239719390869, "learning_rate": 9.647204841645058e-05, "loss": 0.2364, "step": 9200 }, { "epoch": 1.250976264049713, "grad_norm": 0.39665448665618896, "learning_rate": 9.645745479403608e-05, "loss": 0.2306, "step": 9210 }, { "epoch": 1.2523345444667051, "grad_norm": 0.7107229828834534, "learning_rate": 9.644283215856602e-05, "loss": 0.2274, "step": 9220 }, { "epoch": 1.2536928248836974, "grad_norm": 1.0035239458084106, "learning_rate": 9.642818051917242e-05, "loss": 0.2469, "step": 9230 }, { "epoch": 1.2550511053006894, "grad_norm": 0.5820460319519043, "learning_rate": 9.641349988500534e-05, "loss": 0.237, "step": 9240 }, { "epoch": 1.2564093857176815, "grad_norm": 0.4353732168674469, "learning_rate": 9.639879026523293e-05, "loss": 0.2356, "step": 9250 }, { "epoch": 1.2577676661346735, "grad_norm": 0.388810396194458, "learning_rate": 9.638405166904147e-05, "loss": 0.2371, "step": 9260 }, { "epoch": 1.2591259465516655, "grad_norm": 0.3972727358341217, "learning_rate": 9.636928410563535e-05, "loss": 0.2425, "step": 9270 }, { "epoch": 1.2604842269686576, "grad_norm": 0.5266451239585876, "learning_rate": 9.635448758423702e-05, "loss": 0.2405, "step": 9280 }, { "epoch": 1.2618425073856498, "grad_norm": 0.3865385353565216, "learning_rate": 9.633966211408704e-05, "loss": 0.2423, "step": 9290 }, { "epoch": 1.263200787802642, "grad_norm": 0.3913436233997345, "learning_rate": 9.632480770444401e-05, "loss": 0.2362, "step": 9300 }, { "epoch": 1.264559068219634, "grad_norm": 0.4063433110713959, "learning_rate": 9.630992436458466e-05, "loss": 0.2368, "step": 9310 }, { "epoch": 1.265917348636626, "grad_norm": 0.4934528172016144, "learning_rate": 9.629501210380372e-05, "loss": 0.2306, "step": 9320 }, { "epoch": 1.2672756290536182, "grad_norm": 0.30511581897735596, "learning_rate": 9.628007093141404e-05, "loss": 0.2357, "step": 9330 }, { "epoch": 1.2686339094706103, "grad_norm": 0.3336571753025055, "learning_rate": 9.626510085674651e-05, "loss": 0.246, "step": 9340 }, { "epoch": 1.2699921898876023, "grad_norm": 0.3731229305267334, "learning_rate": 9.625010188915007e-05, "loss": 0.2466, "step": 9350 }, { "epoch": 1.2713504703045944, "grad_norm": 0.36219194531440735, "learning_rate": 9.623507403799166e-05, "loss": 0.2384, "step": 9360 }, { "epoch": 1.2727087507215864, "grad_norm": 0.35683172941207886, "learning_rate": 9.622001731265631e-05, "loss": 0.2358, "step": 9370 }, { "epoch": 1.2740670311385784, "grad_norm": 0.5407777428627014, "learning_rate": 9.620493172254707e-05, "loss": 0.2356, "step": 9380 }, { "epoch": 1.2754253115555707, "grad_norm": 0.3098716139793396, "learning_rate": 9.618981727708502e-05, "loss": 0.2294, "step": 9390 }, { "epoch": 1.2767835919725627, "grad_norm": 0.4463116228580475, "learning_rate": 9.617467398570923e-05, "loss": 0.2441, "step": 9400 }, { "epoch": 1.2781418723895548, "grad_norm": 0.35336509346961975, "learning_rate": 9.615950185787684e-05, "loss": 0.2328, "step": 9410 }, { "epoch": 1.2795001528065468, "grad_norm": 0.40873751044273376, "learning_rate": 9.614430090306294e-05, "loss": 0.2397, "step": 9420 }, { "epoch": 1.280858433223539, "grad_norm": 0.3297904133796692, "learning_rate": 9.612907113076068e-05, "loss": 0.2351, "step": 9430 }, { "epoch": 1.2822167136405311, "grad_norm": 0.3736768662929535, "learning_rate": 9.611381255048113e-05, "loss": 0.2441, "step": 9440 }, { "epoch": 1.2835749940575232, "grad_norm": 0.45901933312416077, "learning_rate": 9.609852517175344e-05, "loss": 0.2401, "step": 9450 }, { "epoch": 1.2849332744745152, "grad_norm": 0.39437276124954224, "learning_rate": 9.608320900412466e-05, "loss": 0.2292, "step": 9460 }, { "epoch": 1.2862915548915073, "grad_norm": 0.3870416283607483, "learning_rate": 9.60678640571599e-05, "loss": 0.2368, "step": 9470 }, { "epoch": 1.2876498353084993, "grad_norm": 0.4920508563518524, "learning_rate": 9.60524903404422e-05, "loss": 0.238, "step": 9480 }, { "epoch": 1.2890081157254916, "grad_norm": 0.38336899876594543, "learning_rate": 9.603708786357253e-05, "loss": 0.2451, "step": 9490 }, { "epoch": 1.2903663961424836, "grad_norm": 0.3607412874698639, "learning_rate": 9.602165663616988e-05, "loss": 0.2381, "step": 9500 }, { "epoch": 1.2917246765594756, "grad_norm": 0.38367608189582825, "learning_rate": 9.600619666787121e-05, "loss": 0.2361, "step": 9510 }, { "epoch": 1.293082956976468, "grad_norm": 0.3786604702472687, "learning_rate": 9.599070796833135e-05, "loss": 0.2366, "step": 9520 }, { "epoch": 1.29444123739346, "grad_norm": 0.3285292983055115, "learning_rate": 9.597519054722314e-05, "loss": 0.2349, "step": 9530 }, { "epoch": 1.295799517810452, "grad_norm": 0.4163239598274231, "learning_rate": 9.595964441423735e-05, "loss": 0.2351, "step": 9540 }, { "epoch": 1.297157798227444, "grad_norm": 0.40293169021606445, "learning_rate": 9.594406957908262e-05, "loss": 0.2454, "step": 9550 }, { "epoch": 1.298516078644436, "grad_norm": 0.42069295048713684, "learning_rate": 9.59284660514856e-05, "loss": 0.2383, "step": 9560 }, { "epoch": 1.2998743590614281, "grad_norm": 0.42872726917266846, "learning_rate": 9.591283384119081e-05, "loss": 0.2329, "step": 9570 }, { "epoch": 1.3012326394784204, "grad_norm": 0.5812711715698242, "learning_rate": 9.589717295796068e-05, "loss": 0.2285, "step": 9580 }, { "epoch": 1.3025909198954124, "grad_norm": 0.32322773337364197, "learning_rate": 9.588148341157557e-05, "loss": 0.2312, "step": 9590 }, { "epoch": 1.3039492003124045, "grad_norm": 0.35246196389198303, "learning_rate": 9.586576521183371e-05, "loss": 0.2334, "step": 9600 }, { "epoch": 1.3053074807293965, "grad_norm": 0.6094481348991394, "learning_rate": 9.585001836855128e-05, "loss": 0.244, "step": 9610 }, { "epoch": 1.3066657611463888, "grad_norm": 1.3190054893493652, "learning_rate": 9.583424289156227e-05, "loss": 0.2453, "step": 9620 }, { "epoch": 1.3080240415633808, "grad_norm": 0.4828789234161377, "learning_rate": 9.58184387907186e-05, "loss": 0.2439, "step": 9630 }, { "epoch": 1.3093823219803729, "grad_norm": 0.3694145083427429, "learning_rate": 9.580260607589009e-05, "loss": 0.2411, "step": 9640 }, { "epoch": 1.310740602397365, "grad_norm": 0.3425014317035675, "learning_rate": 9.578674475696436e-05, "loss": 0.237, "step": 9650 }, { "epoch": 1.312098882814357, "grad_norm": 0.35514035820961, "learning_rate": 9.577085484384695e-05, "loss": 0.2459, "step": 9660 }, { "epoch": 1.313457163231349, "grad_norm": 0.44277700781822205, "learning_rate": 9.575493634646125e-05, "loss": 0.2345, "step": 9670 }, { "epoch": 1.3148154436483412, "grad_norm": 0.43597668409347534, "learning_rate": 9.573898927474847e-05, "loss": 0.246, "step": 9680 }, { "epoch": 1.3161737240653333, "grad_norm": 0.4773683547973633, "learning_rate": 9.572301363866769e-05, "loss": 0.2358, "step": 9690 }, { "epoch": 1.3175320044823253, "grad_norm": 0.4136565029621124, "learning_rate": 9.570700944819584e-05, "loss": 0.2255, "step": 9700 }, { "epoch": 1.3188902848993176, "grad_norm": 0.9324676990509033, "learning_rate": 9.569097671332763e-05, "loss": 0.2444, "step": 9710 }, { "epoch": 1.3202485653163096, "grad_norm": 1.235852837562561, "learning_rate": 9.567491544407568e-05, "loss": 0.2472, "step": 9720 }, { "epoch": 1.3216068457333017, "grad_norm": 0.3084303140640259, "learning_rate": 9.565882565047033e-05, "loss": 0.2388, "step": 9730 }, { "epoch": 1.3229651261502937, "grad_norm": 0.2960711717605591, "learning_rate": 9.564270734255983e-05, "loss": 0.2334, "step": 9740 }, { "epoch": 1.3243234065672858, "grad_norm": 0.4004145562648773, "learning_rate": 9.562656053041017e-05, "loss": 0.2325, "step": 9750 }, { "epoch": 1.3256816869842778, "grad_norm": 0.3099653422832489, "learning_rate": 9.561038522410517e-05, "loss": 0.2434, "step": 9760 }, { "epoch": 1.32703996740127, "grad_norm": 0.38605257868766785, "learning_rate": 9.559418143374644e-05, "loss": 0.2445, "step": 9770 }, { "epoch": 1.328398247818262, "grad_norm": 0.3521859347820282, "learning_rate": 9.557794916945339e-05, "loss": 0.2373, "step": 9780 }, { "epoch": 1.3297565282352541, "grad_norm": 0.37508273124694824, "learning_rate": 9.556168844136318e-05, "loss": 0.2339, "step": 9790 }, { "epoch": 1.3311148086522462, "grad_norm": 1.122193694114685, "learning_rate": 9.554539925963075e-05, "loss": 0.2466, "step": 9800 }, { "epoch": 1.3324730890692384, "grad_norm": 0.5067532062530518, "learning_rate": 9.552908163442888e-05, "loss": 0.2431, "step": 9810 }, { "epoch": 1.3338313694862305, "grad_norm": 0.4793557822704315, "learning_rate": 9.551273557594801e-05, "loss": 0.2412, "step": 9820 }, { "epoch": 1.3351896499032225, "grad_norm": 0.47758689522743225, "learning_rate": 9.549636109439642e-05, "loss": 0.2413, "step": 9830 }, { "epoch": 1.3365479303202146, "grad_norm": 0.3815268874168396, "learning_rate": 9.547995820000008e-05, "loss": 0.2345, "step": 9840 }, { "epoch": 1.3379062107372066, "grad_norm": 0.29944103956222534, "learning_rate": 9.546352690300275e-05, "loss": 0.2415, "step": 9850 }, { "epoch": 1.3392644911541987, "grad_norm": 0.326760858297348, "learning_rate": 9.544706721366588e-05, "loss": 0.2428, "step": 9860 }, { "epoch": 1.340622771571191, "grad_norm": 0.3274824917316437, "learning_rate": 9.54305791422687e-05, "loss": 0.2369, "step": 9870 }, { "epoch": 1.341981051988183, "grad_norm": 0.31547462940216064, "learning_rate": 9.541406269910816e-05, "loss": 0.2366, "step": 9880 }, { "epoch": 1.343339332405175, "grad_norm": 0.34344032406806946, "learning_rate": 9.539751789449886e-05, "loss": 0.2495, "step": 9890 }, { "epoch": 1.3446976128221673, "grad_norm": 0.46136555075645447, "learning_rate": 9.538094473877321e-05, "loss": 0.2452, "step": 9900 }, { "epoch": 1.3460558932391593, "grad_norm": 0.2972998321056366, "learning_rate": 9.536434324228127e-05, "loss": 0.2463, "step": 9910 }, { "epoch": 1.3474141736561513, "grad_norm": 0.46994298696517944, "learning_rate": 9.534771341539079e-05, "loss": 0.2331, "step": 9920 }, { "epoch": 1.3487724540731434, "grad_norm": 0.34427976608276367, "learning_rate": 9.533105526848727e-05, "loss": 0.2319, "step": 9930 }, { "epoch": 1.3501307344901354, "grad_norm": 0.632222056388855, "learning_rate": 9.53143688119738e-05, "loss": 0.2326, "step": 9940 }, { "epoch": 1.3514890149071275, "grad_norm": 0.3955300450325012, "learning_rate": 9.529765405627127e-05, "loss": 0.2358, "step": 9950 }, { "epoch": 1.3528472953241197, "grad_norm": 0.5034508109092712, "learning_rate": 9.528091101181813e-05, "loss": 0.2428, "step": 9960 }, { "epoch": 1.3542055757411118, "grad_norm": 0.35826992988586426, "learning_rate": 9.526413968907056e-05, "loss": 0.2338, "step": 9970 }, { "epoch": 1.3555638561581038, "grad_norm": 1.0628341436386108, "learning_rate": 9.52473400985024e-05, "loss": 0.2323, "step": 9980 }, { "epoch": 1.3569221365750959, "grad_norm": 0.39227163791656494, "learning_rate": 9.523051225060513e-05, "loss": 0.2518, "step": 9990 }, { "epoch": 1.3582804169920881, "grad_norm": 0.4547765254974365, "learning_rate": 9.521365615588785e-05, "loss": 0.2301, "step": 10000 }, { "epoch": 1.3596386974090802, "grad_norm": 0.32657191157341003, "learning_rate": 9.519677182487737e-05, "loss": 0.2416, "step": 10010 }, { "epoch": 1.3609969778260722, "grad_norm": 0.34798070788383484, "learning_rate": 9.517985926811803e-05, "loss": 0.2376, "step": 10020 }, { "epoch": 1.3623552582430642, "grad_norm": 0.45538845658302307, "learning_rate": 9.516291849617191e-05, "loss": 0.2339, "step": 10030 }, { "epoch": 1.3637135386600563, "grad_norm": 0.42663517594337463, "learning_rate": 9.514594951961866e-05, "loss": 0.239, "step": 10040 }, { "epoch": 1.3650718190770483, "grad_norm": 0.3157259225845337, "learning_rate": 9.512895234905552e-05, "loss": 0.2409, "step": 10050 }, { "epoch": 1.3664300994940406, "grad_norm": 0.40212956070899963, "learning_rate": 9.511192699509736e-05, "loss": 0.2389, "step": 10060 }, { "epoch": 1.3677883799110326, "grad_norm": 0.35797327756881714, "learning_rate": 9.509487346837666e-05, "loss": 0.2408, "step": 10070 }, { "epoch": 1.3691466603280247, "grad_norm": 0.34534287452697754, "learning_rate": 9.507779177954348e-05, "loss": 0.2479, "step": 10080 }, { "epoch": 1.370504940745017, "grad_norm": 0.3092736303806305, "learning_rate": 9.506068193926547e-05, "loss": 0.2475, "step": 10090 }, { "epoch": 1.371863221162009, "grad_norm": 0.3059932291507721, "learning_rate": 9.504354395822787e-05, "loss": 0.2485, "step": 10100 }, { "epoch": 1.373221501579001, "grad_norm": 0.30673494935035706, "learning_rate": 9.502637784713351e-05, "loss": 0.2332, "step": 10110 }, { "epoch": 1.374579781995993, "grad_norm": 0.4220554232597351, "learning_rate": 9.500918361670272e-05, "loss": 0.2422, "step": 10120 }, { "epoch": 1.375938062412985, "grad_norm": 0.529952347278595, "learning_rate": 9.499196127767344e-05, "loss": 0.2428, "step": 10130 }, { "epoch": 1.3772963428299771, "grad_norm": 0.36314892768859863, "learning_rate": 9.497471084080121e-05, "loss": 0.2411, "step": 10140 }, { "epoch": 1.3786546232469694, "grad_norm": 0.3234858214855194, "learning_rate": 9.495743231685901e-05, "loss": 0.2315, "step": 10150 }, { "epoch": 1.3800129036639615, "grad_norm": 0.58331298828125, "learning_rate": 9.494012571663747e-05, "loss": 0.2362, "step": 10160 }, { "epoch": 1.3813711840809535, "grad_norm": 0.3461581766605377, "learning_rate": 9.492279105094463e-05, "loss": 0.2413, "step": 10170 }, { "epoch": 1.3827294644979455, "grad_norm": 0.39425915479660034, "learning_rate": 9.49054283306062e-05, "loss": 0.2326, "step": 10180 }, { "epoch": 1.3840877449149378, "grad_norm": 0.4520118534564972, "learning_rate": 9.488803756646531e-05, "loss": 0.2225, "step": 10190 }, { "epoch": 1.3854460253319298, "grad_norm": 0.44491803646087646, "learning_rate": 9.48706187693826e-05, "loss": 0.2426, "step": 10200 }, { "epoch": 1.3868043057489219, "grad_norm": 0.4648768901824951, "learning_rate": 9.485317195023632e-05, "loss": 0.2373, "step": 10210 }, { "epoch": 1.388162586165914, "grad_norm": 0.4224698841571808, "learning_rate": 9.48356971199221e-05, "loss": 0.2343, "step": 10220 }, { "epoch": 1.389520866582906, "grad_norm": 0.3708207607269287, "learning_rate": 9.481819428935313e-05, "loss": 0.2372, "step": 10230 }, { "epoch": 1.390879146999898, "grad_norm": 0.8218110203742981, "learning_rate": 9.480066346946006e-05, "loss": 0.222, "step": 10240 }, { "epoch": 1.3922374274168903, "grad_norm": 0.4297555088996887, "learning_rate": 9.478310467119103e-05, "loss": 0.242, "step": 10250 }, { "epoch": 1.3935957078338823, "grad_norm": 0.4233069121837616, "learning_rate": 9.476551790551165e-05, "loss": 0.2304, "step": 10260 }, { "epoch": 1.3949539882508744, "grad_norm": 0.4403098225593567, "learning_rate": 9.474790318340501e-05, "loss": 0.2432, "step": 10270 }, { "epoch": 1.3963122686678666, "grad_norm": 0.3553713262081146, "learning_rate": 9.473026051587162e-05, "loss": 0.2383, "step": 10280 }, { "epoch": 1.3976705490848587, "grad_norm": 0.28435218334198, "learning_rate": 9.47125899139295e-05, "loss": 0.2472, "step": 10290 }, { "epoch": 1.3990288295018507, "grad_norm": 0.45768317580223083, "learning_rate": 9.469489138861404e-05, "loss": 0.2286, "step": 10300 }, { "epoch": 1.4003871099188427, "grad_norm": 0.3089035153388977, "learning_rate": 9.467716495097818e-05, "loss": 0.2358, "step": 10310 }, { "epoch": 1.4017453903358348, "grad_norm": 0.2847888469696045, "learning_rate": 9.465941061209215e-05, "loss": 0.2426, "step": 10320 }, { "epoch": 1.4031036707528268, "grad_norm": 0.3596046268939972, "learning_rate": 9.464162838304374e-05, "loss": 0.2328, "step": 10330 }, { "epoch": 1.404461951169819, "grad_norm": 0.32544389367103577, "learning_rate": 9.462381827493805e-05, "loss": 0.2399, "step": 10340 }, { "epoch": 1.4058202315868111, "grad_norm": 0.39457476139068604, "learning_rate": 9.460598029889765e-05, "loss": 0.2463, "step": 10350 }, { "epoch": 1.4071785120038032, "grad_norm": 0.3607209026813507, "learning_rate": 9.45881144660625e-05, "loss": 0.2402, "step": 10360 }, { "epoch": 1.4085367924207952, "grad_norm": 0.3422693610191345, "learning_rate": 9.457022078758998e-05, "loss": 0.2365, "step": 10370 }, { "epoch": 1.4098950728377875, "grad_norm": 0.3102242350578308, "learning_rate": 9.455229927465481e-05, "loss": 0.2374, "step": 10380 }, { "epoch": 1.4112533532547795, "grad_norm": 0.3538486957550049, "learning_rate": 9.453434993844912e-05, "loss": 0.2371, "step": 10390 }, { "epoch": 1.4126116336717716, "grad_norm": 0.32267794013023376, "learning_rate": 9.451637279018242e-05, "loss": 0.2382, "step": 10400 }, { "epoch": 1.4139699140887636, "grad_norm": 0.3008194863796234, "learning_rate": 9.44983678410816e-05, "loss": 0.2354, "step": 10410 }, { "epoch": 1.4153281945057556, "grad_norm": 0.41413336992263794, "learning_rate": 9.448033510239085e-05, "loss": 0.2437, "step": 10420 }, { "epoch": 1.4166864749227477, "grad_norm": 0.35520192980766296, "learning_rate": 9.446227458537182e-05, "loss": 0.2438, "step": 10430 }, { "epoch": 1.41804475533974, "grad_norm": 0.27063241600990295, "learning_rate": 9.444418630130341e-05, "loss": 0.2427, "step": 10440 }, { "epoch": 1.419403035756732, "grad_norm": 0.4812606871128082, "learning_rate": 9.442607026148192e-05, "loss": 0.2481, "step": 10450 }, { "epoch": 1.420761316173724, "grad_norm": 0.39711084961891174, "learning_rate": 9.440792647722094e-05, "loss": 0.2321, "step": 10460 }, { "epoch": 1.422119596590716, "grad_norm": 0.3186090886592865, "learning_rate": 9.438975495985142e-05, "loss": 0.2336, "step": 10470 }, { "epoch": 1.4234778770077083, "grad_norm": 0.33473339676856995, "learning_rate": 9.437155572072165e-05, "loss": 0.2503, "step": 10480 }, { "epoch": 1.4248361574247004, "grad_norm": 0.36334994435310364, "learning_rate": 9.435332877119715e-05, "loss": 0.2304, "step": 10490 }, { "epoch": 1.4261944378416924, "grad_norm": 0.3507101237773895, "learning_rate": 9.433507412266083e-05, "loss": 0.2437, "step": 10500 }, { "epoch": 1.4275527182586845, "grad_norm": 0.6983070969581604, "learning_rate": 9.431679178651287e-05, "loss": 0.242, "step": 10510 }, { "epoch": 1.4289109986756765, "grad_norm": 0.4753780663013458, "learning_rate": 9.42984817741707e-05, "loss": 0.2378, "step": 10520 }, { "epoch": 1.4302692790926685, "grad_norm": 0.38433128595352173, "learning_rate": 9.428014409706911e-05, "loss": 0.23, "step": 10530 }, { "epoch": 1.4316275595096608, "grad_norm": 0.46366724371910095, "learning_rate": 9.42617787666601e-05, "loss": 0.2438, "step": 10540 }, { "epoch": 1.4329858399266528, "grad_norm": 0.4229886531829834, "learning_rate": 9.424338579441299e-05, "loss": 0.2418, "step": 10550 }, { "epoch": 1.434344120343645, "grad_norm": 0.33727264404296875, "learning_rate": 9.422496519181432e-05, "loss": 0.2424, "step": 10560 }, { "epoch": 1.4357024007606372, "grad_norm": 0.4025544822216034, "learning_rate": 9.420651697036791e-05, "loss": 0.2371, "step": 10570 }, { "epoch": 1.4370606811776292, "grad_norm": 0.32525119185447693, "learning_rate": 9.418804114159482e-05, "loss": 0.2407, "step": 10580 }, { "epoch": 1.4384189615946212, "grad_norm": 0.3665565848350525, "learning_rate": 9.416953771703339e-05, "loss": 0.2284, "step": 10590 }, { "epoch": 1.4397772420116133, "grad_norm": 0.3372083306312561, "learning_rate": 9.415100670823911e-05, "loss": 0.2462, "step": 10600 }, { "epoch": 1.4411355224286053, "grad_norm": 0.49547871947288513, "learning_rate": 9.413244812678476e-05, "loss": 0.2503, "step": 10610 }, { "epoch": 1.4424938028455974, "grad_norm": 0.529727578163147, "learning_rate": 9.411386198426034e-05, "loss": 0.2388, "step": 10620 }, { "epoch": 1.4438520832625896, "grad_norm": 0.27697861194610596, "learning_rate": 9.409524829227301e-05, "loss": 0.2371, "step": 10630 }, { "epoch": 1.4452103636795817, "grad_norm": 0.6433039307594299, "learning_rate": 9.407660706244721e-05, "loss": 0.2381, "step": 10640 }, { "epoch": 1.4465686440965737, "grad_norm": 0.443145215511322, "learning_rate": 9.405793830642451e-05, "loss": 0.2347, "step": 10650 }, { "epoch": 1.4479269245135657, "grad_norm": 0.3168441951274872, "learning_rate": 9.403924203586372e-05, "loss": 0.2371, "step": 10660 }, { "epoch": 1.449285204930558, "grad_norm": 0.4280048608779907, "learning_rate": 9.40205182624408e-05, "loss": 0.2359, "step": 10670 }, { "epoch": 1.45064348534755, "grad_norm": 0.3294278681278229, "learning_rate": 9.400176699784891e-05, "loss": 0.2364, "step": 10680 }, { "epoch": 1.452001765764542, "grad_norm": 0.34099143743515015, "learning_rate": 9.398298825379834e-05, "loss": 0.2409, "step": 10690 }, { "epoch": 1.4533600461815341, "grad_norm": 0.3247606158256531, "learning_rate": 9.39641820420166e-05, "loss": 0.2501, "step": 10700 }, { "epoch": 1.4547183265985262, "grad_norm": 0.3181615471839905, "learning_rate": 9.39453483742483e-05, "loss": 0.2355, "step": 10710 }, { "epoch": 1.4560766070155182, "grad_norm": 0.39810457825660706, "learning_rate": 9.392648726225521e-05, "loss": 0.2432, "step": 10720 }, { "epoch": 1.4574348874325105, "grad_norm": 0.33859172463417053, "learning_rate": 9.390759871781629e-05, "loss": 0.2239, "step": 10730 }, { "epoch": 1.4587931678495025, "grad_norm": 0.37943604588508606, "learning_rate": 9.388868275272753e-05, "loss": 0.2412, "step": 10740 }, { "epoch": 1.4601514482664946, "grad_norm": 0.41325798630714417, "learning_rate": 9.386973937880214e-05, "loss": 0.2497, "step": 10750 }, { "epoch": 1.4615097286834868, "grad_norm": 0.3815285563468933, "learning_rate": 9.385076860787039e-05, "loss": 0.2494, "step": 10760 }, { "epoch": 1.4628680091004789, "grad_norm": 0.8314708471298218, "learning_rate": 9.383177045177972e-05, "loss": 0.2426, "step": 10770 }, { "epoch": 1.464226289517471, "grad_norm": 0.4035673141479492, "learning_rate": 9.381274492239456e-05, "loss": 0.2352, "step": 10780 }, { "epoch": 1.465584569934463, "grad_norm": 0.38025951385498047, "learning_rate": 9.379369203159658e-05, "loss": 0.2345, "step": 10790 }, { "epoch": 1.466942850351455, "grad_norm": 0.41772934794425964, "learning_rate": 9.377461179128442e-05, "loss": 0.2399, "step": 10800 }, { "epoch": 1.468301130768447, "grad_norm": 0.36657193303108215, "learning_rate": 9.375550421337385e-05, "loss": 0.2366, "step": 10810 }, { "epoch": 1.4696594111854393, "grad_norm": 0.2692399024963379, "learning_rate": 9.373636930979772e-05, "loss": 0.2373, "step": 10820 }, { "epoch": 1.4710176916024313, "grad_norm": 0.44436848163604736, "learning_rate": 9.371720709250593e-05, "loss": 0.2358, "step": 10830 }, { "epoch": 1.4723759720194234, "grad_norm": 0.2570059299468994, "learning_rate": 9.369801757346544e-05, "loss": 0.2415, "step": 10840 }, { "epoch": 1.4737342524364154, "grad_norm": 0.8472379446029663, "learning_rate": 9.367880076466025e-05, "loss": 0.2368, "step": 10850 }, { "epoch": 1.4750925328534077, "grad_norm": 0.46572890877723694, "learning_rate": 9.36595566780914e-05, "loss": 0.2464, "step": 10860 }, { "epoch": 1.4764508132703997, "grad_norm": 0.4237960875034332, "learning_rate": 9.364028532577702e-05, "loss": 0.2307, "step": 10870 }, { "epoch": 1.4778090936873918, "grad_norm": 0.4372323155403137, "learning_rate": 9.362098671975218e-05, "loss": 0.2529, "step": 10880 }, { "epoch": 1.4791673741043838, "grad_norm": 0.30010125041007996, "learning_rate": 9.360166087206904e-05, "loss": 0.2346, "step": 10890 }, { "epoch": 1.4805256545213759, "grad_norm": 1.0886262655258179, "learning_rate": 9.358230779479674e-05, "loss": 0.2341, "step": 10900 }, { "epoch": 1.481883934938368, "grad_norm": 0.3966710567474365, "learning_rate": 9.356292750002143e-05, "loss": 0.2406, "step": 10910 }, { "epoch": 1.4832422153553602, "grad_norm": 1.8515955209732056, "learning_rate": 9.354351999984628e-05, "loss": 0.2415, "step": 10920 }, { "epoch": 1.4846004957723522, "grad_norm": 0.3783247172832489, "learning_rate": 9.352408530639141e-05, "loss": 0.2374, "step": 10930 }, { "epoch": 1.4859587761893442, "grad_norm": 0.3919428586959839, "learning_rate": 9.350462343179393e-05, "loss": 0.2409, "step": 10940 }, { "epoch": 1.4873170566063365, "grad_norm": 0.34580954909324646, "learning_rate": 9.348513438820799e-05, "loss": 0.227, "step": 10950 }, { "epoch": 1.4886753370233285, "grad_norm": 1.0340849161148071, "learning_rate": 9.346561818780459e-05, "loss": 0.2339, "step": 10960 }, { "epoch": 1.4900336174403206, "grad_norm": 0.380354642868042, "learning_rate": 9.344607484277182e-05, "loss": 0.2406, "step": 10970 }, { "epoch": 1.4913918978573126, "grad_norm": 0.39091289043426514, "learning_rate": 9.342650436531459e-05, "loss": 0.2474, "step": 10980 }, { "epoch": 1.4927501782743047, "grad_norm": 0.45897534489631653, "learning_rate": 9.340690676765487e-05, "loss": 0.2376, "step": 10990 }, { "epoch": 1.4941084586912967, "grad_norm": 0.5231205224990845, "learning_rate": 9.338728206203149e-05, "loss": 0.2377, "step": 11000 }, { "epoch": 1.495466739108289, "grad_norm": 0.4154932498931885, "learning_rate": 9.336763026070025e-05, "loss": 0.2393, "step": 11010 }, { "epoch": 1.496825019525281, "grad_norm": 0.33903566002845764, "learning_rate": 9.334795137593388e-05, "loss": 0.2431, "step": 11020 }, { "epoch": 1.498183299942273, "grad_norm": 0.48632538318634033, "learning_rate": 9.332824542002196e-05, "loss": 0.2401, "step": 11030 }, { "epoch": 1.499541580359265, "grad_norm": 0.36208510398864746, "learning_rate": 9.330851240527105e-05, "loss": 0.2445, "step": 11040 }, { "epoch": 1.5008998607762574, "grad_norm": 0.35333532094955444, "learning_rate": 9.328875234400456e-05, "loss": 0.24, "step": 11050 }, { "epoch": 1.5022581411932494, "grad_norm": 0.36202186346054077, "learning_rate": 9.326896524856282e-05, "loss": 0.247, "step": 11060 }, { "epoch": 1.5036164216102414, "grad_norm": 0.3726303279399872, "learning_rate": 9.324915113130302e-05, "loss": 0.2412, "step": 11070 }, { "epoch": 1.5049747020272335, "grad_norm": 0.44329944252967834, "learning_rate": 9.322931000459923e-05, "loss": 0.2424, "step": 11080 }, { "epoch": 1.5063329824442255, "grad_norm": 0.5297825932502747, "learning_rate": 9.320944188084242e-05, "loss": 0.2409, "step": 11090 }, { "epoch": 1.5076912628612176, "grad_norm": 0.39525100588798523, "learning_rate": 9.318954677244036e-05, "loss": 0.2279, "step": 11100 }, { "epoch": 1.5090495432782098, "grad_norm": 0.35600605607032776, "learning_rate": 9.316962469181774e-05, "loss": 0.2393, "step": 11110 }, { "epoch": 1.5104078236952019, "grad_norm": 0.3053950071334839, "learning_rate": 9.314967565141604e-05, "loss": 0.2336, "step": 11120 }, { "epoch": 1.511766104112194, "grad_norm": 0.49638625979423523, "learning_rate": 9.31296996636936e-05, "loss": 0.2339, "step": 11130 }, { "epoch": 1.5131243845291862, "grad_norm": 0.41946613788604736, "learning_rate": 9.310969674112557e-05, "loss": 0.2263, "step": 11140 }, { "epoch": 1.5144826649461782, "grad_norm": 0.3650508522987366, "learning_rate": 9.308966689620398e-05, "loss": 0.2451, "step": 11150 }, { "epoch": 1.5158409453631703, "grad_norm": 0.35046103596687317, "learning_rate": 9.306961014143758e-05, "loss": 0.2273, "step": 11160 }, { "epoch": 1.5171992257801623, "grad_norm": 0.35636183619499207, "learning_rate": 9.304952648935202e-05, "loss": 0.2392, "step": 11170 }, { "epoch": 1.5185575061971543, "grad_norm": 0.4965170621871948, "learning_rate": 9.302941595248966e-05, "loss": 0.2374, "step": 11180 }, { "epoch": 1.5199157866141464, "grad_norm": 0.4001160264015198, "learning_rate": 9.300927854340972e-05, "loss": 0.2352, "step": 11190 }, { "epoch": 1.5212740670311384, "grad_norm": 0.5259150266647339, "learning_rate": 9.298911427468817e-05, "loss": 0.2451, "step": 11200 }, { "epoch": 1.5226323474481307, "grad_norm": 0.38911765813827515, "learning_rate": 9.296892315891775e-05, "loss": 0.2417, "step": 11210 }, { "epoch": 1.5239906278651227, "grad_norm": 0.9925249814987183, "learning_rate": 9.294870520870799e-05, "loss": 0.2315, "step": 11220 }, { "epoch": 1.525348908282115, "grad_norm": 0.34320637583732605, "learning_rate": 9.292846043668515e-05, "loss": 0.2451, "step": 11230 }, { "epoch": 1.526707188699107, "grad_norm": 0.5314062833786011, "learning_rate": 9.290818885549225e-05, "loss": 0.2384, "step": 11240 }, { "epoch": 1.528065469116099, "grad_norm": 0.4163719117641449, "learning_rate": 9.288789047778907e-05, "loss": 0.2422, "step": 11250 }, { "epoch": 1.5294237495330911, "grad_norm": 0.3653613328933716, "learning_rate": 9.286756531625212e-05, "loss": 0.2332, "step": 11260 }, { "epoch": 1.5307820299500832, "grad_norm": 0.3442307412624359, "learning_rate": 9.284721338357458e-05, "loss": 0.2467, "step": 11270 }, { "epoch": 1.5321403103670752, "grad_norm": 0.42096075415611267, "learning_rate": 9.282683469246645e-05, "loss": 0.2389, "step": 11280 }, { "epoch": 1.5334985907840673, "grad_norm": 0.34530577063560486, "learning_rate": 9.280642925565437e-05, "loss": 0.2394, "step": 11290 }, { "epoch": 1.5348568712010595, "grad_norm": 0.3823917508125305, "learning_rate": 9.278599708588169e-05, "loss": 0.2395, "step": 11300 }, { "epoch": 1.5362151516180516, "grad_norm": 0.414197713136673, "learning_rate": 9.276553819590845e-05, "loss": 0.2388, "step": 11310 }, { "epoch": 1.5375734320350436, "grad_norm": 0.4558267593383789, "learning_rate": 9.274505259851142e-05, "loss": 0.2326, "step": 11320 }, { "epoch": 1.5389317124520359, "grad_norm": 0.39993786811828613, "learning_rate": 9.272454030648403e-05, "loss": 0.2332, "step": 11330 }, { "epoch": 1.540289992869028, "grad_norm": 2.064215660095215, "learning_rate": 9.270400133263635e-05, "loss": 0.2305, "step": 11340 }, { "epoch": 1.54164827328602, "grad_norm": 0.5439797043800354, "learning_rate": 9.268343568979513e-05, "loss": 0.2418, "step": 11350 }, { "epoch": 1.543006553703012, "grad_norm": 0.3324526250362396, "learning_rate": 9.26628433908038e-05, "loss": 0.2469, "step": 11360 }, { "epoch": 1.544364834120004, "grad_norm": 0.4637681841850281, "learning_rate": 9.26422244485224e-05, "loss": 0.2433, "step": 11370 }, { "epoch": 1.545723114536996, "grad_norm": 0.2895611524581909, "learning_rate": 9.262157887582765e-05, "loss": 0.2409, "step": 11380 }, { "epoch": 1.547081394953988, "grad_norm": 0.3732016682624817, "learning_rate": 9.260090668561285e-05, "loss": 0.2432, "step": 11390 }, { "epoch": 1.5484396753709804, "grad_norm": 0.3439998924732208, "learning_rate": 9.258020789078794e-05, "loss": 0.2436, "step": 11400 }, { "epoch": 1.5497979557879724, "grad_norm": 0.331209659576416, "learning_rate": 9.255948250427952e-05, "loss": 0.2439, "step": 11410 }, { "epoch": 1.5511562362049647, "grad_norm": 0.46013331413269043, "learning_rate": 9.253873053903075e-05, "loss": 0.2442, "step": 11420 }, { "epoch": 1.5525145166219567, "grad_norm": 0.31000202894210815, "learning_rate": 9.251795200800138e-05, "loss": 0.2395, "step": 11430 }, { "epoch": 1.5538727970389488, "grad_norm": 0.3480253219604492, "learning_rate": 9.24971469241678e-05, "loss": 0.2465, "step": 11440 }, { "epoch": 1.5552310774559408, "grad_norm": 0.31846657395362854, "learning_rate": 9.247631530052291e-05, "loss": 0.2416, "step": 11450 }, { "epoch": 1.5565893578729328, "grad_norm": 0.4008362293243408, "learning_rate": 9.245545715007628e-05, "loss": 0.2376, "step": 11460 }, { "epoch": 1.5579476382899249, "grad_norm": 0.7884146571159363, "learning_rate": 9.243457248585397e-05, "loss": 0.2328, "step": 11470 }, { "epoch": 1.559305918706917, "grad_norm": 0.9333849549293518, "learning_rate": 9.241366132089862e-05, "loss": 0.2332, "step": 11480 }, { "epoch": 1.560664199123909, "grad_norm": 0.35331299901008606, "learning_rate": 9.239272366826941e-05, "loss": 0.2495, "step": 11490 }, { "epoch": 1.5620224795409012, "grad_norm": 0.34951984882354736, "learning_rate": 9.23717595410421e-05, "loss": 0.2392, "step": 11500 }, { "epoch": 1.5633807599578933, "grad_norm": 0.4054914712905884, "learning_rate": 9.235076895230891e-05, "loss": 0.2397, "step": 11510 }, { "epoch": 1.5647390403748855, "grad_norm": 0.3443402647972107, "learning_rate": 9.232975191517868e-05, "loss": 0.2422, "step": 11520 }, { "epoch": 1.5660973207918776, "grad_norm": 0.294423907995224, "learning_rate": 9.23087084427767e-05, "loss": 0.241, "step": 11530 }, { "epoch": 1.5674556012088696, "grad_norm": 0.4231998920440674, "learning_rate": 9.22876385482448e-05, "loss": 0.2441, "step": 11540 }, { "epoch": 1.5688138816258617, "grad_norm": 0.3206895589828491, "learning_rate": 9.226654224474128e-05, "loss": 0.2322, "step": 11550 }, { "epoch": 1.5701721620428537, "grad_norm": 0.39131247997283936, "learning_rate": 9.224541954544097e-05, "loss": 0.2397, "step": 11560 }, { "epoch": 1.5715304424598457, "grad_norm": 0.3464904725551605, "learning_rate": 9.222427046353514e-05, "loss": 0.2363, "step": 11570 }, { "epoch": 1.5728887228768378, "grad_norm": 0.673896074295044, "learning_rate": 9.220309501223161e-05, "loss": 0.2403, "step": 11580 }, { "epoch": 1.57424700329383, "grad_norm": 0.3317304849624634, "learning_rate": 9.218189320475456e-05, "loss": 0.2498, "step": 11590 }, { "epoch": 1.575605283710822, "grad_norm": 0.3529096245765686, "learning_rate": 9.216066505434474e-05, "loss": 0.2496, "step": 11600 }, { "epoch": 1.5769635641278141, "grad_norm": 0.3662819266319275, "learning_rate": 9.213941057425928e-05, "loss": 0.2402, "step": 11610 }, { "epoch": 1.5783218445448064, "grad_norm": 0.6167352795600891, "learning_rate": 9.211812977777177e-05, "loss": 0.2455, "step": 11620 }, { "epoch": 1.5796801249617984, "grad_norm": 0.35804393887519836, "learning_rate": 9.209682267817223e-05, "loss": 0.234, "step": 11630 }, { "epoch": 1.5810384053787905, "grad_norm": 0.3855069875717163, "learning_rate": 9.207548928876715e-05, "loss": 0.2321, "step": 11640 }, { "epoch": 1.5823966857957825, "grad_norm": 0.4773486852645874, "learning_rate": 9.205412962287938e-05, "loss": 0.2428, "step": 11650 }, { "epoch": 1.5837549662127746, "grad_norm": 0.7416009902954102, "learning_rate": 9.20327436938482e-05, "loss": 0.2386, "step": 11660 }, { "epoch": 1.5851132466297666, "grad_norm": 0.4019080400466919, "learning_rate": 9.201133151502932e-05, "loss": 0.2392, "step": 11670 }, { "epoch": 1.5864715270467586, "grad_norm": 0.30513155460357666, "learning_rate": 9.19898930997948e-05, "loss": 0.2352, "step": 11680 }, { "epoch": 1.587829807463751, "grad_norm": 0.5565023422241211, "learning_rate": 9.196842846153308e-05, "loss": 0.2465, "step": 11690 }, { "epoch": 1.589188087880743, "grad_norm": 0.2988587021827698, "learning_rate": 9.194693761364905e-05, "loss": 0.2483, "step": 11700 }, { "epoch": 1.5905463682977352, "grad_norm": 0.3865082561969757, "learning_rate": 9.19254205695639e-05, "loss": 0.2393, "step": 11710 }, { "epoch": 1.5919046487147273, "grad_norm": 0.6537158489227295, "learning_rate": 9.190387734271518e-05, "loss": 0.2374, "step": 11720 }, { "epoch": 1.5932629291317193, "grad_norm": 0.3684825599193573, "learning_rate": 9.188230794655682e-05, "loss": 0.2424, "step": 11730 }, { "epoch": 1.5946212095487113, "grad_norm": 0.3383871018886566, "learning_rate": 9.186071239455907e-05, "loss": 0.2478, "step": 11740 }, { "epoch": 1.5959794899657034, "grad_norm": 0.3614795506000519, "learning_rate": 9.183909070020855e-05, "loss": 0.2463, "step": 11750 }, { "epoch": 1.5973377703826954, "grad_norm": 0.6121826767921448, "learning_rate": 9.181744287700817e-05, "loss": 0.2387, "step": 11760 }, { "epoch": 1.5986960507996875, "grad_norm": 0.492961049079895, "learning_rate": 9.179576893847717e-05, "loss": 0.2426, "step": 11770 }, { "epoch": 1.6000543312166797, "grad_norm": 0.437020480632782, "learning_rate": 9.17740688981511e-05, "loss": 0.2473, "step": 11780 }, { "epoch": 1.6014126116336718, "grad_norm": 0.3576638400554657, "learning_rate": 9.175234276958183e-05, "loss": 0.2358, "step": 11790 }, { "epoch": 1.6027708920506638, "grad_norm": 0.40313956141471863, "learning_rate": 9.173059056633745e-05, "loss": 0.2487, "step": 11800 }, { "epoch": 1.604129172467656, "grad_norm": 1.7828867435455322, "learning_rate": 9.170881230200246e-05, "loss": 0.2398, "step": 11810 }, { "epoch": 1.6054874528846481, "grad_norm": 0.5592231154441833, "learning_rate": 9.16870079901775e-05, "loss": 0.2381, "step": 11820 }, { "epoch": 1.6068457333016402, "grad_norm": 0.3828433156013489, "learning_rate": 9.166517764447955e-05, "loss": 0.2409, "step": 11830 }, { "epoch": 1.6082040137186322, "grad_norm": 0.41618263721466064, "learning_rate": 9.164332127854187e-05, "loss": 0.2399, "step": 11840 }, { "epoch": 1.6095622941356242, "grad_norm": 0.36915260553359985, "learning_rate": 9.162143890601392e-05, "loss": 0.2547, "step": 11850 }, { "epoch": 1.6109205745526163, "grad_norm": 0.3568577468395233, "learning_rate": 9.159953054056138e-05, "loss": 0.2385, "step": 11860 }, { "epoch": 1.6122788549696083, "grad_norm": 0.4273124635219574, "learning_rate": 9.157759619586627e-05, "loss": 0.2441, "step": 11870 }, { "epoch": 1.6136371353866006, "grad_norm": 0.9208621382713318, "learning_rate": 9.15556358856267e-05, "loss": 0.2266, "step": 11880 }, { "epoch": 1.6149954158035926, "grad_norm": 0.3841458559036255, "learning_rate": 9.153364962355712e-05, "loss": 0.2406, "step": 11890 }, { "epoch": 1.616353696220585, "grad_norm": 0.40183767676353455, "learning_rate": 9.151163742338809e-05, "loss": 0.2383, "step": 11900 }, { "epoch": 1.617711976637577, "grad_norm": 0.30856603384017944, "learning_rate": 9.14895992988664e-05, "loss": 0.237, "step": 11910 }, { "epoch": 1.619070257054569, "grad_norm": 0.33121463656425476, "learning_rate": 9.146753526375507e-05, "loss": 0.2388, "step": 11920 }, { "epoch": 1.620428537471561, "grad_norm": 0.4119341969490051, "learning_rate": 9.144544533183323e-05, "loss": 0.2404, "step": 11930 }, { "epoch": 1.621786817888553, "grad_norm": 0.3397175967693329, "learning_rate": 9.142332951689627e-05, "loss": 0.2428, "step": 11940 }, { "epoch": 1.623145098305545, "grad_norm": 0.4907020032405853, "learning_rate": 9.140118783275564e-05, "loss": 0.2402, "step": 11950 }, { "epoch": 1.6245033787225371, "grad_norm": 0.5164840817451477, "learning_rate": 9.137902029323905e-05, "loss": 0.244, "step": 11960 }, { "epoch": 1.6258616591395294, "grad_norm": 0.6736775040626526, "learning_rate": 9.135682691219028e-05, "loss": 0.2512, "step": 11970 }, { "epoch": 1.6272199395565214, "grad_norm": 0.31505918502807617, "learning_rate": 9.133460770346927e-05, "loss": 0.2426, "step": 11980 }, { "epoch": 1.6285782199735135, "grad_norm": 0.5828289985656738, "learning_rate": 9.13123626809521e-05, "loss": 0.2425, "step": 11990 }, { "epoch": 1.6299365003905057, "grad_norm": 0.5748461484909058, "learning_rate": 9.129009185853097e-05, "loss": 0.2343, "step": 12000 }, { "epoch": 1.6312947808074978, "grad_norm": 1.724286437034607, "learning_rate": 9.126779525011419e-05, "loss": 0.247, "step": 12010 }, { "epoch": 1.6326530612244898, "grad_norm": 0.40451204776763916, "learning_rate": 9.12454728696262e-05, "loss": 0.2403, "step": 12020 }, { "epoch": 1.6340113416414819, "grad_norm": 0.3216111660003662, "learning_rate": 9.122312473100744e-05, "loss": 0.2353, "step": 12030 }, { "epoch": 1.635369622058474, "grad_norm": 0.47889232635498047, "learning_rate": 9.120075084821457e-05, "loss": 0.2384, "step": 12040 }, { "epoch": 1.636727902475466, "grad_norm": 0.46261516213417053, "learning_rate": 9.117835123522022e-05, "loss": 0.2443, "step": 12050 }, { "epoch": 1.638086182892458, "grad_norm": 0.2884509265422821, "learning_rate": 9.115592590601315e-05, "loss": 0.2482, "step": 12060 }, { "epoch": 1.6394444633094503, "grad_norm": 0.8469234704971313, "learning_rate": 9.113347487459812e-05, "loss": 0.2466, "step": 12070 }, { "epoch": 1.6408027437264423, "grad_norm": 0.435625821352005, "learning_rate": 9.111099815499604e-05, "loss": 0.2425, "step": 12080 }, { "epoch": 1.6421610241434346, "grad_norm": 0.32619595527648926, "learning_rate": 9.108849576124375e-05, "loss": 0.2407, "step": 12090 }, { "epoch": 1.6435193045604266, "grad_norm": 0.4726732671260834, "learning_rate": 9.106596770739422e-05, "loss": 0.2347, "step": 12100 }, { "epoch": 1.6448775849774186, "grad_norm": 0.6267814040184021, "learning_rate": 9.104341400751637e-05, "loss": 0.2298, "step": 12110 }, { "epoch": 1.6462358653944107, "grad_norm": 0.5082738399505615, "learning_rate": 9.102083467569517e-05, "loss": 0.2357, "step": 12120 }, { "epoch": 1.6475941458114027, "grad_norm": 0.5757683515548706, "learning_rate": 9.099822972603163e-05, "loss": 0.2342, "step": 12130 }, { "epoch": 1.6489524262283948, "grad_norm": 0.3208656311035156, "learning_rate": 9.097559917264267e-05, "loss": 0.2486, "step": 12140 }, { "epoch": 1.6503107066453868, "grad_norm": 0.3922220468521118, "learning_rate": 9.095294302966128e-05, "loss": 0.2428, "step": 12150 }, { "epoch": 1.651668987062379, "grad_norm": 0.9889463186264038, "learning_rate": 9.093026131123643e-05, "loss": 0.2306, "step": 12160 }, { "epoch": 1.6530272674793711, "grad_norm": 0.6700922846794128, "learning_rate": 9.090755403153299e-05, "loss": 0.2471, "step": 12170 }, { "epoch": 1.6543855478963632, "grad_norm": 0.346439391374588, "learning_rate": 9.088482120473187e-05, "loss": 0.2381, "step": 12180 }, { "epoch": 1.6557438283133554, "grad_norm": 0.8422291278839111, "learning_rate": 9.086206284502988e-05, "loss": 0.2457, "step": 12190 }, { "epoch": 1.6571021087303475, "grad_norm": 0.4650530517101288, "learning_rate": 9.083927896663982e-05, "loss": 0.2375, "step": 12200 }, { "epoch": 1.6584603891473395, "grad_norm": 0.3685927987098694, "learning_rate": 9.081646958379041e-05, "loss": 0.2479, "step": 12210 }, { "epoch": 1.6598186695643316, "grad_norm": 0.3419102430343628, "learning_rate": 9.079363471072627e-05, "loss": 0.2424, "step": 12220 }, { "epoch": 1.6611769499813236, "grad_norm": 0.2700977325439453, "learning_rate": 9.077077436170798e-05, "loss": 0.2423, "step": 12230 }, { "epoch": 1.6625352303983156, "grad_norm": 0.5653162002563477, "learning_rate": 9.074788855101201e-05, "loss": 0.2333, "step": 12240 }, { "epoch": 1.6638935108153077, "grad_norm": 0.4855403006076813, "learning_rate": 9.072497729293073e-05, "loss": 0.2567, "step": 12250 }, { "epoch": 1.6652517912323, "grad_norm": 0.43391939997673035, "learning_rate": 9.07020406017724e-05, "loss": 0.2408, "step": 12260 }, { "epoch": 1.666610071649292, "grad_norm": 0.3749004006385803, "learning_rate": 9.06790784918612e-05, "loss": 0.2492, "step": 12270 }, { "epoch": 1.6679683520662842, "grad_norm": 0.39145609736442566, "learning_rate": 9.06560909775371e-05, "loss": 0.2465, "step": 12280 }, { "epoch": 1.6693266324832763, "grad_norm": 0.36915719509124756, "learning_rate": 9.063307807315602e-05, "loss": 0.2401, "step": 12290 }, { "epoch": 1.6706849129002683, "grad_norm": 0.4086484909057617, "learning_rate": 9.061003979308971e-05, "loss": 0.2317, "step": 12300 }, { "epoch": 1.6720431933172604, "grad_norm": 0.43611446022987366, "learning_rate": 9.058697615172575e-05, "loss": 0.2432, "step": 12310 }, { "epoch": 1.6734014737342524, "grad_norm": 0.5021176338195801, "learning_rate": 9.056388716346757e-05, "loss": 0.2481, "step": 12320 }, { "epoch": 1.6747597541512445, "grad_norm": 0.3247273564338684, "learning_rate": 9.054077284273445e-05, "loss": 0.2303, "step": 12330 }, { "epoch": 1.6761180345682365, "grad_norm": 0.35414817929267883, "learning_rate": 9.051763320396145e-05, "loss": 0.2454, "step": 12340 }, { "epoch": 1.6774763149852288, "grad_norm": 0.36206984519958496, "learning_rate": 9.049446826159945e-05, "loss": 0.2357, "step": 12350 }, { "epoch": 1.6788345954022208, "grad_norm": 1.067825436592102, "learning_rate": 9.047127803011517e-05, "loss": 0.2401, "step": 12360 }, { "epoch": 1.6801928758192128, "grad_norm": 0.6506267786026001, "learning_rate": 9.044806252399107e-05, "loss": 0.234, "step": 12370 }, { "epoch": 1.681551156236205, "grad_norm": 0.49706748127937317, "learning_rate": 9.042482175772544e-05, "loss": 0.2402, "step": 12380 }, { "epoch": 1.6829094366531971, "grad_norm": 0.32818710803985596, "learning_rate": 9.040155574583233e-05, "loss": 0.2357, "step": 12390 }, { "epoch": 1.6842677170701892, "grad_norm": 0.34813210368156433, "learning_rate": 9.037826450284155e-05, "loss": 0.2387, "step": 12400 }, { "epoch": 1.6856259974871812, "grad_norm": 0.33134976029396057, "learning_rate": 9.035494804329865e-05, "loss": 0.2476, "step": 12410 }, { "epoch": 1.6869842779041733, "grad_norm": 0.2668949365615845, "learning_rate": 9.033160638176496e-05, "loss": 0.2407, "step": 12420 }, { "epoch": 1.6883425583211653, "grad_norm": 0.31002283096313477, "learning_rate": 9.030823953281753e-05, "loss": 0.2467, "step": 12430 }, { "epoch": 1.6897008387381574, "grad_norm": 0.4842119514942169, "learning_rate": 9.028484751104918e-05, "loss": 0.2387, "step": 12440 }, { "epoch": 1.6910591191551496, "grad_norm": 0.33204859495162964, "learning_rate": 9.026143033106836e-05, "loss": 0.2422, "step": 12450 }, { "epoch": 1.6924173995721417, "grad_norm": 0.30064883828163147, "learning_rate": 9.023798800749932e-05, "loss": 0.2376, "step": 12460 }, { "epoch": 1.693775679989134, "grad_norm": 0.4047108292579651, "learning_rate": 9.021452055498199e-05, "loss": 0.2337, "step": 12470 }, { "epoch": 1.695133960406126, "grad_norm": 0.28210899233818054, "learning_rate": 9.019102798817197e-05, "loss": 0.2433, "step": 12480 }, { "epoch": 1.696492240823118, "grad_norm": 0.30888643860816956, "learning_rate": 9.016751032174053e-05, "loss": 0.2461, "step": 12490 }, { "epoch": 1.69785052124011, "grad_norm": 0.4265442490577698, "learning_rate": 9.014396757037471e-05, "loss": 0.2458, "step": 12500 }, { "epoch": 1.699208801657102, "grad_norm": 0.29664146900177, "learning_rate": 9.012039974877709e-05, "loss": 0.2473, "step": 12510 }, { "epoch": 1.7005670820740941, "grad_norm": 1.3626660108566284, "learning_rate": 9.009680687166597e-05, "loss": 0.2324, "step": 12520 }, { "epoch": 1.7019253624910862, "grad_norm": 0.40633031725883484, "learning_rate": 9.007318895377532e-05, "loss": 0.2458, "step": 12530 }, { "epoch": 1.7032836429080782, "grad_norm": 0.3311006724834442, "learning_rate": 9.00495460098547e-05, "loss": 0.2364, "step": 12540 }, { "epoch": 1.7046419233250705, "grad_norm": 0.3622264564037323, "learning_rate": 9.002587805466933e-05, "loss": 0.2356, "step": 12550 }, { "epoch": 1.7060002037420625, "grad_norm": 0.3215394914150238, "learning_rate": 9.000218510300004e-05, "loss": 0.2408, "step": 12560 }, { "epoch": 1.7073584841590548, "grad_norm": 0.6830755472183228, "learning_rate": 8.997846716964326e-05, "loss": 0.2449, "step": 12570 }, { "epoch": 1.7087167645760468, "grad_norm": 0.46850576996803284, "learning_rate": 8.995472426941103e-05, "loss": 0.2385, "step": 12580 }, { "epoch": 1.7100750449930389, "grad_norm": 0.47411811351776123, "learning_rate": 8.993095641713099e-05, "loss": 0.2418, "step": 12590 }, { "epoch": 1.711433325410031, "grad_norm": 0.36984318494796753, "learning_rate": 8.990716362764635e-05, "loss": 0.2475, "step": 12600 }, { "epoch": 1.712791605827023, "grad_norm": 0.5144878029823303, "learning_rate": 8.988334591581591e-05, "loss": 0.2451, "step": 12610 }, { "epoch": 1.714149886244015, "grad_norm": 0.3425566852092743, "learning_rate": 8.9859503296514e-05, "loss": 0.2403, "step": 12620 }, { "epoch": 1.715508166661007, "grad_norm": 0.715060830116272, "learning_rate": 8.983563578463056e-05, "loss": 0.2322, "step": 12630 }, { "epoch": 1.7168664470779993, "grad_norm": 0.35686108469963074, "learning_rate": 8.981174339507102e-05, "loss": 0.2441, "step": 12640 }, { "epoch": 1.7182247274949913, "grad_norm": 0.40858104825019836, "learning_rate": 8.978782614275638e-05, "loss": 0.235, "step": 12650 }, { "epoch": 1.7195830079119834, "grad_norm": 0.5848186016082764, "learning_rate": 8.976388404262315e-05, "loss": 0.2339, "step": 12660 }, { "epoch": 1.7209412883289756, "grad_norm": 0.5057547688484192, "learning_rate": 8.973991710962337e-05, "loss": 0.24, "step": 12670 }, { "epoch": 1.7222995687459677, "grad_norm": 1.034732699394226, "learning_rate": 8.97159253587246e-05, "loss": 0.2512, "step": 12680 }, { "epoch": 1.7236578491629597, "grad_norm": 1.3138669729232788, "learning_rate": 8.969190880490985e-05, "loss": 0.2366, "step": 12690 }, { "epoch": 1.7250161295799518, "grad_norm": 0.36217761039733887, "learning_rate": 8.966786746317766e-05, "loss": 0.2426, "step": 12700 }, { "epoch": 1.7263744099969438, "grad_norm": 0.388761043548584, "learning_rate": 8.964380134854207e-05, "loss": 0.2402, "step": 12710 }, { "epoch": 1.7277326904139358, "grad_norm": 0.3967556059360504, "learning_rate": 8.961971047603251e-05, "loss": 0.2229, "step": 12720 }, { "epoch": 1.7290909708309279, "grad_norm": 0.5538721680641174, "learning_rate": 8.959559486069397e-05, "loss": 0.2349, "step": 12730 }, { "epoch": 1.7304492512479202, "grad_norm": 0.412199467420578, "learning_rate": 8.957145451758684e-05, "loss": 0.2368, "step": 12740 }, { "epoch": 1.7318075316649122, "grad_norm": 0.5580852031707764, "learning_rate": 8.954728946178694e-05, "loss": 0.2364, "step": 12750 }, { "epoch": 1.7331658120819045, "grad_norm": 0.4095515310764313, "learning_rate": 8.952309970838557e-05, "loss": 0.2405, "step": 12760 }, { "epoch": 1.7345240924988965, "grad_norm": 0.45170798897743225, "learning_rate": 8.949888527248939e-05, "loss": 0.2427, "step": 12770 }, { "epoch": 1.7358823729158885, "grad_norm": 0.45863354206085205, "learning_rate": 8.947464616922054e-05, "loss": 0.2415, "step": 12780 }, { "epoch": 1.7372406533328806, "grad_norm": 0.4179075360298157, "learning_rate": 8.945038241371654e-05, "loss": 0.2312, "step": 12790 }, { "epoch": 1.7385989337498726, "grad_norm": 0.3569994568824768, "learning_rate": 8.942609402113027e-05, "loss": 0.2374, "step": 12800 }, { "epoch": 1.7399572141668647, "grad_norm": 0.44350293278694153, "learning_rate": 8.940178100663005e-05, "loss": 0.2414, "step": 12810 }, { "epoch": 1.7413154945838567, "grad_norm": 0.4659176468849182, "learning_rate": 8.937744338539956e-05, "loss": 0.2501, "step": 12820 }, { "epoch": 1.742673775000849, "grad_norm": 0.46017318964004517, "learning_rate": 8.93530811726378e-05, "loss": 0.2377, "step": 12830 }, { "epoch": 1.744032055417841, "grad_norm": 2.1445960998535156, "learning_rate": 8.932869438355922e-05, "loss": 0.2432, "step": 12840 }, { "epoch": 1.745390335834833, "grad_norm": 1.1443910598754883, "learning_rate": 8.930428303339353e-05, "loss": 0.2323, "step": 12850 }, { "epoch": 1.7467486162518253, "grad_norm": 0.4064890444278717, "learning_rate": 8.927984713738584e-05, "loss": 0.2329, "step": 12860 }, { "epoch": 1.7481068966688174, "grad_norm": 0.6467783451080322, "learning_rate": 8.925538671079655e-05, "loss": 0.2459, "step": 12870 }, { "epoch": 1.7494651770858094, "grad_norm": 0.6551882028579712, "learning_rate": 8.923090176890141e-05, "loss": 0.2369, "step": 12880 }, { "epoch": 1.7508234575028014, "grad_norm": 0.5396952033042908, "learning_rate": 8.920639232699142e-05, "loss": 0.2337, "step": 12890 }, { "epoch": 1.7521817379197935, "grad_norm": 0.35799697041511536, "learning_rate": 8.918185840037298e-05, "loss": 0.233, "step": 12900 }, { "epoch": 1.7535400183367855, "grad_norm": 0.4546336233615875, "learning_rate": 8.91573000043677e-05, "loss": 0.2283, "step": 12910 }, { "epoch": 1.7548982987537776, "grad_norm": 0.6032233834266663, "learning_rate": 8.913271715431251e-05, "loss": 0.2442, "step": 12920 }, { "epoch": 1.7562565791707698, "grad_norm": 0.3092736005783081, "learning_rate": 8.910810986555956e-05, "loss": 0.2434, "step": 12930 }, { "epoch": 1.7576148595877619, "grad_norm": 0.3355954587459564, "learning_rate": 8.908347815347634e-05, "loss": 0.2465, "step": 12940 }, { "epoch": 1.7589731400047541, "grad_norm": 0.34593793749809265, "learning_rate": 8.905882203344553e-05, "loss": 0.2406, "step": 12950 }, { "epoch": 1.7603314204217462, "grad_norm": 0.5628035068511963, "learning_rate": 8.903414152086511e-05, "loss": 0.2386, "step": 12960 }, { "epoch": 1.7616897008387382, "grad_norm": 0.34180569648742676, "learning_rate": 8.90094366311482e-05, "loss": 0.2388, "step": 12970 }, { "epoch": 1.7630479812557303, "grad_norm": 0.3809289038181305, "learning_rate": 8.898470737972327e-05, "loss": 0.2348, "step": 12980 }, { "epoch": 1.7644062616727223, "grad_norm": 0.35012713074684143, "learning_rate": 8.895995378203389e-05, "loss": 0.249, "step": 12990 }, { "epoch": 1.7657645420897143, "grad_norm": 0.3079511225223541, "learning_rate": 8.89351758535389e-05, "loss": 0.229, "step": 13000 }, { "epoch": 1.7671228225067064, "grad_norm": 0.33352455496788025, "learning_rate": 8.89103736097123e-05, "loss": 0.2362, "step": 13010 }, { "epoch": 1.7684811029236986, "grad_norm": 0.3871811032295227, "learning_rate": 8.888554706604332e-05, "loss": 0.2414, "step": 13020 }, { "epoch": 1.7698393833406907, "grad_norm": 0.6309791207313538, "learning_rate": 8.886069623803631e-05, "loss": 0.239, "step": 13030 }, { "epoch": 1.7711976637576827, "grad_norm": 0.3470625579357147, "learning_rate": 8.883582114121083e-05, "loss": 0.2328, "step": 13040 }, { "epoch": 1.772555944174675, "grad_norm": 0.5532134771347046, "learning_rate": 8.881092179110158e-05, "loss": 0.2397, "step": 13050 }, { "epoch": 1.773914224591667, "grad_norm": 0.49158212542533875, "learning_rate": 8.87859982032584e-05, "loss": 0.2342, "step": 13060 }, { "epoch": 1.775272505008659, "grad_norm": 0.32140496373176575, "learning_rate": 8.876105039324624e-05, "loss": 0.2477, "step": 13070 }, { "epoch": 1.7766307854256511, "grad_norm": 0.408200740814209, "learning_rate": 8.873607837664526e-05, "loss": 0.2348, "step": 13080 }, { "epoch": 1.7779890658426432, "grad_norm": 0.5912440419197083, "learning_rate": 8.871108216905067e-05, "loss": 0.2364, "step": 13090 }, { "epoch": 1.7793473462596352, "grad_norm": 0.5071557760238647, "learning_rate": 8.868606178607278e-05, "loss": 0.2376, "step": 13100 }, { "epoch": 1.7807056266766272, "grad_norm": 0.3423488736152649, "learning_rate": 8.866101724333706e-05, "loss": 0.2396, "step": 13110 }, { "epoch": 1.7820639070936195, "grad_norm": 0.336873322725296, "learning_rate": 8.863594855648398e-05, "loss": 0.2398, "step": 13120 }, { "epoch": 1.7834221875106115, "grad_norm": 0.33317118883132935, "learning_rate": 8.861085574116919e-05, "loss": 0.2413, "step": 13130 }, { "epoch": 1.7847804679276038, "grad_norm": 0.4827631115913391, "learning_rate": 8.85857388130633e-05, "loss": 0.2439, "step": 13140 }, { "epoch": 1.7861387483445959, "grad_norm": 0.3797971308231354, "learning_rate": 8.85605977878521e-05, "loss": 0.2346, "step": 13150 }, { "epoch": 1.787497028761588, "grad_norm": 0.3406827449798584, "learning_rate": 8.853543268123632e-05, "loss": 0.2536, "step": 13160 }, { "epoch": 1.78885530917858, "grad_norm": 0.43208080530166626, "learning_rate": 8.851024350893177e-05, "loss": 0.236, "step": 13170 }, { "epoch": 1.790213589595572, "grad_norm": 0.25632062554359436, "learning_rate": 8.848503028666931e-05, "loss": 0.2561, "step": 13180 }, { "epoch": 1.791571870012564, "grad_norm": 0.5119814872741699, "learning_rate": 8.84597930301948e-05, "loss": 0.2375, "step": 13190 }, { "epoch": 1.792930150429556, "grad_norm": 0.4080025851726532, "learning_rate": 8.843453175526909e-05, "loss": 0.2406, "step": 13200 }, { "epoch": 1.7942884308465483, "grad_norm": 0.28889235854148865, "learning_rate": 8.840924647766806e-05, "loss": 0.2378, "step": 13210 }, { "epoch": 1.7956467112635404, "grad_norm": 0.31422796845436096, "learning_rate": 8.838393721318259e-05, "loss": 0.2496, "step": 13220 }, { "epoch": 1.7970049916805324, "grad_norm": 0.3150452971458435, "learning_rate": 8.835860397761848e-05, "loss": 0.2439, "step": 13230 }, { "epoch": 1.7983632720975247, "grad_norm": 0.30418115854263306, "learning_rate": 8.833324678679659e-05, "loss": 0.2395, "step": 13240 }, { "epoch": 1.7997215525145167, "grad_norm": 0.28686094284057617, "learning_rate": 8.830786565655265e-05, "loss": 0.2463, "step": 13250 }, { "epoch": 1.8010798329315088, "grad_norm": 0.6010620594024658, "learning_rate": 8.82824606027374e-05, "loss": 0.2398, "step": 13260 }, { "epoch": 1.8024381133485008, "grad_norm": 0.418920636177063, "learning_rate": 8.825703164121648e-05, "loss": 0.2384, "step": 13270 }, { "epoch": 1.8037963937654928, "grad_norm": 0.3237849175930023, "learning_rate": 8.823157878787051e-05, "loss": 0.2346, "step": 13280 }, { "epoch": 1.8051546741824849, "grad_norm": 0.7656481862068176, "learning_rate": 8.820610205859499e-05, "loss": 0.2361, "step": 13290 }, { "epoch": 1.806512954599477, "grad_norm": 0.39090290665626526, "learning_rate": 8.818060146930034e-05, "loss": 0.2403, "step": 13300 }, { "epoch": 1.8078712350164692, "grad_norm": 0.3946121335029602, "learning_rate": 8.815507703591188e-05, "loss": 0.2398, "step": 13310 }, { "epoch": 1.8092295154334612, "grad_norm": 0.4888049066066742, "learning_rate": 8.812952877436984e-05, "loss": 0.2351, "step": 13320 }, { "epoch": 1.8105877958504535, "grad_norm": 1.3449900150299072, "learning_rate": 8.810395670062931e-05, "loss": 0.236, "step": 13330 }, { "epoch": 1.8119460762674455, "grad_norm": 0.4427200257778168, "learning_rate": 8.807836083066024e-05, "loss": 0.233, "step": 13340 }, { "epoch": 1.8133043566844376, "grad_norm": 0.31534823775291443, "learning_rate": 8.805274118044748e-05, "loss": 0.2501, "step": 13350 }, { "epoch": 1.8146626371014296, "grad_norm": 0.4071117043495178, "learning_rate": 8.80270977659907e-05, "loss": 0.2373, "step": 13360 }, { "epoch": 1.8160209175184217, "grad_norm": 0.594148576259613, "learning_rate": 8.800143060330441e-05, "loss": 0.2448, "step": 13370 }, { "epoch": 1.8173791979354137, "grad_norm": 0.3715565502643585, "learning_rate": 8.797573970841798e-05, "loss": 0.2473, "step": 13380 }, { "epoch": 1.8187374783524057, "grad_norm": 0.43769571185112, "learning_rate": 8.795002509737556e-05, "loss": 0.2328, "step": 13390 }, { "epoch": 1.8200957587693978, "grad_norm": 0.4485446512699127, "learning_rate": 8.792428678623614e-05, "loss": 0.2395, "step": 13400 }, { "epoch": 1.82145403918639, "grad_norm": 0.47363510727882385, "learning_rate": 8.789852479107352e-05, "loss": 0.2354, "step": 13410 }, { "epoch": 1.822812319603382, "grad_norm": 0.6795073747634888, "learning_rate": 8.787273912797625e-05, "loss": 0.2373, "step": 13420 }, { "epoch": 1.8241706000203743, "grad_norm": 0.4995262026786804, "learning_rate": 8.78469298130477e-05, "loss": 0.2421, "step": 13430 }, { "epoch": 1.8255288804373664, "grad_norm": 0.38040482997894287, "learning_rate": 8.782109686240598e-05, "loss": 0.2328, "step": 13440 }, { "epoch": 1.8268871608543584, "grad_norm": 0.4416826665401459, "learning_rate": 8.779524029218399e-05, "loss": 0.2424, "step": 13450 }, { "epoch": 1.8282454412713505, "grad_norm": 0.5220518112182617, "learning_rate": 8.776936011852935e-05, "loss": 0.2435, "step": 13460 }, { "epoch": 1.8296037216883425, "grad_norm": 0.29444456100463867, "learning_rate": 8.774345635760446e-05, "loss": 0.2447, "step": 13470 }, { "epoch": 1.8309620021053346, "grad_norm": 0.3660382032394409, "learning_rate": 8.771752902558643e-05, "loss": 0.2447, "step": 13480 }, { "epoch": 1.8323202825223266, "grad_norm": 0.4636138677597046, "learning_rate": 8.769157813866705e-05, "loss": 0.2518, "step": 13490 }, { "epoch": 1.8336785629393189, "grad_norm": 0.2707512080669403, "learning_rate": 8.76656037130529e-05, "loss": 0.2462, "step": 13500 }, { "epoch": 1.835036843356311, "grad_norm": 0.35500267148017883, "learning_rate": 8.76396057649652e-05, "loss": 0.2427, "step": 13510 }, { "epoch": 1.8363951237733032, "grad_norm": 0.4943229854106903, "learning_rate": 8.761358431063989e-05, "loss": 0.238, "step": 13520 }, { "epoch": 1.8377534041902952, "grad_norm": 0.4867532551288605, "learning_rate": 8.758753936632755e-05, "loss": 0.2271, "step": 13530 }, { "epoch": 1.8391116846072872, "grad_norm": 0.5242833495140076, "learning_rate": 8.756147094829348e-05, "loss": 0.2376, "step": 13540 }, { "epoch": 1.8404699650242793, "grad_norm": 0.6108646988868713, "learning_rate": 8.753537907281763e-05, "loss": 0.2396, "step": 13550 }, { "epoch": 1.8418282454412713, "grad_norm": 0.32995206117630005, "learning_rate": 8.750926375619454e-05, "loss": 0.2417, "step": 13560 }, { "epoch": 1.8431865258582634, "grad_norm": 0.32687699794769287, "learning_rate": 8.74831250147335e-05, "loss": 0.2512, "step": 13570 }, { "epoch": 1.8445448062752554, "grad_norm": 0.5465505719184875, "learning_rate": 8.745696286475833e-05, "loss": 0.2332, "step": 13580 }, { "epoch": 1.8459030866922475, "grad_norm": 0.38558080792427063, "learning_rate": 8.74307773226075e-05, "loss": 0.2336, "step": 13590 }, { "epoch": 1.8472613671092397, "grad_norm": 0.3300583064556122, "learning_rate": 8.740456840463409e-05, "loss": 0.2401, "step": 13600 }, { "epoch": 1.8486196475262318, "grad_norm": 0.31553205847740173, "learning_rate": 8.737833612720578e-05, "loss": 0.2415, "step": 13610 }, { "epoch": 1.849977927943224, "grad_norm": 0.47764718532562256, "learning_rate": 8.735208050670487e-05, "loss": 0.2453, "step": 13620 }, { "epoch": 1.851336208360216, "grad_norm": 0.49447715282440186, "learning_rate": 8.732580155952819e-05, "loss": 0.2444, "step": 13630 }, { "epoch": 1.852694488777208, "grad_norm": 0.26222941279411316, "learning_rate": 8.729949930208714e-05, "loss": 0.242, "step": 13640 }, { "epoch": 1.8540527691942001, "grad_norm": 0.38448286056518555, "learning_rate": 8.72731737508077e-05, "loss": 0.2391, "step": 13650 }, { "epoch": 1.8554110496111922, "grad_norm": 0.36421218514442444, "learning_rate": 8.724682492213039e-05, "loss": 0.2312, "step": 13660 }, { "epoch": 1.8567693300281842, "grad_norm": 0.35362163186073303, "learning_rate": 8.722045283251025e-05, "loss": 0.241, "step": 13670 }, { "epoch": 1.8581276104451763, "grad_norm": 0.33185282349586487, "learning_rate": 8.719405749841691e-05, "loss": 0.238, "step": 13680 }, { "epoch": 1.8594858908621685, "grad_norm": 0.3736755847930908, "learning_rate": 8.716763893633444e-05, "loss": 0.2367, "step": 13690 }, { "epoch": 1.8608441712791606, "grad_norm": 0.32251736521720886, "learning_rate": 8.714119716276143e-05, "loss": 0.2394, "step": 13700 }, { "epoch": 1.8622024516961526, "grad_norm": 0.39299705624580383, "learning_rate": 8.7114732194211e-05, "loss": 0.2336, "step": 13710 }, { "epoch": 1.8635607321131449, "grad_norm": 0.3303115665912628, "learning_rate": 8.708824404721072e-05, "loss": 0.2446, "step": 13720 }, { "epoch": 1.864919012530137, "grad_norm": 0.3755153715610504, "learning_rate": 8.706173273830269e-05, "loss": 0.228, "step": 13730 }, { "epoch": 1.866277292947129, "grad_norm": 0.41286396980285645, "learning_rate": 8.703519828404338e-05, "loss": 0.2481, "step": 13740 }, { "epoch": 1.867635573364121, "grad_norm": 0.2921053171157837, "learning_rate": 8.700864070100383e-05, "loss": 0.2298, "step": 13750 }, { "epoch": 1.868993853781113, "grad_norm": 0.3696379065513611, "learning_rate": 8.698206000576943e-05, "loss": 0.2338, "step": 13760 }, { "epoch": 1.870352134198105, "grad_norm": 0.445083349943161, "learning_rate": 8.695545621494004e-05, "loss": 0.2406, "step": 13770 }, { "epoch": 1.8717104146150971, "grad_norm": 0.414634644985199, "learning_rate": 8.692882934512993e-05, "loss": 0.2354, "step": 13780 }, { "epoch": 1.8730686950320894, "grad_norm": 0.36477115750312805, "learning_rate": 8.690217941296783e-05, "loss": 0.2406, "step": 13790 }, { "epoch": 1.8744269754490814, "grad_norm": 0.32521480321884155, "learning_rate": 8.687550643509682e-05, "loss": 0.2477, "step": 13800 }, { "epoch": 1.8757852558660737, "grad_norm": 0.327962189912796, "learning_rate": 8.684881042817439e-05, "loss": 0.2402, "step": 13810 }, { "epoch": 1.8771435362830657, "grad_norm": 0.42625656723976135, "learning_rate": 8.68220914088724e-05, "loss": 0.2443, "step": 13820 }, { "epoch": 1.8785018167000578, "grad_norm": 0.3964630961418152, "learning_rate": 8.679534939387712e-05, "loss": 0.244, "step": 13830 }, { "epoch": 1.8798600971170498, "grad_norm": 0.5431462526321411, "learning_rate": 8.676858439988912e-05, "loss": 0.2289, "step": 13840 }, { "epoch": 1.8812183775340419, "grad_norm": 0.2543709874153137, "learning_rate": 8.67417964436234e-05, "loss": 0.2426, "step": 13850 }, { "epoch": 1.882576657951034, "grad_norm": 0.6753960847854614, "learning_rate": 8.671498554180921e-05, "loss": 0.2392, "step": 13860 }, { "epoch": 1.883934938368026, "grad_norm": 0.46066227555274963, "learning_rate": 8.668815171119021e-05, "loss": 0.2467, "step": 13870 }, { "epoch": 1.8852932187850182, "grad_norm": 0.3593063950538635, "learning_rate": 8.666129496852431e-05, "loss": 0.238, "step": 13880 }, { "epoch": 1.8866514992020103, "grad_norm": 0.32598215341567993, "learning_rate": 8.663441533058378e-05, "loss": 0.2345, "step": 13890 }, { "epoch": 1.8880097796190023, "grad_norm": 0.44575342535972595, "learning_rate": 8.660751281415519e-05, "loss": 0.2327, "step": 13900 }, { "epoch": 1.8893680600359946, "grad_norm": 0.34083518385887146, "learning_rate": 8.658058743603933e-05, "loss": 0.2394, "step": 13910 }, { "epoch": 1.8907263404529866, "grad_norm": 0.3874667286872864, "learning_rate": 8.655363921305137e-05, "loss": 0.246, "step": 13920 }, { "epoch": 1.8920846208699786, "grad_norm": 0.48084691166877747, "learning_rate": 8.652666816202066e-05, "loss": 0.2356, "step": 13930 }, { "epoch": 1.8934429012869707, "grad_norm": 0.4052474796772003, "learning_rate": 8.649967429979085e-05, "loss": 0.2448, "step": 13940 }, { "epoch": 1.8948011817039627, "grad_norm": 0.36070188879966736, "learning_rate": 8.647265764321981e-05, "loss": 0.2508, "step": 13950 }, { "epoch": 1.8961594621209548, "grad_norm": 1.455027461051941, "learning_rate": 8.644561820917969e-05, "loss": 0.2361, "step": 13960 }, { "epoch": 1.8975177425379468, "grad_norm": 0.5001794695854187, "learning_rate": 8.641855601455681e-05, "loss": 0.2496, "step": 13970 }, { "epoch": 1.898876022954939, "grad_norm": 0.417280375957489, "learning_rate": 8.639147107625174e-05, "loss": 0.2347, "step": 13980 }, { "epoch": 1.900234303371931, "grad_norm": 0.5158026814460754, "learning_rate": 8.636436341117923e-05, "loss": 0.2484, "step": 13990 }, { "epoch": 1.9015925837889234, "grad_norm": 0.3047603964805603, "learning_rate": 8.633723303626824e-05, "loss": 0.2353, "step": 14000 }, { "epoch": 1.9029508642059154, "grad_norm": 0.5949711799621582, "learning_rate": 8.631007996846193e-05, "loss": 0.2251, "step": 14010 }, { "epoch": 1.9043091446229075, "grad_norm": 0.5542662739753723, "learning_rate": 8.628290422471756e-05, "loss": 0.2364, "step": 14020 }, { "epoch": 1.9056674250398995, "grad_norm": 0.3102232813835144, "learning_rate": 8.625570582200666e-05, "loss": 0.2405, "step": 14030 }, { "epoch": 1.9070257054568915, "grad_norm": 0.3596298396587372, "learning_rate": 8.622848477731482e-05, "loss": 0.2455, "step": 14040 }, { "epoch": 1.9083839858738836, "grad_norm": 0.5188972353935242, "learning_rate": 8.620124110764181e-05, "loss": 0.238, "step": 14050 }, { "epoch": 1.9097422662908756, "grad_norm": 0.35516491532325745, "learning_rate": 8.617397483000151e-05, "loss": 0.2437, "step": 14060 }, { "epoch": 1.9111005467078679, "grad_norm": 0.43912559747695923, "learning_rate": 8.614668596142195e-05, "loss": 0.2394, "step": 14070 }, { "epoch": 1.91245882712486, "grad_norm": 0.5589343309402466, "learning_rate": 8.611937451894525e-05, "loss": 0.2392, "step": 14080 }, { "epoch": 1.913817107541852, "grad_norm": 0.3792573809623718, "learning_rate": 8.60920405196276e-05, "loss": 0.2389, "step": 14090 }, { "epoch": 1.9151753879588442, "grad_norm": 0.47707757353782654, "learning_rate": 8.60646839805393e-05, "loss": 0.2412, "step": 14100 }, { "epoch": 1.9165336683758363, "grad_norm": 0.33512479066848755, "learning_rate": 8.603730491876479e-05, "loss": 0.2288, "step": 14110 }, { "epoch": 1.9178919487928283, "grad_norm": 0.3264877498149872, "learning_rate": 8.600990335140243e-05, "loss": 0.2442, "step": 14120 }, { "epoch": 1.9192502292098204, "grad_norm": 0.7492736577987671, "learning_rate": 8.598247929556479e-05, "loss": 0.2403, "step": 14130 }, { "epoch": 1.9206085096268124, "grad_norm": 0.41925838589668274, "learning_rate": 8.595503276837837e-05, "loss": 0.2361, "step": 14140 }, { "epoch": 1.9219667900438044, "grad_norm": 0.3393121063709259, "learning_rate": 8.592756378698378e-05, "loss": 0.2436, "step": 14150 }, { "epoch": 1.9233250704607965, "grad_norm": 0.5019286274909973, "learning_rate": 8.590007236853559e-05, "loss": 0.2286, "step": 14160 }, { "epoch": 1.9246833508777887, "grad_norm": 0.47932615876197815, "learning_rate": 8.587255853020244e-05, "loss": 0.243, "step": 14170 }, { "epoch": 1.9260416312947808, "grad_norm": 0.35999608039855957, "learning_rate": 8.584502228916693e-05, "loss": 0.248, "step": 14180 }, { "epoch": 1.927399911711773, "grad_norm": 0.37379515171051025, "learning_rate": 8.581746366262566e-05, "loss": 0.2351, "step": 14190 }, { "epoch": 1.928758192128765, "grad_norm": 0.3385591208934784, "learning_rate": 8.578988266778923e-05, "loss": 0.2492, "step": 14200 }, { "epoch": 1.9301164725457571, "grad_norm": 0.7545274496078491, "learning_rate": 8.576227932188218e-05, "loss": 0.2463, "step": 14210 }, { "epoch": 1.9314747529627492, "grad_norm": 0.45451533794403076, "learning_rate": 8.573465364214303e-05, "loss": 0.2372, "step": 14220 }, { "epoch": 1.9328330333797412, "grad_norm": 0.3137250244617462, "learning_rate": 8.570700564582423e-05, "loss": 0.2455, "step": 14230 }, { "epoch": 1.9341913137967333, "grad_norm": 0.33765143156051636, "learning_rate": 8.567933535019219e-05, "loss": 0.244, "step": 14240 }, { "epoch": 1.9355495942137253, "grad_norm": 0.4192339777946472, "learning_rate": 8.565164277252724e-05, "loss": 0.2412, "step": 14250 }, { "epoch": 1.9369078746307176, "grad_norm": 0.2957611382007599, "learning_rate": 8.56239279301236e-05, "loss": 0.2337, "step": 14260 }, { "epoch": 1.9382661550477096, "grad_norm": 0.37260550260543823, "learning_rate": 8.55961908402894e-05, "loss": 0.2319, "step": 14270 }, { "epoch": 1.9396244354647016, "grad_norm": 0.3279188275337219, "learning_rate": 8.556843152034672e-05, "loss": 0.2443, "step": 14280 }, { "epoch": 1.940982715881694, "grad_norm": 0.28557059168815613, "learning_rate": 8.554064998763144e-05, "loss": 0.2335, "step": 14290 }, { "epoch": 1.942340996298686, "grad_norm": 0.4878370463848114, "learning_rate": 8.551284625949338e-05, "loss": 0.2312, "step": 14300 }, { "epoch": 1.943699276715678, "grad_norm": 0.6017899513244629, "learning_rate": 8.548502035329616e-05, "loss": 0.2374, "step": 14310 }, { "epoch": 1.94505755713267, "grad_norm": 0.33437126874923706, "learning_rate": 8.54571722864173e-05, "loss": 0.2374, "step": 14320 }, { "epoch": 1.946415837549662, "grad_norm": 0.828488826751709, "learning_rate": 8.542930207624817e-05, "loss": 0.2303, "step": 14330 }, { "epoch": 1.9477741179666541, "grad_norm": 0.8462080955505371, "learning_rate": 8.540140974019389e-05, "loss": 0.2365, "step": 14340 }, { "epoch": 1.9491323983836462, "grad_norm": 0.2966611087322235, "learning_rate": 8.537349529567349e-05, "loss": 0.2315, "step": 14350 }, { "epoch": 1.9504906788006384, "grad_norm": 0.3532332181930542, "learning_rate": 8.534555876011973e-05, "loss": 0.2375, "step": 14360 }, { "epoch": 1.9518489592176305, "grad_norm": 0.5266355276107788, "learning_rate": 8.531760015097924e-05, "loss": 0.2421, "step": 14370 }, { "epoch": 1.9532072396346227, "grad_norm": 0.6208711862564087, "learning_rate": 8.528961948571239e-05, "loss": 0.2453, "step": 14380 }, { "epoch": 1.9545655200516148, "grad_norm": 0.3167774975299835, "learning_rate": 8.526161678179329e-05, "loss": 0.2338, "step": 14390 }, { "epoch": 1.9559238004686068, "grad_norm": 0.47262799739837646, "learning_rate": 8.523359205670991e-05, "loss": 0.2367, "step": 14400 }, { "epoch": 1.9572820808855989, "grad_norm": 0.3526477515697479, "learning_rate": 8.520554532796389e-05, "loss": 0.2312, "step": 14410 }, { "epoch": 1.958640361302591, "grad_norm": 0.340472936630249, "learning_rate": 8.517747661307063e-05, "loss": 0.2366, "step": 14420 }, { "epoch": 1.959998641719583, "grad_norm": 0.34723782539367676, "learning_rate": 8.514938592955929e-05, "loss": 0.2429, "step": 14430 }, { "epoch": 1.961356922136575, "grad_norm": 0.3824019432067871, "learning_rate": 8.51212732949727e-05, "loss": 0.2468, "step": 14440 }, { "epoch": 1.962715202553567, "grad_norm": 0.3525373041629791, "learning_rate": 8.509313872686746e-05, "loss": 0.2477, "step": 14450 }, { "epoch": 1.9640734829705593, "grad_norm": 0.886265218257904, "learning_rate": 8.506498224281383e-05, "loss": 0.2361, "step": 14460 }, { "epoch": 1.9654317633875513, "grad_norm": 0.34379270672798157, "learning_rate": 8.503680386039575e-05, "loss": 0.2342, "step": 14470 }, { "epoch": 1.9667900438045436, "grad_norm": 0.3149538040161133, "learning_rate": 8.500860359721084e-05, "loss": 0.231, "step": 14480 }, { "epoch": 1.9681483242215356, "grad_norm": 0.6826171278953552, "learning_rate": 8.49803814708704e-05, "loss": 0.2334, "step": 14490 }, { "epoch": 1.9695066046385277, "grad_norm": 0.9640529155731201, "learning_rate": 8.495213749899938e-05, "loss": 0.2381, "step": 14500 }, { "epoch": 1.9708648850555197, "grad_norm": 0.2922406494617462, "learning_rate": 8.492387169923635e-05, "loss": 0.2274, "step": 14510 }, { "epoch": 1.9722231654725118, "grad_norm": 0.3545210063457489, "learning_rate": 8.489558408923357e-05, "loss": 0.2358, "step": 14520 }, { "epoch": 1.9735814458895038, "grad_norm": 0.33676913380622864, "learning_rate": 8.486727468665686e-05, "loss": 0.2413, "step": 14530 }, { "epoch": 1.9749397263064958, "grad_norm": 0.7061485052108765, "learning_rate": 8.483894350918565e-05, "loss": 0.2354, "step": 14540 }, { "epoch": 1.976298006723488, "grad_norm": 0.2854490280151367, "learning_rate": 8.481059057451301e-05, "loss": 0.2434, "step": 14550 }, { "epoch": 1.9776562871404801, "grad_norm": 0.3676334023475647, "learning_rate": 8.478221590034556e-05, "loss": 0.2413, "step": 14560 }, { "epoch": 1.9790145675574724, "grad_norm": 0.31399813294410706, "learning_rate": 8.475381950440354e-05, "loss": 0.2387, "step": 14570 }, { "epoch": 1.9803728479744644, "grad_norm": 0.4124453067779541, "learning_rate": 8.472540140442072e-05, "loss": 0.2455, "step": 14580 }, { "epoch": 1.9817311283914565, "grad_norm": 0.28971201181411743, "learning_rate": 8.469696161814441e-05, "loss": 0.2311, "step": 14590 }, { "epoch": 1.9830894088084485, "grad_norm": 0.31198179721832275, "learning_rate": 8.466850016333552e-05, "loss": 0.2404, "step": 14600 }, { "epoch": 1.9844476892254406, "grad_norm": 0.7683231830596924, "learning_rate": 8.464001705776842e-05, "loss": 0.2326, "step": 14610 }, { "epoch": 1.9858059696424326, "grad_norm": 0.29524800181388855, "learning_rate": 8.461151231923107e-05, "loss": 0.2378, "step": 14620 }, { "epoch": 1.9871642500594247, "grad_norm": 0.6258348226547241, "learning_rate": 8.45829859655249e-05, "loss": 0.2318, "step": 14630 }, { "epoch": 1.9885225304764167, "grad_norm": 0.5115181803703308, "learning_rate": 8.455443801446486e-05, "loss": 0.2373, "step": 14640 }, { "epoch": 1.989880810893409, "grad_norm": 0.37546342611312866, "learning_rate": 8.452586848387936e-05, "loss": 0.2374, "step": 14650 }, { "epoch": 1.991239091310401, "grad_norm": 0.962464451789856, "learning_rate": 8.449727739161028e-05, "loss": 0.2401, "step": 14660 }, { "epoch": 1.9925973717273933, "grad_norm": 0.41409891843795776, "learning_rate": 8.446866475551305e-05, "loss": 0.2498, "step": 14670 }, { "epoch": 1.9939556521443853, "grad_norm": 0.4057100713253021, "learning_rate": 8.444003059345644e-05, "loss": 0.2415, "step": 14680 }, { "epoch": 1.9953139325613773, "grad_norm": 0.2734403908252716, "learning_rate": 8.441137492332272e-05, "loss": 0.236, "step": 14690 }, { "epoch": 1.9966722129783694, "grad_norm": 0.4394242465496063, "learning_rate": 8.438269776300762e-05, "loss": 0.236, "step": 14700 }, { "epoch": 1.9980304933953614, "grad_norm": 0.319155216217041, "learning_rate": 8.435399913042023e-05, "loss": 0.2395, "step": 14710 }, { "epoch": 1.9993887738123535, "grad_norm": 0.4020504951477051, "learning_rate": 8.432527904348308e-05, "loss": 0.2401, "step": 14720 }, { "epoch": 2.0007470542293455, "grad_norm": 0.5737829208374023, "learning_rate": 8.42965375201321e-05, "loss": 0.2347, "step": 14730 }, { "epoch": 2.0021053346463376, "grad_norm": 0.40510350465774536, "learning_rate": 8.42677745783166e-05, "loss": 0.2285, "step": 14740 }, { "epoch": 2.00346361506333, "grad_norm": 0.3309233784675598, "learning_rate": 8.42389902359993e-05, "loss": 0.2103, "step": 14750 }, { "epoch": 2.004821895480322, "grad_norm": 0.7688928246498108, "learning_rate": 8.42101845111562e-05, "loss": 0.2215, "step": 14760 }, { "epoch": 2.006180175897314, "grad_norm": 0.30741849541664124, "learning_rate": 8.418135742177674e-05, "loss": 0.2265, "step": 14770 }, { "epoch": 2.007538456314306, "grad_norm": 0.33003556728363037, "learning_rate": 8.415250898586367e-05, "loss": 0.2132, "step": 14780 }, { "epoch": 2.008896736731298, "grad_norm": 0.3745386004447937, "learning_rate": 8.412363922143308e-05, "loss": 0.2159, "step": 14790 }, { "epoch": 2.0102550171482902, "grad_norm": 0.27881401777267456, "learning_rate": 8.409474814651433e-05, "loss": 0.226, "step": 14800 }, { "epoch": 2.0116132975652823, "grad_norm": 0.33334100246429443, "learning_rate": 8.406583577915017e-05, "loss": 0.2143, "step": 14810 }, { "epoch": 2.0129715779822743, "grad_norm": 0.3435106873512268, "learning_rate": 8.403690213739658e-05, "loss": 0.214, "step": 14820 }, { "epoch": 2.0143298583992664, "grad_norm": 0.4243570566177368, "learning_rate": 8.400794723932289e-05, "loss": 0.2173, "step": 14830 }, { "epoch": 2.0156881388162584, "grad_norm": 0.31727415323257446, "learning_rate": 8.397897110301163e-05, "loss": 0.227, "step": 14840 }, { "epoch": 2.017046419233251, "grad_norm": 0.3069789707660675, "learning_rate": 8.394997374655865e-05, "loss": 0.224, "step": 14850 }, { "epoch": 2.018404699650243, "grad_norm": 0.3419307768344879, "learning_rate": 8.392095518807304e-05, "loss": 0.2129, "step": 14860 }, { "epoch": 2.019762980067235, "grad_norm": 0.40169134736061096, "learning_rate": 8.38919154456771e-05, "loss": 0.2344, "step": 14870 }, { "epoch": 2.021121260484227, "grad_norm": 0.4769253432750702, "learning_rate": 8.386285453750638e-05, "loss": 0.2234, "step": 14880 }, { "epoch": 2.022479540901219, "grad_norm": 0.3165617287158966, "learning_rate": 8.38337724817097e-05, "loss": 0.2244, "step": 14890 }, { "epoch": 2.023837821318211, "grad_norm": 0.30390650033950806, "learning_rate": 8.380466929644896e-05, "loss": 0.2196, "step": 14900 }, { "epoch": 2.025196101735203, "grad_norm": 0.3633367419242859, "learning_rate": 8.377554499989941e-05, "loss": 0.2171, "step": 14910 }, { "epoch": 2.026554382152195, "grad_norm": 0.36349353194236755, "learning_rate": 8.37463996102494e-05, "loss": 0.2334, "step": 14920 }, { "epoch": 2.0279126625691872, "grad_norm": 0.3197133243083954, "learning_rate": 8.37172331457004e-05, "loss": 0.2259, "step": 14930 }, { "epoch": 2.0292709429861797, "grad_norm": 0.36091211438179016, "learning_rate": 8.368804562446717e-05, "loss": 0.229, "step": 14940 }, { "epoch": 2.0306292234031718, "grad_norm": 0.27034950256347656, "learning_rate": 8.365883706477751e-05, "loss": 0.2254, "step": 14950 }, { "epoch": 2.031987503820164, "grad_norm": 0.37883803248405457, "learning_rate": 8.36296074848724e-05, "loss": 0.2173, "step": 14960 }, { "epoch": 2.033345784237156, "grad_norm": 0.3809976875782013, "learning_rate": 8.3600356903006e-05, "loss": 0.2274, "step": 14970 }, { "epoch": 2.034704064654148, "grad_norm": 0.3609301447868347, "learning_rate": 8.357108533744547e-05, "loss": 0.2192, "step": 14980 }, { "epoch": 2.03606234507114, "grad_norm": 0.37505221366882324, "learning_rate": 8.354179280647118e-05, "loss": 0.2162, "step": 14990 }, { "epoch": 2.037420625488132, "grad_norm": 0.3272266983985901, "learning_rate": 8.351247932837655e-05, "loss": 0.2336, "step": 15000 }, { "epoch": 2.038778905905124, "grad_norm": 0.37192732095718384, "learning_rate": 8.348314492146806e-05, "loss": 0.2165, "step": 15010 }, { "epoch": 2.040137186322116, "grad_norm": 0.4183749854564667, "learning_rate": 8.34537896040653e-05, "loss": 0.2131, "step": 15020 }, { "epoch": 2.041495466739108, "grad_norm": 0.36245858669281006, "learning_rate": 8.34244133945009e-05, "loss": 0.2102, "step": 15030 }, { "epoch": 2.0428537471561006, "grad_norm": 0.37136727571487427, "learning_rate": 8.339501631112054e-05, "loss": 0.2165, "step": 15040 }, { "epoch": 2.0442120275730926, "grad_norm": 0.43313634395599365, "learning_rate": 8.336559837228294e-05, "loss": 0.219, "step": 15050 }, { "epoch": 2.0455703079900847, "grad_norm": 2.265929937362671, "learning_rate": 8.333615959635983e-05, "loss": 0.2181, "step": 15060 }, { "epoch": 2.0469285884070767, "grad_norm": 0.34179550409317017, "learning_rate": 8.330670000173597e-05, "loss": 0.2249, "step": 15070 }, { "epoch": 2.0482868688240687, "grad_norm": 0.47437140345573425, "learning_rate": 8.327721960680912e-05, "loss": 0.2193, "step": 15080 }, { "epoch": 2.049645149241061, "grad_norm": 0.32911449670791626, "learning_rate": 8.324771842999002e-05, "loss": 0.2307, "step": 15090 }, { "epoch": 2.051003429658053, "grad_norm": 0.3329635560512543, "learning_rate": 8.32181964897024e-05, "loss": 0.2203, "step": 15100 }, { "epoch": 2.052361710075045, "grad_norm": 0.33285364508628845, "learning_rate": 8.318865380438292e-05, "loss": 0.2177, "step": 15110 }, { "epoch": 2.053719990492037, "grad_norm": 0.3382975459098816, "learning_rate": 8.315909039248125e-05, "loss": 0.2226, "step": 15120 }, { "epoch": 2.0550782709090294, "grad_norm": 0.5013484954833984, "learning_rate": 8.312950627245999e-05, "loss": 0.2161, "step": 15130 }, { "epoch": 2.0564365513260214, "grad_norm": 0.3559269309043884, "learning_rate": 8.309990146279464e-05, "loss": 0.2123, "step": 15140 }, { "epoch": 2.0577948317430135, "grad_norm": 0.34989362955093384, "learning_rate": 8.307027598197366e-05, "loss": 0.2246, "step": 15150 }, { "epoch": 2.0591531121600055, "grad_norm": 0.34205755591392517, "learning_rate": 8.304062984849838e-05, "loss": 0.2052, "step": 15160 }, { "epoch": 2.0605113925769976, "grad_norm": 0.41371703147888184, "learning_rate": 8.301096308088306e-05, "loss": 0.2136, "step": 15170 }, { "epoch": 2.0618696729939896, "grad_norm": 0.4233803451061249, "learning_rate": 8.298127569765486e-05, "loss": 0.2182, "step": 15180 }, { "epoch": 2.0632279534109816, "grad_norm": 0.31219545006752014, "learning_rate": 8.295156771735374e-05, "loss": 0.2164, "step": 15190 }, { "epoch": 2.0645862338279737, "grad_norm": 0.5075352191925049, "learning_rate": 8.292183915853262e-05, "loss": 0.2215, "step": 15200 }, { "epoch": 2.0659445142449657, "grad_norm": 0.494703084230423, "learning_rate": 8.289209003975722e-05, "loss": 0.2205, "step": 15210 }, { "epoch": 2.0673027946619578, "grad_norm": 0.39021196961402893, "learning_rate": 8.286232037960606e-05, "loss": 0.2185, "step": 15220 }, { "epoch": 2.0686610750789503, "grad_norm": 0.4948576092720032, "learning_rate": 8.28325301966706e-05, "loss": 0.2314, "step": 15230 }, { "epoch": 2.0700193554959423, "grad_norm": 0.7706882953643799, "learning_rate": 8.280271950955499e-05, "loss": 0.2305, "step": 15240 }, { "epoch": 2.0713776359129343, "grad_norm": 0.3254872262477875, "learning_rate": 8.277288833687628e-05, "loss": 0.2152, "step": 15250 }, { "epoch": 2.0727359163299264, "grad_norm": 0.47733134031295776, "learning_rate": 8.274303669726426e-05, "loss": 0.2236, "step": 15260 }, { "epoch": 2.0740941967469184, "grad_norm": 0.27645909786224365, "learning_rate": 8.271316460936152e-05, "loss": 0.2223, "step": 15270 }, { "epoch": 2.0754524771639105, "grad_norm": 0.3581974506378174, "learning_rate": 8.268327209182345e-05, "loss": 0.2295, "step": 15280 }, { "epoch": 2.0768107575809025, "grad_norm": 0.31064373254776, "learning_rate": 8.265335916331813e-05, "loss": 0.2265, "step": 15290 }, { "epoch": 2.0781690379978945, "grad_norm": 0.4233199954032898, "learning_rate": 8.262342584252644e-05, "loss": 0.2325, "step": 15300 }, { "epoch": 2.0795273184148866, "grad_norm": 0.5160502195358276, "learning_rate": 8.2593472148142e-05, "loss": 0.218, "step": 15310 }, { "epoch": 2.0808855988318786, "grad_norm": 0.348518431186676, "learning_rate": 8.256349809887111e-05, "loss": 0.2238, "step": 15320 }, { "epoch": 2.082243879248871, "grad_norm": 0.3518754839897156, "learning_rate": 8.25335037134328e-05, "loss": 0.2234, "step": 15330 }, { "epoch": 2.083602159665863, "grad_norm": 0.28069326281547546, "learning_rate": 8.250348901055882e-05, "loss": 0.2197, "step": 15340 }, { "epoch": 2.084960440082855, "grad_norm": 0.36871790885925293, "learning_rate": 8.24734540089936e-05, "loss": 0.2189, "step": 15350 }, { "epoch": 2.0863187204998472, "grad_norm": 0.3161095976829529, "learning_rate": 8.244339872749423e-05, "loss": 0.2208, "step": 15360 }, { "epoch": 2.0876770009168393, "grad_norm": 0.41570502519607544, "learning_rate": 8.241332318483047e-05, "loss": 0.2197, "step": 15370 }, { "epoch": 2.0890352813338313, "grad_norm": 0.4342082142829895, "learning_rate": 8.238322739978475e-05, "loss": 0.2212, "step": 15380 }, { "epoch": 2.0903935617508234, "grad_norm": 0.7536396384239197, "learning_rate": 8.235311139115214e-05, "loss": 0.2163, "step": 15390 }, { "epoch": 2.0917518421678154, "grad_norm": 0.2658962309360504, "learning_rate": 8.23229751777403e-05, "loss": 0.2197, "step": 15400 }, { "epoch": 2.0931101225848074, "grad_norm": 0.26460951566696167, "learning_rate": 8.229281877836956e-05, "loss": 0.2225, "step": 15410 }, { "epoch": 2.0944684030018, "grad_norm": 0.3321557939052582, "learning_rate": 8.226264221187285e-05, "loss": 0.2305, "step": 15420 }, { "epoch": 2.095826683418792, "grad_norm": 0.3588963449001312, "learning_rate": 8.223244549709566e-05, "loss": 0.211, "step": 15430 }, { "epoch": 2.097184963835784, "grad_norm": 0.36711928248405457, "learning_rate": 8.220222865289607e-05, "loss": 0.2235, "step": 15440 }, { "epoch": 2.098543244252776, "grad_norm": 1.0615520477294922, "learning_rate": 8.217199169814476e-05, "loss": 0.2244, "step": 15450 }, { "epoch": 2.099901524669768, "grad_norm": 0.352327823638916, "learning_rate": 8.214173465172495e-05, "loss": 0.2321, "step": 15460 }, { "epoch": 2.10125980508676, "grad_norm": 0.3782016932964325, "learning_rate": 8.21114575325324e-05, "loss": 0.2196, "step": 15470 }, { "epoch": 2.102618085503752, "grad_norm": 0.3188062310218811, "learning_rate": 8.208116035947543e-05, "loss": 0.2223, "step": 15480 }, { "epoch": 2.103976365920744, "grad_norm": 0.3331490457057953, "learning_rate": 8.205084315147487e-05, "loss": 0.2283, "step": 15490 }, { "epoch": 2.1053346463377363, "grad_norm": 0.3380565643310547, "learning_rate": 8.202050592746404e-05, "loss": 0.216, "step": 15500 }, { "epoch": 2.1066929267547283, "grad_norm": 0.2957261800765991, "learning_rate": 8.19901487063888e-05, "loss": 0.2081, "step": 15510 }, { "epoch": 2.108051207171721, "grad_norm": 0.35792115330696106, "learning_rate": 8.195977150720748e-05, "loss": 0.2219, "step": 15520 }, { "epoch": 2.109409487588713, "grad_norm": 0.32785937190055847, "learning_rate": 8.192937434889088e-05, "loss": 0.2311, "step": 15530 }, { "epoch": 2.110767768005705, "grad_norm": 0.2785061001777649, "learning_rate": 8.189895725042229e-05, "loss": 0.2209, "step": 15540 }, { "epoch": 2.112126048422697, "grad_norm": 0.28211286664009094, "learning_rate": 8.186852023079739e-05, "loss": 0.224, "step": 15550 }, { "epoch": 2.113484328839689, "grad_norm": 0.3302210569381714, "learning_rate": 8.183806330902438e-05, "loss": 0.2309, "step": 15560 }, { "epoch": 2.114842609256681, "grad_norm": 0.3360493779182434, "learning_rate": 8.180758650412383e-05, "loss": 0.2364, "step": 15570 }, { "epoch": 2.116200889673673, "grad_norm": 0.311196893453598, "learning_rate": 8.177708983512877e-05, "loss": 0.2129, "step": 15580 }, { "epoch": 2.117559170090665, "grad_norm": 0.33928048610687256, "learning_rate": 8.174657332108462e-05, "loss": 0.2173, "step": 15590 }, { "epoch": 2.118917450507657, "grad_norm": 0.2910972833633423, "learning_rate": 8.171603698104917e-05, "loss": 0.2215, "step": 15600 }, { "epoch": 2.1202757309246496, "grad_norm": 0.4158949851989746, "learning_rate": 8.168548083409264e-05, "loss": 0.2181, "step": 15610 }, { "epoch": 2.1216340113416416, "grad_norm": 0.3833238482475281, "learning_rate": 8.165490489929754e-05, "loss": 0.2187, "step": 15620 }, { "epoch": 2.1229922917586337, "grad_norm": 0.36514586210250854, "learning_rate": 8.162430919575883e-05, "loss": 0.2399, "step": 15630 }, { "epoch": 2.1243505721756257, "grad_norm": 0.31345289945602417, "learning_rate": 8.159369374258377e-05, "loss": 0.2266, "step": 15640 }, { "epoch": 2.1257088525926178, "grad_norm": 0.6269563436508179, "learning_rate": 8.156305855889195e-05, "loss": 0.232, "step": 15650 }, { "epoch": 2.12706713300961, "grad_norm": 0.5642430782318115, "learning_rate": 8.153240366381528e-05, "loss": 0.2191, "step": 15660 }, { "epoch": 2.128425413426602, "grad_norm": 0.3308405578136444, "learning_rate": 8.1501729076498e-05, "loss": 0.2289, "step": 15670 }, { "epoch": 2.129783693843594, "grad_norm": 0.4664577543735504, "learning_rate": 8.147103481609664e-05, "loss": 0.2224, "step": 15680 }, { "epoch": 2.131141974260586, "grad_norm": 0.4744555652141571, "learning_rate": 8.144032090178002e-05, "loss": 0.2199, "step": 15690 }, { "epoch": 2.132500254677578, "grad_norm": 0.3077816069126129, "learning_rate": 8.14095873527292e-05, "loss": 0.2172, "step": 15700 }, { "epoch": 2.1338585350945705, "grad_norm": 0.4103671908378601, "learning_rate": 8.137883418813756e-05, "loss": 0.2247, "step": 15710 }, { "epoch": 2.1352168155115625, "grad_norm": 0.2833380103111267, "learning_rate": 8.134806142721067e-05, "loss": 0.2201, "step": 15720 }, { "epoch": 2.1365750959285545, "grad_norm": 0.862818717956543, "learning_rate": 8.131726908916639e-05, "loss": 0.2136, "step": 15730 }, { "epoch": 2.1379333763455466, "grad_norm": 0.38952168822288513, "learning_rate": 8.128645719323478e-05, "loss": 0.223, "step": 15740 }, { "epoch": 2.1392916567625386, "grad_norm": 0.3778298795223236, "learning_rate": 8.125562575865813e-05, "loss": 0.215, "step": 15750 }, { "epoch": 2.1406499371795307, "grad_norm": 0.39475059509277344, "learning_rate": 8.122477480469089e-05, "loss": 0.2265, "step": 15760 }, { "epoch": 2.1420082175965227, "grad_norm": 0.337980180978775, "learning_rate": 8.119390435059973e-05, "loss": 0.2173, "step": 15770 }, { "epoch": 2.1433664980135148, "grad_norm": 0.5466394424438477, "learning_rate": 8.116301441566354e-05, "loss": 0.2205, "step": 15780 }, { "epoch": 2.144724778430507, "grad_norm": 0.31856563687324524, "learning_rate": 8.113210501917327e-05, "loss": 0.2282, "step": 15790 }, { "epoch": 2.1460830588474993, "grad_norm": 0.3726279139518738, "learning_rate": 8.110117618043216e-05, "loss": 0.2234, "step": 15800 }, { "epoch": 2.1474413392644913, "grad_norm": 0.44323012232780457, "learning_rate": 8.107022791875546e-05, "loss": 0.2359, "step": 15810 }, { "epoch": 2.1487996196814834, "grad_norm": 0.36587512493133545, "learning_rate": 8.103926025347066e-05, "loss": 0.2197, "step": 15820 }, { "epoch": 2.1501579000984754, "grad_norm": 0.35130101442337036, "learning_rate": 8.100827320391726e-05, "loss": 0.2175, "step": 15830 }, { "epoch": 2.1515161805154674, "grad_norm": 0.6085388660430908, "learning_rate": 8.097726678944697e-05, "loss": 0.2276, "step": 15840 }, { "epoch": 2.1528744609324595, "grad_norm": 0.3278145492076874, "learning_rate": 8.094624102942352e-05, "loss": 0.2175, "step": 15850 }, { "epoch": 2.1542327413494515, "grad_norm": 0.3350791335105896, "learning_rate": 8.091519594322275e-05, "loss": 0.2252, "step": 15860 }, { "epoch": 2.1555910217664436, "grad_norm": 0.30996114015579224, "learning_rate": 8.088413155023259e-05, "loss": 0.2217, "step": 15870 }, { "epoch": 2.1569493021834356, "grad_norm": 0.34789976477622986, "learning_rate": 8.085304786985297e-05, "loss": 0.2275, "step": 15880 }, { "epoch": 2.1583075826004277, "grad_norm": 0.3841097950935364, "learning_rate": 8.082194492149593e-05, "loss": 0.2074, "step": 15890 }, { "epoch": 2.15966586301742, "grad_norm": 0.35532334446907043, "learning_rate": 8.079082272458548e-05, "loss": 0.2274, "step": 15900 }, { "epoch": 2.161024143434412, "grad_norm": 0.6634958982467651, "learning_rate": 8.075968129855771e-05, "loss": 0.2218, "step": 15910 }, { "epoch": 2.1623824238514042, "grad_norm": 0.47083255648612976, "learning_rate": 8.07285206628607e-05, "loss": 0.2195, "step": 15920 }, { "epoch": 2.1637407042683963, "grad_norm": 0.32020655274391174, "learning_rate": 8.069734083695449e-05, "loss": 0.2277, "step": 15930 }, { "epoch": 2.1650989846853883, "grad_norm": 0.29828450083732605, "learning_rate": 8.066614184031113e-05, "loss": 0.2262, "step": 15940 }, { "epoch": 2.1664572651023803, "grad_norm": 0.4296570420265198, "learning_rate": 8.063492369241467e-05, "loss": 0.2201, "step": 15950 }, { "epoch": 2.1678155455193724, "grad_norm": 0.36955979466438293, "learning_rate": 8.060368641276108e-05, "loss": 0.2156, "step": 15960 }, { "epoch": 2.1691738259363644, "grad_norm": 0.3432427942752838, "learning_rate": 8.05724300208583e-05, "loss": 0.2122, "step": 15970 }, { "epoch": 2.1705321063533565, "grad_norm": 0.3438816964626312, "learning_rate": 8.054115453622621e-05, "loss": 0.237, "step": 15980 }, { "epoch": 2.1718903867703485, "grad_norm": 0.3948029577732086, "learning_rate": 8.050985997839658e-05, "loss": 0.2209, "step": 15990 }, { "epoch": 2.173248667187341, "grad_norm": 0.36971116065979004, "learning_rate": 8.047854636691314e-05, "loss": 0.2273, "step": 16000 }, { "epoch": 2.174606947604333, "grad_norm": 0.34923842549324036, "learning_rate": 8.044721372133146e-05, "loss": 0.226, "step": 16010 }, { "epoch": 2.175965228021325, "grad_norm": 0.5350345373153687, "learning_rate": 8.041586206121905e-05, "loss": 0.2094, "step": 16020 }, { "epoch": 2.177323508438317, "grad_norm": 0.3235137462615967, "learning_rate": 8.038449140615528e-05, "loss": 0.2269, "step": 16030 }, { "epoch": 2.178681788855309, "grad_norm": 0.3839135766029358, "learning_rate": 8.035310177573135e-05, "loss": 0.2243, "step": 16040 }, { "epoch": 2.180040069272301, "grad_norm": 0.4759887754917145, "learning_rate": 8.032169318955039e-05, "loss": 0.2152, "step": 16050 }, { "epoch": 2.1813983496892932, "grad_norm": 0.41388922929763794, "learning_rate": 8.029026566722726e-05, "loss": 0.2271, "step": 16060 }, { "epoch": 2.1827566301062853, "grad_norm": 0.35182181000709534, "learning_rate": 8.025881922838872e-05, "loss": 0.2246, "step": 16070 }, { "epoch": 2.1841149105232773, "grad_norm": 0.3189408779144287, "learning_rate": 8.022735389267333e-05, "loss": 0.2152, "step": 16080 }, { "epoch": 2.18547319094027, "grad_norm": 0.39519914984703064, "learning_rate": 8.019586967973146e-05, "loss": 0.2274, "step": 16090 }, { "epoch": 2.186831471357262, "grad_norm": 0.4419909417629242, "learning_rate": 8.016436660922522e-05, "loss": 0.2259, "step": 16100 }, { "epoch": 2.188189751774254, "grad_norm": 0.3178653120994568, "learning_rate": 8.013284470082858e-05, "loss": 0.2125, "step": 16110 }, { "epoch": 2.189548032191246, "grad_norm": 0.39341259002685547, "learning_rate": 8.010130397422718e-05, "loss": 0.214, "step": 16120 }, { "epoch": 2.190906312608238, "grad_norm": 0.3202066719532013, "learning_rate": 8.006974444911846e-05, "loss": 0.2189, "step": 16130 }, { "epoch": 2.19226459302523, "grad_norm": 0.45892593264579773, "learning_rate": 8.003816614521165e-05, "loss": 0.2229, "step": 16140 }, { "epoch": 2.193622873442222, "grad_norm": 0.4014805853366852, "learning_rate": 8.00065690822276e-05, "loss": 0.2346, "step": 16150 }, { "epoch": 2.194981153859214, "grad_norm": 0.3596577048301697, "learning_rate": 7.997495327989895e-05, "loss": 0.2113, "step": 16160 }, { "epoch": 2.196339434276206, "grad_norm": 0.5899550318717957, "learning_rate": 7.994331875797003e-05, "loss": 0.2183, "step": 16170 }, { "epoch": 2.1976977146931986, "grad_norm": 0.41216742992401123, "learning_rate": 7.991166553619686e-05, "loss": 0.2232, "step": 16180 }, { "epoch": 2.1990559951101907, "grad_norm": 0.37528106570243835, "learning_rate": 7.987999363434711e-05, "loss": 0.2305, "step": 16190 }, { "epoch": 2.2004142755271827, "grad_norm": 0.4170815646648407, "learning_rate": 7.984830307220015e-05, "loss": 0.2143, "step": 16200 }, { "epoch": 2.2017725559441748, "grad_norm": 0.622063398361206, "learning_rate": 7.9816593869547e-05, "loss": 0.221, "step": 16210 }, { "epoch": 2.203130836361167, "grad_norm": 0.3674362301826477, "learning_rate": 7.97848660461903e-05, "loss": 0.2205, "step": 16220 }, { "epoch": 2.204489116778159, "grad_norm": 0.5298368334770203, "learning_rate": 7.975311962194434e-05, "loss": 0.2175, "step": 16230 }, { "epoch": 2.205847397195151, "grad_norm": 0.4523549973964691, "learning_rate": 7.9721354616635e-05, "loss": 0.2132, "step": 16240 }, { "epoch": 2.207205677612143, "grad_norm": 0.35151809453964233, "learning_rate": 7.968957105009981e-05, "loss": 0.2277, "step": 16250 }, { "epoch": 2.208563958029135, "grad_norm": 0.3794400990009308, "learning_rate": 7.965776894218785e-05, "loss": 0.2213, "step": 16260 }, { "epoch": 2.209922238446127, "grad_norm": 0.340616375207901, "learning_rate": 7.96259483127598e-05, "loss": 0.2205, "step": 16270 }, { "epoch": 2.2112805188631195, "grad_norm": 0.3701592683792114, "learning_rate": 7.959410918168788e-05, "loss": 0.2264, "step": 16280 }, { "epoch": 2.2126387992801115, "grad_norm": 0.4840666651725769, "learning_rate": 7.956225156885591e-05, "loss": 0.2235, "step": 16290 }, { "epoch": 2.2139970796971036, "grad_norm": 0.34238502383232117, "learning_rate": 7.953037549415921e-05, "loss": 0.2145, "step": 16300 }, { "epoch": 2.2153553601140956, "grad_norm": 0.41128548979759216, "learning_rate": 7.949848097750465e-05, "loss": 0.2138, "step": 16310 }, { "epoch": 2.2167136405310877, "grad_norm": 0.32379594445228577, "learning_rate": 7.946656803881062e-05, "loss": 0.2154, "step": 16320 }, { "epoch": 2.2180719209480797, "grad_norm": 0.6468459963798523, "learning_rate": 7.943463669800699e-05, "loss": 0.2195, "step": 16330 }, { "epoch": 2.2194302013650717, "grad_norm": 0.33094942569732666, "learning_rate": 7.940268697503516e-05, "loss": 0.2255, "step": 16340 }, { "epoch": 2.220788481782064, "grad_norm": 0.3391402065753937, "learning_rate": 7.9370718889848e-05, "loss": 0.2199, "step": 16350 }, { "epoch": 2.222146762199056, "grad_norm": 0.3538142144680023, "learning_rate": 7.93387324624098e-05, "loss": 0.2266, "step": 16360 }, { "epoch": 2.223505042616048, "grad_norm": 0.3208047151565552, "learning_rate": 7.930672771269635e-05, "loss": 0.2365, "step": 16370 }, { "epoch": 2.2248633230330404, "grad_norm": 0.4131433665752411, "learning_rate": 7.92747046606949e-05, "loss": 0.2136, "step": 16380 }, { "epoch": 2.2262216034500324, "grad_norm": 0.32839280366897583, "learning_rate": 7.924266332640408e-05, "loss": 0.2203, "step": 16390 }, { "epoch": 2.2275798838670244, "grad_norm": 0.4719604551792145, "learning_rate": 7.9210603729834e-05, "loss": 0.2308, "step": 16400 }, { "epoch": 2.2289381642840165, "grad_norm": 0.6617920398712158, "learning_rate": 7.917852589100607e-05, "loss": 0.22, "step": 16410 }, { "epoch": 2.2302964447010085, "grad_norm": 0.26253870129585266, "learning_rate": 7.914642982995322e-05, "loss": 0.2187, "step": 16420 }, { "epoch": 2.2316547251180006, "grad_norm": 0.3670192360877991, "learning_rate": 7.911431556671968e-05, "loss": 0.2135, "step": 16430 }, { "epoch": 2.2330130055349926, "grad_norm": 0.413297563791275, "learning_rate": 7.908218312136105e-05, "loss": 0.215, "step": 16440 }, { "epoch": 2.2343712859519846, "grad_norm": 0.616898238658905, "learning_rate": 7.905003251394433e-05, "loss": 0.2194, "step": 16450 }, { "epoch": 2.2357295663689767, "grad_norm": 0.4443977475166321, "learning_rate": 7.901786376454781e-05, "loss": 0.2191, "step": 16460 }, { "epoch": 2.237087846785969, "grad_norm": 0.3916460871696472, "learning_rate": 7.898567689326115e-05, "loss": 0.2169, "step": 16470 }, { "epoch": 2.238446127202961, "grad_norm": 0.4607168734073639, "learning_rate": 7.895347192018529e-05, "loss": 0.2124, "step": 16480 }, { "epoch": 2.2398044076199533, "grad_norm": 0.3639601767063141, "learning_rate": 7.892124886543252e-05, "loss": 0.2267, "step": 16490 }, { "epoch": 2.2411626880369453, "grad_norm": 0.418556272983551, "learning_rate": 7.888900774912638e-05, "loss": 0.2097, "step": 16500 }, { "epoch": 2.2425209684539373, "grad_norm": 0.3311217725276947, "learning_rate": 7.885674859140174e-05, "loss": 0.2278, "step": 16510 }, { "epoch": 2.2438792488709294, "grad_norm": 0.4088120460510254, "learning_rate": 7.882447141240465e-05, "loss": 0.2233, "step": 16520 }, { "epoch": 2.2452375292879214, "grad_norm": 0.35755929350852966, "learning_rate": 7.879217623229249e-05, "loss": 0.2167, "step": 16530 }, { "epoch": 2.2465958097049135, "grad_norm": 0.4787384569644928, "learning_rate": 7.875986307123389e-05, "loss": 0.221, "step": 16540 }, { "epoch": 2.2479540901219055, "grad_norm": 0.5022382140159607, "learning_rate": 7.872753194940866e-05, "loss": 0.2322, "step": 16550 }, { "epoch": 2.249312370538898, "grad_norm": 0.33916059136390686, "learning_rate": 7.869518288700783e-05, "loss": 0.2185, "step": 16560 }, { "epoch": 2.25067065095589, "grad_norm": 0.3196845054626465, "learning_rate": 7.866281590423367e-05, "loss": 0.2158, "step": 16570 }, { "epoch": 2.252028931372882, "grad_norm": 0.31432798504829407, "learning_rate": 7.863043102129964e-05, "loss": 0.2104, "step": 16580 }, { "epoch": 2.253387211789874, "grad_norm": 0.4067695736885071, "learning_rate": 7.859802825843031e-05, "loss": 0.2194, "step": 16590 }, { "epoch": 2.254745492206866, "grad_norm": 0.40504950284957886, "learning_rate": 7.85656076358615e-05, "loss": 0.2239, "step": 16600 }, { "epoch": 2.256103772623858, "grad_norm": 0.3138993978500366, "learning_rate": 7.853316917384015e-05, "loss": 0.2139, "step": 16610 }, { "epoch": 2.2574620530408502, "grad_norm": 0.40745672583580017, "learning_rate": 7.850071289262435e-05, "loss": 0.2215, "step": 16620 }, { "epoch": 2.2588203334578423, "grad_norm": 1.074416160583496, "learning_rate": 7.846823881248328e-05, "loss": 0.2262, "step": 16630 }, { "epoch": 2.2601786138748343, "grad_norm": 0.37666547298431396, "learning_rate": 7.843574695369729e-05, "loss": 0.2174, "step": 16640 }, { "epoch": 2.2615368942918264, "grad_norm": 0.6361355781555176, "learning_rate": 7.840323733655778e-05, "loss": 0.2224, "step": 16650 }, { "epoch": 2.2628951747088184, "grad_norm": 0.426990270614624, "learning_rate": 7.837070998136731e-05, "loss": 0.2207, "step": 16660 }, { "epoch": 2.264253455125811, "grad_norm": 0.4823324382305145, "learning_rate": 7.833816490843946e-05, "loss": 0.2207, "step": 16670 }, { "epoch": 2.265611735542803, "grad_norm": 0.33581337332725525, "learning_rate": 7.830560213809888e-05, "loss": 0.222, "step": 16680 }, { "epoch": 2.266970015959795, "grad_norm": 0.37334010004997253, "learning_rate": 7.827302169068128e-05, "loss": 0.2291, "step": 16690 }, { "epoch": 2.268328296376787, "grad_norm": 0.34610894322395325, "learning_rate": 7.824042358653341e-05, "loss": 0.217, "step": 16700 }, { "epoch": 2.269686576793779, "grad_norm": 0.4591052532196045, "learning_rate": 7.820780784601307e-05, "loss": 0.2203, "step": 16710 }, { "epoch": 2.271044857210771, "grad_norm": 0.41494959592819214, "learning_rate": 7.817517448948905e-05, "loss": 0.2124, "step": 16720 }, { "epoch": 2.272403137627763, "grad_norm": 0.3272130787372589, "learning_rate": 7.814252353734112e-05, "loss": 0.2247, "step": 16730 }, { "epoch": 2.273761418044755, "grad_norm": 0.28063976764678955, "learning_rate": 7.810985500996011e-05, "loss": 0.2204, "step": 16740 }, { "epoch": 2.275119698461747, "grad_norm": 0.6289178133010864, "learning_rate": 7.807716892774772e-05, "loss": 0.2219, "step": 16750 }, { "epoch": 2.2764779788787397, "grad_norm": 0.4202318489551544, "learning_rate": 7.80444653111167e-05, "loss": 0.2356, "step": 16760 }, { "epoch": 2.2778362592957317, "grad_norm": 0.37801799178123474, "learning_rate": 7.801174418049076e-05, "loss": 0.2322, "step": 16770 }, { "epoch": 2.279194539712724, "grad_norm": 0.38527947664260864, "learning_rate": 7.797900555630445e-05, "loss": 0.2265, "step": 16780 }, { "epoch": 2.280552820129716, "grad_norm": 0.5178115367889404, "learning_rate": 7.794624945900333e-05, "loss": 0.2247, "step": 16790 }, { "epoch": 2.281911100546708, "grad_norm": 0.3901855945587158, "learning_rate": 7.791347590904385e-05, "loss": 0.2199, "step": 16800 }, { "epoch": 2.2832693809637, "grad_norm": 0.4630627930164337, "learning_rate": 7.788068492689334e-05, "loss": 0.2216, "step": 16810 }, { "epoch": 2.284627661380692, "grad_norm": 0.35771042108535767, "learning_rate": 7.784787653303005e-05, "loss": 0.2192, "step": 16820 }, { "epoch": 2.285985941797684, "grad_norm": 0.479138046503067, "learning_rate": 7.781505074794309e-05, "loss": 0.2303, "step": 16830 }, { "epoch": 2.287344222214676, "grad_norm": 0.34415796399116516, "learning_rate": 7.778220759213243e-05, "loss": 0.2254, "step": 16840 }, { "epoch": 2.2887025026316685, "grad_norm": 0.3446962833404541, "learning_rate": 7.774934708610885e-05, "loss": 0.2236, "step": 16850 }, { "epoch": 2.2900607830486606, "grad_norm": 0.2800467908382416, "learning_rate": 7.771646925039406e-05, "loss": 0.2203, "step": 16860 }, { "epoch": 2.2914190634656526, "grad_norm": 0.3306378126144409, "learning_rate": 7.768357410552047e-05, "loss": 0.214, "step": 16870 }, { "epoch": 2.2927773438826446, "grad_norm": 0.3530402183532715, "learning_rate": 7.76506616720314e-05, "loss": 0.2182, "step": 16880 }, { "epoch": 2.2941356242996367, "grad_norm": 0.6923341155052185, "learning_rate": 7.761773197048096e-05, "loss": 0.2266, "step": 16890 }, { "epoch": 2.2954939047166287, "grad_norm": 0.3869684934616089, "learning_rate": 7.758478502143395e-05, "loss": 0.217, "step": 16900 }, { "epoch": 2.2968521851336208, "grad_norm": 0.44772422313690186, "learning_rate": 7.755182084546604e-05, "loss": 0.2172, "step": 16910 }, { "epoch": 2.298210465550613, "grad_norm": 0.5390695929527283, "learning_rate": 7.751883946316363e-05, "loss": 0.2214, "step": 16920 }, { "epoch": 2.299568745967605, "grad_norm": 0.31105825304985046, "learning_rate": 7.748584089512383e-05, "loss": 0.2191, "step": 16930 }, { "epoch": 2.3009270263845973, "grad_norm": 0.36740121245384216, "learning_rate": 7.745282516195453e-05, "loss": 0.2155, "step": 16940 }, { "epoch": 2.3022853068015894, "grad_norm": 0.35622864961624146, "learning_rate": 7.741979228427435e-05, "loss": 0.2253, "step": 16950 }, { "epoch": 2.3036435872185814, "grad_norm": 0.4868374466896057, "learning_rate": 7.738674228271254e-05, "loss": 0.2211, "step": 16960 }, { "epoch": 2.3050018676355735, "grad_norm": 0.7237669825553894, "learning_rate": 7.735367517790912e-05, "loss": 0.2319, "step": 16970 }, { "epoch": 2.3063601480525655, "grad_norm": 0.38525113463401794, "learning_rate": 7.732059099051476e-05, "loss": 0.2187, "step": 16980 }, { "epoch": 2.3077184284695575, "grad_norm": 0.537720263004303, "learning_rate": 7.728748974119079e-05, "loss": 0.2225, "step": 16990 }, { "epoch": 2.3090767088865496, "grad_norm": 0.2703965902328491, "learning_rate": 7.725437145060923e-05, "loss": 0.2163, "step": 17000 }, { "epoch": 2.3104349893035416, "grad_norm": 0.3242848813533783, "learning_rate": 7.722123613945269e-05, "loss": 0.2305, "step": 17010 }, { "epoch": 2.3117932697205337, "grad_norm": 0.34849461913108826, "learning_rate": 7.718808382841449e-05, "loss": 0.2236, "step": 17020 }, { "epoch": 2.3131515501375257, "grad_norm": 0.6753604412078857, "learning_rate": 7.715491453819845e-05, "loss": 0.2101, "step": 17030 }, { "epoch": 2.3145098305545178, "grad_norm": 0.4212934970855713, "learning_rate": 7.71217282895191e-05, "loss": 0.2237, "step": 17040 }, { "epoch": 2.3158681109715102, "grad_norm": 0.5723938345909119, "learning_rate": 7.708852510310152e-05, "loss": 0.2271, "step": 17050 }, { "epoch": 2.3172263913885023, "grad_norm": 0.3367297351360321, "learning_rate": 7.705530499968135e-05, "loss": 0.2182, "step": 17060 }, { "epoch": 2.3185846718054943, "grad_norm": 0.3916478455066681, "learning_rate": 7.702206800000485e-05, "loss": 0.223, "step": 17070 }, { "epoch": 2.3199429522224864, "grad_norm": 1.4356703758239746, "learning_rate": 7.698881412482874e-05, "loss": 0.218, "step": 17080 }, { "epoch": 2.3213012326394784, "grad_norm": 0.3191513419151306, "learning_rate": 7.695554339492037e-05, "loss": 0.2193, "step": 17090 }, { "epoch": 2.3226595130564704, "grad_norm": 0.34011518955230713, "learning_rate": 7.692225583105756e-05, "loss": 0.2256, "step": 17100 }, { "epoch": 2.3240177934734625, "grad_norm": 0.4546034336090088, "learning_rate": 7.688895145402868e-05, "loss": 0.2319, "step": 17110 }, { "epoch": 2.3253760738904545, "grad_norm": 0.509128987789154, "learning_rate": 7.685563028463257e-05, "loss": 0.231, "step": 17120 }, { "epoch": 2.3267343543074466, "grad_norm": 0.30908483266830444, "learning_rate": 7.682229234367859e-05, "loss": 0.2254, "step": 17130 }, { "epoch": 2.328092634724439, "grad_norm": 1.0931140184402466, "learning_rate": 7.678893765198653e-05, "loss": 0.2188, "step": 17140 }, { "epoch": 2.329450915141431, "grad_norm": 0.6769023537635803, "learning_rate": 7.675556623038667e-05, "loss": 0.2288, "step": 17150 }, { "epoch": 2.330809195558423, "grad_norm": 0.48528119921684265, "learning_rate": 7.672217809971971e-05, "loss": 0.2224, "step": 17160 }, { "epoch": 2.332167475975415, "grad_norm": 0.4830183684825897, "learning_rate": 7.668877328083686e-05, "loss": 0.2188, "step": 17170 }, { "epoch": 2.3335257563924072, "grad_norm": 0.34027040004730225, "learning_rate": 7.665535179459967e-05, "loss": 0.2375, "step": 17180 }, { "epoch": 2.3348840368093993, "grad_norm": 0.44149038195610046, "learning_rate": 7.662191366188012e-05, "loss": 0.2202, "step": 17190 }, { "epoch": 2.3362423172263913, "grad_norm": 0.4097312092781067, "learning_rate": 7.658845890356061e-05, "loss": 0.2223, "step": 17200 }, { "epoch": 2.3376005976433833, "grad_norm": 0.37330499291419983, "learning_rate": 7.65549875405339e-05, "loss": 0.2224, "step": 17210 }, { "epoch": 2.3389588780603754, "grad_norm": 0.3548596501350403, "learning_rate": 7.65214995937031e-05, "loss": 0.2194, "step": 17220 }, { "epoch": 2.340317158477368, "grad_norm": 0.338329941034317, "learning_rate": 7.648799508398174e-05, "loss": 0.2281, "step": 17230 }, { "epoch": 2.34167543889436, "grad_norm": 0.41045382618904114, "learning_rate": 7.645447403229364e-05, "loss": 0.2201, "step": 17240 }, { "epoch": 2.343033719311352, "grad_norm": 0.3171554505825043, "learning_rate": 7.642093645957299e-05, "loss": 0.227, "step": 17250 }, { "epoch": 2.344391999728344, "grad_norm": 0.32688799500465393, "learning_rate": 7.638738238676425e-05, "loss": 0.2203, "step": 17260 }, { "epoch": 2.345750280145336, "grad_norm": 0.34194961190223694, "learning_rate": 7.635381183482219e-05, "loss": 0.2195, "step": 17270 }, { "epoch": 2.347108560562328, "grad_norm": 1.1452054977416992, "learning_rate": 7.632022482471195e-05, "loss": 0.2223, "step": 17280 }, { "epoch": 2.34846684097932, "grad_norm": 0.38640838861465454, "learning_rate": 7.628662137740884e-05, "loss": 0.2266, "step": 17290 }, { "epoch": 2.349825121396312, "grad_norm": 0.30884313583374023, "learning_rate": 7.625300151389852e-05, "loss": 0.2183, "step": 17300 }, { "epoch": 2.351183401813304, "grad_norm": 0.3533780574798584, "learning_rate": 7.621936525517684e-05, "loss": 0.2229, "step": 17310 }, { "epoch": 2.3525416822302967, "grad_norm": 0.5537764430046082, "learning_rate": 7.61857126222499e-05, "loss": 0.2253, "step": 17320 }, { "epoch": 2.3538999626472883, "grad_norm": 0.35817527770996094, "learning_rate": 7.615204363613407e-05, "loss": 0.2256, "step": 17330 }, { "epoch": 2.3552582430642808, "grad_norm": 0.3411032557487488, "learning_rate": 7.61183583178559e-05, "loss": 0.2262, "step": 17340 }, { "epoch": 2.356616523481273, "grad_norm": 0.33438894152641296, "learning_rate": 7.608465668845212e-05, "loss": 0.2181, "step": 17350 }, { "epoch": 2.357974803898265, "grad_norm": 0.3558556139469147, "learning_rate": 7.605093876896971e-05, "loss": 0.2323, "step": 17360 }, { "epoch": 2.359333084315257, "grad_norm": 0.339269757270813, "learning_rate": 7.601720458046576e-05, "loss": 0.2201, "step": 17370 }, { "epoch": 2.360691364732249, "grad_norm": 0.3285767436027527, "learning_rate": 7.598345414400754e-05, "loss": 0.2276, "step": 17380 }, { "epoch": 2.362049645149241, "grad_norm": 0.3021979033946991, "learning_rate": 7.594968748067247e-05, "loss": 0.224, "step": 17390 }, { "epoch": 2.363407925566233, "grad_norm": 0.35113757848739624, "learning_rate": 7.591590461154811e-05, "loss": 0.219, "step": 17400 }, { "epoch": 2.364766205983225, "grad_norm": 0.40378066897392273, "learning_rate": 7.588210555773214e-05, "loss": 0.2244, "step": 17410 }, { "epoch": 2.366124486400217, "grad_norm": 0.557292640209198, "learning_rate": 7.584829034033233e-05, "loss": 0.2245, "step": 17420 }, { "epoch": 2.3674827668172096, "grad_norm": 0.5098421573638916, "learning_rate": 7.581445898046655e-05, "loss": 0.2126, "step": 17430 }, { "epoch": 2.3688410472342016, "grad_norm": 0.344594269990921, "learning_rate": 7.578061149926278e-05, "loss": 0.2204, "step": 17440 }, { "epoch": 2.3701993276511937, "grad_norm": 0.49601149559020996, "learning_rate": 7.574674791785902e-05, "loss": 0.2177, "step": 17450 }, { "epoch": 2.3715576080681857, "grad_norm": 0.42993995547294617, "learning_rate": 7.571286825740338e-05, "loss": 0.2299, "step": 17460 }, { "epoch": 2.3729158884851778, "grad_norm": 0.5972803235054016, "learning_rate": 7.567897253905396e-05, "loss": 0.2237, "step": 17470 }, { "epoch": 2.37427416890217, "grad_norm": 0.44466930627822876, "learning_rate": 7.564506078397891e-05, "loss": 0.2215, "step": 17480 }, { "epoch": 2.375632449319162, "grad_norm": 0.38318246603012085, "learning_rate": 7.561113301335641e-05, "loss": 0.2255, "step": 17490 }, { "epoch": 2.376990729736154, "grad_norm": 0.3717864453792572, "learning_rate": 7.557718924837459e-05, "loss": 0.2212, "step": 17500 }, { "epoch": 2.378349010153146, "grad_norm": 0.47269853949546814, "learning_rate": 7.554322951023165e-05, "loss": 0.2218, "step": 17510 }, { "epoch": 2.3797072905701384, "grad_norm": 0.44630852341651917, "learning_rate": 7.55092538201357e-05, "loss": 0.2239, "step": 17520 }, { "epoch": 2.3810655709871305, "grad_norm": 0.45602938532829285, "learning_rate": 7.547526219930483e-05, "loss": 0.231, "step": 17530 }, { "epoch": 2.3824238514041225, "grad_norm": 0.44227683544158936, "learning_rate": 7.544125466896708e-05, "loss": 0.2154, "step": 17540 }, { "epoch": 2.3837821318211145, "grad_norm": 0.39833444356918335, "learning_rate": 7.540723125036045e-05, "loss": 0.2179, "step": 17550 }, { "epoch": 2.3851404122381066, "grad_norm": 0.37296849489212036, "learning_rate": 7.53731919647328e-05, "loss": 0.2154, "step": 17560 }, { "epoch": 2.3864986926550986, "grad_norm": 0.34890878200531006, "learning_rate": 7.533913683334197e-05, "loss": 0.2265, "step": 17570 }, { "epoch": 2.3878569730720907, "grad_norm": 0.43531516194343567, "learning_rate": 7.530506587745566e-05, "loss": 0.2177, "step": 17580 }, { "epoch": 2.3892152534890827, "grad_norm": 0.37810420989990234, "learning_rate": 7.527097911835145e-05, "loss": 0.2176, "step": 17590 }, { "epoch": 2.3905735339060747, "grad_norm": 0.4198654890060425, "learning_rate": 7.523687657731679e-05, "loss": 0.2255, "step": 17600 }, { "epoch": 2.3919318143230672, "grad_norm": 0.4483186602592468, "learning_rate": 7.520275827564902e-05, "loss": 0.2173, "step": 17610 }, { "epoch": 2.3932900947400593, "grad_norm": 0.403157502412796, "learning_rate": 7.516862423465525e-05, "loss": 0.221, "step": 17620 }, { "epoch": 2.3946483751570513, "grad_norm": 0.30923691391944885, "learning_rate": 7.513447447565252e-05, "loss": 0.2274, "step": 17630 }, { "epoch": 2.3960066555740434, "grad_norm": 0.46626609563827515, "learning_rate": 7.510030901996757e-05, "loss": 0.2165, "step": 17640 }, { "epoch": 2.3973649359910354, "grad_norm": 0.3745487630367279, "learning_rate": 7.506612788893706e-05, "loss": 0.2143, "step": 17650 }, { "epoch": 2.3987232164080274, "grad_norm": 0.3729627728462219, "learning_rate": 7.503193110390733e-05, "loss": 0.2282, "step": 17660 }, { "epoch": 2.4000814968250195, "grad_norm": 0.38380536437034607, "learning_rate": 7.499771868623458e-05, "loss": 0.2184, "step": 17670 }, { "epoch": 2.4014397772420115, "grad_norm": 0.40276503562927246, "learning_rate": 7.496349065728475e-05, "loss": 0.2144, "step": 17680 }, { "epoch": 2.4027980576590036, "grad_norm": 0.37527552247047424, "learning_rate": 7.492924703843348e-05, "loss": 0.2231, "step": 17690 }, { "epoch": 2.4041563380759956, "grad_norm": 0.5146649479866028, "learning_rate": 7.489498785106624e-05, "loss": 0.2252, "step": 17700 }, { "epoch": 2.4055146184929876, "grad_norm": 0.715528666973114, "learning_rate": 7.486071311657812e-05, "loss": 0.2124, "step": 17710 }, { "epoch": 2.40687289890998, "grad_norm": 0.5943784117698669, "learning_rate": 7.482642285637399e-05, "loss": 0.2309, "step": 17720 }, { "epoch": 2.408231179326972, "grad_norm": 0.6369363069534302, "learning_rate": 7.479211709186837e-05, "loss": 0.2171, "step": 17730 }, { "epoch": 2.409589459743964, "grad_norm": 0.5407935380935669, "learning_rate": 7.475779584448554e-05, "loss": 0.2201, "step": 17740 }, { "epoch": 2.4109477401609563, "grad_norm": 0.5469819903373718, "learning_rate": 7.472345913565937e-05, "loss": 0.234, "step": 17750 }, { "epoch": 2.4123060205779483, "grad_norm": 0.4867139756679535, "learning_rate": 7.468910698683339e-05, "loss": 0.234, "step": 17760 }, { "epoch": 2.4136643009949403, "grad_norm": 0.4305403530597687, "learning_rate": 7.465473941946082e-05, "loss": 0.2223, "step": 17770 }, { "epoch": 2.4150225814119324, "grad_norm": 0.4273839294910431, "learning_rate": 7.462035645500447e-05, "loss": 0.2256, "step": 17780 }, { "epoch": 2.4163808618289244, "grad_norm": 0.4300057291984558, "learning_rate": 7.458595811493679e-05, "loss": 0.2206, "step": 17790 }, { "epoch": 2.4177391422459165, "grad_norm": 0.6854287385940552, "learning_rate": 7.45515444207398e-05, "loss": 0.2246, "step": 17800 }, { "epoch": 2.419097422662909, "grad_norm": 0.4548177719116211, "learning_rate": 7.451711539390516e-05, "loss": 0.224, "step": 17810 }, { "epoch": 2.420455703079901, "grad_norm": 0.5135058760643005, "learning_rate": 7.448267105593406e-05, "loss": 0.216, "step": 17820 }, { "epoch": 2.421813983496893, "grad_norm": 0.4439927935600281, "learning_rate": 7.444821142833724e-05, "loss": 0.2314, "step": 17830 }, { "epoch": 2.423172263913885, "grad_norm": 0.36408931016921997, "learning_rate": 7.441373653263506e-05, "loss": 0.2152, "step": 17840 }, { "epoch": 2.424530544330877, "grad_norm": 0.3925407826900482, "learning_rate": 7.437924639035736e-05, "loss": 0.2126, "step": 17850 }, { "epoch": 2.425888824747869, "grad_norm": 0.365328311920166, "learning_rate": 7.43447410230435e-05, "loss": 0.218, "step": 17860 }, { "epoch": 2.427247105164861, "grad_norm": 0.4770735800266266, "learning_rate": 7.431022045224236e-05, "loss": 0.2117, "step": 17870 }, { "epoch": 2.4286053855818532, "grad_norm": 0.33498409390449524, "learning_rate": 7.427568469951234e-05, "loss": 0.2336, "step": 17880 }, { "epoch": 2.4299636659988453, "grad_norm": 0.5474316477775574, "learning_rate": 7.424113378642125e-05, "loss": 0.2282, "step": 17890 }, { "epoch": 2.4313219464158378, "grad_norm": 0.46832311153411865, "learning_rate": 7.420656773454646e-05, "loss": 0.2316, "step": 17900 }, { "epoch": 2.43268022683283, "grad_norm": 0.6558818817138672, "learning_rate": 7.417198656547472e-05, "loss": 0.2254, "step": 17910 }, { "epoch": 2.434038507249822, "grad_norm": 0.2903479337692261, "learning_rate": 7.413739030080226e-05, "loss": 0.229, "step": 17920 }, { "epoch": 2.435396787666814, "grad_norm": 0.3386137783527374, "learning_rate": 7.410277896213472e-05, "loss": 0.2296, "step": 17930 }, { "epoch": 2.436755068083806, "grad_norm": 0.6938409209251404, "learning_rate": 7.406815257108718e-05, "loss": 0.2329, "step": 17940 }, { "epoch": 2.438113348500798, "grad_norm": 0.4150840938091278, "learning_rate": 7.403351114928407e-05, "loss": 0.2272, "step": 17950 }, { "epoch": 2.43947162891779, "grad_norm": 0.5522544980049133, "learning_rate": 7.399885471835926e-05, "loss": 0.2247, "step": 17960 }, { "epoch": 2.440829909334782, "grad_norm": 0.416754812002182, "learning_rate": 7.396418329995597e-05, "loss": 0.2135, "step": 17970 }, { "epoch": 2.442188189751774, "grad_norm": 0.40655186772346497, "learning_rate": 7.392949691572676e-05, "loss": 0.2292, "step": 17980 }, { "epoch": 2.4435464701687666, "grad_norm": 0.338854044675827, "learning_rate": 7.38947955873336e-05, "loss": 0.228, "step": 17990 }, { "epoch": 2.4449047505857586, "grad_norm": 0.4626075327396393, "learning_rate": 7.38600793364477e-05, "loss": 0.2239, "step": 18000 }, { "epoch": 2.4462630310027507, "grad_norm": 0.4303097426891327, "learning_rate": 7.382534818474966e-05, "loss": 0.2189, "step": 18010 }, { "epoch": 2.4476213114197427, "grad_norm": 0.38468948006629944, "learning_rate": 7.379060215392938e-05, "loss": 0.223, "step": 18020 }, { "epoch": 2.4489795918367347, "grad_norm": 0.3992916941642761, "learning_rate": 7.375584126568601e-05, "loss": 0.2135, "step": 18030 }, { "epoch": 2.450337872253727, "grad_norm": 0.3446080982685089, "learning_rate": 7.372106554172802e-05, "loss": 0.2267, "step": 18040 }, { "epoch": 2.451696152670719, "grad_norm": 0.37887194752693176, "learning_rate": 7.368627500377313e-05, "loss": 0.2264, "step": 18050 }, { "epoch": 2.453054433087711, "grad_norm": 0.5525243282318115, "learning_rate": 7.365146967354831e-05, "loss": 0.2218, "step": 18060 }, { "epoch": 2.454412713504703, "grad_norm": 0.3137887120246887, "learning_rate": 7.361664957278977e-05, "loss": 0.2192, "step": 18070 }, { "epoch": 2.455770993921695, "grad_norm": 0.5501535534858704, "learning_rate": 7.358181472324293e-05, "loss": 0.2098, "step": 18080 }, { "epoch": 2.457129274338687, "grad_norm": 0.3788621425628662, "learning_rate": 7.354696514666246e-05, "loss": 0.2243, "step": 18090 }, { "epoch": 2.4584875547556795, "grad_norm": 0.5961252450942993, "learning_rate": 7.35121008648122e-05, "loss": 0.2264, "step": 18100 }, { "epoch": 2.4598458351726715, "grad_norm": 0.3429439663887024, "learning_rate": 7.347722189946515e-05, "loss": 0.2219, "step": 18110 }, { "epoch": 2.4612041155896636, "grad_norm": 0.5862210392951965, "learning_rate": 7.344232827240353e-05, "loss": 0.2236, "step": 18120 }, { "epoch": 2.4625623960066556, "grad_norm": 0.5576637387275696, "learning_rate": 7.340742000541867e-05, "loss": 0.2177, "step": 18130 }, { "epoch": 2.4639206764236476, "grad_norm": 0.5937841534614563, "learning_rate": 7.337249712031108e-05, "loss": 0.2087, "step": 18140 }, { "epoch": 2.4652789568406397, "grad_norm": 0.3347906768321991, "learning_rate": 7.333755963889038e-05, "loss": 0.2197, "step": 18150 }, { "epoch": 2.4666372372576317, "grad_norm": 0.3759860694408417, "learning_rate": 7.330260758297532e-05, "loss": 0.225, "step": 18160 }, { "epoch": 2.4679955176746238, "grad_norm": 0.6083000898361206, "learning_rate": 7.326764097439372e-05, "loss": 0.2224, "step": 18170 }, { "epoch": 2.469353798091616, "grad_norm": 0.5077692270278931, "learning_rate": 7.323265983498253e-05, "loss": 0.2302, "step": 18180 }, { "epoch": 2.4707120785086083, "grad_norm": 1.0299490690231323, "learning_rate": 7.319766418658774e-05, "loss": 0.2235, "step": 18190 }, { "epoch": 2.4720703589256003, "grad_norm": 0.37100949883461, "learning_rate": 7.316265405106441e-05, "loss": 0.2226, "step": 18200 }, { "epoch": 2.4734286393425924, "grad_norm": 0.4211720824241638, "learning_rate": 7.312762945027667e-05, "loss": 0.2239, "step": 18210 }, { "epoch": 2.4747869197595844, "grad_norm": 0.4059642255306244, "learning_rate": 7.309259040609765e-05, "loss": 0.2246, "step": 18220 }, { "epoch": 2.4761452001765765, "grad_norm": 0.35163789987564087, "learning_rate": 7.305753694040954e-05, "loss": 0.2248, "step": 18230 }, { "epoch": 2.4775034805935685, "grad_norm": 0.3899269700050354, "learning_rate": 7.302246907510346e-05, "loss": 0.2363, "step": 18240 }, { "epoch": 2.4788617610105605, "grad_norm": 0.432137668132782, "learning_rate": 7.298738683207963e-05, "loss": 0.2266, "step": 18250 }, { "epoch": 2.4802200414275526, "grad_norm": 0.38312944769859314, "learning_rate": 7.295229023324717e-05, "loss": 0.2282, "step": 18260 }, { "epoch": 2.4815783218445446, "grad_norm": 0.3734847605228424, "learning_rate": 7.291717930052421e-05, "loss": 0.2329, "step": 18270 }, { "epoch": 2.482936602261537, "grad_norm": 0.4429275393486023, "learning_rate": 7.288205405583779e-05, "loss": 0.2288, "step": 18280 }, { "epoch": 2.484294882678529, "grad_norm": 0.489762544631958, "learning_rate": 7.284691452112391e-05, "loss": 0.225, "step": 18290 }, { "epoch": 2.485653163095521, "grad_norm": 0.5930917263031006, "learning_rate": 7.281176071832747e-05, "loss": 0.2257, "step": 18300 }, { "epoch": 2.4870114435125132, "grad_norm": 0.3834420144557953, "learning_rate": 7.277659266940236e-05, "loss": 0.2242, "step": 18310 }, { "epoch": 2.4883697239295053, "grad_norm": 0.5934807062149048, "learning_rate": 7.274141039631127e-05, "loss": 0.221, "step": 18320 }, { "epoch": 2.4897280043464973, "grad_norm": 0.3063584566116333, "learning_rate": 7.270621392102584e-05, "loss": 0.2176, "step": 18330 }, { "epoch": 2.4910862847634894, "grad_norm": 0.3830847442150116, "learning_rate": 7.267100326552652e-05, "loss": 0.217, "step": 18340 }, { "epoch": 2.4924445651804814, "grad_norm": 0.3647053837776184, "learning_rate": 7.263577845180266e-05, "loss": 0.2127, "step": 18350 }, { "epoch": 2.4938028455974734, "grad_norm": 0.5673231482505798, "learning_rate": 7.260053950185244e-05, "loss": 0.2251, "step": 18360 }, { "epoch": 2.495161126014466, "grad_norm": 0.5498285889625549, "learning_rate": 7.25652864376829e-05, "loss": 0.2311, "step": 18370 }, { "epoch": 2.4965194064314575, "grad_norm": 0.37594595551490784, "learning_rate": 7.253001928130982e-05, "loss": 0.2264, "step": 18380 }, { "epoch": 2.49787768684845, "grad_norm": 0.5517831444740295, "learning_rate": 7.249473805475785e-05, "loss": 0.2289, "step": 18390 }, { "epoch": 2.499235967265442, "grad_norm": 0.2970275580883026, "learning_rate": 7.245944278006038e-05, "loss": 0.2272, "step": 18400 }, { "epoch": 2.500594247682434, "grad_norm": 0.606865406036377, "learning_rate": 7.242413347925962e-05, "loss": 0.2254, "step": 18410 }, { "epoch": 2.501952528099426, "grad_norm": 0.6445013284683228, "learning_rate": 7.238881017440648e-05, "loss": 0.2221, "step": 18420 }, { "epoch": 2.503310808516418, "grad_norm": 0.31438612937927246, "learning_rate": 7.23534728875607e-05, "loss": 0.2228, "step": 18430 }, { "epoch": 2.5046690889334102, "grad_norm": 0.41488996148109436, "learning_rate": 7.231812164079064e-05, "loss": 0.2315, "step": 18440 }, { "epoch": 2.5060273693504023, "grad_norm": 0.4151557981967926, "learning_rate": 7.228275645617347e-05, "loss": 0.2186, "step": 18450 }, { "epoch": 2.5073856497673948, "grad_norm": 0.4799214005470276, "learning_rate": 7.224737735579504e-05, "loss": 0.2298, "step": 18460 }, { "epoch": 2.5087439301843864, "grad_norm": 0.45435062050819397, "learning_rate": 7.221198436174987e-05, "loss": 0.2281, "step": 18470 }, { "epoch": 2.510102210601379, "grad_norm": 0.3130773901939392, "learning_rate": 7.217657749614118e-05, "loss": 0.2199, "step": 18480 }, { "epoch": 2.511460491018371, "grad_norm": 0.36274731159210205, "learning_rate": 7.214115678108085e-05, "loss": 0.2182, "step": 18490 }, { "epoch": 2.512818771435363, "grad_norm": 0.46227800846099854, "learning_rate": 7.210572223868937e-05, "loss": 0.2266, "step": 18500 }, { "epoch": 2.514177051852355, "grad_norm": 0.47384390234947205, "learning_rate": 7.207027389109593e-05, "loss": 0.2219, "step": 18510 }, { "epoch": 2.515535332269347, "grad_norm": 0.44111353158950806, "learning_rate": 7.203481176043831e-05, "loss": 0.2276, "step": 18520 }, { "epoch": 2.516893612686339, "grad_norm": 0.3312663435935974, "learning_rate": 7.199933586886286e-05, "loss": 0.2317, "step": 18530 }, { "epoch": 2.518251893103331, "grad_norm": 0.43059998750686646, "learning_rate": 7.19638462385246e-05, "loss": 0.2246, "step": 18540 }, { "epoch": 2.519610173520323, "grad_norm": 0.8788889646530151, "learning_rate": 7.192834289158708e-05, "loss": 0.2299, "step": 18550 }, { "epoch": 2.520968453937315, "grad_norm": 0.3624875843524933, "learning_rate": 7.189282585022242e-05, "loss": 0.2203, "step": 18560 }, { "epoch": 2.5223267343543077, "grad_norm": 0.3935474455356598, "learning_rate": 7.18572951366113e-05, "loss": 0.2176, "step": 18570 }, { "epoch": 2.5236850147712997, "grad_norm": 0.3270396590232849, "learning_rate": 7.182175077294294e-05, "loss": 0.2369, "step": 18580 }, { "epoch": 2.5250432951882917, "grad_norm": 0.32190844416618347, "learning_rate": 7.178619278141508e-05, "loss": 0.2163, "step": 18590 }, { "epoch": 2.526401575605284, "grad_norm": 0.39595890045166016, "learning_rate": 7.175062118423397e-05, "loss": 0.2254, "step": 18600 }, { "epoch": 2.527759856022276, "grad_norm": 0.47311902046203613, "learning_rate": 7.171503600361438e-05, "loss": 0.2157, "step": 18610 }, { "epoch": 2.529118136439268, "grad_norm": 0.3375617265701294, "learning_rate": 7.167943726177951e-05, "loss": 0.2165, "step": 18620 }, { "epoch": 2.53047641685626, "grad_norm": 0.3622303009033203, "learning_rate": 7.164382498096109e-05, "loss": 0.224, "step": 18630 }, { "epoch": 2.531834697273252, "grad_norm": 0.3742624521255493, "learning_rate": 7.160819918339925e-05, "loss": 0.2127, "step": 18640 }, { "epoch": 2.533192977690244, "grad_norm": 0.41686904430389404, "learning_rate": 7.157255989134261e-05, "loss": 0.2198, "step": 18650 }, { "epoch": 2.5345512581072365, "grad_norm": 0.4114459455013275, "learning_rate": 7.153690712704819e-05, "loss": 0.2102, "step": 18660 }, { "epoch": 2.535909538524228, "grad_norm": 0.3226616382598877, "learning_rate": 7.150124091278143e-05, "loss": 0.2232, "step": 18670 }, { "epoch": 2.5372678189412206, "grad_norm": 0.3466982841491699, "learning_rate": 7.146556127081617e-05, "loss": 0.2326, "step": 18680 }, { "epoch": 2.5386260993582126, "grad_norm": 0.3452596962451935, "learning_rate": 7.142986822343463e-05, "loss": 0.2213, "step": 18690 }, { "epoch": 2.5399843797752046, "grad_norm": 0.34961768984794617, "learning_rate": 7.139416179292739e-05, "loss": 0.2257, "step": 18700 }, { "epoch": 2.5413426601921967, "grad_norm": 0.6228933930397034, "learning_rate": 7.135844200159344e-05, "loss": 0.2218, "step": 18710 }, { "epoch": 2.5427009406091887, "grad_norm": 0.4611671566963196, "learning_rate": 7.132270887174005e-05, "loss": 0.2244, "step": 18720 }, { "epoch": 2.5440592210261808, "grad_norm": 0.52619469165802, "learning_rate": 7.128696242568287e-05, "loss": 0.2224, "step": 18730 }, { "epoch": 2.545417501443173, "grad_norm": 0.32233622670173645, "learning_rate": 7.125120268574582e-05, "loss": 0.2117, "step": 18740 }, { "epoch": 2.5467757818601653, "grad_norm": 0.49539849162101746, "learning_rate": 7.121542967426115e-05, "loss": 0.2304, "step": 18750 }, { "epoch": 2.548134062277157, "grad_norm": 0.4256165623664856, "learning_rate": 7.117964341356942e-05, "loss": 0.2221, "step": 18760 }, { "epoch": 2.5494923426941494, "grad_norm": 0.4947744309902191, "learning_rate": 7.11438439260194e-05, "loss": 0.2229, "step": 18770 }, { "epoch": 2.5508506231111414, "grad_norm": 0.4364069998264313, "learning_rate": 7.11080312339682e-05, "loss": 0.2285, "step": 18780 }, { "epoch": 2.5522089035281335, "grad_norm": 0.3704259693622589, "learning_rate": 7.107220535978111e-05, "loss": 0.2226, "step": 18790 }, { "epoch": 2.5535671839451255, "grad_norm": 0.5077809691429138, "learning_rate": 7.103636632583168e-05, "loss": 0.2148, "step": 18800 }, { "epoch": 2.5549254643621175, "grad_norm": 0.4197351336479187, "learning_rate": 7.100051415450168e-05, "loss": 0.2119, "step": 18810 }, { "epoch": 2.5562837447791096, "grad_norm": 0.31832000613212585, "learning_rate": 7.096464886818106e-05, "loss": 0.2066, "step": 18820 }, { "epoch": 2.5576420251961016, "grad_norm": 0.404459148645401, "learning_rate": 7.092877048926802e-05, "loss": 0.2152, "step": 18830 }, { "epoch": 2.5590003056130937, "grad_norm": 0.5054754018783569, "learning_rate": 7.089287904016887e-05, "loss": 0.2111, "step": 18840 }, { "epoch": 2.5603585860300857, "grad_norm": 0.7069879770278931, "learning_rate": 7.085697454329811e-05, "loss": 0.2239, "step": 18850 }, { "epoch": 2.561716866447078, "grad_norm": 0.35723549127578735, "learning_rate": 7.082105702107842e-05, "loss": 0.2241, "step": 18860 }, { "epoch": 2.5630751468640702, "grad_norm": 0.6069508790969849, "learning_rate": 7.078512649594052e-05, "loss": 0.229, "step": 18870 }, { "epoch": 2.5644334272810623, "grad_norm": 0.4556558430194855, "learning_rate": 7.074918299032338e-05, "loss": 0.2168, "step": 18880 }, { "epoch": 2.5657917076980543, "grad_norm": 0.4640515446662903, "learning_rate": 7.071322652667396e-05, "loss": 0.2344, "step": 18890 }, { "epoch": 2.5671499881150464, "grad_norm": 0.4263159930706024, "learning_rate": 7.06772571274474e-05, "loss": 0.228, "step": 18900 }, { "epoch": 2.5685082685320384, "grad_norm": 0.5986847877502441, "learning_rate": 7.064127481510685e-05, "loss": 0.2268, "step": 18910 }, { "epoch": 2.5698665489490304, "grad_norm": 0.416866272687912, "learning_rate": 7.060527961212358e-05, "loss": 0.2353, "step": 18920 }, { "epoch": 2.5712248293660225, "grad_norm": 0.4247607886791229, "learning_rate": 7.056927154097683e-05, "loss": 0.2225, "step": 18930 }, { "epoch": 2.5725831097830145, "grad_norm": 0.39113467931747437, "learning_rate": 7.053325062415401e-05, "loss": 0.2257, "step": 18940 }, { "epoch": 2.573941390200007, "grad_norm": 0.38410714268684387, "learning_rate": 7.049721688415044e-05, "loss": 0.2273, "step": 18950 }, { "epoch": 2.5752996706169986, "grad_norm": 0.3734860122203827, "learning_rate": 7.046117034346945e-05, "loss": 0.22, "step": 18960 }, { "epoch": 2.576657951033991, "grad_norm": 0.4131544828414917, "learning_rate": 7.042511102462244e-05, "loss": 0.2128, "step": 18970 }, { "epoch": 2.578016231450983, "grad_norm": 0.32029616832733154, "learning_rate": 7.038903895012872e-05, "loss": 0.2256, "step": 18980 }, { "epoch": 2.579374511867975, "grad_norm": 0.6094593405723572, "learning_rate": 7.035295414251559e-05, "loss": 0.2292, "step": 18990 }, { "epoch": 2.580732792284967, "grad_norm": 0.3760390281677246, "learning_rate": 7.031685662431832e-05, "loss": 0.2271, "step": 19000 }, { "epoch": 2.5820910727019593, "grad_norm": 0.7080153822898865, "learning_rate": 7.02807464180801e-05, "loss": 0.225, "step": 19010 }, { "epoch": 2.5834493531189513, "grad_norm": 0.34191015362739563, "learning_rate": 7.024462354635204e-05, "loss": 0.2228, "step": 19020 }, { "epoch": 2.5848076335359433, "grad_norm": 0.3485642969608307, "learning_rate": 7.020848803169317e-05, "loss": 0.2224, "step": 19030 }, { "epoch": 2.586165913952936, "grad_norm": 0.37850192189216614, "learning_rate": 7.01723398966704e-05, "loss": 0.2258, "step": 19040 }, { "epoch": 2.5875241943699274, "grad_norm": 0.37759852409362793, "learning_rate": 7.013617916385853e-05, "loss": 0.2189, "step": 19050 }, { "epoch": 2.58888247478692, "grad_norm": 0.40866485238075256, "learning_rate": 7.010000585584025e-05, "loss": 0.2318, "step": 19060 }, { "epoch": 2.590240755203912, "grad_norm": 0.41963663697242737, "learning_rate": 7.006381999520606e-05, "loss": 0.216, "step": 19070 }, { "epoch": 2.591599035620904, "grad_norm": 0.3379341959953308, "learning_rate": 7.002762160455435e-05, "loss": 0.2232, "step": 19080 }, { "epoch": 2.592957316037896, "grad_norm": 0.31755051016807556, "learning_rate": 6.999141070649126e-05, "loss": 0.2255, "step": 19090 }, { "epoch": 2.594315596454888, "grad_norm": 0.4829753637313843, "learning_rate": 6.995518732363083e-05, "loss": 0.2238, "step": 19100 }, { "epoch": 2.59567387687188, "grad_norm": 0.3803703486919403, "learning_rate": 6.991895147859485e-05, "loss": 0.2286, "step": 19110 }, { "epoch": 2.597032157288872, "grad_norm": 0.39325863122940063, "learning_rate": 6.988270319401289e-05, "loss": 0.2164, "step": 19120 }, { "epoch": 2.5983904377058646, "grad_norm": 0.40794941782951355, "learning_rate": 6.984644249252229e-05, "loss": 0.2233, "step": 19130 }, { "epoch": 2.5997487181228562, "grad_norm": 0.3550748825073242, "learning_rate": 6.981016939676816e-05, "loss": 0.2209, "step": 19140 }, { "epoch": 2.6011069985398487, "grad_norm": 0.3947926461696625, "learning_rate": 6.977388392940334e-05, "loss": 0.2188, "step": 19150 }, { "epoch": 2.6024652789568408, "grad_norm": 0.4314517378807068, "learning_rate": 6.973758611308839e-05, "loss": 0.2267, "step": 19160 }, { "epoch": 2.603823559373833, "grad_norm": 0.32683461904525757, "learning_rate": 6.970127597049159e-05, "loss": 0.2126, "step": 19170 }, { "epoch": 2.605181839790825, "grad_norm": 0.31194284558296204, "learning_rate": 6.966495352428893e-05, "loss": 0.2206, "step": 19180 }, { "epoch": 2.606540120207817, "grad_norm": 0.4203945994377136, "learning_rate": 6.962861879716407e-05, "loss": 0.2229, "step": 19190 }, { "epoch": 2.607898400624809, "grad_norm": 0.4689308702945709, "learning_rate": 6.959227181180833e-05, "loss": 0.2171, "step": 19200 }, { "epoch": 2.609256681041801, "grad_norm": 0.45276597142219543, "learning_rate": 6.955591259092069e-05, "loss": 0.2278, "step": 19210 }, { "epoch": 2.610614961458793, "grad_norm": 0.3757180869579315, "learning_rate": 6.951954115720777e-05, "loss": 0.2267, "step": 19220 }, { "epoch": 2.611973241875785, "grad_norm": 0.40527060627937317, "learning_rate": 6.948315753338385e-05, "loss": 0.2196, "step": 19230 }, { "epoch": 2.6133315222927775, "grad_norm": 0.4065079689025879, "learning_rate": 6.94467617421708e-05, "loss": 0.2221, "step": 19240 }, { "epoch": 2.6146898027097696, "grad_norm": 0.39045244455337524, "learning_rate": 6.941035380629808e-05, "loss": 0.2302, "step": 19250 }, { "epoch": 2.6160480831267616, "grad_norm": 0.32205888628959656, "learning_rate": 6.937393374850271e-05, "loss": 0.2195, "step": 19260 }, { "epoch": 2.6174063635437537, "grad_norm": 0.39852288365364075, "learning_rate": 6.933750159152934e-05, "loss": 0.229, "step": 19270 }, { "epoch": 2.6187646439607457, "grad_norm": 0.4331900179386139, "learning_rate": 6.930105735813013e-05, "loss": 0.2268, "step": 19280 }, { "epoch": 2.6201229243777377, "grad_norm": 0.41724151372909546, "learning_rate": 6.926460107106482e-05, "loss": 0.2296, "step": 19290 }, { "epoch": 2.62148120479473, "grad_norm": 0.33520007133483887, "learning_rate": 6.922813275310064e-05, "loss": 0.2219, "step": 19300 }, { "epoch": 2.622839485211722, "grad_norm": 0.511389434337616, "learning_rate": 6.919165242701234e-05, "loss": 0.2162, "step": 19310 }, { "epoch": 2.624197765628714, "grad_norm": 0.41440412402153015, "learning_rate": 6.91551601155822e-05, "loss": 0.2243, "step": 19320 }, { "epoch": 2.6255560460457064, "grad_norm": 0.3579232692718506, "learning_rate": 6.911865584159994e-05, "loss": 0.2289, "step": 19330 }, { "epoch": 2.626914326462698, "grad_norm": 0.5578670501708984, "learning_rate": 6.908213962786278e-05, "loss": 0.2249, "step": 19340 }, { "epoch": 2.6282726068796904, "grad_norm": 0.2886176109313965, "learning_rate": 6.90456114971754e-05, "loss": 0.2325, "step": 19350 }, { "epoch": 2.6296308872966825, "grad_norm": 0.3462935984134674, "learning_rate": 6.900907147234993e-05, "loss": 0.2134, "step": 19360 }, { "epoch": 2.6309891677136745, "grad_norm": 0.34147918224334717, "learning_rate": 6.897251957620587e-05, "loss": 0.2212, "step": 19370 }, { "epoch": 2.6323474481306666, "grad_norm": 0.3735399842262268, "learning_rate": 6.89359558315702e-05, "loss": 0.2213, "step": 19380 }, { "epoch": 2.6337057285476586, "grad_norm": 0.591202974319458, "learning_rate": 6.889938026127725e-05, "loss": 0.2156, "step": 19390 }, { "epoch": 2.6350640089646507, "grad_norm": 0.43868014216423035, "learning_rate": 6.886279288816878e-05, "loss": 0.2157, "step": 19400 }, { "epoch": 2.6364222893816427, "grad_norm": 0.39146044850349426, "learning_rate": 6.882619373509391e-05, "loss": 0.2236, "step": 19410 }, { "epoch": 2.637780569798635, "grad_norm": 0.4587043821811676, "learning_rate": 6.878958282490907e-05, "loss": 0.2265, "step": 19420 }, { "epoch": 2.6391388502156268, "grad_norm": 0.315970778465271, "learning_rate": 6.87529601804781e-05, "loss": 0.2208, "step": 19430 }, { "epoch": 2.6404971306326193, "grad_norm": 0.406206339597702, "learning_rate": 6.871632582467212e-05, "loss": 0.2263, "step": 19440 }, { "epoch": 2.6418554110496113, "grad_norm": 0.4027423858642578, "learning_rate": 6.867967978036959e-05, "loss": 0.2222, "step": 19450 }, { "epoch": 2.6432136914666033, "grad_norm": 0.33020728826522827, "learning_rate": 6.864302207045628e-05, "loss": 0.2217, "step": 19460 }, { "epoch": 2.6445719718835954, "grad_norm": 0.42348408699035645, "learning_rate": 6.860635271782519e-05, "loss": 0.2238, "step": 19470 }, { "epoch": 2.6459302523005874, "grad_norm": 0.37534770369529724, "learning_rate": 6.856967174537664e-05, "loss": 0.2228, "step": 19480 }, { "epoch": 2.6472885327175795, "grad_norm": 0.38852137327194214, "learning_rate": 6.85329791760182e-05, "loss": 0.2252, "step": 19490 }, { "epoch": 2.6486468131345715, "grad_norm": 0.4674433171749115, "learning_rate": 6.849627503266464e-05, "loss": 0.2215, "step": 19500 }, { "epoch": 2.650005093551564, "grad_norm": 0.3877376914024353, "learning_rate": 6.845955933823807e-05, "loss": 0.2234, "step": 19510 }, { "epoch": 2.6513633739685556, "grad_norm": 0.42665305733680725, "learning_rate": 6.842283211566767e-05, "loss": 0.2184, "step": 19520 }, { "epoch": 2.652721654385548, "grad_norm": 0.3758017420768738, "learning_rate": 6.83860933878899e-05, "loss": 0.2323, "step": 19530 }, { "epoch": 2.65407993480254, "grad_norm": 0.33159399032592773, "learning_rate": 6.834934317784841e-05, "loss": 0.2198, "step": 19540 }, { "epoch": 2.655438215219532, "grad_norm": 0.3533087372779846, "learning_rate": 6.8312581508494e-05, "loss": 0.2277, "step": 19550 }, { "epoch": 2.656796495636524, "grad_norm": 0.3744170069694519, "learning_rate": 6.82758084027846e-05, "loss": 0.2281, "step": 19560 }, { "epoch": 2.6581547760535162, "grad_norm": 0.506310224533081, "learning_rate": 6.823902388368536e-05, "loss": 0.2239, "step": 19570 }, { "epoch": 2.6595130564705083, "grad_norm": 0.5487335324287415, "learning_rate": 6.820222797416847e-05, "loss": 0.2182, "step": 19580 }, { "epoch": 2.6608713368875003, "grad_norm": 0.34415534138679504, "learning_rate": 6.81654206972133e-05, "loss": 0.2288, "step": 19590 }, { "epoch": 2.6622296173044924, "grad_norm": 0.35172897577285767, "learning_rate": 6.812860207580625e-05, "loss": 0.2072, "step": 19600 }, { "epoch": 2.6635878977214844, "grad_norm": 0.4019588530063629, "learning_rate": 6.809177213294089e-05, "loss": 0.2253, "step": 19610 }, { "epoch": 2.664946178138477, "grad_norm": 0.6600854396820068, "learning_rate": 6.80549308916178e-05, "loss": 0.2273, "step": 19620 }, { "epoch": 2.666304458555469, "grad_norm": 0.3658861219882965, "learning_rate": 6.801807837484463e-05, "loss": 0.2166, "step": 19630 }, { "epoch": 2.667662738972461, "grad_norm": 0.29930350184440613, "learning_rate": 6.798121460563608e-05, "loss": 0.2196, "step": 19640 }, { "epoch": 2.669021019389453, "grad_norm": 0.3651399314403534, "learning_rate": 6.794433960701387e-05, "loss": 0.2271, "step": 19650 }, { "epoch": 2.670379299806445, "grad_norm": 0.3585663139820099, "learning_rate": 6.790745340200672e-05, "loss": 0.2279, "step": 19660 }, { "epoch": 2.671737580223437, "grad_norm": 0.35577186942100525, "learning_rate": 6.787055601365038e-05, "loss": 0.2281, "step": 19670 }, { "epoch": 2.673095860640429, "grad_norm": 0.381797730922699, "learning_rate": 6.783364746498758e-05, "loss": 0.2285, "step": 19680 }, { "epoch": 2.674454141057421, "grad_norm": 0.4486427307128906, "learning_rate": 6.779672777906795e-05, "loss": 0.222, "step": 19690 }, { "epoch": 2.6758124214744132, "grad_norm": 0.31911206245422363, "learning_rate": 6.775979697894821e-05, "loss": 0.213, "step": 19700 }, { "epoch": 2.6771707018914057, "grad_norm": 0.3482758700847626, "learning_rate": 6.77228550876919e-05, "loss": 0.222, "step": 19710 }, { "epoch": 2.6785289823083973, "grad_norm": 0.9415132403373718, "learning_rate": 6.768590212836951e-05, "loss": 0.2182, "step": 19720 }, { "epoch": 2.67988726272539, "grad_norm": 0.3410845994949341, "learning_rate": 6.764893812405848e-05, "loss": 0.2369, "step": 19730 }, { "epoch": 2.681245543142382, "grad_norm": 0.3497271239757538, "learning_rate": 6.761196309784316e-05, "loss": 0.2178, "step": 19740 }, { "epoch": 2.682603823559374, "grad_norm": 0.4950864911079407, "learning_rate": 6.75749770728147e-05, "loss": 0.2209, "step": 19750 }, { "epoch": 2.683962103976366, "grad_norm": 0.3392435610294342, "learning_rate": 6.753798007207122e-05, "loss": 0.2162, "step": 19760 }, { "epoch": 2.685320384393358, "grad_norm": 0.3872303366661072, "learning_rate": 6.750097211871761e-05, "loss": 0.2293, "step": 19770 }, { "epoch": 2.68667866481035, "grad_norm": 0.38323667645454407, "learning_rate": 6.746395323586564e-05, "loss": 0.2221, "step": 19780 }, { "epoch": 2.688036945227342, "grad_norm": 0.33826202154159546, "learning_rate": 6.742692344663393e-05, "loss": 0.2286, "step": 19790 }, { "epoch": 2.6893952256443345, "grad_norm": 0.4248747229576111, "learning_rate": 6.738988277414784e-05, "loss": 0.2213, "step": 19800 }, { "epoch": 2.690753506061326, "grad_norm": 0.4481458067893982, "learning_rate": 6.73528312415396e-05, "loss": 0.2297, "step": 19810 }, { "epoch": 2.6921117864783186, "grad_norm": 0.43812301754951477, "learning_rate": 6.731576887194818e-05, "loss": 0.2103, "step": 19820 }, { "epoch": 2.6934700668953107, "grad_norm": 0.35246214270591736, "learning_rate": 6.727869568851933e-05, "loss": 0.2169, "step": 19830 }, { "epoch": 2.6948283473123027, "grad_norm": 0.4352017641067505, "learning_rate": 6.724161171440554e-05, "loss": 0.2167, "step": 19840 }, { "epoch": 2.6961866277292947, "grad_norm": 0.44771209359169006, "learning_rate": 6.720451697276608e-05, "loss": 0.2155, "step": 19850 }, { "epoch": 2.697544908146287, "grad_norm": 0.4226098954677582, "learning_rate": 6.71674114867669e-05, "loss": 0.2307, "step": 19860 }, { "epoch": 2.698903188563279, "grad_norm": 0.47660306096076965, "learning_rate": 6.713029527958068e-05, "loss": 0.2201, "step": 19870 }, { "epoch": 2.700261468980271, "grad_norm": 0.33955150842666626, "learning_rate": 6.709316837438677e-05, "loss": 0.2104, "step": 19880 }, { "epoch": 2.701619749397263, "grad_norm": 0.34553763270378113, "learning_rate": 6.705603079437123e-05, "loss": 0.2286, "step": 19890 }, { "epoch": 2.702978029814255, "grad_norm": 0.5438740849494934, "learning_rate": 6.70188825627268e-05, "loss": 0.2259, "step": 19900 }, { "epoch": 2.7043363102312474, "grad_norm": 0.3817773759365082, "learning_rate": 6.698172370265285e-05, "loss": 0.2198, "step": 19910 }, { "epoch": 2.7056945906482395, "grad_norm": 0.3596331775188446, "learning_rate": 6.694455423735535e-05, "loss": 0.2217, "step": 19920 }, { "epoch": 2.7070528710652315, "grad_norm": 0.4651833176612854, "learning_rate": 6.690737419004698e-05, "loss": 0.2226, "step": 19930 }, { "epoch": 2.7084111514822236, "grad_norm": 0.30929696559906006, "learning_rate": 6.687018358394694e-05, "loss": 0.22, "step": 19940 }, { "epoch": 2.7097694318992156, "grad_norm": 0.37631136178970337, "learning_rate": 6.683298244228106e-05, "loss": 0.2221, "step": 19950 }, { "epoch": 2.7111277123162076, "grad_norm": 0.4453674554824829, "learning_rate": 6.67957707882818e-05, "loss": 0.2289, "step": 19960 }, { "epoch": 2.7124859927331997, "grad_norm": 0.3455471098423004, "learning_rate": 6.675854864518812e-05, "loss": 0.2144, "step": 19970 }, { "epoch": 2.7138442731501917, "grad_norm": 0.6344315409660339, "learning_rate": 6.672131603624551e-05, "loss": 0.2137, "step": 19980 }, { "epoch": 2.7152025535671838, "grad_norm": 0.3075887858867645, "learning_rate": 6.668407298470609e-05, "loss": 0.22, "step": 19990 }, { "epoch": 2.7165608339841762, "grad_norm": 0.40567317605018616, "learning_rate": 6.66468195138284e-05, "loss": 0.2256, "step": 20000 }, { "epoch": 2.717919114401168, "grad_norm": 0.4422047734260559, "learning_rate": 6.660955564687756e-05, "loss": 0.228, "step": 20010 }, { "epoch": 2.7192773948181603, "grad_norm": 0.3186933398246765, "learning_rate": 6.657228140712515e-05, "loss": 0.2292, "step": 20020 }, { "epoch": 2.7206356752351524, "grad_norm": 0.36685749888420105, "learning_rate": 6.653499681784925e-05, "loss": 0.2252, "step": 20030 }, { "epoch": 2.7219939556521444, "grad_norm": 0.3671252131462097, "learning_rate": 6.649770190233435e-05, "loss": 0.2166, "step": 20040 }, { "epoch": 2.7233522360691365, "grad_norm": 0.3518591821193695, "learning_rate": 6.646039668387146e-05, "loss": 0.2231, "step": 20050 }, { "epoch": 2.7247105164861285, "grad_norm": 0.46431753039360046, "learning_rate": 6.642308118575796e-05, "loss": 0.2282, "step": 20060 }, { "epoch": 2.7260687969031205, "grad_norm": 0.35424497723579407, "learning_rate": 6.63857554312977e-05, "loss": 0.2207, "step": 20070 }, { "epoch": 2.7274270773201126, "grad_norm": 0.3903871476650238, "learning_rate": 6.634841944380092e-05, "loss": 0.2186, "step": 20080 }, { "epoch": 2.728785357737105, "grad_norm": 0.417592853307724, "learning_rate": 6.631107324658424e-05, "loss": 0.2288, "step": 20090 }, { "epoch": 2.7301436381540967, "grad_norm": 0.40019673109054565, "learning_rate": 6.627371686297064e-05, "loss": 0.2283, "step": 20100 }, { "epoch": 2.731501918571089, "grad_norm": 0.4178541898727417, "learning_rate": 6.623635031628951e-05, "loss": 0.223, "step": 20110 }, { "epoch": 2.732860198988081, "grad_norm": 0.4637932777404785, "learning_rate": 6.619897362987654e-05, "loss": 0.221, "step": 20120 }, { "epoch": 2.7342184794050732, "grad_norm": 0.36538973450660706, "learning_rate": 6.616158682707381e-05, "loss": 0.2102, "step": 20130 }, { "epoch": 2.7355767598220653, "grad_norm": 0.4854636788368225, "learning_rate": 6.612418993122964e-05, "loss": 0.2143, "step": 20140 }, { "epoch": 2.7369350402390573, "grad_norm": 0.36849257349967957, "learning_rate": 6.60867829656987e-05, "loss": 0.2206, "step": 20150 }, { "epoch": 2.7382933206560494, "grad_norm": 0.4071289896965027, "learning_rate": 6.604936595384195e-05, "loss": 0.2264, "step": 20160 }, { "epoch": 2.7396516010730414, "grad_norm": 0.41805315017700195, "learning_rate": 6.601193891902662e-05, "loss": 0.222, "step": 20170 }, { "epoch": 2.741009881490034, "grad_norm": 0.8620611429214478, "learning_rate": 6.59745018846262e-05, "loss": 0.2194, "step": 20180 }, { "epoch": 2.7423681619070255, "grad_norm": 0.41262975335121155, "learning_rate": 6.593705487402038e-05, "loss": 0.2265, "step": 20190 }, { "epoch": 2.743726442324018, "grad_norm": 0.6644691228866577, "learning_rate": 6.589959791059518e-05, "loss": 0.2195, "step": 20200 }, { "epoch": 2.74508472274101, "grad_norm": 0.5896000266075134, "learning_rate": 6.586213101774273e-05, "loss": 0.2205, "step": 20210 }, { "epoch": 2.746443003158002, "grad_norm": 0.4981018006801605, "learning_rate": 6.582465421886143e-05, "loss": 0.2252, "step": 20220 }, { "epoch": 2.747801283574994, "grad_norm": 0.5343995690345764, "learning_rate": 6.578716753735583e-05, "loss": 0.2308, "step": 20230 }, { "epoch": 2.749159563991986, "grad_norm": 0.3990219235420227, "learning_rate": 6.574967099663668e-05, "loss": 0.2313, "step": 20240 }, { "epoch": 2.750517844408978, "grad_norm": 0.3259849548339844, "learning_rate": 6.571216462012086e-05, "loss": 0.2239, "step": 20250 }, { "epoch": 2.75187612482597, "grad_norm": 0.5746831297874451, "learning_rate": 6.567464843123142e-05, "loss": 0.2228, "step": 20260 }, { "epoch": 2.7532344052429623, "grad_norm": 0.46299394965171814, "learning_rate": 6.56371224533975e-05, "loss": 0.2316, "step": 20270 }, { "epoch": 2.7545926856599543, "grad_norm": 0.4156789481639862, "learning_rate": 6.55995867100544e-05, "loss": 0.2207, "step": 20280 }, { "epoch": 2.755950966076947, "grad_norm": 0.439792275428772, "learning_rate": 6.556204122464347e-05, "loss": 0.2185, "step": 20290 }, { "epoch": 2.757309246493939, "grad_norm": 0.5094109177589417, "learning_rate": 6.552448602061219e-05, "loss": 0.2147, "step": 20300 }, { "epoch": 2.758667526910931, "grad_norm": 0.38743048906326294, "learning_rate": 6.548692112141408e-05, "loss": 0.2211, "step": 20310 }, { "epoch": 2.760025807327923, "grad_norm": 0.5876513123512268, "learning_rate": 6.544934655050871e-05, "loss": 0.2205, "step": 20320 }, { "epoch": 2.761384087744915, "grad_norm": 0.5204431414604187, "learning_rate": 6.541176233136172e-05, "loss": 0.2129, "step": 20330 }, { "epoch": 2.762742368161907, "grad_norm": 0.4141252636909485, "learning_rate": 6.537416848744477e-05, "loss": 0.224, "step": 20340 }, { "epoch": 2.764100648578899, "grad_norm": 0.5133291482925415, "learning_rate": 6.53365650422355e-05, "loss": 0.2223, "step": 20350 }, { "epoch": 2.765458928995891, "grad_norm": 0.40256938338279724, "learning_rate": 6.529895201921755e-05, "loss": 0.2253, "step": 20360 }, { "epoch": 2.766817209412883, "grad_norm": 0.4410557746887207, "learning_rate": 6.526132944188058e-05, "loss": 0.22, "step": 20370 }, { "epoch": 2.7681754898298756, "grad_norm": 0.3975759744644165, "learning_rate": 6.52236973337202e-05, "loss": 0.2227, "step": 20380 }, { "epoch": 2.769533770246867, "grad_norm": 0.7161093950271606, "learning_rate": 6.518605571823792e-05, "loss": 0.2218, "step": 20390 }, { "epoch": 2.7708920506638597, "grad_norm": 0.9059826135635376, "learning_rate": 6.514840461894129e-05, "loss": 0.2211, "step": 20400 }, { "epoch": 2.7722503310808517, "grad_norm": 0.5957664847373962, "learning_rate": 6.511074405934371e-05, "loss": 0.2265, "step": 20410 }, { "epoch": 2.7736086114978438, "grad_norm": 0.47573816776275635, "learning_rate": 6.507307406296448e-05, "loss": 0.2229, "step": 20420 }, { "epoch": 2.774966891914836, "grad_norm": 0.4443944990634918, "learning_rate": 6.503539465332885e-05, "loss": 0.2169, "step": 20430 }, { "epoch": 2.776325172331828, "grad_norm": 0.4381183981895447, "learning_rate": 6.499770585396786e-05, "loss": 0.2162, "step": 20440 }, { "epoch": 2.77768345274882, "grad_norm": 0.4882720112800598, "learning_rate": 6.496000768841852e-05, "loss": 0.2199, "step": 20450 }, { "epoch": 2.779041733165812, "grad_norm": 0.5568972229957581, "learning_rate": 6.492230018022364e-05, "loss": 0.2244, "step": 20460 }, { "epoch": 2.7804000135828044, "grad_norm": 0.5025323629379272, "learning_rate": 6.488458335293183e-05, "loss": 0.2229, "step": 20470 }, { "epoch": 2.781758293999796, "grad_norm": 0.49633491039276123, "learning_rate": 6.484685723009758e-05, "loss": 0.2249, "step": 20480 }, { "epoch": 2.7831165744167885, "grad_norm": 0.5024377703666687, "learning_rate": 6.480912183528116e-05, "loss": 0.2181, "step": 20490 }, { "epoch": 2.7844748548337805, "grad_norm": 0.6363875865936279, "learning_rate": 6.47713771920486e-05, "loss": 0.2238, "step": 20500 }, { "epoch": 2.7858331352507726, "grad_norm": 0.36526432633399963, "learning_rate": 6.473362332397175e-05, "loss": 0.2148, "step": 20510 }, { "epoch": 2.7871914156677646, "grad_norm": 0.3785194754600525, "learning_rate": 6.469586025462821e-05, "loss": 0.2239, "step": 20520 }, { "epoch": 2.7885496960847567, "grad_norm": 0.5447390675544739, "learning_rate": 6.465808800760133e-05, "loss": 0.2252, "step": 20530 }, { "epoch": 2.7899079765017487, "grad_norm": 0.8131313920021057, "learning_rate": 6.462030660648017e-05, "loss": 0.2301, "step": 20540 }, { "epoch": 2.7912662569187408, "grad_norm": 0.6130920648574829, "learning_rate": 6.458251607485952e-05, "loss": 0.2239, "step": 20550 }, { "epoch": 2.7926245373357332, "grad_norm": 0.46628615260124207, "learning_rate": 6.454471643633987e-05, "loss": 0.2251, "step": 20560 }, { "epoch": 2.793982817752725, "grad_norm": 0.48288917541503906, "learning_rate": 6.450690771452741e-05, "loss": 0.2281, "step": 20570 }, { "epoch": 2.7953410981697173, "grad_norm": 0.7290740609169006, "learning_rate": 6.446908993303398e-05, "loss": 0.2191, "step": 20580 }, { "epoch": 2.7966993785867094, "grad_norm": 0.41732752323150635, "learning_rate": 6.443126311547709e-05, "loss": 0.2305, "step": 20590 }, { "epoch": 2.7980576590037014, "grad_norm": 0.322334885597229, "learning_rate": 6.439342728547991e-05, "loss": 0.2247, "step": 20600 }, { "epoch": 2.7994159394206934, "grad_norm": 0.3089941143989563, "learning_rate": 6.43555824666712e-05, "loss": 0.2245, "step": 20610 }, { "epoch": 2.8007742198376855, "grad_norm": 0.43546292185783386, "learning_rate": 6.431772868268537e-05, "loss": 0.2223, "step": 20620 }, { "epoch": 2.8021325002546775, "grad_norm": 0.40323957800865173, "learning_rate": 6.427986595716241e-05, "loss": 0.2232, "step": 20630 }, { "epoch": 2.8034907806716696, "grad_norm": 0.8291993141174316, "learning_rate": 6.424199431374788e-05, "loss": 0.217, "step": 20640 }, { "epoch": 2.8048490610886616, "grad_norm": 0.39431655406951904, "learning_rate": 6.420411377609297e-05, "loss": 0.2393, "step": 20650 }, { "epoch": 2.8062073415056537, "grad_norm": 0.40056371688842773, "learning_rate": 6.416622436785434e-05, "loss": 0.224, "step": 20660 }, { "epoch": 2.807565621922646, "grad_norm": 0.30844759941101074, "learning_rate": 6.412832611269424e-05, "loss": 0.2173, "step": 20670 }, { "epoch": 2.808923902339638, "grad_norm": 0.36609190702438354, "learning_rate": 6.409041903428045e-05, "loss": 0.2208, "step": 20680 }, { "epoch": 2.81028218275663, "grad_norm": 0.9679686427116394, "learning_rate": 6.405250315628624e-05, "loss": 0.2201, "step": 20690 }, { "epoch": 2.8116404631736223, "grad_norm": 0.31677964329719543, "learning_rate": 6.401457850239036e-05, "loss": 0.2153, "step": 20700 }, { "epoch": 2.8129987435906143, "grad_norm": 0.4036737084388733, "learning_rate": 6.397664509627711e-05, "loss": 0.2122, "step": 20710 }, { "epoch": 2.8143570240076063, "grad_norm": 0.38754427433013916, "learning_rate": 6.393870296163615e-05, "loss": 0.2211, "step": 20720 }, { "epoch": 2.8157153044245984, "grad_norm": 0.5116146206855774, "learning_rate": 6.390075212216271e-05, "loss": 0.217, "step": 20730 }, { "epoch": 2.8170735848415904, "grad_norm": 0.3810253441333771, "learning_rate": 6.386279260155735e-05, "loss": 0.2277, "step": 20740 }, { "epoch": 2.8184318652585825, "grad_norm": 0.6304855942726135, "learning_rate": 6.382482442352611e-05, "loss": 0.2318, "step": 20750 }, { "epoch": 2.819790145675575, "grad_norm": 0.3044786751270294, "learning_rate": 6.378684761178045e-05, "loss": 0.2214, "step": 20760 }, { "epoch": 2.8211484260925666, "grad_norm": 0.3380535840988159, "learning_rate": 6.374886219003715e-05, "loss": 0.2158, "step": 20770 }, { "epoch": 2.822506706509559, "grad_norm": 0.4985421299934387, "learning_rate": 6.371086818201843e-05, "loss": 0.2256, "step": 20780 }, { "epoch": 2.823864986926551, "grad_norm": 0.33762213587760925, "learning_rate": 6.367286561145187e-05, "loss": 0.2263, "step": 20790 }, { "epoch": 2.825223267343543, "grad_norm": 0.3583674430847168, "learning_rate": 6.363485450207037e-05, "loss": 0.2087, "step": 20800 }, { "epoch": 2.826581547760535, "grad_norm": 0.41844236850738525, "learning_rate": 6.359683487761219e-05, "loss": 0.2198, "step": 20810 }, { "epoch": 2.827939828177527, "grad_norm": 0.4017079770565033, "learning_rate": 6.355880676182086e-05, "loss": 0.2181, "step": 20820 }, { "epoch": 2.8292981085945192, "grad_norm": 0.4796137809753418, "learning_rate": 6.352077017844526e-05, "loss": 0.2217, "step": 20830 }, { "epoch": 2.8306563890115113, "grad_norm": 0.5771798491477966, "learning_rate": 6.348272515123955e-05, "loss": 0.2284, "step": 20840 }, { "epoch": 2.8320146694285038, "grad_norm": 1.610284447669983, "learning_rate": 6.344467170396316e-05, "loss": 0.2305, "step": 20850 }, { "epoch": 2.8333729498454954, "grad_norm": 0.38275888562202454, "learning_rate": 6.340660986038077e-05, "loss": 0.2203, "step": 20860 }, { "epoch": 2.834731230262488, "grad_norm": 0.4177561104297638, "learning_rate": 6.336853964426231e-05, "loss": 0.2195, "step": 20870 }, { "epoch": 2.83608951067948, "grad_norm": 0.5642317533493042, "learning_rate": 6.333046107938292e-05, "loss": 0.2277, "step": 20880 }, { "epoch": 2.837447791096472, "grad_norm": 0.38792186975479126, "learning_rate": 6.329237418952298e-05, "loss": 0.2166, "step": 20890 }, { "epoch": 2.838806071513464, "grad_norm": 0.3562542200088501, "learning_rate": 6.325427899846807e-05, "loss": 0.2261, "step": 20900 }, { "epoch": 2.840164351930456, "grad_norm": 0.3323986828327179, "learning_rate": 6.321617553000896e-05, "loss": 0.2262, "step": 20910 }, { "epoch": 2.841522632347448, "grad_norm": 0.5957162380218506, "learning_rate": 6.317806380794154e-05, "loss": 0.2141, "step": 20920 }, { "epoch": 2.84288091276444, "grad_norm": 0.3997288644313812, "learning_rate": 6.313994385606689e-05, "loss": 0.2274, "step": 20930 }, { "epoch": 2.844239193181432, "grad_norm": 0.37631654739379883, "learning_rate": 6.310181569819125e-05, "loss": 0.2202, "step": 20940 }, { "epoch": 2.845597473598424, "grad_norm": 0.35834816098213196, "learning_rate": 6.306367935812595e-05, "loss": 0.2188, "step": 20950 }, { "epoch": 2.8469557540154167, "grad_norm": 0.47011682391166687, "learning_rate": 6.302553485968745e-05, "loss": 0.223, "step": 20960 }, { "epoch": 2.8483140344324087, "grad_norm": 0.40326008200645447, "learning_rate": 6.298738222669729e-05, "loss": 0.2149, "step": 20970 }, { "epoch": 2.8496723148494008, "grad_norm": 0.4038887321949005, "learning_rate": 6.294922148298208e-05, "loss": 0.2091, "step": 20980 }, { "epoch": 2.851030595266393, "grad_norm": 0.4332898259162903, "learning_rate": 6.291105265237356e-05, "loss": 0.2204, "step": 20990 }, { "epoch": 2.852388875683385, "grad_norm": 0.3278917670249939, "learning_rate": 6.287287575870842e-05, "loss": 0.2203, "step": 21000 }, { "epoch": 2.853747156100377, "grad_norm": 0.3090645372867584, "learning_rate": 6.283469082582847e-05, "loss": 0.2181, "step": 21010 }, { "epoch": 2.855105436517369, "grad_norm": 0.3650490939617157, "learning_rate": 6.27964978775805e-05, "loss": 0.2126, "step": 21020 }, { "epoch": 2.856463716934361, "grad_norm": 0.3941207230091095, "learning_rate": 6.27582969378163e-05, "loss": 0.2294, "step": 21030 }, { "epoch": 2.857821997351353, "grad_norm": 0.4040345251560211, "learning_rate": 6.27200880303927e-05, "loss": 0.2257, "step": 21040 }, { "epoch": 2.8591802777683455, "grad_norm": 0.42537635564804077, "learning_rate": 6.268187117917144e-05, "loss": 0.2244, "step": 21050 }, { "epoch": 2.860538558185337, "grad_norm": 0.4814678132534027, "learning_rate": 6.264364640801928e-05, "loss": 0.2232, "step": 21060 }, { "epoch": 2.8618968386023296, "grad_norm": 0.6218770146369934, "learning_rate": 6.260541374080786e-05, "loss": 0.2183, "step": 21070 }, { "epoch": 2.8632551190193216, "grad_norm": 0.3691994249820709, "learning_rate": 6.256717320141384e-05, "loss": 0.2303, "step": 21080 }, { "epoch": 2.8646133994363137, "grad_norm": 0.4578982889652252, "learning_rate": 6.252892481371868e-05, "loss": 0.2288, "step": 21090 }, { "epoch": 2.8659716798533057, "grad_norm": 0.34923189878463745, "learning_rate": 6.249066860160886e-05, "loss": 0.2254, "step": 21100 }, { "epoch": 2.8673299602702977, "grad_norm": 0.4254482388496399, "learning_rate": 6.245240458897566e-05, "loss": 0.223, "step": 21110 }, { "epoch": 2.86868824068729, "grad_norm": 0.31372958421707153, "learning_rate": 6.24141327997153e-05, "loss": 0.2233, "step": 21120 }, { "epoch": 2.870046521104282, "grad_norm": 0.3327460289001465, "learning_rate": 6.237585325772881e-05, "loss": 0.2165, "step": 21130 }, { "epoch": 2.8714048015212743, "grad_norm": 0.3533669710159302, "learning_rate": 6.233756598692205e-05, "loss": 0.2157, "step": 21140 }, { "epoch": 2.872763081938266, "grad_norm": 0.35265395045280457, "learning_rate": 6.229927101120575e-05, "loss": 0.215, "step": 21150 }, { "epoch": 2.8741213623552584, "grad_norm": 0.4106992483139038, "learning_rate": 6.226096835449542e-05, "loss": 0.2208, "step": 21160 }, { "epoch": 2.8754796427722504, "grad_norm": 0.39340105652809143, "learning_rate": 6.222265804071139e-05, "loss": 0.2256, "step": 21170 }, { "epoch": 2.8768379231892425, "grad_norm": 0.3429628610610962, "learning_rate": 6.218434009377875e-05, "loss": 0.2178, "step": 21180 }, { "epoch": 2.8781962036062345, "grad_norm": 0.6223356127738953, "learning_rate": 6.214601453762738e-05, "loss": 0.2193, "step": 21190 }, { "epoch": 2.8795544840232266, "grad_norm": 0.31285038590431213, "learning_rate": 6.210768139619189e-05, "loss": 0.2168, "step": 21200 }, { "epoch": 2.8809127644402186, "grad_norm": 0.4172757863998413, "learning_rate": 6.206934069341164e-05, "loss": 0.2179, "step": 21210 }, { "epoch": 2.8822710448572106, "grad_norm": 0.28440403938293457, "learning_rate": 6.20309924532307e-05, "loss": 0.2262, "step": 21220 }, { "epoch": 2.883629325274203, "grad_norm": 0.4297007918357849, "learning_rate": 6.199263669959786e-05, "loss": 0.2203, "step": 21230 }, { "epoch": 2.8849876056911947, "grad_norm": 0.4779892563819885, "learning_rate": 6.195427345646659e-05, "loss": 0.2197, "step": 21240 }, { "epoch": 2.886345886108187, "grad_norm": 0.36403754353523254, "learning_rate": 6.191590274779508e-05, "loss": 0.2206, "step": 21250 }, { "epoch": 2.8877041665251793, "grad_norm": 0.427384614944458, "learning_rate": 6.18775245975461e-05, "loss": 0.2202, "step": 21260 }, { "epoch": 2.8890624469421713, "grad_norm": 0.2878035604953766, "learning_rate": 6.183913902968714e-05, "loss": 0.2162, "step": 21270 }, { "epoch": 2.8904207273591633, "grad_norm": 0.4710155129432678, "learning_rate": 6.18007460681903e-05, "loss": 0.2205, "step": 21280 }, { "epoch": 2.8917790077761554, "grad_norm": 0.3774133324623108, "learning_rate": 6.176234573703227e-05, "loss": 0.2207, "step": 21290 }, { "epoch": 2.8931372881931474, "grad_norm": 0.3471783399581909, "learning_rate": 6.172393806019441e-05, "loss": 0.2164, "step": 21300 }, { "epoch": 2.8944955686101395, "grad_norm": 0.38091152906417847, "learning_rate": 6.168552306166258e-05, "loss": 0.2223, "step": 21310 }, { "epoch": 2.8958538490271315, "grad_norm": 0.4818286895751953, "learning_rate": 6.164710076542728e-05, "loss": 0.2245, "step": 21320 }, { "epoch": 2.8972121294441235, "grad_norm": 0.35584893822669983, "learning_rate": 6.160867119548352e-05, "loss": 0.223, "step": 21330 }, { "epoch": 2.898570409861116, "grad_norm": 0.3149985373020172, "learning_rate": 6.157023437583093e-05, "loss": 0.2217, "step": 21340 }, { "epoch": 2.899928690278108, "grad_norm": 0.4001374840736389, "learning_rate": 6.153179033047356e-05, "loss": 0.2164, "step": 21350 }, { "epoch": 2.9012869706951, "grad_norm": 0.43292590975761414, "learning_rate": 6.149333908342005e-05, "loss": 0.2184, "step": 21360 }, { "epoch": 2.902645251112092, "grad_norm": 0.4383385181427002, "learning_rate": 6.145488065868352e-05, "loss": 0.2184, "step": 21370 }, { "epoch": 2.904003531529084, "grad_norm": 0.36491161584854126, "learning_rate": 6.141641508028154e-05, "loss": 0.2089, "step": 21380 }, { "epoch": 2.9053618119460762, "grad_norm": 0.3600454330444336, "learning_rate": 6.13779423722362e-05, "loss": 0.2199, "step": 21390 }, { "epoch": 2.9067200923630683, "grad_norm": 0.45122113823890686, "learning_rate": 6.133946255857401e-05, "loss": 0.2306, "step": 21400 }, { "epoch": 2.9080783727800603, "grad_norm": 0.49834099411964417, "learning_rate": 6.130097566332593e-05, "loss": 0.226, "step": 21410 }, { "epoch": 2.9094366531970524, "grad_norm": 0.5697745084762573, "learning_rate": 6.126248171052731e-05, "loss": 0.2175, "step": 21420 }, { "epoch": 2.910794933614045, "grad_norm": 0.3719008266925812, "learning_rate": 6.122398072421796e-05, "loss": 0.2166, "step": 21430 }, { "epoch": 2.9121532140310364, "grad_norm": 0.37002283334732056, "learning_rate": 6.118547272844205e-05, "loss": 0.2206, "step": 21440 }, { "epoch": 2.913511494448029, "grad_norm": 0.6166349053382874, "learning_rate": 6.114695774724814e-05, "loss": 0.2171, "step": 21450 }, { "epoch": 2.914869774865021, "grad_norm": 0.3591141998767853, "learning_rate": 6.110843580468913e-05, "loss": 0.2233, "step": 21460 }, { "epoch": 2.916228055282013, "grad_norm": 0.45251932740211487, "learning_rate": 6.10699069248223e-05, "loss": 0.21, "step": 21470 }, { "epoch": 2.917586335699005, "grad_norm": 0.4817533493041992, "learning_rate": 6.103137113170926e-05, "loss": 0.2207, "step": 21480 }, { "epoch": 2.918944616115997, "grad_norm": 0.4513356387615204, "learning_rate": 6.099282844941588e-05, "loss": 0.2171, "step": 21490 }, { "epoch": 2.920302896532989, "grad_norm": 0.4395926594734192, "learning_rate": 6.0954278902012406e-05, "loss": 0.2222, "step": 21500 }, { "epoch": 2.921661176949981, "grad_norm": 0.3441222310066223, "learning_rate": 6.0915722513573346e-05, "loss": 0.2254, "step": 21510 }, { "epoch": 2.9230194573669737, "grad_norm": 0.4153713881969452, "learning_rate": 6.087715930817747e-05, "loss": 0.2222, "step": 21520 }, { "epoch": 2.9243777377839653, "grad_norm": 0.5554759502410889, "learning_rate": 6.08385893099078e-05, "loss": 0.2201, "step": 21530 }, { "epoch": 2.9257360182009577, "grad_norm": 0.3858485221862793, "learning_rate": 6.080001254285161e-05, "loss": 0.2243, "step": 21540 }, { "epoch": 2.92709429861795, "grad_norm": 0.38889816403388977, "learning_rate": 6.076142903110044e-05, "loss": 0.2314, "step": 21550 }, { "epoch": 2.928452579034942, "grad_norm": 0.3956488072872162, "learning_rate": 6.072283879874994e-05, "loss": 0.2191, "step": 21560 }, { "epoch": 2.929810859451934, "grad_norm": 0.4923965036869049, "learning_rate": 6.068424186990007e-05, "loss": 0.2234, "step": 21570 }, { "epoch": 2.931169139868926, "grad_norm": 0.4151303470134735, "learning_rate": 6.064563826865492e-05, "loss": 0.2151, "step": 21580 }, { "epoch": 2.932527420285918, "grad_norm": 0.5072651505470276, "learning_rate": 6.0607028019122745e-05, "loss": 0.2267, "step": 21590 }, { "epoch": 2.93388570070291, "grad_norm": 0.4687449634075165, "learning_rate": 6.056841114541595e-05, "loss": 0.2236, "step": 21600 }, { "epoch": 2.9352439811199025, "grad_norm": 0.5488258004188538, "learning_rate": 6.052978767165107e-05, "loss": 0.2264, "step": 21610 }, { "epoch": 2.936602261536894, "grad_norm": 0.3529285788536072, "learning_rate": 6.049115762194881e-05, "loss": 0.2227, "step": 21620 }, { "epoch": 2.9379605419538866, "grad_norm": 0.3785221576690674, "learning_rate": 6.0452521020433947e-05, "loss": 0.2154, "step": 21630 }, { "epoch": 2.9393188223708786, "grad_norm": 0.5670950412750244, "learning_rate": 6.0413877891235326e-05, "loss": 0.2166, "step": 21640 }, { "epoch": 2.9406771027878706, "grad_norm": 1.1755040884017944, "learning_rate": 6.0375228258485894e-05, "loss": 0.2221, "step": 21650 }, { "epoch": 2.9420353832048627, "grad_norm": 0.4152349531650543, "learning_rate": 6.0336572146322676e-05, "loss": 0.2217, "step": 21660 }, { "epoch": 2.9433936636218547, "grad_norm": 0.6543741226196289, "learning_rate": 6.029790957888671e-05, "loss": 0.225, "step": 21670 }, { "epoch": 2.9447519440388468, "grad_norm": 0.5600965619087219, "learning_rate": 6.025924058032309e-05, "loss": 0.2293, "step": 21680 }, { "epoch": 2.946110224455839, "grad_norm": 0.3869888186454773, "learning_rate": 6.022056517478093e-05, "loss": 0.2245, "step": 21690 }, { "epoch": 2.947468504872831, "grad_norm": 0.3750194013118744, "learning_rate": 6.018188338641332e-05, "loss": 0.2324, "step": 21700 }, { "epoch": 2.948826785289823, "grad_norm": 1.340261697769165, "learning_rate": 6.0143195239377335e-05, "loss": 0.2228, "step": 21710 }, { "epoch": 2.9501850657068154, "grad_norm": 0.3500477969646454, "learning_rate": 6.0104500757834056e-05, "loss": 0.2227, "step": 21720 }, { "epoch": 2.9515433461238074, "grad_norm": 0.3757529556751251, "learning_rate": 6.0065799965948486e-05, "loss": 0.2222, "step": 21730 }, { "epoch": 2.9529016265407995, "grad_norm": 0.34145471453666687, "learning_rate": 6.00270928878896e-05, "loss": 0.2203, "step": 21740 }, { "epoch": 2.9542599069577915, "grad_norm": 0.289265900850296, "learning_rate": 5.998837954783027e-05, "loss": 0.2308, "step": 21750 }, { "epoch": 2.9556181873747835, "grad_norm": 0.37081602215766907, "learning_rate": 5.994965996994728e-05, "loss": 0.226, "step": 21760 }, { "epoch": 2.9569764677917756, "grad_norm": 0.44045135378837585, "learning_rate": 5.9910934178421326e-05, "loss": 0.2277, "step": 21770 }, { "epoch": 2.9583347482087676, "grad_norm": 0.36819690465927124, "learning_rate": 5.987220219743699e-05, "loss": 0.2328, "step": 21780 }, { "epoch": 2.9596930286257597, "grad_norm": 0.32799577713012695, "learning_rate": 5.9833464051182674e-05, "loss": 0.2229, "step": 21790 }, { "epoch": 2.9610513090427517, "grad_norm": 0.4577489495277405, "learning_rate": 5.979471976385069e-05, "loss": 0.2281, "step": 21800 }, { "epoch": 2.962409589459744, "grad_norm": 0.381186306476593, "learning_rate": 5.975596935963716e-05, "loss": 0.2165, "step": 21810 }, { "epoch": 2.963767869876736, "grad_norm": 0.46059027314186096, "learning_rate": 5.9717212862742025e-05, "loss": 0.2237, "step": 21820 }, { "epoch": 2.9651261502937283, "grad_norm": 0.40138885378837585, "learning_rate": 5.967845029736901e-05, "loss": 0.2288, "step": 21830 }, { "epoch": 2.9664844307107203, "grad_norm": 0.34390026330947876, "learning_rate": 5.9639681687725666e-05, "loss": 0.23, "step": 21840 }, { "epoch": 2.9678427111277124, "grad_norm": 0.4854002892971039, "learning_rate": 5.9600907058023295e-05, "loss": 0.2233, "step": 21850 }, { "epoch": 2.9692009915447044, "grad_norm": 0.5247782468795776, "learning_rate": 5.956212643247697e-05, "loss": 0.2199, "step": 21860 }, { "epoch": 2.9705592719616964, "grad_norm": 0.32356032729148865, "learning_rate": 5.952333983530551e-05, "loss": 0.2138, "step": 21870 }, { "epoch": 2.9719175523786885, "grad_norm": 0.3924809396266937, "learning_rate": 5.948454729073145e-05, "loss": 0.219, "step": 21880 }, { "epoch": 2.9732758327956805, "grad_norm": 0.4289812445640564, "learning_rate": 5.944574882298105e-05, "loss": 0.2294, "step": 21890 }, { "epoch": 2.974634113212673, "grad_norm": 0.5584338307380676, "learning_rate": 5.9406944456284264e-05, "loss": 0.2204, "step": 21900 }, { "epoch": 2.9759923936296646, "grad_norm": 0.36858996748924255, "learning_rate": 5.936813421487474e-05, "loss": 0.232, "step": 21910 }, { "epoch": 2.977350674046657, "grad_norm": 0.8937425017356873, "learning_rate": 5.932931812298978e-05, "loss": 0.2116, "step": 21920 }, { "epoch": 2.978708954463649, "grad_norm": 0.3569163382053375, "learning_rate": 5.929049620487034e-05, "loss": 0.2258, "step": 21930 }, { "epoch": 2.980067234880641, "grad_norm": 0.4328438341617584, "learning_rate": 5.9251668484761035e-05, "loss": 0.2085, "step": 21940 }, { "epoch": 2.981425515297633, "grad_norm": 0.5336624383926392, "learning_rate": 5.921283498691008e-05, "loss": 0.2268, "step": 21950 }, { "epoch": 2.9827837957146253, "grad_norm": 0.4557378888130188, "learning_rate": 5.91739957355693e-05, "loss": 0.2189, "step": 21960 }, { "epoch": 2.9841420761316173, "grad_norm": 0.4392958879470825, "learning_rate": 5.913515075499415e-05, "loss": 0.222, "step": 21970 }, { "epoch": 2.9855003565486093, "grad_norm": 0.39971327781677246, "learning_rate": 5.909630006944362e-05, "loss": 0.225, "step": 21980 }, { "epoch": 2.9868586369656014, "grad_norm": 0.7787989377975464, "learning_rate": 5.905744370318026e-05, "loss": 0.2273, "step": 21990 }, { "epoch": 2.9882169173825934, "grad_norm": 0.8093920350074768, "learning_rate": 5.9018581680470206e-05, "loss": 0.2232, "step": 22000 }, { "epoch": 2.989575197799586, "grad_norm": 0.3496170938014984, "learning_rate": 5.897971402558309e-05, "loss": 0.2147, "step": 22010 }, { "epoch": 2.990933478216578, "grad_norm": 0.49405068159103394, "learning_rate": 5.8940840762792084e-05, "loss": 0.2212, "step": 22020 }, { "epoch": 2.99229175863357, "grad_norm": 0.8607224822044373, "learning_rate": 5.890196191637386e-05, "loss": 0.2197, "step": 22030 }, { "epoch": 2.993650039050562, "grad_norm": 0.6923351287841797, "learning_rate": 5.8863077510608556e-05, "loss": 0.2214, "step": 22040 }, { "epoch": 2.995008319467554, "grad_norm": 0.339284211397171, "learning_rate": 5.882418756977981e-05, "loss": 0.2297, "step": 22050 }, { "epoch": 2.996366599884546, "grad_norm": 0.7265267372131348, "learning_rate": 5.878529211817471e-05, "loss": 0.2123, "step": 22060 }, { "epoch": 2.997724880301538, "grad_norm": 0.35381853580474854, "learning_rate": 5.874639118008374e-05, "loss": 0.2245, "step": 22070 }, { "epoch": 2.99908316071853, "grad_norm": 0.3038233816623688, "learning_rate": 5.87074847798009e-05, "loss": 0.2162, "step": 22080 }, { "epoch": 3.0004414411355222, "grad_norm": 0.43029147386550903, "learning_rate": 5.8668572941623535e-05, "loss": 0.2272, "step": 22090 }, { "epoch": 3.0017997215525147, "grad_norm": 0.3131086826324463, "learning_rate": 5.862965568985239e-05, "loss": 0.1878, "step": 22100 }, { "epoch": 3.0031580019695068, "grad_norm": 0.45259684324264526, "learning_rate": 5.859073304879161e-05, "loss": 0.2091, "step": 22110 }, { "epoch": 3.004516282386499, "grad_norm": 0.3443499505519867, "learning_rate": 5.85518050427487e-05, "loss": 0.1968, "step": 22120 }, { "epoch": 3.005874562803491, "grad_norm": 0.4133569896221161, "learning_rate": 5.85128716960345e-05, "loss": 0.1985, "step": 22130 }, { "epoch": 3.007232843220483, "grad_norm": 0.3862740695476532, "learning_rate": 5.847393303296322e-05, "loss": 0.2032, "step": 22140 }, { "epoch": 3.008591123637475, "grad_norm": 0.5262600779533386, "learning_rate": 5.8434989077852366e-05, "loss": 0.1978, "step": 22150 }, { "epoch": 3.009949404054467, "grad_norm": 0.46052494645118713, "learning_rate": 5.8396039855022746e-05, "loss": 0.1965, "step": 22160 }, { "epoch": 3.011307684471459, "grad_norm": 0.4434579610824585, "learning_rate": 5.835708538879846e-05, "loss": 0.1951, "step": 22170 }, { "epoch": 3.012665964888451, "grad_norm": 0.41567879915237427, "learning_rate": 5.8318125703506895e-05, "loss": 0.1896, "step": 22180 }, { "epoch": 3.014024245305443, "grad_norm": 0.34274792671203613, "learning_rate": 5.827916082347868e-05, "loss": 0.1908, "step": 22190 }, { "epoch": 3.0153825257224356, "grad_norm": 0.542848527431488, "learning_rate": 5.824019077304772e-05, "loss": 0.2008, "step": 22200 }, { "epoch": 3.0167408061394276, "grad_norm": 0.41481274366378784, "learning_rate": 5.820121557655109e-05, "loss": 0.193, "step": 22210 }, { "epoch": 3.0180990865564197, "grad_norm": 0.33878418803215027, "learning_rate": 5.816223525832916e-05, "loss": 0.1978, "step": 22220 }, { "epoch": 3.0194573669734117, "grad_norm": 0.45369836688041687, "learning_rate": 5.8123249842725405e-05, "loss": 0.1908, "step": 22230 }, { "epoch": 3.0208156473904038, "grad_norm": 0.33093494176864624, "learning_rate": 5.808425935408655e-05, "loss": 0.1947, "step": 22240 }, { "epoch": 3.022173927807396, "grad_norm": 0.36740002036094666, "learning_rate": 5.804526381676249e-05, "loss": 0.1887, "step": 22250 }, { "epoch": 3.023532208224388, "grad_norm": 0.4273303151130676, "learning_rate": 5.800626325510622e-05, "loss": 0.2017, "step": 22260 }, { "epoch": 3.02489048864138, "grad_norm": 0.3919294476509094, "learning_rate": 5.796725769347394e-05, "loss": 0.1935, "step": 22270 }, { "epoch": 3.026248769058372, "grad_norm": 0.38185834884643555, "learning_rate": 5.7928247156224914e-05, "loss": 0.2015, "step": 22280 }, { "epoch": 3.0276070494753644, "grad_norm": 0.3406679630279541, "learning_rate": 5.788923166772155e-05, "loss": 0.1891, "step": 22290 }, { "epoch": 3.0289653298923565, "grad_norm": 0.5562617182731628, "learning_rate": 5.78502112523293e-05, "loss": 0.1925, "step": 22300 }, { "epoch": 3.0303236103093485, "grad_norm": 0.7805081605911255, "learning_rate": 5.781118593441679e-05, "loss": 0.1957, "step": 22310 }, { "epoch": 3.0316818907263405, "grad_norm": 0.48405614495277405, "learning_rate": 5.777215573835559e-05, "loss": 0.1947, "step": 22320 }, { "epoch": 3.0330401711433326, "grad_norm": 0.5455044507980347, "learning_rate": 5.7733120688520414e-05, "loss": 0.1943, "step": 22330 }, { "epoch": 3.0343984515603246, "grad_norm": 0.709382951259613, "learning_rate": 5.7694080809288955e-05, "loss": 0.1966, "step": 22340 }, { "epoch": 3.0357567319773167, "grad_norm": 1.1738965511322021, "learning_rate": 5.7655036125041905e-05, "loss": 0.1956, "step": 22350 }, { "epoch": 3.0371150123943087, "grad_norm": 0.39698681235313416, "learning_rate": 5.761598666016301e-05, "loss": 0.204, "step": 22360 }, { "epoch": 3.0384732928113007, "grad_norm": 0.43283453583717346, "learning_rate": 5.757693243903898e-05, "loss": 0.1968, "step": 22370 }, { "epoch": 3.039831573228293, "grad_norm": 0.5176641345024109, "learning_rate": 5.753787348605948e-05, "loss": 0.195, "step": 22380 }, { "epoch": 3.0411898536452853, "grad_norm": 0.5673525333404541, "learning_rate": 5.749880982561714e-05, "loss": 0.2029, "step": 22390 }, { "epoch": 3.0425481340622773, "grad_norm": 0.6657816171646118, "learning_rate": 5.745974148210753e-05, "loss": 0.1997, "step": 22400 }, { "epoch": 3.0439064144792694, "grad_norm": 0.4057551622390747, "learning_rate": 5.742066847992915e-05, "loss": 0.1977, "step": 22410 }, { "epoch": 3.0452646948962614, "grad_norm": 0.5428000092506409, "learning_rate": 5.738159084348341e-05, "loss": 0.1974, "step": 22420 }, { "epoch": 3.0466229753132534, "grad_norm": 0.8811984062194824, "learning_rate": 5.734250859717462e-05, "loss": 0.1992, "step": 22430 }, { "epoch": 3.0479812557302455, "grad_norm": 0.4974076747894287, "learning_rate": 5.730342176540993e-05, "loss": 0.2084, "step": 22440 }, { "epoch": 3.0493395361472375, "grad_norm": 0.5310653448104858, "learning_rate": 5.7264330372599394e-05, "loss": 0.1869, "step": 22450 }, { "epoch": 3.0506978165642296, "grad_norm": 0.5155213475227356, "learning_rate": 5.722523444315592e-05, "loss": 0.1962, "step": 22460 }, { "epoch": 3.0520560969812216, "grad_norm": 0.5410409569740295, "learning_rate": 5.7186134001495194e-05, "loss": 0.1924, "step": 22470 }, { "epoch": 3.053414377398214, "grad_norm": 0.3572130799293518, "learning_rate": 5.714702907203579e-05, "loss": 0.1885, "step": 22480 }, { "epoch": 3.054772657815206, "grad_norm": 0.3837507665157318, "learning_rate": 5.710791967919904e-05, "loss": 0.1938, "step": 22490 }, { "epoch": 3.056130938232198, "grad_norm": 0.3550776243209839, "learning_rate": 5.706880584740908e-05, "loss": 0.1877, "step": 22500 }, { "epoch": 3.05748921864919, "grad_norm": 0.39525020122528076, "learning_rate": 5.702968760109281e-05, "loss": 0.2071, "step": 22510 }, { "epoch": 3.0588474990661823, "grad_norm": 0.4249646067619324, "learning_rate": 5.6990564964679885e-05, "loss": 0.2002, "step": 22520 }, { "epoch": 3.0602057794831743, "grad_norm": 0.47318965196609497, "learning_rate": 5.6951437962602705e-05, "loss": 0.2067, "step": 22530 }, { "epoch": 3.0615640599001663, "grad_norm": 0.33254480361938477, "learning_rate": 5.691230661929643e-05, "loss": 0.1972, "step": 22540 }, { "epoch": 3.0629223403171584, "grad_norm": 0.41011202335357666, "learning_rate": 5.687317095919886e-05, "loss": 0.1986, "step": 22550 }, { "epoch": 3.0642806207341504, "grad_norm": 0.3641238212585449, "learning_rate": 5.683403100675056e-05, "loss": 0.1843, "step": 22560 }, { "epoch": 3.0656389011511425, "grad_norm": 0.42180219292640686, "learning_rate": 5.679488678639472e-05, "loss": 0.1876, "step": 22570 }, { "epoch": 3.066997181568135, "grad_norm": 0.43326082825660706, "learning_rate": 5.6755738322577245e-05, "loss": 0.1963, "step": 22580 }, { "epoch": 3.068355461985127, "grad_norm": 0.5182169675827026, "learning_rate": 5.671658563974664e-05, "loss": 0.1904, "step": 22590 }, { "epoch": 3.069713742402119, "grad_norm": 0.5532153844833374, "learning_rate": 5.66774287623541e-05, "loss": 0.1986, "step": 22600 }, { "epoch": 3.071072022819111, "grad_norm": 0.5032017827033997, "learning_rate": 5.66382677148534e-05, "loss": 0.2001, "step": 22610 }, { "epoch": 3.072430303236103, "grad_norm": 0.5071492195129395, "learning_rate": 5.659910252170093e-05, "loss": 0.208, "step": 22620 }, { "epoch": 3.073788583653095, "grad_norm": 0.6972277164459229, "learning_rate": 5.6559933207355675e-05, "loss": 0.1872, "step": 22630 }, { "epoch": 3.075146864070087, "grad_norm": 0.46211233735084534, "learning_rate": 5.6520759796279164e-05, "loss": 0.2004, "step": 22640 }, { "epoch": 3.0765051444870792, "grad_norm": 2.174985885620117, "learning_rate": 5.648158231293553e-05, "loss": 0.1989, "step": 22650 }, { "epoch": 3.0778634249040713, "grad_norm": 0.4240027964115143, "learning_rate": 5.644240078179144e-05, "loss": 0.1962, "step": 22660 }, { "epoch": 3.0792217053210633, "grad_norm": 0.3783434331417084, "learning_rate": 5.640321522731604e-05, "loss": 0.1993, "step": 22670 }, { "epoch": 3.080579985738056, "grad_norm": 0.38484007120132446, "learning_rate": 5.6364025673981066e-05, "loss": 0.1972, "step": 22680 }, { "epoch": 3.081938266155048, "grad_norm": 0.42936986684799194, "learning_rate": 5.6324832146260676e-05, "loss": 0.2086, "step": 22690 }, { "epoch": 3.08329654657204, "grad_norm": 0.39799636602401733, "learning_rate": 5.628563466863155e-05, "loss": 0.1942, "step": 22700 }, { "epoch": 3.084654826989032, "grad_norm": 0.42647063732147217, "learning_rate": 5.624643326557285e-05, "loss": 0.2054, "step": 22710 }, { "epoch": 3.086013107406024, "grad_norm": 0.4107569456100464, "learning_rate": 5.6207227961566154e-05, "loss": 0.2026, "step": 22720 }, { "epoch": 3.087371387823016, "grad_norm": 0.5849148035049438, "learning_rate": 5.616801878109549e-05, "loss": 0.1948, "step": 22730 }, { "epoch": 3.088729668240008, "grad_norm": 0.5138232707977295, "learning_rate": 5.6128805748647317e-05, "loss": 0.1962, "step": 22740 }, { "epoch": 3.090087948657, "grad_norm": 0.6439228653907776, "learning_rate": 5.608958888871048e-05, "loss": 0.1893, "step": 22750 }, { "epoch": 3.091446229073992, "grad_norm": 0.46972277760505676, "learning_rate": 5.605036822577622e-05, "loss": 0.1911, "step": 22760 }, { "epoch": 3.0928045094909846, "grad_norm": 1.8087300062179565, "learning_rate": 5.601114378433817e-05, "loss": 0.2052, "step": 22770 }, { "epoch": 3.0941627899079767, "grad_norm": 0.3180915117263794, "learning_rate": 5.597191558889231e-05, "loss": 0.1844, "step": 22780 }, { "epoch": 3.0955210703249687, "grad_norm": 0.5545761585235596, "learning_rate": 5.593268366393696e-05, "loss": 0.1992, "step": 22790 }, { "epoch": 3.0968793507419607, "grad_norm": 0.5697637796401978, "learning_rate": 5.589344803397277e-05, "loss": 0.1944, "step": 22800 }, { "epoch": 3.098237631158953, "grad_norm": 0.33732491731643677, "learning_rate": 5.585420872350271e-05, "loss": 0.1907, "step": 22810 }, { "epoch": 3.099595911575945, "grad_norm": 0.465926855802536, "learning_rate": 5.5814965757032055e-05, "loss": 0.1945, "step": 22820 }, { "epoch": 3.100954191992937, "grad_norm": 0.43737974762916565, "learning_rate": 5.577571915906835e-05, "loss": 0.1996, "step": 22830 }, { "epoch": 3.102312472409929, "grad_norm": 0.413466215133667, "learning_rate": 5.57364689541214e-05, "loss": 0.1958, "step": 22840 }, { "epoch": 3.103670752826921, "grad_norm": 0.6358380317687988, "learning_rate": 5.56972151667033e-05, "loss": 0.1934, "step": 22850 }, { "epoch": 3.105029033243913, "grad_norm": 0.6689414978027344, "learning_rate": 5.5657957821328334e-05, "loss": 0.2088, "step": 22860 }, { "epoch": 3.1063873136609055, "grad_norm": 0.41022130846977234, "learning_rate": 5.561869694251303e-05, "loss": 0.203, "step": 22870 }, { "epoch": 3.1077455940778975, "grad_norm": 0.4046862721443176, "learning_rate": 5.5579432554776134e-05, "loss": 0.1909, "step": 22880 }, { "epoch": 3.1091038744948896, "grad_norm": 0.42942601442337036, "learning_rate": 5.554016468263857e-05, "loss": 0.1991, "step": 22890 }, { "epoch": 3.1104621549118816, "grad_norm": 0.5205926299095154, "learning_rate": 5.5500893350623464e-05, "loss": 0.2046, "step": 22900 }, { "epoch": 3.1118204353288736, "grad_norm": 0.5594541430473328, "learning_rate": 5.546161858325604e-05, "loss": 0.2031, "step": 22910 }, { "epoch": 3.1131787157458657, "grad_norm": 0.646840512752533, "learning_rate": 5.5422340405063735e-05, "loss": 0.1995, "step": 22920 }, { "epoch": 3.1145369961628577, "grad_norm": 0.3771526515483856, "learning_rate": 5.538305884057606e-05, "loss": 0.2028, "step": 22930 }, { "epoch": 3.1158952765798498, "grad_norm": 0.4641600251197815, "learning_rate": 5.5343773914324704e-05, "loss": 0.2057, "step": 22940 }, { "epoch": 3.117253556996842, "grad_norm": 0.4313160181045532, "learning_rate": 5.5304485650843404e-05, "loss": 0.2006, "step": 22950 }, { "epoch": 3.1186118374138343, "grad_norm": 0.6138641238212585, "learning_rate": 5.526519407466799e-05, "loss": 0.2007, "step": 22960 }, { "epoch": 3.1199701178308263, "grad_norm": 0.3870111405849457, "learning_rate": 5.5225899210336394e-05, "loss": 0.1919, "step": 22970 }, { "epoch": 3.1213283982478184, "grad_norm": 0.5468588471412659, "learning_rate": 5.5186601082388544e-05, "loss": 0.2036, "step": 22980 }, { "epoch": 3.1226866786648104, "grad_norm": 0.5523982644081116, "learning_rate": 5.514729971536645e-05, "loss": 0.1864, "step": 22990 }, { "epoch": 3.1240449590818025, "grad_norm": 0.5056903958320618, "learning_rate": 5.510799513381415e-05, "loss": 0.1918, "step": 23000 }, { "epoch": 3.1254032394987945, "grad_norm": 0.6762421131134033, "learning_rate": 5.5068687362277646e-05, "loss": 0.1843, "step": 23010 }, { "epoch": 3.1267615199157865, "grad_norm": 0.470162034034729, "learning_rate": 5.502937642530498e-05, "loss": 0.1985, "step": 23020 }, { "epoch": 3.1281198003327786, "grad_norm": 0.4683346748352051, "learning_rate": 5.4990062347446135e-05, "loss": 0.1862, "step": 23030 }, { "epoch": 3.1294780807497706, "grad_norm": 0.6808971166610718, "learning_rate": 5.495074515325307e-05, "loss": 0.1977, "step": 23040 }, { "epoch": 3.1308363611667627, "grad_norm": 0.3941827416419983, "learning_rate": 5.49114248672797e-05, "loss": 0.1945, "step": 23050 }, { "epoch": 3.132194641583755, "grad_norm": 0.5241951942443848, "learning_rate": 5.487210151408186e-05, "loss": 0.2014, "step": 23060 }, { "epoch": 3.133552922000747, "grad_norm": 0.5281507968902588, "learning_rate": 5.483277511821729e-05, "loss": 0.2018, "step": 23070 }, { "epoch": 3.1349112024177392, "grad_norm": 0.4658227860927582, "learning_rate": 5.479344570424567e-05, "loss": 0.1923, "step": 23080 }, { "epoch": 3.1362694828347313, "grad_norm": 0.6539624929428101, "learning_rate": 5.47541132967285e-05, "loss": 0.1869, "step": 23090 }, { "epoch": 3.1376277632517233, "grad_norm": 0.557059109210968, "learning_rate": 5.47147779202292e-05, "loss": 0.1923, "step": 23100 }, { "epoch": 3.1389860436687154, "grad_norm": 0.3933965563774109, "learning_rate": 5.467543959931303e-05, "loss": 0.1959, "step": 23110 }, { "epoch": 3.1403443240857074, "grad_norm": 0.41743648052215576, "learning_rate": 5.463609835854711e-05, "loss": 0.1949, "step": 23120 }, { "epoch": 3.1417026045026994, "grad_norm": 0.9121444225311279, "learning_rate": 5.459675422250032e-05, "loss": 0.1994, "step": 23130 }, { "epoch": 3.1430608849196915, "grad_norm": 0.5243625044822693, "learning_rate": 5.4557407215743425e-05, "loss": 0.1958, "step": 23140 }, { "epoch": 3.144419165336684, "grad_norm": 0.4121338427066803, "learning_rate": 5.451805736284894e-05, "loss": 0.19, "step": 23150 }, { "epoch": 3.145777445753676, "grad_norm": 0.4734656512737274, "learning_rate": 5.447870468839116e-05, "loss": 0.1939, "step": 23160 }, { "epoch": 3.147135726170668, "grad_norm": 0.4724094867706299, "learning_rate": 5.4439349216946144e-05, "loss": 0.1985, "step": 23170 }, { "epoch": 3.14849400658766, "grad_norm": 0.4070218801498413, "learning_rate": 5.439999097309172e-05, "loss": 0.1902, "step": 23180 }, { "epoch": 3.149852287004652, "grad_norm": 0.72185879945755, "learning_rate": 5.436062998140741e-05, "loss": 0.2043, "step": 23190 }, { "epoch": 3.151210567421644, "grad_norm": 0.3958280682563782, "learning_rate": 5.432126626647448e-05, "loss": 0.1916, "step": 23200 }, { "epoch": 3.1525688478386362, "grad_norm": 0.797694206237793, "learning_rate": 5.4281899852875886e-05, "loss": 0.1979, "step": 23210 }, { "epoch": 3.1539271282556283, "grad_norm": 0.37975621223449707, "learning_rate": 5.424253076519627e-05, "loss": 0.1977, "step": 23220 }, { "epoch": 3.1552854086726203, "grad_norm": 0.5891029238700867, "learning_rate": 5.4203159028021955e-05, "loss": 0.1995, "step": 23230 }, { "epoch": 3.1566436890896123, "grad_norm": 0.3409916162490845, "learning_rate": 5.416378466594089e-05, "loss": 0.1909, "step": 23240 }, { "epoch": 3.158001969506605, "grad_norm": 0.4415968656539917, "learning_rate": 5.41244077035427e-05, "loss": 0.2058, "step": 23250 }, { "epoch": 3.159360249923597, "grad_norm": 0.4663088023662567, "learning_rate": 5.408502816541861e-05, "loss": 0.2027, "step": 23260 }, { "epoch": 3.160718530340589, "grad_norm": 0.7359892725944519, "learning_rate": 5.404564607616144e-05, "loss": 0.1864, "step": 23270 }, { "epoch": 3.162076810757581, "grad_norm": 0.3518093228340149, "learning_rate": 5.400626146036565e-05, "loss": 0.1893, "step": 23280 }, { "epoch": 3.163435091174573, "grad_norm": 0.5675481557846069, "learning_rate": 5.3966874342627235e-05, "loss": 0.2042, "step": 23290 }, { "epoch": 3.164793371591565, "grad_norm": 0.4408515989780426, "learning_rate": 5.392748474754379e-05, "loss": 0.195, "step": 23300 }, { "epoch": 3.166151652008557, "grad_norm": 0.45783570408821106, "learning_rate": 5.388809269971441e-05, "loss": 0.1882, "step": 23310 }, { "epoch": 3.167509932425549, "grad_norm": 0.48831093311309814, "learning_rate": 5.384869822373975e-05, "loss": 0.1849, "step": 23320 }, { "epoch": 3.168868212842541, "grad_norm": 0.4313524663448334, "learning_rate": 5.3809301344221975e-05, "loss": 0.1979, "step": 23330 }, { "epoch": 3.170226493259533, "grad_norm": 0.48908308148384094, "learning_rate": 5.3769902085764777e-05, "loss": 0.1918, "step": 23340 }, { "epoch": 3.1715847736765257, "grad_norm": 0.41587868332862854, "learning_rate": 5.37305004729733e-05, "loss": 0.2051, "step": 23350 }, { "epoch": 3.1729430540935177, "grad_norm": 0.41939619183540344, "learning_rate": 5.3691096530454154e-05, "loss": 0.1991, "step": 23360 }, { "epoch": 3.1743013345105098, "grad_norm": 0.4324507713317871, "learning_rate": 5.365169028281544e-05, "loss": 0.1997, "step": 23370 }, { "epoch": 3.175659614927502, "grad_norm": 0.47118595242500305, "learning_rate": 5.361228175466666e-05, "loss": 0.1961, "step": 23380 }, { "epoch": 3.177017895344494, "grad_norm": 0.41696882247924805, "learning_rate": 5.357287097061877e-05, "loss": 0.1959, "step": 23390 }, { "epoch": 3.178376175761486, "grad_norm": 0.849330484867096, "learning_rate": 5.353345795528412e-05, "loss": 0.2108, "step": 23400 }, { "epoch": 3.179734456178478, "grad_norm": 0.4518722891807556, "learning_rate": 5.3494042733276464e-05, "loss": 0.1864, "step": 23410 }, { "epoch": 3.18109273659547, "grad_norm": 0.3876972198486328, "learning_rate": 5.3454625329210905e-05, "loss": 0.1809, "step": 23420 }, { "epoch": 3.182451017012462, "grad_norm": 0.470084547996521, "learning_rate": 5.3415205767703945e-05, "loss": 0.191, "step": 23430 }, { "epoch": 3.1838092974294545, "grad_norm": 0.472075492143631, "learning_rate": 5.337578407337342e-05, "loss": 0.1994, "step": 23440 }, { "epoch": 3.1851675778464466, "grad_norm": 0.5384938716888428, "learning_rate": 5.333636027083849e-05, "loss": 0.1931, "step": 23450 }, { "epoch": 3.1865258582634386, "grad_norm": 0.44069352746009827, "learning_rate": 5.329693438471964e-05, "loss": 0.2023, "step": 23460 }, { "epoch": 3.1878841386804306, "grad_norm": 1.7066231966018677, "learning_rate": 5.325750643963866e-05, "loss": 0.1895, "step": 23470 }, { "epoch": 3.1892424190974227, "grad_norm": 0.3669210374355316, "learning_rate": 5.3218076460218613e-05, "loss": 0.2003, "step": 23480 }, { "epoch": 3.1906006995144147, "grad_norm": 0.6147090792655945, "learning_rate": 5.317864447108384e-05, "loss": 0.2025, "step": 23490 }, { "epoch": 3.1919589799314068, "grad_norm": 0.5725212693214417, "learning_rate": 5.313921049685993e-05, "loss": 0.1957, "step": 23500 }, { "epoch": 3.193317260348399, "grad_norm": 0.3860167860984802, "learning_rate": 5.309977456217373e-05, "loss": 0.1969, "step": 23510 }, { "epoch": 3.194675540765391, "grad_norm": 0.7600659728050232, "learning_rate": 5.30603366916533e-05, "loss": 0.1888, "step": 23520 }, { "epoch": 3.1960338211823833, "grad_norm": 0.4015670716762543, "learning_rate": 5.302089690992791e-05, "loss": 0.197, "step": 23530 }, { "epoch": 3.1973921015993754, "grad_norm": 0.8637807965278625, "learning_rate": 5.298145524162801e-05, "loss": 0.1849, "step": 23540 }, { "epoch": 3.1987503820163674, "grad_norm": 0.37728622555732727, "learning_rate": 5.294201171138524e-05, "loss": 0.2151, "step": 23550 }, { "epoch": 3.2001086624333595, "grad_norm": 0.5689631104469299, "learning_rate": 5.29025663438324e-05, "loss": 0.2041, "step": 23560 }, { "epoch": 3.2014669428503515, "grad_norm": 0.44920605421066284, "learning_rate": 5.2863119163603456e-05, "loss": 0.1972, "step": 23570 }, { "epoch": 3.2028252232673435, "grad_norm": 0.4204196631908417, "learning_rate": 5.2823670195333494e-05, "loss": 0.1934, "step": 23580 }, { "epoch": 3.2041835036843356, "grad_norm": 0.4005328118801117, "learning_rate": 5.27842194636587e-05, "loss": 0.1938, "step": 23590 }, { "epoch": 3.2055417841013276, "grad_norm": 0.5842621326446533, "learning_rate": 5.274476699321638e-05, "loss": 0.2035, "step": 23600 }, { "epoch": 3.2069000645183197, "grad_norm": 1.5148141384124756, "learning_rate": 5.2705312808644904e-05, "loss": 0.1972, "step": 23610 }, { "epoch": 3.2082583449353117, "grad_norm": 0.4919832944869995, "learning_rate": 5.2665856934583756e-05, "loss": 0.1948, "step": 23620 }, { "epoch": 3.209616625352304, "grad_norm": 0.5148344039916992, "learning_rate": 5.262639939567343e-05, "loss": 0.2056, "step": 23630 }, { "epoch": 3.2109749057692962, "grad_norm": 0.454988956451416, "learning_rate": 5.258694021655548e-05, "loss": 0.1899, "step": 23640 }, { "epoch": 3.2123331861862883, "grad_norm": 0.49169981479644775, "learning_rate": 5.254747942187248e-05, "loss": 0.1927, "step": 23650 }, { "epoch": 3.2136914666032803, "grad_norm": 0.5601779818534851, "learning_rate": 5.250801703626803e-05, "loss": 0.1964, "step": 23660 }, { "epoch": 3.2150497470202724, "grad_norm": 0.4128323793411255, "learning_rate": 5.2468553084386687e-05, "loss": 0.1916, "step": 23670 }, { "epoch": 3.2164080274372644, "grad_norm": 0.4643900692462921, "learning_rate": 5.2429087590874016e-05, "loss": 0.1945, "step": 23680 }, { "epoch": 3.2177663078542564, "grad_norm": 0.4682321548461914, "learning_rate": 5.238962058037654e-05, "loss": 0.192, "step": 23690 }, { "epoch": 3.2191245882712485, "grad_norm": 0.4738646447658539, "learning_rate": 5.235015207754173e-05, "loss": 0.1881, "step": 23700 }, { "epoch": 3.2204828686882405, "grad_norm": 0.7280223369598389, "learning_rate": 5.231068210701797e-05, "loss": 0.1983, "step": 23710 }, { "epoch": 3.2218411491052326, "grad_norm": 0.3701850473880768, "learning_rate": 5.227121069345459e-05, "loss": 0.1973, "step": 23720 }, { "epoch": 3.223199429522225, "grad_norm": 0.3943403661251068, "learning_rate": 5.2231737861501786e-05, "loss": 0.2066, "step": 23730 }, { "epoch": 3.224557709939217, "grad_norm": 0.35450661182403564, "learning_rate": 5.2192263635810676e-05, "loss": 0.1969, "step": 23740 }, { "epoch": 3.225915990356209, "grad_norm": 1.0155965089797974, "learning_rate": 5.215278804103324e-05, "loss": 0.2026, "step": 23750 }, { "epoch": 3.227274270773201, "grad_norm": 0.38062742352485657, "learning_rate": 5.2113311101822296e-05, "loss": 0.1987, "step": 23760 }, { "epoch": 3.228632551190193, "grad_norm": 0.35923290252685547, "learning_rate": 5.2073832842831505e-05, "loss": 0.2034, "step": 23770 }, { "epoch": 3.2299908316071853, "grad_norm": 0.6462444067001343, "learning_rate": 5.2034353288715364e-05, "loss": 0.1893, "step": 23780 }, { "epoch": 3.2313491120241773, "grad_norm": 0.4878067374229431, "learning_rate": 5.1994872464129165e-05, "loss": 0.1962, "step": 23790 }, { "epoch": 3.2327073924411693, "grad_norm": 0.3804328739643097, "learning_rate": 5.195539039372902e-05, "loss": 0.1996, "step": 23800 }, { "epoch": 3.2340656728581614, "grad_norm": 0.47382599115371704, "learning_rate": 5.191590710217179e-05, "loss": 0.1912, "step": 23810 }, { "epoch": 3.235423953275154, "grad_norm": 0.42456939816474915, "learning_rate": 5.1876422614115106e-05, "loss": 0.1969, "step": 23820 }, { "epoch": 3.236782233692146, "grad_norm": 0.37761470675468445, "learning_rate": 5.183693695421736e-05, "loss": 0.2024, "step": 23830 }, { "epoch": 3.238140514109138, "grad_norm": 0.7931646704673767, "learning_rate": 5.179745014713765e-05, "loss": 0.1946, "step": 23840 }, { "epoch": 3.23949879452613, "grad_norm": 0.4521865248680115, "learning_rate": 5.175796221753582e-05, "loss": 0.1968, "step": 23850 }, { "epoch": 3.240857074943122, "grad_norm": 0.8052563667297363, "learning_rate": 5.171847319007239e-05, "loss": 0.1914, "step": 23860 }, { "epoch": 3.242215355360114, "grad_norm": 0.345735639333725, "learning_rate": 5.1678983089408594e-05, "loss": 0.2009, "step": 23870 }, { "epoch": 3.243573635777106, "grad_norm": 0.4834896922111511, "learning_rate": 5.1639491940206295e-05, "loss": 0.2002, "step": 23880 }, { "epoch": 3.244931916194098, "grad_norm": 0.4099092483520508, "learning_rate": 5.159999976712805e-05, "loss": 0.2002, "step": 23890 }, { "epoch": 3.24629019661109, "grad_norm": 0.5575513243675232, "learning_rate": 5.156050659483703e-05, "loss": 0.1991, "step": 23900 }, { "epoch": 3.2476484770280827, "grad_norm": 0.39751994609832764, "learning_rate": 5.152101244799705e-05, "loss": 0.2034, "step": 23910 }, { "epoch": 3.2490067574450747, "grad_norm": 0.6988863348960876, "learning_rate": 5.1481517351272523e-05, "loss": 0.1959, "step": 23920 }, { "epoch": 3.2503650378620668, "grad_norm": 0.40075069665908813, "learning_rate": 5.144202132932845e-05, "loss": 0.1954, "step": 23930 }, { "epoch": 3.251723318279059, "grad_norm": 0.7470985054969788, "learning_rate": 5.140252440683041e-05, "loss": 0.1994, "step": 23940 }, { "epoch": 3.253081598696051, "grad_norm": 0.4101639688014984, "learning_rate": 5.136302660844456e-05, "loss": 0.2048, "step": 23950 }, { "epoch": 3.254439879113043, "grad_norm": 0.38215816020965576, "learning_rate": 5.1323527958837594e-05, "loss": 0.1898, "step": 23960 }, { "epoch": 3.255798159530035, "grad_norm": 0.4465661346912384, "learning_rate": 5.128402848267674e-05, "loss": 0.1909, "step": 23970 }, { "epoch": 3.257156439947027, "grad_norm": 0.6182205677032471, "learning_rate": 5.1244528204629726e-05, "loss": 0.1978, "step": 23980 }, { "epoch": 3.258514720364019, "grad_norm": 0.47666168212890625, "learning_rate": 5.12050271493648e-05, "loss": 0.2043, "step": 23990 }, { "epoch": 3.259873000781011, "grad_norm": 0.44573158025741577, "learning_rate": 5.11655253415507e-05, "loss": 0.2028, "step": 24000 }, { "epoch": 3.261231281198003, "grad_norm": 0.8984714150428772, "learning_rate": 5.112602280585661e-05, "loss": 0.2052, "step": 24010 }, { "epoch": 3.2625895616149956, "grad_norm": 0.4316316246986389, "learning_rate": 5.1086519566952184e-05, "loss": 0.2031, "step": 24020 }, { "epoch": 3.2639478420319876, "grad_norm": 0.8280765414237976, "learning_rate": 5.1047015649507536e-05, "loss": 0.1901, "step": 24030 }, { "epoch": 3.2653061224489797, "grad_norm": 0.502284586429596, "learning_rate": 5.100751107819315e-05, "loss": 0.2003, "step": 24040 }, { "epoch": 3.2666644028659717, "grad_norm": 0.6302350163459778, "learning_rate": 5.096800587767997e-05, "loss": 0.1983, "step": 24050 }, { "epoch": 3.2680226832829637, "grad_norm": 1.6426030397415161, "learning_rate": 5.092850007263931e-05, "loss": 0.1923, "step": 24060 }, { "epoch": 3.269380963699956, "grad_norm": 0.8085106015205383, "learning_rate": 5.088899368774284e-05, "loss": 0.2015, "step": 24070 }, { "epoch": 3.270739244116948, "grad_norm": 0.4403649866580963, "learning_rate": 5.0849486747662643e-05, "loss": 0.1867, "step": 24080 }, { "epoch": 3.27209752453394, "grad_norm": 0.515762448310852, "learning_rate": 5.080997927707113e-05, "loss": 0.1893, "step": 24090 }, { "epoch": 3.273455804950932, "grad_norm": 0.7697920799255371, "learning_rate": 5.077047130064101e-05, "loss": 0.1939, "step": 24100 }, { "epoch": 3.2748140853679244, "grad_norm": 0.5030130743980408, "learning_rate": 5.073096284304535e-05, "loss": 0.2024, "step": 24110 }, { "epoch": 3.2761723657849164, "grad_norm": 0.37551745772361755, "learning_rate": 5.069145392895749e-05, "loss": 0.1964, "step": 24120 }, { "epoch": 3.2775306462019085, "grad_norm": 0.39565199613571167, "learning_rate": 5.065194458305106e-05, "loss": 0.1854, "step": 24130 }, { "epoch": 3.2788889266189005, "grad_norm": 0.5150911808013916, "learning_rate": 5.0612434829999985e-05, "loss": 0.2026, "step": 24140 }, { "epoch": 3.2802472070358926, "grad_norm": 0.508979856967926, "learning_rate": 5.057292469447842e-05, "loss": 0.2018, "step": 24150 }, { "epoch": 3.2816054874528846, "grad_norm": 0.4225066006183624, "learning_rate": 5.0533414201160755e-05, "loss": 0.1832, "step": 24160 }, { "epoch": 3.2829637678698766, "grad_norm": 0.4393880367279053, "learning_rate": 5.049390337472162e-05, "loss": 0.1872, "step": 24170 }, { "epoch": 3.2843220482868687, "grad_norm": 0.35565125942230225, "learning_rate": 5.0454392239835835e-05, "loss": 0.1955, "step": 24180 }, { "epoch": 3.2856803287038607, "grad_norm": 0.501788318157196, "learning_rate": 5.041488082117842e-05, "loss": 0.1931, "step": 24190 }, { "epoch": 3.287038609120853, "grad_norm": 0.4963611662387848, "learning_rate": 5.03753691434246e-05, "loss": 0.1945, "step": 24200 }, { "epoch": 3.2883968895378453, "grad_norm": 0.5768622159957886, "learning_rate": 5.033585723124971e-05, "loss": 0.1858, "step": 24210 }, { "epoch": 3.2897551699548373, "grad_norm": 0.5229480266571045, "learning_rate": 5.029634510932927e-05, "loss": 0.1911, "step": 24220 }, { "epoch": 3.2911134503718293, "grad_norm": 0.4229726195335388, "learning_rate": 5.0256832802338926e-05, "loss": 0.1934, "step": 24230 }, { "epoch": 3.2924717307888214, "grad_norm": 0.3817008435726166, "learning_rate": 5.02173203349544e-05, "loss": 0.2041, "step": 24240 }, { "epoch": 3.2938300112058134, "grad_norm": 0.4805896580219269, "learning_rate": 5.017780773185159e-05, "loss": 0.1945, "step": 24250 }, { "epoch": 3.2951882916228055, "grad_norm": 0.45569124817848206, "learning_rate": 5.013829501770642e-05, "loss": 0.1942, "step": 24260 }, { "epoch": 3.2965465720397975, "grad_norm": 0.4751398265361786, "learning_rate": 5.009878221719489e-05, "loss": 0.1875, "step": 24270 }, { "epoch": 3.2979048524567895, "grad_norm": 0.6274484992027283, "learning_rate": 5.0059269354993066e-05, "loss": 0.1999, "step": 24280 }, { "epoch": 3.299263132873782, "grad_norm": 1.2875845432281494, "learning_rate": 5.0019756455777056e-05, "loss": 0.1973, "step": 24290 }, { "epoch": 3.3006214132907736, "grad_norm": 0.6598098278045654, "learning_rate": 4.998024354422297e-05, "loss": 0.2059, "step": 24300 }, { "epoch": 3.301979693707766, "grad_norm": 0.3803810775279999, "learning_rate": 4.994073064500695e-05, "loss": 0.2052, "step": 24310 }, { "epoch": 3.303337974124758, "grad_norm": 0.41552597284317017, "learning_rate": 4.990121778280514e-05, "loss": 0.1908, "step": 24320 }, { "epoch": 3.30469625454175, "grad_norm": 0.4312376379966736, "learning_rate": 4.9861704982293594e-05, "loss": 0.2007, "step": 24330 }, { "epoch": 3.3060545349587422, "grad_norm": 0.4603520929813385, "learning_rate": 4.982219226814841e-05, "loss": 0.2017, "step": 24340 }, { "epoch": 3.3074128153757343, "grad_norm": 0.46117183566093445, "learning_rate": 4.9782679665045604e-05, "loss": 0.2071, "step": 24350 }, { "epoch": 3.3087710957927263, "grad_norm": 0.35582295060157776, "learning_rate": 4.9743167197661086e-05, "loss": 0.1973, "step": 24360 }, { "epoch": 3.3101293762097184, "grad_norm": 0.4300174415111542, "learning_rate": 4.970365489067075e-05, "loss": 0.1965, "step": 24370 }, { "epoch": 3.3114876566267104, "grad_norm": 0.5572532415390015, "learning_rate": 4.96641427687503e-05, "loss": 0.1887, "step": 24380 }, { "epoch": 3.3128459370437024, "grad_norm": 0.49323543906211853, "learning_rate": 4.9624630856575405e-05, "loss": 0.2093, "step": 24390 }, { "epoch": 3.314204217460695, "grad_norm": 0.4499657452106476, "learning_rate": 4.958511917882159e-05, "loss": 0.1927, "step": 24400 }, { "epoch": 3.315562497877687, "grad_norm": 0.4166729748249054, "learning_rate": 4.954560776016418e-05, "loss": 0.1961, "step": 24410 }, { "epoch": 3.316920778294679, "grad_norm": 1.0527606010437012, "learning_rate": 4.9506096625278406e-05, "loss": 0.1976, "step": 24420 }, { "epoch": 3.318279058711671, "grad_norm": 0.435272753238678, "learning_rate": 4.946658579883926e-05, "loss": 0.1947, "step": 24430 }, { "epoch": 3.319637339128663, "grad_norm": 0.36819642782211304, "learning_rate": 4.9427075305521584e-05, "loss": 0.1923, "step": 24440 }, { "epoch": 3.320995619545655, "grad_norm": 0.4334109425544739, "learning_rate": 4.938756517000002e-05, "loss": 0.2064, "step": 24450 }, { "epoch": 3.322353899962647, "grad_norm": 0.4736718535423279, "learning_rate": 4.934805541694893e-05, "loss": 0.1948, "step": 24460 }, { "epoch": 3.3237121803796392, "grad_norm": 0.4472101628780365, "learning_rate": 4.9308546071042533e-05, "loss": 0.1849, "step": 24470 }, { "epoch": 3.3250704607966313, "grad_norm": 0.4605238139629364, "learning_rate": 4.926903715695466e-05, "loss": 0.1905, "step": 24480 }, { "epoch": 3.3264287412136238, "grad_norm": 0.5964456796646118, "learning_rate": 4.9229528699359004e-05, "loss": 0.1913, "step": 24490 }, { "epoch": 3.327787021630616, "grad_norm": 0.6751935482025146, "learning_rate": 4.919002072292888e-05, "loss": 0.2, "step": 24500 }, { "epoch": 3.329145302047608, "grad_norm": 0.8427892327308655, "learning_rate": 4.915051325233735e-05, "loss": 0.1953, "step": 24510 }, { "epoch": 3.3305035824646, "grad_norm": 0.4177929759025574, "learning_rate": 4.911100631225717e-05, "loss": 0.1954, "step": 24520 }, { "epoch": 3.331861862881592, "grad_norm": 0.4651828706264496, "learning_rate": 4.9071499927360704e-05, "loss": 0.2023, "step": 24530 }, { "epoch": 3.333220143298584, "grad_norm": 0.44755879044532776, "learning_rate": 4.903199412232005e-05, "loss": 0.1947, "step": 24540 }, { "epoch": 3.334578423715576, "grad_norm": 0.44846901297569275, "learning_rate": 4.899248892180686e-05, "loss": 0.1904, "step": 24550 }, { "epoch": 3.335936704132568, "grad_norm": 0.5566550493240356, "learning_rate": 4.895298435049246e-05, "loss": 0.2074, "step": 24560 }, { "epoch": 3.33729498454956, "grad_norm": 0.5064932703971863, "learning_rate": 4.891348043304782e-05, "loss": 0.1893, "step": 24570 }, { "epoch": 3.3386532649665526, "grad_norm": 0.648489773273468, "learning_rate": 4.88739771941434e-05, "loss": 0.1896, "step": 24580 }, { "epoch": 3.3400115453835446, "grad_norm": 0.5402374267578125, "learning_rate": 4.883447465844932e-05, "loss": 0.1914, "step": 24590 }, { "epoch": 3.3413698258005367, "grad_norm": 0.4293290674686432, "learning_rate": 4.879497285063521e-05, "loss": 0.1973, "step": 24600 }, { "epoch": 3.3427281062175287, "grad_norm": 0.48524659872055054, "learning_rate": 4.875547179537028e-05, "loss": 0.1995, "step": 24610 }, { "epoch": 3.3440863866345207, "grad_norm": 0.5362362861633301, "learning_rate": 4.871597151732328e-05, "loss": 0.2039, "step": 24620 }, { "epoch": 3.3454446670515128, "grad_norm": 0.45497360825538635, "learning_rate": 4.867647204116241e-05, "loss": 0.2029, "step": 24630 }, { "epoch": 3.346802947468505, "grad_norm": 0.37105777859687805, "learning_rate": 4.863697339155545e-05, "loss": 0.1898, "step": 24640 }, { "epoch": 3.348161227885497, "grad_norm": 0.39854225516319275, "learning_rate": 4.859747559316959e-05, "loss": 0.1965, "step": 24650 }, { "epoch": 3.349519508302489, "grad_norm": 0.5250695943832397, "learning_rate": 4.855797867067157e-05, "loss": 0.2003, "step": 24660 }, { "epoch": 3.350877788719481, "grad_norm": 0.42131307721138, "learning_rate": 4.851848264872749e-05, "loss": 0.1906, "step": 24670 }, { "epoch": 3.352236069136473, "grad_norm": 0.5562297105789185, "learning_rate": 4.847898755200295e-05, "loss": 0.2075, "step": 24680 }, { "epoch": 3.3535943495534655, "grad_norm": 0.7240763306617737, "learning_rate": 4.843949340516298e-05, "loss": 0.1999, "step": 24690 }, { "epoch": 3.3549526299704575, "grad_norm": 0.5242428183555603, "learning_rate": 4.8400000232871953e-05, "loss": 0.1974, "step": 24700 }, { "epoch": 3.3563109103874496, "grad_norm": 0.7951975464820862, "learning_rate": 4.8360508059793723e-05, "loss": 0.1967, "step": 24710 }, { "epoch": 3.3576691908044416, "grad_norm": 0.46050840616226196, "learning_rate": 4.8321016910591424e-05, "loss": 0.2018, "step": 24720 }, { "epoch": 3.3590274712214336, "grad_norm": 0.4191540777683258, "learning_rate": 4.8281526809927605e-05, "loss": 0.1968, "step": 24730 }, { "epoch": 3.3603857516384257, "grad_norm": 0.5767720937728882, "learning_rate": 4.824203778246419e-05, "loss": 0.2107, "step": 24740 }, { "epoch": 3.3617440320554177, "grad_norm": 0.6248223781585693, "learning_rate": 4.8202549852862355e-05, "loss": 0.1989, "step": 24750 }, { "epoch": 3.3631023124724098, "grad_norm": 0.3626697063446045, "learning_rate": 4.8163063045782665e-05, "loss": 0.1961, "step": 24760 }, { "epoch": 3.364460592889402, "grad_norm": 0.47375214099884033, "learning_rate": 4.8123577385884905e-05, "loss": 0.1938, "step": 24770 }, { "epoch": 3.3658188733063943, "grad_norm": 0.4869895875453949, "learning_rate": 4.808409289782824e-05, "loss": 0.201, "step": 24780 }, { "epoch": 3.3671771537233863, "grad_norm": 0.6187500357627869, "learning_rate": 4.8044609606270997e-05, "loss": 0.2014, "step": 24790 }, { "epoch": 3.3685354341403784, "grad_norm": 0.7533590793609619, "learning_rate": 4.800512753587084e-05, "loss": 0.2022, "step": 24800 }, { "epoch": 3.3698937145573704, "grad_norm": 0.4372308850288391, "learning_rate": 4.796564671128466e-05, "loss": 0.1979, "step": 24810 }, { "epoch": 3.3712519949743625, "grad_norm": 0.4863978922367096, "learning_rate": 4.7926167157168513e-05, "loss": 0.1983, "step": 24820 }, { "epoch": 3.3726102753913545, "grad_norm": 0.4470824599266052, "learning_rate": 4.7886688898177736e-05, "loss": 0.1996, "step": 24830 }, { "epoch": 3.3739685558083465, "grad_norm": 1.0712095499038696, "learning_rate": 4.7847211958966774e-05, "loss": 0.2081, "step": 24840 }, { "epoch": 3.3753268362253386, "grad_norm": 0.44084376096725464, "learning_rate": 4.780773636418933e-05, "loss": 0.1967, "step": 24850 }, { "epoch": 3.3766851166423306, "grad_norm": 0.6834970116615295, "learning_rate": 4.776826213849823e-05, "loss": 0.1932, "step": 24860 }, { "epoch": 3.378043397059323, "grad_norm": 1.43789803981781, "learning_rate": 4.7728789306545424e-05, "loss": 0.1916, "step": 24870 }, { "epoch": 3.379401677476315, "grad_norm": 0.42063403129577637, "learning_rate": 4.7689317892982056e-05, "loss": 0.1955, "step": 24880 }, { "epoch": 3.380759957893307, "grad_norm": 0.3985574245452881, "learning_rate": 4.7649847922458285e-05, "loss": 0.2004, "step": 24890 }, { "epoch": 3.3821182383102992, "grad_norm": 0.8507152199745178, "learning_rate": 4.761037941962346e-05, "loss": 0.1984, "step": 24900 }, { "epoch": 3.3834765187272913, "grad_norm": 0.3569352328777313, "learning_rate": 4.7570912409125995e-05, "loss": 0.1846, "step": 24910 }, { "epoch": 3.3848347991442833, "grad_norm": 0.5592585206031799, "learning_rate": 4.753144691561332e-05, "loss": 0.1993, "step": 24920 }, { "epoch": 3.3861930795612754, "grad_norm": 0.5355365872383118, "learning_rate": 4.749198296373199e-05, "loss": 0.2013, "step": 24930 }, { "epoch": 3.3875513599782674, "grad_norm": 0.5264211893081665, "learning_rate": 4.745252057812753e-05, "loss": 0.188, "step": 24940 }, { "epoch": 3.3889096403952594, "grad_norm": 0.47882217168807983, "learning_rate": 4.741305978344454e-05, "loss": 0.1808, "step": 24950 }, { "epoch": 3.390267920812252, "grad_norm": 0.5803291201591492, "learning_rate": 4.737360060432658e-05, "loss": 0.2008, "step": 24960 }, { "epoch": 3.391626201229244, "grad_norm": 0.5298609137535095, "learning_rate": 4.733414306541625e-05, "loss": 0.1966, "step": 24970 }, { "epoch": 3.392984481646236, "grad_norm": 0.5263457298278809, "learning_rate": 4.729468719135511e-05, "loss": 0.1865, "step": 24980 }, { "epoch": 3.394342762063228, "grad_norm": 0.5308341383934021, "learning_rate": 4.725523300678363e-05, "loss": 0.2008, "step": 24990 }, { "epoch": 3.39570104248022, "grad_norm": 0.39443686604499817, "learning_rate": 4.721578053634132e-05, "loss": 0.1874, "step": 25000 }, { "epoch": 3.397059322897212, "grad_norm": 0.4585479199886322, "learning_rate": 4.717632980466651e-05, "loss": 0.1964, "step": 25010 }, { "epoch": 3.398417603314204, "grad_norm": 0.4118663966655731, "learning_rate": 4.7136880836396536e-05, "loss": 0.1938, "step": 25020 }, { "epoch": 3.399775883731196, "grad_norm": 0.7136027216911316, "learning_rate": 4.709743365616761e-05, "loss": 0.1997, "step": 25030 }, { "epoch": 3.4011341641481883, "grad_norm": 0.7569804191589355, "learning_rate": 4.705798828861478e-05, "loss": 0.1945, "step": 25040 }, { "epoch": 3.4024924445651803, "grad_norm": 0.6106255054473877, "learning_rate": 4.701854475837202e-05, "loss": 0.1947, "step": 25050 }, { "epoch": 3.4038507249821723, "grad_norm": 0.4539012908935547, "learning_rate": 4.697910309007211e-05, "loss": 0.19, "step": 25060 }, { "epoch": 3.405209005399165, "grad_norm": 0.41092780232429504, "learning_rate": 4.69396633083467e-05, "loss": 0.1897, "step": 25070 }, { "epoch": 3.406567285816157, "grad_norm": 0.48333054780960083, "learning_rate": 4.690022543782628e-05, "loss": 0.1967, "step": 25080 }, { "epoch": 3.407925566233149, "grad_norm": 1.4922982454299927, "learning_rate": 4.686078950314007e-05, "loss": 0.1967, "step": 25090 }, { "epoch": 3.409283846650141, "grad_norm": 0.43472543358802795, "learning_rate": 4.682135552891618e-05, "loss": 0.1932, "step": 25100 }, { "epoch": 3.410642127067133, "grad_norm": 0.36925268173217773, "learning_rate": 4.67819235397814e-05, "loss": 0.2054, "step": 25110 }, { "epoch": 3.412000407484125, "grad_norm": 0.3776818513870239, "learning_rate": 4.6742493560361364e-05, "loss": 0.2033, "step": 25120 }, { "epoch": 3.413358687901117, "grad_norm": 0.41551700234413147, "learning_rate": 4.670306561528037e-05, "loss": 0.2036, "step": 25130 }, { "epoch": 3.414716968318109, "grad_norm": 0.40841737389564514, "learning_rate": 4.6663639729161514e-05, "loss": 0.1898, "step": 25140 }, { "epoch": 3.416075248735101, "grad_norm": 0.5143362283706665, "learning_rate": 4.6624215926626595e-05, "loss": 0.2037, "step": 25150 }, { "epoch": 3.4174335291520936, "grad_norm": 0.4876977503299713, "learning_rate": 4.658479423229607e-05, "loss": 0.1938, "step": 25160 }, { "epoch": 3.4187918095690857, "grad_norm": 0.4590885639190674, "learning_rate": 4.6545374670789113e-05, "loss": 0.193, "step": 25170 }, { "epoch": 3.4201500899860777, "grad_norm": 0.5086439251899719, "learning_rate": 4.6505957266723554e-05, "loss": 0.1985, "step": 25180 }, { "epoch": 3.4215083704030698, "grad_norm": 2.305898427963257, "learning_rate": 4.646654204471588e-05, "loss": 0.1909, "step": 25190 }, { "epoch": 3.422866650820062, "grad_norm": 0.5479399561882019, "learning_rate": 4.642712902938124e-05, "loss": 0.2037, "step": 25200 }, { "epoch": 3.424224931237054, "grad_norm": 0.3902255892753601, "learning_rate": 4.638771824533335e-05, "loss": 0.1942, "step": 25210 }, { "epoch": 3.425583211654046, "grad_norm": 0.5885148048400879, "learning_rate": 4.634830971718459e-05, "loss": 0.1893, "step": 25220 }, { "epoch": 3.426941492071038, "grad_norm": 0.7191256284713745, "learning_rate": 4.630890346954586e-05, "loss": 0.1992, "step": 25230 }, { "epoch": 3.42829977248803, "grad_norm": 0.47798648476600647, "learning_rate": 4.6269499527026704e-05, "loss": 0.2036, "step": 25240 }, { "epoch": 3.4296580529050225, "grad_norm": 0.4788721799850464, "learning_rate": 4.6230097914235235e-05, "loss": 0.1953, "step": 25250 }, { "epoch": 3.4310163333220145, "grad_norm": 0.3851640820503235, "learning_rate": 4.619069865577802e-05, "loss": 0.1934, "step": 25260 }, { "epoch": 3.4323746137390065, "grad_norm": 0.5268415808677673, "learning_rate": 4.615130177626027e-05, "loss": 0.1968, "step": 25270 }, { "epoch": 3.4337328941559986, "grad_norm": 0.44216448068618774, "learning_rate": 4.611190730028561e-05, "loss": 0.1869, "step": 25280 }, { "epoch": 3.4350911745729906, "grad_norm": 0.5560691356658936, "learning_rate": 4.6072515252456236e-05, "loss": 0.1912, "step": 25290 }, { "epoch": 3.4364494549899827, "grad_norm": 0.5576750040054321, "learning_rate": 4.603312565737277e-05, "loss": 0.195, "step": 25300 }, { "epoch": 3.4378077354069747, "grad_norm": 0.8031753897666931, "learning_rate": 4.5993738539634354e-05, "loss": 0.1968, "step": 25310 }, { "epoch": 3.4391660158239667, "grad_norm": 0.6398729681968689, "learning_rate": 4.595435392383858e-05, "loss": 0.2016, "step": 25320 }, { "epoch": 3.440524296240959, "grad_norm": 0.4761310815811157, "learning_rate": 4.5914971834581404e-05, "loss": 0.193, "step": 25330 }, { "epoch": 3.4418825766579513, "grad_norm": 0.5108974575996399, "learning_rate": 4.587559229645733e-05, "loss": 0.2025, "step": 25340 }, { "epoch": 3.443240857074943, "grad_norm": 0.9945142865180969, "learning_rate": 4.5836215334059116e-05, "loss": 0.1983, "step": 25350 }, { "epoch": 3.4445991374919354, "grad_norm": 0.4224299192428589, "learning_rate": 4.579684097197805e-05, "loss": 0.2, "step": 25360 }, { "epoch": 3.4459574179089274, "grad_norm": 0.7272350192070007, "learning_rate": 4.575746923480374e-05, "loss": 0.1975, "step": 25370 }, { "epoch": 3.4473156983259194, "grad_norm": 0.4278114140033722, "learning_rate": 4.571810014712411e-05, "loss": 0.2022, "step": 25380 }, { "epoch": 3.4486739787429115, "grad_norm": 0.5417473316192627, "learning_rate": 4.5678733733525535e-05, "loss": 0.1967, "step": 25390 }, { "epoch": 3.4500322591599035, "grad_norm": 0.3946176767349243, "learning_rate": 4.56393700185926e-05, "loss": 0.1908, "step": 25400 }, { "epoch": 3.4513905395768956, "grad_norm": 0.45926663279533386, "learning_rate": 4.560000902690829e-05, "loss": 0.1925, "step": 25410 }, { "epoch": 3.4527488199938876, "grad_norm": 0.49743932485580444, "learning_rate": 4.556065078305387e-05, "loss": 0.1995, "step": 25420 }, { "epoch": 3.4541071004108796, "grad_norm": 0.38151416182518005, "learning_rate": 4.5521295311608846e-05, "loss": 0.1971, "step": 25430 }, { "epoch": 3.4554653808278717, "grad_norm": 0.3892293870449066, "learning_rate": 4.548194263715108e-05, "loss": 0.1992, "step": 25440 }, { "epoch": 3.456823661244864, "grad_norm": 0.46314501762390137, "learning_rate": 4.544259278425658e-05, "loss": 0.1916, "step": 25450 }, { "epoch": 3.458181941661856, "grad_norm": 0.5697717666625977, "learning_rate": 4.5403245777499697e-05, "loss": 0.1936, "step": 25460 }, { "epoch": 3.4595402220788483, "grad_norm": 0.36197546124458313, "learning_rate": 4.536390164145291e-05, "loss": 0.2019, "step": 25470 }, { "epoch": 3.4608985024958403, "grad_norm": 0.6520623564720154, "learning_rate": 4.5324560400686964e-05, "loss": 0.1952, "step": 25480 }, { "epoch": 3.4622567829128323, "grad_norm": 0.46439626812934875, "learning_rate": 4.528522207977082e-05, "loss": 0.199, "step": 25490 }, { "epoch": 3.4636150633298244, "grad_norm": 0.5746205449104309, "learning_rate": 4.524588670327151e-05, "loss": 0.1836, "step": 25500 }, { "epoch": 3.4649733437468164, "grad_norm": 0.4195530116558075, "learning_rate": 4.5206554295754357e-05, "loss": 0.2167, "step": 25510 }, { "epoch": 3.4663316241638085, "grad_norm": 0.44876354932785034, "learning_rate": 4.5167224881782714e-05, "loss": 0.1857, "step": 25520 }, { "epoch": 3.4676899045808005, "grad_norm": 0.40572425723075867, "learning_rate": 4.5127898485918136e-05, "loss": 0.1955, "step": 25530 }, { "epoch": 3.469048184997793, "grad_norm": 0.5131312012672424, "learning_rate": 4.5088575132720304e-05, "loss": 0.2036, "step": 25540 }, { "epoch": 3.470406465414785, "grad_norm": 0.5062470436096191, "learning_rate": 4.504925484674693e-05, "loss": 0.188, "step": 25550 }, { "epoch": 3.471764745831777, "grad_norm": 0.5084410309791565, "learning_rate": 4.500993765255388e-05, "loss": 0.2031, "step": 25560 }, { "epoch": 3.473123026248769, "grad_norm": 0.44772180914878845, "learning_rate": 4.497062357469503e-05, "loss": 0.1973, "step": 25570 }, { "epoch": 3.474481306665761, "grad_norm": 0.4758513271808624, "learning_rate": 4.493131263772237e-05, "loss": 0.2031, "step": 25580 }, { "epoch": 3.475839587082753, "grad_norm": 0.5850323438644409, "learning_rate": 4.489200486618587e-05, "loss": 0.1988, "step": 25590 }, { "epoch": 3.4771978674997452, "grad_norm": 0.35261064767837524, "learning_rate": 4.485270028463355e-05, "loss": 0.1909, "step": 25600 }, { "epoch": 3.4785561479167373, "grad_norm": 0.6332876086235046, "learning_rate": 4.4813398917611474e-05, "loss": 0.1987, "step": 25610 }, { "epoch": 3.4799144283337293, "grad_norm": 0.770811915397644, "learning_rate": 4.477410078966362e-05, "loss": 0.1915, "step": 25620 }, { "epoch": 3.481272708750722, "grad_norm": 0.6020998358726501, "learning_rate": 4.473480592533203e-05, "loss": 0.1968, "step": 25630 }, { "epoch": 3.482630989167714, "grad_norm": 0.5758033990859985, "learning_rate": 4.469551434915661e-05, "loss": 0.1908, "step": 25640 }, { "epoch": 3.483989269584706, "grad_norm": 0.5746198892593384, "learning_rate": 4.4656226085675294e-05, "loss": 0.2007, "step": 25650 }, { "epoch": 3.485347550001698, "grad_norm": 0.5072306394577026, "learning_rate": 4.4616941159423956e-05, "loss": 0.1905, "step": 25660 }, { "epoch": 3.48670583041869, "grad_norm": 0.45598042011260986, "learning_rate": 4.4577659594936284e-05, "loss": 0.1957, "step": 25670 }, { "epoch": 3.488064110835682, "grad_norm": 0.5555516481399536, "learning_rate": 4.4538381416743986e-05, "loss": 0.2091, "step": 25680 }, { "epoch": 3.489422391252674, "grad_norm": 0.41476359963417053, "learning_rate": 4.4499106649376554e-05, "loss": 0.2011, "step": 25690 }, { "epoch": 3.490780671669666, "grad_norm": 0.3816494941711426, "learning_rate": 4.445983531736142e-05, "loss": 0.1817, "step": 25700 }, { "epoch": 3.492138952086658, "grad_norm": 1.4906617403030396, "learning_rate": 4.442056744522388e-05, "loss": 0.203, "step": 25710 }, { "epoch": 3.49349723250365, "grad_norm": 0.4812445342540741, "learning_rate": 4.438130305748698e-05, "loss": 0.2091, "step": 25720 }, { "epoch": 3.4948555129206422, "grad_norm": 0.9287388324737549, "learning_rate": 4.434204217867169e-05, "loss": 0.1944, "step": 25730 }, { "epoch": 3.4962137933376347, "grad_norm": 0.4472825229167938, "learning_rate": 4.430278483329672e-05, "loss": 0.2059, "step": 25740 }, { "epoch": 3.4975720737546268, "grad_norm": 0.5211368203163147, "learning_rate": 4.426353104587862e-05, "loss": 0.182, "step": 25750 }, { "epoch": 3.498930354171619, "grad_norm": 0.429802805185318, "learning_rate": 4.422428084093166e-05, "loss": 0.1939, "step": 25760 }, { "epoch": 3.500288634588611, "grad_norm": 0.468260794878006, "learning_rate": 4.418503424296795e-05, "loss": 0.1859, "step": 25770 }, { "epoch": 3.501646915005603, "grad_norm": 0.5630806684494019, "learning_rate": 4.41457912764973e-05, "loss": 0.1954, "step": 25780 }, { "epoch": 3.503005195422595, "grad_norm": 0.5001989006996155, "learning_rate": 4.410655196602724e-05, "loss": 0.1999, "step": 25790 }, { "epoch": 3.504363475839587, "grad_norm": 0.4892875850200653, "learning_rate": 4.406731633606306e-05, "loss": 0.2043, "step": 25800 }, { "epoch": 3.5057217562565794, "grad_norm": 0.5210531949996948, "learning_rate": 4.40280844111077e-05, "loss": 0.2006, "step": 25810 }, { "epoch": 3.507080036673571, "grad_norm": 0.36557772755622864, "learning_rate": 4.3988856215661833e-05, "loss": 0.1971, "step": 25820 }, { "epoch": 3.5084383170905635, "grad_norm": 0.4441193640232086, "learning_rate": 4.394963177422379e-05, "loss": 0.1986, "step": 25830 }, { "epoch": 3.5097965975075556, "grad_norm": 0.48608940839767456, "learning_rate": 4.391041111128953e-05, "loss": 0.2112, "step": 25840 }, { "epoch": 3.5111548779245476, "grad_norm": 0.5865601301193237, "learning_rate": 4.38711942513527e-05, "loss": 0.1936, "step": 25850 }, { "epoch": 3.5125131583415397, "grad_norm": 0.38082730770111084, "learning_rate": 4.3831981218904516e-05, "loss": 0.1938, "step": 25860 }, { "epoch": 3.5138714387585317, "grad_norm": 0.4054211378097534, "learning_rate": 4.3792772038433844e-05, "loss": 0.1966, "step": 25870 }, { "epoch": 3.5152297191755237, "grad_norm": 0.4694993197917938, "learning_rate": 4.375356673442716e-05, "loss": 0.2091, "step": 25880 }, { "epoch": 3.516587999592516, "grad_norm": 0.3229004442691803, "learning_rate": 4.3714365331368444e-05, "loss": 0.1922, "step": 25890 }, { "epoch": 3.517946280009508, "grad_norm": 0.45641300082206726, "learning_rate": 4.367516785373935e-05, "loss": 0.1956, "step": 25900 }, { "epoch": 3.5193045604265, "grad_norm": 0.38634705543518066, "learning_rate": 4.3635974326018945e-05, "loss": 0.1863, "step": 25910 }, { "epoch": 3.5206628408434923, "grad_norm": 0.33556005358695984, "learning_rate": 4.359678477268397e-05, "loss": 0.1992, "step": 25920 }, { "epoch": 3.5220211212604844, "grad_norm": 0.750998854637146, "learning_rate": 4.355759921820858e-05, "loss": 0.1992, "step": 25930 }, { "epoch": 3.5233794016774764, "grad_norm": 0.6848328709602356, "learning_rate": 4.3518417687064466e-05, "loss": 0.195, "step": 25940 }, { "epoch": 3.5247376820944685, "grad_norm": 0.365925133228302, "learning_rate": 4.347924020372085e-05, "loss": 0.1981, "step": 25950 }, { "epoch": 3.5260959625114605, "grad_norm": 0.5224839448928833, "learning_rate": 4.344006679264434e-05, "loss": 0.1901, "step": 25960 }, { "epoch": 3.5274542429284526, "grad_norm": 0.5524365305900574, "learning_rate": 4.340089747829909e-05, "loss": 0.1979, "step": 25970 }, { "epoch": 3.5288125233454446, "grad_norm": 0.4588007926940918, "learning_rate": 4.3361732285146604e-05, "loss": 0.1906, "step": 25980 }, { "epoch": 3.5301708037624366, "grad_norm": 0.5297346711158752, "learning_rate": 4.33225712376459e-05, "loss": 0.1957, "step": 25990 }, { "epoch": 3.5315290841794287, "grad_norm": 0.5760900974273682, "learning_rate": 4.328341436025337e-05, "loss": 0.1895, "step": 26000 }, { "epoch": 3.532887364596421, "grad_norm": 0.3822748363018036, "learning_rate": 4.3244261677422774e-05, "loss": 0.1945, "step": 26010 }, { "epoch": 3.5342456450134128, "grad_norm": 0.3623114824295044, "learning_rate": 4.3205113213605304e-05, "loss": 0.1993, "step": 26020 }, { "epoch": 3.5356039254304052, "grad_norm": 0.4182954430580139, "learning_rate": 4.3165968993249454e-05, "loss": 0.1927, "step": 26030 }, { "epoch": 3.5369622058473973, "grad_norm": 0.5098933577537537, "learning_rate": 4.3126829040801145e-05, "loss": 0.1958, "step": 26040 }, { "epoch": 3.5383204862643893, "grad_norm": 0.36436909437179565, "learning_rate": 4.3087693380703585e-05, "loss": 0.1934, "step": 26050 }, { "epoch": 3.5396787666813814, "grad_norm": 0.40847882628440857, "learning_rate": 4.3048562037397286e-05, "loss": 0.2013, "step": 26060 }, { "epoch": 3.5410370470983734, "grad_norm": 0.5046661496162415, "learning_rate": 4.300943503532013e-05, "loss": 0.1967, "step": 26070 }, { "epoch": 3.5423953275153655, "grad_norm": 0.6724080443382263, "learning_rate": 4.29703123989072e-05, "loss": 0.192, "step": 26080 }, { "epoch": 3.5437536079323575, "grad_norm": 0.4096417725086212, "learning_rate": 4.293119415259094e-05, "loss": 0.1992, "step": 26090 }, { "epoch": 3.54511188834935, "grad_norm": 0.5939971208572388, "learning_rate": 4.289208032080097e-05, "loss": 0.1919, "step": 26100 }, { "epoch": 3.5464701687663416, "grad_norm": 0.45245373249053955, "learning_rate": 4.285297092796421e-05, "loss": 0.2071, "step": 26110 }, { "epoch": 3.547828449183334, "grad_norm": 1.230437159538269, "learning_rate": 4.281386599850482e-05, "loss": 0.1973, "step": 26120 }, { "epoch": 3.549186729600326, "grad_norm": 0.42998626828193665, "learning_rate": 4.2774765556844095e-05, "loss": 0.1961, "step": 26130 }, { "epoch": 3.550545010017318, "grad_norm": 0.3462628722190857, "learning_rate": 4.273566962740062e-05, "loss": 0.192, "step": 26140 }, { "epoch": 3.55190329043431, "grad_norm": 0.3923579454421997, "learning_rate": 4.269657823459008e-05, "loss": 0.1833, "step": 26150 }, { "epoch": 3.5532615708513022, "grad_norm": 0.3863465189933777, "learning_rate": 4.265749140282538e-05, "loss": 0.1923, "step": 26160 }, { "epoch": 3.5546198512682943, "grad_norm": 0.7046294212341309, "learning_rate": 4.2618409156516594e-05, "loss": 0.2001, "step": 26170 }, { "epoch": 3.5559781316852863, "grad_norm": 0.5144555568695068, "learning_rate": 4.257933152007085e-05, "loss": 0.1874, "step": 26180 }, { "epoch": 3.5573364121022784, "grad_norm": 1.7406377792358398, "learning_rate": 4.2540258517892487e-05, "loss": 0.1971, "step": 26190 }, { "epoch": 3.5586946925192704, "grad_norm": 2.373384952545166, "learning_rate": 4.250119017438288e-05, "loss": 0.1958, "step": 26200 }, { "epoch": 3.560052972936263, "grad_norm": 0.3596789240837097, "learning_rate": 4.2462126513940526e-05, "loss": 0.1933, "step": 26210 }, { "epoch": 3.561411253353255, "grad_norm": 0.38115379214286804, "learning_rate": 4.242306756096104e-05, "loss": 0.1799, "step": 26220 }, { "epoch": 3.562769533770247, "grad_norm": 0.5424114465713501, "learning_rate": 4.238401333983699e-05, "loss": 0.1837, "step": 26230 }, { "epoch": 3.564127814187239, "grad_norm": 0.4205479025840759, "learning_rate": 4.234496387495811e-05, "loss": 0.2029, "step": 26240 }, { "epoch": 3.565486094604231, "grad_norm": 0.9383400082588196, "learning_rate": 4.2305919190711063e-05, "loss": 0.1954, "step": 26250 }, { "epoch": 3.566844375021223, "grad_norm": 0.5052439570426941, "learning_rate": 4.2266879311479605e-05, "loss": 0.1984, "step": 26260 }, { "epoch": 3.568202655438215, "grad_norm": 0.42032065987586975, "learning_rate": 4.2227844261644415e-05, "loss": 0.1968, "step": 26270 }, { "epoch": 3.569560935855207, "grad_norm": 0.3882676661014557, "learning_rate": 4.2188814065583217e-05, "loss": 0.2019, "step": 26280 }, { "epoch": 3.570919216272199, "grad_norm": 0.7270762920379639, "learning_rate": 4.2149788747670705e-05, "loss": 0.195, "step": 26290 }, { "epoch": 3.5722774966891917, "grad_norm": 0.46117204427719116, "learning_rate": 4.211076833227847e-05, "loss": 0.1941, "step": 26300 }, { "epoch": 3.5736357771061833, "grad_norm": 0.44317176938056946, "learning_rate": 4.207175284377511e-05, "loss": 0.1892, "step": 26310 }, { "epoch": 3.574994057523176, "grad_norm": 0.4705517888069153, "learning_rate": 4.2032742306526075e-05, "loss": 0.1846, "step": 26320 }, { "epoch": 3.576352337940168, "grad_norm": 0.4972946047782898, "learning_rate": 4.199373674489377e-05, "loss": 0.1988, "step": 26330 }, { "epoch": 3.57771061835716, "grad_norm": 0.5764078497886658, "learning_rate": 4.195473618323753e-05, "loss": 0.1903, "step": 26340 }, { "epoch": 3.579068898774152, "grad_norm": 0.5813182592391968, "learning_rate": 4.1915740645913456e-05, "loss": 0.1878, "step": 26350 }, { "epoch": 3.580427179191144, "grad_norm": 0.6853075623512268, "learning_rate": 4.187675015727462e-05, "loss": 0.1995, "step": 26360 }, { "epoch": 3.581785459608136, "grad_norm": 0.3317774534225464, "learning_rate": 4.183776474167086e-05, "loss": 0.2009, "step": 26370 }, { "epoch": 3.583143740025128, "grad_norm": 0.4082251489162445, "learning_rate": 4.179878442344892e-05, "loss": 0.1926, "step": 26380 }, { "epoch": 3.5845020204421205, "grad_norm": 0.41543978452682495, "learning_rate": 4.17598092269523e-05, "loss": 0.1946, "step": 26390 }, { "epoch": 3.585860300859112, "grad_norm": 0.6097654700279236, "learning_rate": 4.1720839176521316e-05, "loss": 0.1894, "step": 26400 }, { "epoch": 3.5872185812761046, "grad_norm": 0.48445582389831543, "learning_rate": 4.1681874296493116e-05, "loss": 0.1979, "step": 26410 }, { "epoch": 3.5885768616930966, "grad_norm": 0.4952711760997772, "learning_rate": 4.1642914611201545e-05, "loss": 0.1998, "step": 26420 }, { "epoch": 3.5899351421100887, "grad_norm": 0.3786572813987732, "learning_rate": 4.160396014497728e-05, "loss": 0.1946, "step": 26430 }, { "epoch": 3.5912934225270807, "grad_norm": 0.48531797528266907, "learning_rate": 4.1565010922147646e-05, "loss": 0.1945, "step": 26440 }, { "epoch": 3.5926517029440728, "grad_norm": 0.4590783417224884, "learning_rate": 4.152606696703678e-05, "loss": 0.1954, "step": 26450 }, { "epoch": 3.594009983361065, "grad_norm": 0.5117254853248596, "learning_rate": 4.148712830396551e-05, "loss": 0.192, "step": 26460 }, { "epoch": 3.595368263778057, "grad_norm": 0.9933412671089172, "learning_rate": 4.144819495725131e-05, "loss": 0.1997, "step": 26470 }, { "epoch": 3.5967265441950493, "grad_norm": 0.35638225078582764, "learning_rate": 4.1409266951208415e-05, "loss": 0.1973, "step": 26480 }, { "epoch": 3.598084824612041, "grad_norm": 0.7997497320175171, "learning_rate": 4.1370344310147625e-05, "loss": 0.2034, "step": 26490 }, { "epoch": 3.5994431050290334, "grad_norm": 0.6580579876899719, "learning_rate": 4.133142705837647e-05, "loss": 0.1974, "step": 26500 }, { "epoch": 3.6008013854460255, "grad_norm": 0.4332260191440582, "learning_rate": 4.129251522019911e-05, "loss": 0.1949, "step": 26510 }, { "epoch": 3.6021596658630175, "grad_norm": 0.413887619972229, "learning_rate": 4.1253608819916254e-05, "loss": 0.1894, "step": 26520 }, { "epoch": 3.6035179462800095, "grad_norm": 0.4621548056602478, "learning_rate": 4.1214707881825316e-05, "loss": 0.1927, "step": 26530 }, { "epoch": 3.6048762266970016, "grad_norm": 0.3288225531578064, "learning_rate": 4.1175812430220196e-05, "loss": 0.193, "step": 26540 }, { "epoch": 3.6062345071139936, "grad_norm": 0.85454922914505, "learning_rate": 4.113692248939145e-05, "loss": 0.1916, "step": 26550 }, { "epoch": 3.6075927875309857, "grad_norm": 0.4270640015602112, "learning_rate": 4.1098038083626156e-05, "loss": 0.1923, "step": 26560 }, { "epoch": 3.6089510679479777, "grad_norm": 0.4922953248023987, "learning_rate": 4.1059159237207914e-05, "loss": 0.1884, "step": 26570 }, { "epoch": 3.6103093483649698, "grad_norm": 0.5857746005058289, "learning_rate": 4.102028597441693e-05, "loss": 0.2066, "step": 26580 }, { "epoch": 3.6116676287819622, "grad_norm": 0.510541558265686, "learning_rate": 4.09814183195298e-05, "loss": 0.1942, "step": 26590 }, { "epoch": 3.6130259091989543, "grad_norm": 0.6733138561248779, "learning_rate": 4.0942556296819755e-05, "loss": 0.2117, "step": 26600 }, { "epoch": 3.6143841896159463, "grad_norm": 0.5752008557319641, "learning_rate": 4.0903699930556396e-05, "loss": 0.2003, "step": 26610 }, { "epoch": 3.6157424700329384, "grad_norm": 0.4305654466152191, "learning_rate": 4.086484924500585e-05, "loss": 0.1932, "step": 26620 }, { "epoch": 3.6171007504499304, "grad_norm": 0.39875492453575134, "learning_rate": 4.0826004264430706e-05, "loss": 0.1917, "step": 26630 }, { "epoch": 3.6184590308669224, "grad_norm": 0.3978516757488251, "learning_rate": 4.078716501308993e-05, "loss": 0.1983, "step": 26640 }, { "epoch": 3.6198173112839145, "grad_norm": 0.5497390627861023, "learning_rate": 4.074833151523899e-05, "loss": 0.2073, "step": 26650 }, { "epoch": 3.6211755917009065, "grad_norm": 0.3507409393787384, "learning_rate": 4.070950379512967e-05, "loss": 0.1888, "step": 26660 }, { "epoch": 3.6225338721178986, "grad_norm": 0.37882164120674133, "learning_rate": 4.067068187701023e-05, "loss": 0.1899, "step": 26670 }, { "epoch": 3.623892152534891, "grad_norm": 0.3879409432411194, "learning_rate": 4.063186578512528e-05, "loss": 0.1867, "step": 26680 }, { "epoch": 3.6252504329518827, "grad_norm": 0.4401319921016693, "learning_rate": 4.059305554371574e-05, "loss": 0.1936, "step": 26690 }, { "epoch": 3.626608713368875, "grad_norm": 0.4354172945022583, "learning_rate": 4.055425117701897e-05, "loss": 0.1939, "step": 26700 }, { "epoch": 3.627966993785867, "grad_norm": 0.5392180681228638, "learning_rate": 4.051545270926855e-05, "loss": 0.1896, "step": 26710 }, { "epoch": 3.629325274202859, "grad_norm": 0.48733121156692505, "learning_rate": 4.047666016469449e-05, "loss": 0.1948, "step": 26720 }, { "epoch": 3.6306835546198513, "grad_norm": 0.4953230917453766, "learning_rate": 4.043787356752303e-05, "loss": 0.2004, "step": 26730 }, { "epoch": 3.6320418350368433, "grad_norm": 0.4953707158565521, "learning_rate": 4.03990929419767e-05, "loss": 0.2091, "step": 26740 }, { "epoch": 3.6334001154538353, "grad_norm": 0.5761758685112, "learning_rate": 4.0360318312274345e-05, "loss": 0.2014, "step": 26750 }, { "epoch": 3.6347583958708274, "grad_norm": 0.44004687666893005, "learning_rate": 4.0321549702631e-05, "loss": 0.2004, "step": 26760 }, { "epoch": 3.63611667628782, "grad_norm": 0.39203307032585144, "learning_rate": 4.0282787137257994e-05, "loss": 0.1907, "step": 26770 }, { "epoch": 3.6374749567048115, "grad_norm": 0.3734578788280487, "learning_rate": 4.024403064036284e-05, "loss": 0.1963, "step": 26780 }, { "epoch": 3.638833237121804, "grad_norm": 0.7305260300636292, "learning_rate": 4.02052802361493e-05, "loss": 0.1895, "step": 26790 }, { "epoch": 3.640191517538796, "grad_norm": 0.45858633518218994, "learning_rate": 4.0166535948817344e-05, "loss": 0.1917, "step": 26800 }, { "epoch": 3.641549797955788, "grad_norm": 0.589333176612854, "learning_rate": 4.012779780256303e-05, "loss": 0.2026, "step": 26810 }, { "epoch": 3.64290807837278, "grad_norm": 0.3884367346763611, "learning_rate": 4.008906582157869e-05, "loss": 0.1872, "step": 26820 }, { "epoch": 3.644266358789772, "grad_norm": 0.4503089487552643, "learning_rate": 4.005034003005274e-05, "loss": 0.1993, "step": 26830 }, { "epoch": 3.645624639206764, "grad_norm": 0.40541285276412964, "learning_rate": 4.001162045216974e-05, "loss": 0.2082, "step": 26840 }, { "epoch": 3.646982919623756, "grad_norm": 1.7212721109390259, "learning_rate": 3.997290711211041e-05, "loss": 0.195, "step": 26850 }, { "epoch": 3.6483412000407487, "grad_norm": 0.39078232645988464, "learning_rate": 3.993420003405151e-05, "loss": 0.1922, "step": 26860 }, { "epoch": 3.6496994804577403, "grad_norm": 0.4268849492073059, "learning_rate": 3.9895499242165956e-05, "loss": 0.1985, "step": 26870 }, { "epoch": 3.6510577608747328, "grad_norm": 0.4303213357925415, "learning_rate": 3.9856804760622676e-05, "loss": 0.2023, "step": 26880 }, { "epoch": 3.652416041291725, "grad_norm": 0.5146116614341736, "learning_rate": 3.98181166135867e-05, "loss": 0.1966, "step": 26890 }, { "epoch": 3.653774321708717, "grad_norm": 1.4449865818023682, "learning_rate": 3.977943482521908e-05, "loss": 0.1911, "step": 26900 }, { "epoch": 3.655132602125709, "grad_norm": 0.6222800612449646, "learning_rate": 3.9740759419676906e-05, "loss": 0.1924, "step": 26910 }, { "epoch": 3.656490882542701, "grad_norm": 0.4491010010242462, "learning_rate": 3.9702090421113306e-05, "loss": 0.1994, "step": 26920 }, { "epoch": 3.657849162959693, "grad_norm": 0.4451349079608917, "learning_rate": 3.966342785367733e-05, "loss": 0.2097, "step": 26930 }, { "epoch": 3.659207443376685, "grad_norm": 0.46658650040626526, "learning_rate": 3.962477174151411e-05, "loss": 0.1912, "step": 26940 }, { "epoch": 3.660565723793677, "grad_norm": 0.48521873354911804, "learning_rate": 3.958612210876469e-05, "loss": 0.2055, "step": 26950 }, { "epoch": 3.661924004210669, "grad_norm": 0.7977859377861023, "learning_rate": 3.954747897956606e-05, "loss": 0.194, "step": 26960 }, { "epoch": 3.6632822846276616, "grad_norm": 0.4986419081687927, "learning_rate": 3.950884237805119e-05, "loss": 0.1984, "step": 26970 }, { "epoch": 3.6646405650446536, "grad_norm": 0.6793397665023804, "learning_rate": 3.947021232834892e-05, "loss": 0.1991, "step": 26980 }, { "epoch": 3.6659988454616457, "grad_norm": 0.396443247795105, "learning_rate": 3.9431588854584074e-05, "loss": 0.191, "step": 26990 }, { "epoch": 3.6673571258786377, "grad_norm": 0.6113031506538391, "learning_rate": 3.9392971980877266e-05, "loss": 0.2012, "step": 27000 }, { "epoch": 3.6687154062956298, "grad_norm": 0.566853404045105, "learning_rate": 3.9354361731345076e-05, "loss": 0.2009, "step": 27010 }, { "epoch": 3.670073686712622, "grad_norm": 0.6991735100746155, "learning_rate": 3.9315758130099936e-05, "loss": 0.1935, "step": 27020 }, { "epoch": 3.671431967129614, "grad_norm": 0.6013372540473938, "learning_rate": 3.927716120125006e-05, "loss": 0.1992, "step": 27030 }, { "epoch": 3.672790247546606, "grad_norm": 0.5538707375526428, "learning_rate": 3.923857096889959e-05, "loss": 0.1925, "step": 27040 }, { "epoch": 3.674148527963598, "grad_norm": 0.43267303705215454, "learning_rate": 3.919998745714839e-05, "loss": 0.1885, "step": 27050 }, { "epoch": 3.6755068083805904, "grad_norm": 0.6607810854911804, "learning_rate": 3.916141069009222e-05, "loss": 0.188, "step": 27060 }, { "epoch": 3.676865088797582, "grad_norm": 0.5193665027618408, "learning_rate": 3.912284069182255e-05, "loss": 0.1996, "step": 27070 }, { "epoch": 3.6782233692145745, "grad_norm": 0.5077078342437744, "learning_rate": 3.908427748642666e-05, "loss": 0.1999, "step": 27080 }, { "epoch": 3.6795816496315665, "grad_norm": 0.44458863139152527, "learning_rate": 3.9045721097987606e-05, "loss": 0.1982, "step": 27090 }, { "epoch": 3.6809399300485586, "grad_norm": 0.9144501686096191, "learning_rate": 3.9007171550584123e-05, "loss": 0.196, "step": 27100 }, { "epoch": 3.6822982104655506, "grad_norm": 3.3904082775115967, "learning_rate": 3.896862886829076e-05, "loss": 0.1915, "step": 27110 }, { "epoch": 3.6836564908825427, "grad_norm": 0.483428418636322, "learning_rate": 3.89300930751777e-05, "loss": 0.1948, "step": 27120 }, { "epoch": 3.6850147712995347, "grad_norm": 0.37810570001602173, "learning_rate": 3.889156419531086e-05, "loss": 0.2031, "step": 27130 }, { "epoch": 3.6863730517165267, "grad_norm": 0.44553709030151367, "learning_rate": 3.885304225275188e-05, "loss": 0.1909, "step": 27140 }, { "epoch": 3.6877313321335192, "grad_norm": 0.45160728693008423, "learning_rate": 3.8814527271557956e-05, "loss": 0.1922, "step": 27150 }, { "epoch": 3.689089612550511, "grad_norm": 0.6436792612075806, "learning_rate": 3.877601927578205e-05, "loss": 0.1862, "step": 27160 }, { "epoch": 3.6904478929675033, "grad_norm": 0.476819783449173, "learning_rate": 3.87375182894727e-05, "loss": 0.1862, "step": 27170 }, { "epoch": 3.6918061733844953, "grad_norm": 0.5363958477973938, "learning_rate": 3.86990243366741e-05, "loss": 0.1974, "step": 27180 }, { "epoch": 3.6931644538014874, "grad_norm": 0.6205507516860962, "learning_rate": 3.8660537441426004e-05, "loss": 0.2025, "step": 27190 }, { "epoch": 3.6945227342184794, "grad_norm": 0.48431262373924255, "learning_rate": 3.8622057627763805e-05, "loss": 0.1977, "step": 27200 }, { "epoch": 3.6958810146354715, "grad_norm": 0.46685782074928284, "learning_rate": 3.8583584919718475e-05, "loss": 0.1931, "step": 27210 }, { "epoch": 3.6972392950524635, "grad_norm": 0.43339046835899353, "learning_rate": 3.85451193413165e-05, "loss": 0.1887, "step": 27220 }, { "epoch": 3.6985975754694556, "grad_norm": 0.49697989225387573, "learning_rate": 3.8506660916579955e-05, "loss": 0.1991, "step": 27230 }, { "epoch": 3.6999558558864476, "grad_norm": 0.6055324077606201, "learning_rate": 3.846820966952645e-05, "loss": 0.1993, "step": 27240 }, { "epoch": 3.7013141363034396, "grad_norm": 0.5207672119140625, "learning_rate": 3.842976562416907e-05, "loss": 0.193, "step": 27250 }, { "epoch": 3.702672416720432, "grad_norm": 0.5228583216667175, "learning_rate": 3.839132880451648e-05, "loss": 0.1864, "step": 27260 }, { "epoch": 3.704030697137424, "grad_norm": 0.6569842100143433, "learning_rate": 3.8352899234572724e-05, "loss": 0.2017, "step": 27270 }, { "epoch": 3.705388977554416, "grad_norm": 0.665297269821167, "learning_rate": 3.831447693833743e-05, "loss": 0.1957, "step": 27280 }, { "epoch": 3.7067472579714082, "grad_norm": 0.8790073990821838, "learning_rate": 3.827606193980561e-05, "loss": 0.1861, "step": 27290 }, { "epoch": 3.7081055383884003, "grad_norm": 0.6696270704269409, "learning_rate": 3.8237654262967723e-05, "loss": 0.2035, "step": 27300 }, { "epoch": 3.7094638188053923, "grad_norm": 0.41551563143730164, "learning_rate": 3.819925393180972e-05, "loss": 0.1922, "step": 27310 }, { "epoch": 3.7108220992223844, "grad_norm": 0.4485434591770172, "learning_rate": 3.816086097031286e-05, "loss": 0.2033, "step": 27320 }, { "epoch": 3.7121803796393764, "grad_norm": 0.5025439858436584, "learning_rate": 3.8122475402453906e-05, "loss": 0.1961, "step": 27330 }, { "epoch": 3.7135386600563685, "grad_norm": 0.41164860129356384, "learning_rate": 3.8084097252204936e-05, "loss": 0.1965, "step": 27340 }, { "epoch": 3.714896940473361, "grad_norm": 0.46350565552711487, "learning_rate": 3.804572654353341e-05, "loss": 0.198, "step": 27350 }, { "epoch": 3.7162552208903525, "grad_norm": 0.5567351579666138, "learning_rate": 3.8007363300402156e-05, "loss": 0.1972, "step": 27360 }, { "epoch": 3.717613501307345, "grad_norm": 0.5155938863754272, "learning_rate": 3.796900754676931e-05, "loss": 0.2005, "step": 27370 }, { "epoch": 3.718971781724337, "grad_norm": 0.4998150169849396, "learning_rate": 3.7930659306588366e-05, "loss": 0.1862, "step": 27380 }, { "epoch": 3.720330062141329, "grad_norm": 0.47984400391578674, "learning_rate": 3.789231860380812e-05, "loss": 0.1915, "step": 27390 }, { "epoch": 3.721688342558321, "grad_norm": 0.47899726033210754, "learning_rate": 3.7853985462372625e-05, "loss": 0.2007, "step": 27400 }, { "epoch": 3.723046622975313, "grad_norm": 0.36974117159843445, "learning_rate": 3.781565990622126e-05, "loss": 0.1855, "step": 27410 }, { "epoch": 3.7244049033923052, "grad_norm": 2.026336431503296, "learning_rate": 3.777734195928861e-05, "loss": 0.1843, "step": 27420 }, { "epoch": 3.7257631838092973, "grad_norm": 0.4026225805282593, "learning_rate": 3.773903164550459e-05, "loss": 0.1946, "step": 27430 }, { "epoch": 3.7271214642262898, "grad_norm": 0.6560866832733154, "learning_rate": 3.770072898879426e-05, "loss": 0.1923, "step": 27440 }, { "epoch": 3.7284797446432814, "grad_norm": 0.6025427579879761, "learning_rate": 3.766243401307796e-05, "loss": 0.1918, "step": 27450 }, { "epoch": 3.729838025060274, "grad_norm": 0.9850094318389893, "learning_rate": 3.7624146742271205e-05, "loss": 0.1992, "step": 27460 }, { "epoch": 3.731196305477266, "grad_norm": 0.4461987018585205, "learning_rate": 3.758586720028469e-05, "loss": 0.1998, "step": 27470 }, { "epoch": 3.732554585894258, "grad_norm": 0.9402459263801575, "learning_rate": 3.7547595411024336e-05, "loss": 0.2056, "step": 27480 }, { "epoch": 3.73391286631125, "grad_norm": 0.46155309677124023, "learning_rate": 3.750933139839115e-05, "loss": 0.1981, "step": 27490 }, { "epoch": 3.735271146728242, "grad_norm": 0.5257353782653809, "learning_rate": 3.747107518628133e-05, "loss": 0.202, "step": 27500 }, { "epoch": 3.736629427145234, "grad_norm": 0.4278040826320648, "learning_rate": 3.743282679858618e-05, "loss": 0.1971, "step": 27510 }, { "epoch": 3.737987707562226, "grad_norm": 0.4987119734287262, "learning_rate": 3.739458625919214e-05, "loss": 0.2119, "step": 27520 }, { "epoch": 3.7393459879792186, "grad_norm": 0.4684533476829529, "learning_rate": 3.735635359198074e-05, "loss": 0.1859, "step": 27530 }, { "epoch": 3.74070426839621, "grad_norm": 0.7278027534484863, "learning_rate": 3.7318128820828554e-05, "loss": 0.1905, "step": 27540 }, { "epoch": 3.7420625488132027, "grad_norm": 0.4004683196544647, "learning_rate": 3.727991196960731e-05, "loss": 0.1972, "step": 27550 }, { "epoch": 3.7434208292301947, "grad_norm": 0.44931432604789734, "learning_rate": 3.72417030621837e-05, "loss": 0.1902, "step": 27560 }, { "epoch": 3.7447791096471867, "grad_norm": 0.8967168927192688, "learning_rate": 3.720350212241951e-05, "loss": 0.2065, "step": 27570 }, { "epoch": 3.746137390064179, "grad_norm": 0.6153474450111389, "learning_rate": 3.716530917417155e-05, "loss": 0.199, "step": 27580 }, { "epoch": 3.747495670481171, "grad_norm": 0.5569081902503967, "learning_rate": 3.712712424129159e-05, "loss": 0.1949, "step": 27590 }, { "epoch": 3.748853950898163, "grad_norm": 0.46786242723464966, "learning_rate": 3.708894734762646e-05, "loss": 0.1951, "step": 27600 }, { "epoch": 3.750212231315155, "grad_norm": 0.8529894948005676, "learning_rate": 3.705077851701792e-05, "loss": 0.1974, "step": 27610 }, { "epoch": 3.751570511732147, "grad_norm": 0.579586386680603, "learning_rate": 3.7012617773302726e-05, "loss": 0.1912, "step": 27620 }, { "epoch": 3.752928792149139, "grad_norm": 0.5262661576271057, "learning_rate": 3.697446514031257e-05, "loss": 0.2002, "step": 27630 }, { "epoch": 3.7542870725661315, "grad_norm": 0.4758906960487366, "learning_rate": 3.693632064187405e-05, "loss": 0.1887, "step": 27640 }, { "epoch": 3.7556453529831235, "grad_norm": 0.42770326137542725, "learning_rate": 3.689818430180876e-05, "loss": 0.186, "step": 27650 }, { "epoch": 3.7570036334001156, "grad_norm": 0.4627552330493927, "learning_rate": 3.6860056143933106e-05, "loss": 0.1823, "step": 27660 }, { "epoch": 3.7583619138171076, "grad_norm": 0.38884228467941284, "learning_rate": 3.682193619205847e-05, "loss": 0.1947, "step": 27670 }, { "epoch": 3.7597201942340996, "grad_norm": 0.5055716633796692, "learning_rate": 3.6783824469991055e-05, "loss": 0.1907, "step": 27680 }, { "epoch": 3.7610784746510917, "grad_norm": 0.4437083303928375, "learning_rate": 3.674572100153193e-05, "loss": 0.2023, "step": 27690 }, { "epoch": 3.7624367550680837, "grad_norm": 0.4741124212741852, "learning_rate": 3.6707625810477034e-05, "loss": 0.1874, "step": 27700 }, { "epoch": 3.7637950354850758, "grad_norm": 0.4470958709716797, "learning_rate": 3.6669538920617094e-05, "loss": 0.205, "step": 27710 }, { "epoch": 3.765153315902068, "grad_norm": 0.5795932412147522, "learning_rate": 3.663146035573771e-05, "loss": 0.1908, "step": 27720 }, { "epoch": 3.7665115963190603, "grad_norm": 0.4729503393173218, "learning_rate": 3.6593390139619246e-05, "loss": 0.1882, "step": 27730 }, { "epoch": 3.767869876736052, "grad_norm": 1.3188419342041016, "learning_rate": 3.655532829603685e-05, "loss": 0.1898, "step": 27740 }, { "epoch": 3.7692281571530444, "grad_norm": 0.5355834364891052, "learning_rate": 3.6517274848760467e-05, "loss": 0.2005, "step": 27750 }, { "epoch": 3.7705864375700364, "grad_norm": 1.3015527725219727, "learning_rate": 3.6479229821554746e-05, "loss": 0.1991, "step": 27760 }, { "epoch": 3.7719447179870285, "grad_norm": 0.7440180778503418, "learning_rate": 3.644119323817915e-05, "loss": 0.2063, "step": 27770 }, { "epoch": 3.7733029984040205, "grad_norm": 0.45184051990509033, "learning_rate": 3.6403165122387826e-05, "loss": 0.1991, "step": 27780 }, { "epoch": 3.7746612788210125, "grad_norm": 0.4858826696872711, "learning_rate": 3.636514549792963e-05, "loss": 0.1992, "step": 27790 }, { "epoch": 3.7760195592380046, "grad_norm": 0.4439658522605896, "learning_rate": 3.6327134388548134e-05, "loss": 0.195, "step": 27800 }, { "epoch": 3.7773778396549966, "grad_norm": 0.4236868917942047, "learning_rate": 3.628913181798157e-05, "loss": 0.1971, "step": 27810 }, { "epoch": 3.778736120071989, "grad_norm": 1.0844213962554932, "learning_rate": 3.625113780996285e-05, "loss": 0.1974, "step": 27820 }, { "epoch": 3.7800944004889807, "grad_norm": 0.4896681010723114, "learning_rate": 3.621315238821956e-05, "loss": 0.1981, "step": 27830 }, { "epoch": 3.781452680905973, "grad_norm": 0.5574180483818054, "learning_rate": 3.6175175576473884e-05, "loss": 0.1918, "step": 27840 }, { "epoch": 3.7828109613229652, "grad_norm": 0.65180504322052, "learning_rate": 3.6137207398442654e-05, "loss": 0.1958, "step": 27850 }, { "epoch": 3.7841692417399573, "grad_norm": 0.446737140417099, "learning_rate": 3.60992478778373e-05, "loss": 0.1915, "step": 27860 }, { "epoch": 3.7855275221569493, "grad_norm": 0.40561971068382263, "learning_rate": 3.606129703836385e-05, "loss": 0.1927, "step": 27870 }, { "epoch": 3.7868858025739414, "grad_norm": 0.4269028306007385, "learning_rate": 3.60233549037229e-05, "loss": 0.2024, "step": 27880 }, { "epoch": 3.7882440829909334, "grad_norm": 0.7555444836616516, "learning_rate": 3.598542149760964e-05, "loss": 0.1983, "step": 27890 }, { "epoch": 3.7896023634079254, "grad_norm": 0.41909825801849365, "learning_rate": 3.594749684371378e-05, "loss": 0.2006, "step": 27900 }, { "epoch": 3.790960643824918, "grad_norm": 0.6533623933792114, "learning_rate": 3.590958096571956e-05, "loss": 0.1977, "step": 27910 }, { "epoch": 3.7923189242419095, "grad_norm": 0.49785757064819336, "learning_rate": 3.5871673887305776e-05, "loss": 0.1895, "step": 27920 }, { "epoch": 3.793677204658902, "grad_norm": 0.7246065735816956, "learning_rate": 3.583377563214567e-05, "loss": 0.2007, "step": 27930 }, { "epoch": 3.795035485075894, "grad_norm": 0.41204482316970825, "learning_rate": 3.5795886223907036e-05, "loss": 0.1931, "step": 27940 }, { "epoch": 3.796393765492886, "grad_norm": 0.5219478011131287, "learning_rate": 3.5758005686252114e-05, "loss": 0.1958, "step": 27950 }, { "epoch": 3.797752045909878, "grad_norm": 0.4794066548347473, "learning_rate": 3.57201340428376e-05, "loss": 0.1926, "step": 27960 }, { "epoch": 3.79911032632687, "grad_norm": 1.1426516771316528, "learning_rate": 3.568227131731464e-05, "loss": 0.1956, "step": 27970 }, { "epoch": 3.800468606743862, "grad_norm": 0.5026823878288269, "learning_rate": 3.5644417533328814e-05, "loss": 0.1917, "step": 27980 }, { "epoch": 3.8018268871608543, "grad_norm": 0.44425731897354126, "learning_rate": 3.560657271452009e-05, "loss": 0.1863, "step": 27990 }, { "epoch": 3.8031851675778463, "grad_norm": 0.5948101878166199, "learning_rate": 3.556873688452291e-05, "loss": 0.1876, "step": 28000 }, { "epoch": 3.8045434479948383, "grad_norm": 0.436874121427536, "learning_rate": 3.553091006696603e-05, "loss": 0.1948, "step": 28010 }, { "epoch": 3.805901728411831, "grad_norm": 0.5469005107879639, "learning_rate": 3.5493092285472606e-05, "loss": 0.1984, "step": 28020 }, { "epoch": 3.807260008828823, "grad_norm": 0.44551706314086914, "learning_rate": 3.545528356366015e-05, "loss": 0.1992, "step": 28030 }, { "epoch": 3.808618289245815, "grad_norm": 0.4338882267475128, "learning_rate": 3.541748392514049e-05, "loss": 0.2015, "step": 28040 }, { "epoch": 3.809976569662807, "grad_norm": 0.45955491065979004, "learning_rate": 3.5379693393519835e-05, "loss": 0.1945, "step": 28050 }, { "epoch": 3.811334850079799, "grad_norm": 0.43848133087158203, "learning_rate": 3.534191199239867e-05, "loss": 0.1864, "step": 28060 }, { "epoch": 3.812693130496791, "grad_norm": 0.4811517596244812, "learning_rate": 3.530413974537179e-05, "loss": 0.1864, "step": 28070 }, { "epoch": 3.814051410913783, "grad_norm": 0.379125714302063, "learning_rate": 3.526637667602826e-05, "loss": 0.1911, "step": 28080 }, { "epoch": 3.815409691330775, "grad_norm": 0.498092919588089, "learning_rate": 3.522862280795142e-05, "loss": 0.1962, "step": 28090 }, { "epoch": 3.816767971747767, "grad_norm": 0.42782503366470337, "learning_rate": 3.5190878164718854e-05, "loss": 0.1979, "step": 28100 }, { "epoch": 3.8181262521647596, "grad_norm": 0.6947119832038879, "learning_rate": 3.515314276990243e-05, "loss": 0.1903, "step": 28110 }, { "epoch": 3.8194845325817512, "grad_norm": 0.43547698855400085, "learning_rate": 3.511541664706818e-05, "loss": 0.2007, "step": 28120 }, { "epoch": 3.8208428129987437, "grad_norm": 0.7258159518241882, "learning_rate": 3.507769981977638e-05, "loss": 0.1972, "step": 28130 }, { "epoch": 3.8222010934157358, "grad_norm": 0.7074345946311951, "learning_rate": 3.5039992311581484e-05, "loss": 0.1891, "step": 28140 }, { "epoch": 3.823559373832728, "grad_norm": 0.5319786667823792, "learning_rate": 3.500229414603216e-05, "loss": 0.1972, "step": 28150 }, { "epoch": 3.82491765424972, "grad_norm": 0.49471431970596313, "learning_rate": 3.496460534667117e-05, "loss": 0.1997, "step": 28160 }, { "epoch": 3.826275934666712, "grad_norm": 0.5680634379386902, "learning_rate": 3.4926925937035525e-05, "loss": 0.1931, "step": 28170 }, { "epoch": 3.827634215083704, "grad_norm": 0.9858710765838623, "learning_rate": 3.488925594065631e-05, "loss": 0.2065, "step": 28180 }, { "epoch": 3.828992495500696, "grad_norm": 0.47975149750709534, "learning_rate": 3.485159538105871e-05, "loss": 0.1985, "step": 28190 }, { "epoch": 3.8303507759176885, "grad_norm": 0.5283595323562622, "learning_rate": 3.481394428176208e-05, "loss": 0.2006, "step": 28200 }, { "epoch": 3.83170905633468, "grad_norm": 0.5704883933067322, "learning_rate": 3.4776302666279816e-05, "loss": 0.1854, "step": 28210 }, { "epoch": 3.8330673367516725, "grad_norm": 0.5378320217132568, "learning_rate": 3.4738670558119426e-05, "loss": 0.194, "step": 28220 }, { "epoch": 3.8344256171686646, "grad_norm": 0.5502563714981079, "learning_rate": 3.4701047980782466e-05, "loss": 0.1893, "step": 28230 }, { "epoch": 3.8357838975856566, "grad_norm": 0.4513683617115021, "learning_rate": 3.466343495776452e-05, "loss": 0.1878, "step": 28240 }, { "epoch": 3.8371421780026487, "grad_norm": 0.4697418510913849, "learning_rate": 3.462583151255524e-05, "loss": 0.2004, "step": 28250 }, { "epoch": 3.8385004584196407, "grad_norm": 0.40381988883018494, "learning_rate": 3.4588237668638284e-05, "loss": 0.1952, "step": 28260 }, { "epoch": 3.8398587388366328, "grad_norm": 0.5215148329734802, "learning_rate": 3.455065344949129e-05, "loss": 0.1995, "step": 28270 }, { "epoch": 3.841217019253625, "grad_norm": 0.44988131523132324, "learning_rate": 3.451307887858593e-05, "loss": 0.1985, "step": 28280 }, { "epoch": 3.842575299670617, "grad_norm": 0.3212624490261078, "learning_rate": 3.4475513979387816e-05, "loss": 0.1943, "step": 28290 }, { "epoch": 3.843933580087609, "grad_norm": 0.5113000869750977, "learning_rate": 3.443795877535654e-05, "loss": 0.191, "step": 28300 }, { "epoch": 3.8452918605046014, "grad_norm": 0.4543994665145874, "learning_rate": 3.440041328994562e-05, "loss": 0.1926, "step": 28310 }, { "epoch": 3.8466501409215934, "grad_norm": 0.9766450524330139, "learning_rate": 3.436287754660251e-05, "loss": 0.188, "step": 28320 }, { "epoch": 3.8480084213385854, "grad_norm": 0.6753997206687927, "learning_rate": 3.4325351568768585e-05, "loss": 0.1987, "step": 28330 }, { "epoch": 3.8493667017555775, "grad_norm": 0.6376868486404419, "learning_rate": 3.428783537987914e-05, "loss": 0.1915, "step": 28340 }, { "epoch": 3.8507249821725695, "grad_norm": 2.4130966663360596, "learning_rate": 3.425032900336332e-05, "loss": 0.217, "step": 28350 }, { "epoch": 3.8520832625895616, "grad_norm": 0.43693625926971436, "learning_rate": 3.421283246264417e-05, "loss": 0.1972, "step": 28360 }, { "epoch": 3.8534415430065536, "grad_norm": 0.5474570393562317, "learning_rate": 3.417534578113858e-05, "loss": 0.1967, "step": 28370 }, { "epoch": 3.8547998234235457, "grad_norm": 0.4217793643474579, "learning_rate": 3.413786898225727e-05, "loss": 0.1858, "step": 28380 }, { "epoch": 3.8561581038405377, "grad_norm": 0.4522024691104889, "learning_rate": 3.410040208940483e-05, "loss": 0.1836, "step": 28390 }, { "epoch": 3.85751638425753, "grad_norm": 0.6693591475486755, "learning_rate": 3.406294512597962e-05, "loss": 0.1861, "step": 28400 }, { "epoch": 3.858874664674522, "grad_norm": 0.4359399080276489, "learning_rate": 3.4025498115373825e-05, "loss": 0.1969, "step": 28410 }, { "epoch": 3.8602329450915143, "grad_norm": 0.9366066455841064, "learning_rate": 3.3988061080973396e-05, "loss": 0.1987, "step": 28420 }, { "epoch": 3.8615912255085063, "grad_norm": 0.5528479814529419, "learning_rate": 3.395063404615805e-05, "loss": 0.1961, "step": 28430 }, { "epoch": 3.8629495059254984, "grad_norm": 0.5974676609039307, "learning_rate": 3.391321703430131e-05, "loss": 0.2017, "step": 28440 }, { "epoch": 3.8643077863424904, "grad_norm": 0.5372784733772278, "learning_rate": 3.3875810068770374e-05, "loss": 0.1826, "step": 28450 }, { "epoch": 3.8656660667594824, "grad_norm": 0.6506730318069458, "learning_rate": 3.38384131729262e-05, "loss": 0.2017, "step": 28460 }, { "epoch": 3.8670243471764745, "grad_norm": 0.34969112277030945, "learning_rate": 3.3801026370123455e-05, "loss": 0.194, "step": 28470 }, { "epoch": 3.8683826275934665, "grad_norm": 0.6351760029792786, "learning_rate": 3.3763649683710504e-05, "loss": 0.1886, "step": 28480 }, { "epoch": 3.869740908010459, "grad_norm": 0.927112340927124, "learning_rate": 3.3726283137029375e-05, "loss": 0.2045, "step": 28490 }, { "epoch": 3.8710991884274506, "grad_norm": 0.6057062149047852, "learning_rate": 3.3688926753415776e-05, "loss": 0.1914, "step": 28500 }, { "epoch": 3.872457468844443, "grad_norm": 1.257717251777649, "learning_rate": 3.365158055619909e-05, "loss": 0.1956, "step": 28510 }, { "epoch": 3.873815749261435, "grad_norm": 0.8058252334594727, "learning_rate": 3.36142445687023e-05, "loss": 0.1876, "step": 28520 }, { "epoch": 3.875174029678427, "grad_norm": 0.4556853771209717, "learning_rate": 3.3576918814242055e-05, "loss": 0.1933, "step": 28530 }, { "epoch": 3.876532310095419, "grad_norm": 0.5344199538230896, "learning_rate": 3.353960331612856e-05, "loss": 0.1893, "step": 28540 }, { "epoch": 3.8778905905124113, "grad_norm": 0.5023429989814758, "learning_rate": 3.350229809766565e-05, "loss": 0.1941, "step": 28550 }, { "epoch": 3.8792488709294033, "grad_norm": 0.615887463092804, "learning_rate": 3.346500318215076e-05, "loss": 0.191, "step": 28560 }, { "epoch": 3.8806071513463953, "grad_norm": 0.39724475145339966, "learning_rate": 3.3427718592874844e-05, "loss": 0.19, "step": 28570 }, { "epoch": 3.881965431763388, "grad_norm": 0.5248691439628601, "learning_rate": 3.3390444353122444e-05, "loss": 0.1918, "step": 28580 }, { "epoch": 3.8833237121803794, "grad_norm": 0.5181042551994324, "learning_rate": 3.335318048617161e-05, "loss": 0.1943, "step": 28590 }, { "epoch": 3.884681992597372, "grad_norm": 0.7149196863174438, "learning_rate": 3.331592701529392e-05, "loss": 0.1864, "step": 28600 }, { "epoch": 3.886040273014364, "grad_norm": 0.6122139096260071, "learning_rate": 3.3278683963754484e-05, "loss": 0.1902, "step": 28610 }, { "epoch": 3.887398553431356, "grad_norm": 0.5293213725090027, "learning_rate": 3.324145135481189e-05, "loss": 0.2077, "step": 28620 }, { "epoch": 3.888756833848348, "grad_norm": 0.6245774030685425, "learning_rate": 3.3204229211718205e-05, "loss": 0.1889, "step": 28630 }, { "epoch": 3.89011511426534, "grad_norm": 2.725646495819092, "learning_rate": 3.316701755771894e-05, "loss": 0.1999, "step": 28640 }, { "epoch": 3.891473394682332, "grad_norm": 0.5951151251792908, "learning_rate": 3.312981641605307e-05, "loss": 0.1989, "step": 28650 }, { "epoch": 3.892831675099324, "grad_norm": 0.45930951833724976, "learning_rate": 3.309262580995305e-05, "loss": 0.2022, "step": 28660 }, { "epoch": 3.894189955516316, "grad_norm": 0.9762306809425354, "learning_rate": 3.305544576264465e-05, "loss": 0.1909, "step": 28670 }, { "epoch": 3.8955482359333082, "grad_norm": 0.5934500694274902, "learning_rate": 3.3018276297347165e-05, "loss": 0.1978, "step": 28680 }, { "epoch": 3.8969065163503007, "grad_norm": 0.5352463722229004, "learning_rate": 3.29811174372732e-05, "loss": 0.1888, "step": 28690 }, { "epoch": 3.8982647967672928, "grad_norm": 0.5188794732093811, "learning_rate": 3.294396920562878e-05, "loss": 0.1927, "step": 28700 }, { "epoch": 3.899623077184285, "grad_norm": 0.47732335329055786, "learning_rate": 3.290683162561325e-05, "loss": 0.1863, "step": 28710 }, { "epoch": 3.900981357601277, "grad_norm": 0.5634704232215881, "learning_rate": 3.2869704720419335e-05, "loss": 0.2, "step": 28720 }, { "epoch": 3.902339638018269, "grad_norm": 0.7620969414710999, "learning_rate": 3.283258851323311e-05, "loss": 0.1894, "step": 28730 }, { "epoch": 3.903697918435261, "grad_norm": 0.5450012683868408, "learning_rate": 3.279548302723393e-05, "loss": 0.1959, "step": 28740 }, { "epoch": 3.905056198852253, "grad_norm": 0.41271594166755676, "learning_rate": 3.2758388285594467e-05, "loss": 0.1895, "step": 28750 }, { "epoch": 3.906414479269245, "grad_norm": 0.7059533596038818, "learning_rate": 3.2721304311480685e-05, "loss": 0.1948, "step": 28760 }, { "epoch": 3.907772759686237, "grad_norm": 0.3881911039352417, "learning_rate": 3.268423112805183e-05, "loss": 0.1836, "step": 28770 }, { "epoch": 3.9091310401032295, "grad_norm": 0.39738622307777405, "learning_rate": 3.264716875846042e-05, "loss": 0.1941, "step": 28780 }, { "epoch": 3.910489320520221, "grad_norm": 1.4927008152008057, "learning_rate": 3.261011722585217e-05, "loss": 0.199, "step": 28790 }, { "epoch": 3.9118476009372136, "grad_norm": 0.40918684005737305, "learning_rate": 3.257307655336609e-05, "loss": 0.1991, "step": 28800 }, { "epoch": 3.9132058813542057, "grad_norm": 0.4959438145160675, "learning_rate": 3.253604676413437e-05, "loss": 0.1896, "step": 28810 }, { "epoch": 3.9145641617711977, "grad_norm": 2.22213077545166, "learning_rate": 3.2499027881282395e-05, "loss": 0.1973, "step": 28820 }, { "epoch": 3.9159224421881897, "grad_norm": 0.5552440285682678, "learning_rate": 3.24620199279288e-05, "loss": 0.2085, "step": 28830 }, { "epoch": 3.917280722605182, "grad_norm": 0.5708045959472656, "learning_rate": 3.24250229271853e-05, "loss": 0.1857, "step": 28840 }, { "epoch": 3.918639003022174, "grad_norm": 0.6041545867919922, "learning_rate": 3.238803690215686e-05, "loss": 0.1929, "step": 28850 }, { "epoch": 3.919997283439166, "grad_norm": 0.6538639664649963, "learning_rate": 3.235106187594153e-05, "loss": 0.2029, "step": 28860 }, { "epoch": 3.9213555638561584, "grad_norm": 4.158440589904785, "learning_rate": 3.23140978716305e-05, "loss": 0.2081, "step": 28870 }, { "epoch": 3.92271384427315, "grad_norm": 0.6636366844177246, "learning_rate": 3.2277144912308135e-05, "loss": 0.1834, "step": 28880 }, { "epoch": 3.9240721246901424, "grad_norm": 0.4854392409324646, "learning_rate": 3.22402030210518e-05, "loss": 0.2056, "step": 28890 }, { "epoch": 3.9254304051071345, "grad_norm": 0.7103421688079834, "learning_rate": 3.2203272220932045e-05, "loss": 0.1999, "step": 28900 }, { "epoch": 3.9267886855241265, "grad_norm": 0.7924665212631226, "learning_rate": 3.216635253501244e-05, "loss": 0.1924, "step": 28910 }, { "epoch": 3.9281469659411186, "grad_norm": 0.7611386775970459, "learning_rate": 3.212944398634963e-05, "loss": 0.1988, "step": 28920 }, { "epoch": 3.9295052463581106, "grad_norm": 0.48958566784858704, "learning_rate": 3.20925465979933e-05, "loss": 0.1895, "step": 28930 }, { "epoch": 3.9308635267751026, "grad_norm": 0.4677891135215759, "learning_rate": 3.205566039298614e-05, "loss": 0.1982, "step": 28940 }, { "epoch": 3.9322218071920947, "grad_norm": 0.4329497218132019, "learning_rate": 3.201878539436394e-05, "loss": 0.1978, "step": 28950 }, { "epoch": 3.933580087609087, "grad_norm": 0.5542562007904053, "learning_rate": 3.198192162515538e-05, "loss": 0.2006, "step": 28960 }, { "epoch": 3.9349383680260788, "grad_norm": 0.9243258237838745, "learning_rate": 3.194506910838221e-05, "loss": 0.2012, "step": 28970 }, { "epoch": 3.9362966484430713, "grad_norm": 0.6060301065444946, "learning_rate": 3.1908227867059125e-05, "loss": 0.1953, "step": 28980 }, { "epoch": 3.9376549288600633, "grad_norm": 0.38401123881340027, "learning_rate": 3.187139792419375e-05, "loss": 0.1919, "step": 28990 }, { "epoch": 3.9390132092770553, "grad_norm": 0.44552549719810486, "learning_rate": 3.1834579302786724e-05, "loss": 0.1992, "step": 29000 }, { "epoch": 3.9403714896940474, "grad_norm": 0.5105125308036804, "learning_rate": 3.179777202583154e-05, "loss": 0.2003, "step": 29010 }, { "epoch": 3.9417297701110394, "grad_norm": 0.3535793125629425, "learning_rate": 3.1760976116314656e-05, "loss": 0.1969, "step": 29020 }, { "epoch": 3.9430880505280315, "grad_norm": 0.4657975733280182, "learning_rate": 3.1724191597215405e-05, "loss": 0.1877, "step": 29030 }, { "epoch": 3.9444463309450235, "grad_norm": 0.5319719910621643, "learning_rate": 3.168741849150601e-05, "loss": 0.1914, "step": 29040 }, { "epoch": 3.9458046113620155, "grad_norm": 0.5233412981033325, "learning_rate": 3.16506568221516e-05, "loss": 0.1925, "step": 29050 }, { "epoch": 3.9471628917790076, "grad_norm": 0.5102574229240417, "learning_rate": 3.16139066121101e-05, "loss": 0.1882, "step": 29060 }, { "epoch": 3.948521172196, "grad_norm": 0.3507876992225647, "learning_rate": 3.157716788433235e-05, "loss": 0.1977, "step": 29070 }, { "epoch": 3.949879452612992, "grad_norm": 0.5857633352279663, "learning_rate": 3.154044066176195e-05, "loss": 0.1993, "step": 29080 }, { "epoch": 3.951237733029984, "grad_norm": 0.7576076984405518, "learning_rate": 3.150372496733534e-05, "loss": 0.1894, "step": 29090 }, { "epoch": 3.952596013446976, "grad_norm": 0.6432096362113953, "learning_rate": 3.146702082398182e-05, "loss": 0.1996, "step": 29100 }, { "epoch": 3.9539542938639682, "grad_norm": 0.8041906952857971, "learning_rate": 3.1430328254623375e-05, "loss": 0.19, "step": 29110 }, { "epoch": 3.9553125742809603, "grad_norm": 0.49847447872161865, "learning_rate": 3.139364728217483e-05, "loss": 0.1844, "step": 29120 }, { "epoch": 3.9566708546979523, "grad_norm": 0.4631551504135132, "learning_rate": 3.1356977929543736e-05, "loss": 0.1897, "step": 29130 }, { "epoch": 3.9580291351149444, "grad_norm": 0.5182698369026184, "learning_rate": 3.132032021963041e-05, "loss": 0.1891, "step": 29140 }, { "epoch": 3.9593874155319364, "grad_norm": 0.5683391690254211, "learning_rate": 3.128367417532789e-05, "loss": 0.197, "step": 29150 }, { "epoch": 3.960745695948929, "grad_norm": 0.4268258512020111, "learning_rate": 3.124703981952191e-05, "loss": 0.1858, "step": 29160 }, { "epoch": 3.9621039763659205, "grad_norm": 0.7582992315292358, "learning_rate": 3.121041717509095e-05, "loss": 0.198, "step": 29170 }, { "epoch": 3.963462256782913, "grad_norm": 0.45430299639701843, "learning_rate": 3.117380626490611e-05, "loss": 0.1867, "step": 29180 }, { "epoch": 3.964820537199905, "grad_norm": 0.5447841286659241, "learning_rate": 3.1137207111831236e-05, "loss": 0.1897, "step": 29190 }, { "epoch": 3.966178817616897, "grad_norm": 0.5303439497947693, "learning_rate": 3.110061973872277e-05, "loss": 0.1827, "step": 29200 }, { "epoch": 3.967537098033889, "grad_norm": 1.0494788885116577, "learning_rate": 3.106404416842982e-05, "loss": 0.187, "step": 29210 }, { "epoch": 3.968895378450881, "grad_norm": 0.5895728468894958, "learning_rate": 3.102748042379415e-05, "loss": 0.1942, "step": 29220 }, { "epoch": 3.970253658867873, "grad_norm": 0.5475305318832397, "learning_rate": 3.0990928527650086e-05, "loss": 0.1972, "step": 29230 }, { "epoch": 3.9716119392848652, "grad_norm": 0.41807126998901367, "learning_rate": 3.09543885028246e-05, "loss": 0.1946, "step": 29240 }, { "epoch": 3.9729702197018577, "grad_norm": 0.5113938450813293, "learning_rate": 3.0917860372137225e-05, "loss": 0.1984, "step": 29250 }, { "epoch": 3.9743285001188493, "grad_norm": 0.558836042881012, "learning_rate": 3.0881344158400066e-05, "loss": 0.1984, "step": 29260 }, { "epoch": 3.975686780535842, "grad_norm": 0.5120525360107422, "learning_rate": 3.084483988441782e-05, "loss": 0.1889, "step": 29270 }, { "epoch": 3.977045060952834, "grad_norm": 0.7400079965591431, "learning_rate": 3.0808347572987665e-05, "loss": 0.1848, "step": 29280 }, { "epoch": 3.978403341369826, "grad_norm": 0.4202435612678528, "learning_rate": 3.077186724689938e-05, "loss": 0.189, "step": 29290 }, { "epoch": 3.979761621786818, "grad_norm": 0.6050130128860474, "learning_rate": 3.073539892893519e-05, "loss": 0.1983, "step": 29300 }, { "epoch": 3.98111990220381, "grad_norm": 0.8170284628868103, "learning_rate": 3.0698942641869865e-05, "loss": 0.1862, "step": 29310 }, { "epoch": 3.982478182620802, "grad_norm": 0.3944275677204132, "learning_rate": 3.0662498408470676e-05, "loss": 0.1873, "step": 29320 }, { "epoch": 3.983836463037794, "grad_norm": 0.5249157547950745, "learning_rate": 3.062606625149729e-05, "loss": 0.187, "step": 29330 }, { "epoch": 3.985194743454786, "grad_norm": 0.544728696346283, "learning_rate": 3.058964619370194e-05, "loss": 0.1894, "step": 29340 }, { "epoch": 3.986553023871778, "grad_norm": 0.4942173361778259, "learning_rate": 3.05532382578292e-05, "loss": 0.1884, "step": 29350 }, { "epoch": 3.9879113042887706, "grad_norm": 0.585983157157898, "learning_rate": 3.0516842466616146e-05, "loss": 0.1977, "step": 29360 }, { "epoch": 3.9892695847057627, "grad_norm": 0.899857223033905, "learning_rate": 3.0480458842792235e-05, "loss": 0.19, "step": 29370 }, { "epoch": 3.9906278651227547, "grad_norm": 0.47013363242149353, "learning_rate": 3.0444087409079325e-05, "loss": 0.1978, "step": 29380 }, { "epoch": 3.9919861455397467, "grad_norm": 0.5096186399459839, "learning_rate": 3.0407728188191697e-05, "loss": 0.1889, "step": 29390 }, { "epoch": 3.9933444259567388, "grad_norm": 0.5189212560653687, "learning_rate": 3.0371381202835947e-05, "loss": 0.1973, "step": 29400 }, { "epoch": 3.994702706373731, "grad_norm": 0.3805292844772339, "learning_rate": 3.0335046475711084e-05, "loss": 0.1926, "step": 29410 }, { "epoch": 3.996060986790723, "grad_norm": 0.4320145845413208, "learning_rate": 3.029872402950842e-05, "loss": 0.1909, "step": 29420 }, { "epoch": 3.997419267207715, "grad_norm": 0.6815786957740784, "learning_rate": 3.0262413886911617e-05, "loss": 0.1969, "step": 29430 }, { "epoch": 3.998777547624707, "grad_norm": 0.573527991771698, "learning_rate": 3.0226116070596678e-05, "loss": 0.1988, "step": 29440 }, { "epoch": 4.000135828041699, "grad_norm": 0.7034412622451782, "learning_rate": 3.018983060323185e-05, "loss": 0.1872, "step": 29450 }, { "epoch": 4.001494108458691, "grad_norm": 0.45167267322540283, "learning_rate": 3.0153557507477725e-05, "loss": 0.169, "step": 29460 }, { "epoch": 4.0028523888756835, "grad_norm": 0.7938618063926697, "learning_rate": 3.0117296805987123e-05, "loss": 0.1682, "step": 29470 }, { "epoch": 4.004210669292675, "grad_norm": 0.5057234764099121, "learning_rate": 3.0081048521405148e-05, "loss": 0.1659, "step": 29480 }, { "epoch": 4.005568949709668, "grad_norm": 0.3979319632053375, "learning_rate": 3.004481267636917e-05, "loss": 0.1693, "step": 29490 }, { "epoch": 4.00692723012666, "grad_norm": 0.6141130924224854, "learning_rate": 3.0008589293508737e-05, "loss": 0.1614, "step": 29500 }, { "epoch": 4.008285510543652, "grad_norm": 1.1070421934127808, "learning_rate": 2.9972378395445677e-05, "loss": 0.1618, "step": 29510 }, { "epoch": 4.009643790960644, "grad_norm": 0.5344529747962952, "learning_rate": 2.9936180004793944e-05, "loss": 0.1664, "step": 29520 }, { "epoch": 4.011002071377636, "grad_norm": 0.5652298927307129, "learning_rate": 2.989999414415976e-05, "loss": 0.1617, "step": 29530 }, { "epoch": 4.012360351794628, "grad_norm": 0.9921879172325134, "learning_rate": 2.9863820836141477e-05, "loss": 0.1709, "step": 29540 }, { "epoch": 4.01371863221162, "grad_norm": 0.4918419122695923, "learning_rate": 2.9827660103329603e-05, "loss": 0.1702, "step": 29550 }, { "epoch": 4.015076912628612, "grad_norm": 0.5340962409973145, "learning_rate": 2.979151196830684e-05, "loss": 0.1531, "step": 29560 }, { "epoch": 4.016435193045604, "grad_norm": 0.5817250609397888, "learning_rate": 2.975537645364796e-05, "loss": 0.164, "step": 29570 }, { "epoch": 4.017793473462596, "grad_norm": 0.8375174403190613, "learning_rate": 2.9719253581919905e-05, "loss": 0.1756, "step": 29580 }, { "epoch": 4.019151753879588, "grad_norm": 0.3941054046154022, "learning_rate": 2.9683143375681675e-05, "loss": 0.1646, "step": 29590 }, { "epoch": 4.0205100342965805, "grad_norm": 0.9846612811088562, "learning_rate": 2.9647045857484408e-05, "loss": 0.1717, "step": 29600 }, { "epoch": 4.021868314713573, "grad_norm": 0.5890139937400818, "learning_rate": 2.96109610498713e-05, "loss": 0.1765, "step": 29610 }, { "epoch": 4.023226595130565, "grad_norm": 0.5731610655784607, "learning_rate": 2.9574888975377572e-05, "loss": 0.1717, "step": 29620 }, { "epoch": 4.024584875547557, "grad_norm": 0.6309247016906738, "learning_rate": 2.953882965653057e-05, "loss": 0.1694, "step": 29630 }, { "epoch": 4.025943155964549, "grad_norm": 0.46173331141471863, "learning_rate": 2.9502783115849586e-05, "loss": 0.174, "step": 29640 }, { "epoch": 4.027301436381541, "grad_norm": 0.8179555535316467, "learning_rate": 2.9466749375845992e-05, "loss": 0.1741, "step": 29650 }, { "epoch": 4.028659716798533, "grad_norm": 0.7822527289390564, "learning_rate": 2.9430728459023172e-05, "loss": 0.1806, "step": 29660 }, { "epoch": 4.030017997215525, "grad_norm": 0.5075510740280151, "learning_rate": 2.939472038787644e-05, "loss": 0.1704, "step": 29670 }, { "epoch": 4.031376277632517, "grad_norm": 0.7502756714820862, "learning_rate": 2.9358725184893166e-05, "loss": 0.1744, "step": 29680 }, { "epoch": 4.032734558049509, "grad_norm": 0.7350807189941406, "learning_rate": 2.932274287255261e-05, "loss": 0.1634, "step": 29690 }, { "epoch": 4.034092838466502, "grad_norm": 0.8352128863334656, "learning_rate": 2.9286773473326034e-05, "loss": 0.1684, "step": 29700 }, { "epoch": 4.035451118883493, "grad_norm": 0.9080410599708557, "learning_rate": 2.9250817009676634e-05, "loss": 0.1836, "step": 29710 }, { "epoch": 4.036809399300486, "grad_norm": 0.782817006111145, "learning_rate": 2.9214873504059474e-05, "loss": 0.1745, "step": 29720 }, { "epoch": 4.0381676797174775, "grad_norm": 0.5535351037979126, "learning_rate": 2.917894297892161e-05, "loss": 0.1521, "step": 29730 }, { "epoch": 4.03952596013447, "grad_norm": 0.5197460055351257, "learning_rate": 2.914302545670189e-05, "loss": 0.1689, "step": 29740 }, { "epoch": 4.040884240551462, "grad_norm": 0.5161351561546326, "learning_rate": 2.9107120959831152e-05, "loss": 0.1775, "step": 29750 }, { "epoch": 4.042242520968454, "grad_norm": 0.7122552990913391, "learning_rate": 2.9071229510731997e-05, "loss": 0.1784, "step": 29760 }, { "epoch": 4.043600801385446, "grad_norm": 0.6414073705673218, "learning_rate": 2.9035351131818943e-05, "loss": 0.1682, "step": 29770 }, { "epoch": 4.044959081802438, "grad_norm": 0.4504986107349396, "learning_rate": 2.899948584549835e-05, "loss": 0.1697, "step": 29780 }, { "epoch": 4.046317362219431, "grad_norm": 0.6033002138137817, "learning_rate": 2.8963633674168334e-05, "loss": 0.162, "step": 29790 }, { "epoch": 4.047675642636422, "grad_norm": 1.0106849670410156, "learning_rate": 2.8927794640218918e-05, "loss": 0.169, "step": 29800 }, { "epoch": 4.049033923053415, "grad_norm": 0.573704719543457, "learning_rate": 2.8891968766031813e-05, "loss": 0.1681, "step": 29810 }, { "epoch": 4.050392203470406, "grad_norm": 0.5816409587860107, "learning_rate": 2.8856156073980596e-05, "loss": 0.1663, "step": 29820 }, { "epoch": 4.051750483887399, "grad_norm": 0.5351368188858032, "learning_rate": 2.8820356586430595e-05, "loss": 0.1629, "step": 29830 }, { "epoch": 4.05310876430439, "grad_norm": 0.5784529447555542, "learning_rate": 2.8784570325738853e-05, "loss": 0.167, "step": 29840 }, { "epoch": 4.054467044721383, "grad_norm": 0.6685323119163513, "learning_rate": 2.8748797314254204e-05, "loss": 0.1585, "step": 29850 }, { "epoch": 4.0558253251383745, "grad_norm": 0.6005459427833557, "learning_rate": 2.8713037574317146e-05, "loss": 0.1642, "step": 29860 }, { "epoch": 4.057183605555367, "grad_norm": 1.6828547716140747, "learning_rate": 2.8677291128259947e-05, "loss": 0.1534, "step": 29870 }, { "epoch": 4.058541885972359, "grad_norm": 0.45910412073135376, "learning_rate": 2.864155799840657e-05, "loss": 0.166, "step": 29880 }, { "epoch": 4.059900166389351, "grad_norm": 0.8455206751823425, "learning_rate": 2.860583820707261e-05, "loss": 0.165, "step": 29890 }, { "epoch": 4.0612584468063435, "grad_norm": 0.4674511253833771, "learning_rate": 2.857013177656539e-05, "loss": 0.1646, "step": 29900 }, { "epoch": 4.062616727223335, "grad_norm": 0.4515868127346039, "learning_rate": 2.8534438729183844e-05, "loss": 0.1642, "step": 29910 }, { "epoch": 4.063975007640328, "grad_norm": 0.5765043497085571, "learning_rate": 2.8498759087218586e-05, "loss": 0.1652, "step": 29920 }, { "epoch": 4.065333288057319, "grad_norm": 0.5878356099128723, "learning_rate": 2.846309287295182e-05, "loss": 0.1767, "step": 29930 }, { "epoch": 4.066691568474312, "grad_norm": 0.6735464930534363, "learning_rate": 2.8427440108657388e-05, "loss": 0.1576, "step": 29940 }, { "epoch": 4.068049848891303, "grad_norm": 0.5343856811523438, "learning_rate": 2.8391800816600766e-05, "loss": 0.168, "step": 29950 }, { "epoch": 4.069408129308296, "grad_norm": 0.5110285878181458, "learning_rate": 2.8356175019038922e-05, "loss": 0.1528, "step": 29960 }, { "epoch": 4.070766409725287, "grad_norm": 0.5929447412490845, "learning_rate": 2.8320562738220514e-05, "loss": 0.1622, "step": 29970 }, { "epoch": 4.07212469014228, "grad_norm": 0.6369814872741699, "learning_rate": 2.828496399638564e-05, "loss": 0.1659, "step": 29980 }, { "epoch": 4.073482970559272, "grad_norm": 0.45320719480514526, "learning_rate": 2.824937881576603e-05, "loss": 0.163, "step": 29990 }, { "epoch": 4.074841250976264, "grad_norm": 0.9153100252151489, "learning_rate": 2.8213807218584943e-05, "loss": 0.1717, "step": 30000 }, { "epoch": 4.076199531393256, "grad_norm": 0.3995494246482849, "learning_rate": 2.8178249227057073e-05, "loss": 0.1573, "step": 30010 }, { "epoch": 4.077557811810248, "grad_norm": 0.4510035514831543, "learning_rate": 2.8142704863388725e-05, "loss": 0.179, "step": 30020 }, { "epoch": 4.0789160922272405, "grad_norm": 0.8538302779197693, "learning_rate": 2.8107174149777593e-05, "loss": 0.1728, "step": 30030 }, { "epoch": 4.080274372644232, "grad_norm": 0.5420547723770142, "learning_rate": 2.8071657108412926e-05, "loss": 0.1792, "step": 30040 }, { "epoch": 4.081632653061225, "grad_norm": 0.43045514822006226, "learning_rate": 2.8036153761475416e-05, "loss": 0.1515, "step": 30050 }, { "epoch": 4.082990933478216, "grad_norm": 1.8254318237304688, "learning_rate": 2.8000664131137146e-05, "loss": 0.1688, "step": 30060 }, { "epoch": 4.084349213895209, "grad_norm": 0.5328717827796936, "learning_rate": 2.7965188239561712e-05, "loss": 0.1639, "step": 30070 }, { "epoch": 4.085707494312201, "grad_norm": 0.5663987398147583, "learning_rate": 2.7929726108904075e-05, "loss": 0.181, "step": 30080 }, { "epoch": 4.087065774729193, "grad_norm": 0.4411056935787201, "learning_rate": 2.7894277761310643e-05, "loss": 0.1509, "step": 30090 }, { "epoch": 4.088424055146185, "grad_norm": 0.49102333188056946, "learning_rate": 2.7858843218919168e-05, "loss": 0.1783, "step": 30100 }, { "epoch": 4.089782335563177, "grad_norm": 0.7254617214202881, "learning_rate": 2.782342250385882e-05, "loss": 0.1508, "step": 30110 }, { "epoch": 4.091140615980169, "grad_norm": 0.5456966757774353, "learning_rate": 2.778801563825014e-05, "loss": 0.1561, "step": 30120 }, { "epoch": 4.092498896397161, "grad_norm": 0.5639074444770813, "learning_rate": 2.775262264420496e-05, "loss": 0.1658, "step": 30130 }, { "epoch": 4.093857176814153, "grad_norm": 0.5060279369354248, "learning_rate": 2.7717243543826544e-05, "loss": 0.1767, "step": 30140 }, { "epoch": 4.095215457231145, "grad_norm": 0.7700660228729248, "learning_rate": 2.7681878359209373e-05, "loss": 0.1696, "step": 30150 }, { "epoch": 4.0965737376481375, "grad_norm": 0.8763545155525208, "learning_rate": 2.7646527112439315e-05, "loss": 0.1666, "step": 30160 }, { "epoch": 4.09793201806513, "grad_norm": 0.4519575238227844, "learning_rate": 2.761118982559353e-05, "loss": 0.1634, "step": 30170 }, { "epoch": 4.099290298482122, "grad_norm": 0.5301604270935059, "learning_rate": 2.757586652074039e-05, "loss": 0.1644, "step": 30180 }, { "epoch": 4.100648578899114, "grad_norm": 0.5083214640617371, "learning_rate": 2.7540557219939634e-05, "loss": 0.168, "step": 30190 }, { "epoch": 4.102006859316106, "grad_norm": 2.6981632709503174, "learning_rate": 2.7505261945242168e-05, "loss": 0.1647, "step": 30200 }, { "epoch": 4.103365139733098, "grad_norm": 0.5181933045387268, "learning_rate": 2.746998071869018e-05, "loss": 0.1714, "step": 30210 }, { "epoch": 4.10472342015009, "grad_norm": 0.7825883626937866, "learning_rate": 2.7434713562317116e-05, "loss": 0.1618, "step": 30220 }, { "epoch": 4.106081700567082, "grad_norm": 0.5884459614753723, "learning_rate": 2.7399460498147555e-05, "loss": 0.1667, "step": 30230 }, { "epoch": 4.107439980984074, "grad_norm": 0.5161358118057251, "learning_rate": 2.7364221548197354e-05, "loss": 0.1619, "step": 30240 }, { "epoch": 4.108798261401066, "grad_norm": 0.8127596378326416, "learning_rate": 2.73289967344735e-05, "loss": 0.1698, "step": 30250 }, { "epoch": 4.110156541818059, "grad_norm": 0.6807304620742798, "learning_rate": 2.7293786078974192e-05, "loss": 0.1736, "step": 30260 }, { "epoch": 4.11151482223505, "grad_norm": 0.5027347803115845, "learning_rate": 2.7258589603688733e-05, "loss": 0.1702, "step": 30270 }, { "epoch": 4.112873102652043, "grad_norm": 0.4940623342990875, "learning_rate": 2.722340733059764e-05, "loss": 0.1632, "step": 30280 }, { "epoch": 4.1142313830690345, "grad_norm": 0.8470526337623596, "learning_rate": 2.7188239281672535e-05, "loss": 0.1576, "step": 30290 }, { "epoch": 4.115589663486027, "grad_norm": 0.5877329707145691, "learning_rate": 2.7153085478876106e-05, "loss": 0.1718, "step": 30300 }, { "epoch": 4.1169479439030185, "grad_norm": 0.5968334078788757, "learning_rate": 2.711794594416224e-05, "loss": 0.1591, "step": 30310 }, { "epoch": 4.118306224320011, "grad_norm": 0.5395082235336304, "learning_rate": 2.70828206994758e-05, "loss": 0.1651, "step": 30320 }, { "epoch": 4.119664504737003, "grad_norm": 0.5285475850105286, "learning_rate": 2.7047709766752814e-05, "loss": 0.1647, "step": 30330 }, { "epoch": 4.121022785153995, "grad_norm": 0.814047634601593, "learning_rate": 2.7012613167920374e-05, "loss": 0.1723, "step": 30340 }, { "epoch": 4.122381065570987, "grad_norm": 0.5822237133979797, "learning_rate": 2.6977530924896545e-05, "loss": 0.1624, "step": 30350 }, { "epoch": 4.123739345987979, "grad_norm": 0.6879976987838745, "learning_rate": 2.6942463059590496e-05, "loss": 0.1605, "step": 30360 }, { "epoch": 4.125097626404972, "grad_norm": 0.6605875492095947, "learning_rate": 2.690740959390236e-05, "loss": 0.1604, "step": 30370 }, { "epoch": 4.126455906821963, "grad_norm": 0.527746319770813, "learning_rate": 2.6872370549723334e-05, "loss": 0.165, "step": 30380 }, { "epoch": 4.127814187238956, "grad_norm": 0.6420402526855469, "learning_rate": 2.6837345948935606e-05, "loss": 0.1683, "step": 30390 }, { "epoch": 4.129172467655947, "grad_norm": 0.5550116300582886, "learning_rate": 2.6802335813412273e-05, "loss": 0.1697, "step": 30400 }, { "epoch": 4.13053074807294, "grad_norm": 0.5557520389556885, "learning_rate": 2.6767340165017495e-05, "loss": 0.1701, "step": 30410 }, { "epoch": 4.1318890284899314, "grad_norm": 0.5602233409881592, "learning_rate": 2.673235902560629e-05, "loss": 0.1707, "step": 30420 }, { "epoch": 4.133247308906924, "grad_norm": 0.5476872324943542, "learning_rate": 2.6697392417024702e-05, "loss": 0.1656, "step": 30430 }, { "epoch": 4.1346055893239155, "grad_norm": 0.46781355142593384, "learning_rate": 2.666244036110963e-05, "loss": 0.1534, "step": 30440 }, { "epoch": 4.135963869740908, "grad_norm": 0.48015445470809937, "learning_rate": 2.6627502879688924e-05, "loss": 0.172, "step": 30450 }, { "epoch": 4.1373221501579005, "grad_norm": 0.7135910987854004, "learning_rate": 2.6592579994581346e-05, "loss": 0.1747, "step": 30460 }, { "epoch": 4.138680430574892, "grad_norm": 1.0229312181472778, "learning_rate": 2.6557671727596483e-05, "loss": 0.1788, "step": 30470 }, { "epoch": 4.140038710991885, "grad_norm": 0.8014886379241943, "learning_rate": 2.6522778100534872e-05, "loss": 0.1664, "step": 30480 }, { "epoch": 4.141396991408876, "grad_norm": 0.9365777969360352, "learning_rate": 2.6487899135187815e-05, "loss": 0.1521, "step": 30490 }, { "epoch": 4.142755271825869, "grad_norm": 0.7152336239814758, "learning_rate": 2.6453034853337534e-05, "loss": 0.1684, "step": 30500 }, { "epoch": 4.14411355224286, "grad_norm": 0.5933672189712524, "learning_rate": 2.6418185276757075e-05, "loss": 0.1668, "step": 30510 }, { "epoch": 4.145471832659853, "grad_norm": 0.6173017621040344, "learning_rate": 2.638335042721024e-05, "loss": 0.1802, "step": 30520 }, { "epoch": 4.146830113076844, "grad_norm": 0.7069457173347473, "learning_rate": 2.6348530326451704e-05, "loss": 0.1652, "step": 30530 }, { "epoch": 4.148188393493837, "grad_norm": 0.8189671635627747, "learning_rate": 2.6313724996226875e-05, "loss": 0.1567, "step": 30540 }, { "epoch": 4.149546673910829, "grad_norm": 0.678489089012146, "learning_rate": 2.6278934458271997e-05, "loss": 0.1782, "step": 30550 }, { "epoch": 4.150904954327821, "grad_norm": 0.46090465784072876, "learning_rate": 2.6244158734313994e-05, "loss": 0.164, "step": 30560 }, { "epoch": 4.152263234744813, "grad_norm": 0.5694794654846191, "learning_rate": 2.620939784607063e-05, "loss": 0.1642, "step": 30570 }, { "epoch": 4.153621515161805, "grad_norm": 0.8122572302818298, "learning_rate": 2.617465181525035e-05, "loss": 0.1683, "step": 30580 }, { "epoch": 4.1549797955787975, "grad_norm": 1.0129097700119019, "learning_rate": 2.6139920663552307e-05, "loss": 0.171, "step": 30590 }, { "epoch": 4.156338075995789, "grad_norm": 0.5238208770751953, "learning_rate": 2.610520441266642e-05, "loss": 0.168, "step": 30600 }, { "epoch": 4.157696356412782, "grad_norm": 0.6048885583877563, "learning_rate": 2.607050308427324e-05, "loss": 0.1718, "step": 30610 }, { "epoch": 4.159054636829773, "grad_norm": 0.43715083599090576, "learning_rate": 2.6035816700044036e-05, "loss": 0.1583, "step": 30620 }, { "epoch": 4.160412917246766, "grad_norm": 0.5260584950447083, "learning_rate": 2.600114528164075e-05, "loss": 0.1692, "step": 30630 }, { "epoch": 4.161771197663757, "grad_norm": 0.6103253364562988, "learning_rate": 2.596648885071593e-05, "loss": 0.1663, "step": 30640 }, { "epoch": 4.16312947808075, "grad_norm": 0.4400300085544586, "learning_rate": 2.5931847428912842e-05, "loss": 0.1626, "step": 30650 }, { "epoch": 4.164487758497742, "grad_norm": 0.5935326218605042, "learning_rate": 2.5897221037865284e-05, "loss": 0.1711, "step": 30660 }, { "epoch": 4.165846038914734, "grad_norm": 0.641808807849884, "learning_rate": 2.5862609699197742e-05, "loss": 0.173, "step": 30670 }, { "epoch": 4.167204319331726, "grad_norm": 0.5049773454666138, "learning_rate": 2.58280134345253e-05, "loss": 0.1565, "step": 30680 }, { "epoch": 4.168562599748718, "grad_norm": 0.656670868396759, "learning_rate": 2.5793432265453553e-05, "loss": 0.1671, "step": 30690 }, { "epoch": 4.16992088016571, "grad_norm": 0.6015728116035461, "learning_rate": 2.5758866213578763e-05, "loss": 0.1643, "step": 30700 }, { "epoch": 4.171279160582702, "grad_norm": 0.5588725805282593, "learning_rate": 2.5724315300487677e-05, "loss": 0.1691, "step": 30710 }, { "epoch": 4.1726374409996945, "grad_norm": 0.5576444268226624, "learning_rate": 2.5689779547757652e-05, "loss": 0.168, "step": 30720 }, { "epoch": 4.173995721416686, "grad_norm": 0.565733015537262, "learning_rate": 2.565525897695651e-05, "loss": 0.1703, "step": 30730 }, { "epoch": 4.1753540018336786, "grad_norm": 0.7770469784736633, "learning_rate": 2.5620753609642645e-05, "loss": 0.1725, "step": 30740 }, { "epoch": 4.176712282250671, "grad_norm": 0.6189258098602295, "learning_rate": 2.5586263467364945e-05, "loss": 0.1679, "step": 30750 }, { "epoch": 4.178070562667663, "grad_norm": 0.5456731915473938, "learning_rate": 2.5551788571662762e-05, "loss": 0.1715, "step": 30760 }, { "epoch": 4.179428843084655, "grad_norm": 0.7505424618721008, "learning_rate": 2.5517328944065972e-05, "loss": 0.1705, "step": 30770 }, { "epoch": 4.180787123501647, "grad_norm": 0.611724853515625, "learning_rate": 2.5482884606094858e-05, "loss": 0.1793, "step": 30780 }, { "epoch": 4.182145403918639, "grad_norm": 0.9286504983901978, "learning_rate": 2.54484555792602e-05, "loss": 0.1662, "step": 30790 }, { "epoch": 4.183503684335631, "grad_norm": 0.8131747841835022, "learning_rate": 2.5414041885063232e-05, "loss": 0.1685, "step": 30800 }, { "epoch": 4.184861964752623, "grad_norm": 0.6744845509529114, "learning_rate": 2.537964354499554e-05, "loss": 0.1682, "step": 30810 }, { "epoch": 4.186220245169615, "grad_norm": 0.6201196312904358, "learning_rate": 2.5345260580539197e-05, "loss": 0.1659, "step": 30820 }, { "epoch": 4.187578525586607, "grad_norm": 0.5531781315803528, "learning_rate": 2.531089301316662e-05, "loss": 0.1669, "step": 30830 }, { "epoch": 4.1889368060036, "grad_norm": 0.5611926317214966, "learning_rate": 2.5276540864340636e-05, "loss": 0.1753, "step": 30840 }, { "epoch": 4.1902950864205915, "grad_norm": 0.9091290235519409, "learning_rate": 2.524220415551447e-05, "loss": 0.1752, "step": 30850 }, { "epoch": 4.191653366837584, "grad_norm": 0.5343112349510193, "learning_rate": 2.520788290813162e-05, "loss": 0.1705, "step": 30860 }, { "epoch": 4.1930116472545755, "grad_norm": 0.41048046946525574, "learning_rate": 2.5173577143626037e-05, "loss": 0.1728, "step": 30870 }, { "epoch": 4.194369927671568, "grad_norm": 0.6168935298919678, "learning_rate": 2.5139286883421896e-05, "loss": 0.1636, "step": 30880 }, { "epoch": 4.19572820808856, "grad_norm": 0.5662907958030701, "learning_rate": 2.5105012148933793e-05, "loss": 0.1683, "step": 30890 }, { "epoch": 4.197086488505552, "grad_norm": 0.6171367764472961, "learning_rate": 2.5070752961566522e-05, "loss": 0.1686, "step": 30900 }, { "epoch": 4.198444768922544, "grad_norm": 0.5312259793281555, "learning_rate": 2.5036509342715264e-05, "loss": 0.1694, "step": 30910 }, { "epoch": 4.199803049339536, "grad_norm": 0.6353703141212463, "learning_rate": 2.500228131376543e-05, "loss": 0.174, "step": 30920 }, { "epoch": 4.201161329756529, "grad_norm": 0.5192369222640991, "learning_rate": 2.4968068896092677e-05, "loss": 0.1629, "step": 30930 }, { "epoch": 4.20251961017352, "grad_norm": 0.7301083207130432, "learning_rate": 2.4933872111062965e-05, "loss": 0.1718, "step": 30940 }, { "epoch": 4.203877890590513, "grad_norm": 0.7821075916290283, "learning_rate": 2.4899690980032435e-05, "loss": 0.1733, "step": 30950 }, { "epoch": 4.205236171007504, "grad_norm": 0.7978521585464478, "learning_rate": 2.486552552434749e-05, "loss": 0.1613, "step": 30960 }, { "epoch": 4.206594451424497, "grad_norm": 0.4985819458961487, "learning_rate": 2.4831375765344757e-05, "loss": 0.1631, "step": 30970 }, { "epoch": 4.207952731841488, "grad_norm": 0.6599612236022949, "learning_rate": 2.4797241724350988e-05, "loss": 0.1741, "step": 30980 }, { "epoch": 4.209311012258481, "grad_norm": 0.5237391591072083, "learning_rate": 2.476312342268322e-05, "loss": 0.1596, "step": 30990 }, { "epoch": 4.2106692926754725, "grad_norm": 0.5528895854949951, "learning_rate": 2.4729020881648563e-05, "loss": 0.1648, "step": 31000 }, { "epoch": 4.212027573092465, "grad_norm": 3.066981792449951, "learning_rate": 2.4694934122544345e-05, "loss": 0.178, "step": 31010 }, { "epoch": 4.213385853509457, "grad_norm": 0.6425188779830933, "learning_rate": 2.4660863166658037e-05, "loss": 0.1688, "step": 31020 }, { "epoch": 4.214744133926449, "grad_norm": 0.5000406503677368, "learning_rate": 2.46268080352672e-05, "loss": 0.1642, "step": 31030 }, { "epoch": 4.216102414343442, "grad_norm": 0.501499593257904, "learning_rate": 2.4592768749639576e-05, "loss": 0.171, "step": 31040 }, { "epoch": 4.217460694760433, "grad_norm": 0.6208752989768982, "learning_rate": 2.455874533103293e-05, "loss": 0.1664, "step": 31050 }, { "epoch": 4.218818975177426, "grad_norm": 0.5702804923057556, "learning_rate": 2.452473780069519e-05, "loss": 0.1682, "step": 31060 }, { "epoch": 4.220177255594417, "grad_norm": 0.5080788135528564, "learning_rate": 2.4490746179864314e-05, "loss": 0.159, "step": 31070 }, { "epoch": 4.22153553601141, "grad_norm": 0.7375909686088562, "learning_rate": 2.4456770489768356e-05, "loss": 0.1687, "step": 31080 }, { "epoch": 4.222893816428401, "grad_norm": 0.5774389505386353, "learning_rate": 2.442281075162542e-05, "loss": 0.1705, "step": 31090 }, { "epoch": 4.224252096845394, "grad_norm": 0.6072754263877869, "learning_rate": 2.4388866986643604e-05, "loss": 0.1802, "step": 31100 }, { "epoch": 4.225610377262385, "grad_norm": 0.5900959372520447, "learning_rate": 2.4354939216021106e-05, "loss": 0.1681, "step": 31110 }, { "epoch": 4.226968657679378, "grad_norm": 0.8211285471916199, "learning_rate": 2.432102746094605e-05, "loss": 0.1804, "step": 31120 }, { "epoch": 4.22832693809637, "grad_norm": 0.5616700649261475, "learning_rate": 2.428713174259662e-05, "loss": 0.1632, "step": 31130 }, { "epoch": 4.229685218513362, "grad_norm": 0.697199285030365, "learning_rate": 2.4253252082140982e-05, "loss": 0.1656, "step": 31140 }, { "epoch": 4.2310434989303545, "grad_norm": 0.45483529567718506, "learning_rate": 2.4219388500737223e-05, "loss": 0.1653, "step": 31150 }, { "epoch": 4.232401779347346, "grad_norm": 0.5100761651992798, "learning_rate": 2.418554101953346e-05, "loss": 0.1719, "step": 31160 }, { "epoch": 4.233760059764339, "grad_norm": 0.6037428379058838, "learning_rate": 2.4151709659667683e-05, "loss": 0.1596, "step": 31170 }, { "epoch": 4.23511834018133, "grad_norm": 0.4498038589954376, "learning_rate": 2.4117894442267864e-05, "loss": 0.1678, "step": 31180 }, { "epoch": 4.236476620598323, "grad_norm": 0.6231310963630676, "learning_rate": 2.4084095388451893e-05, "loss": 0.1624, "step": 31190 }, { "epoch": 4.237834901015314, "grad_norm": 0.5154946446418762, "learning_rate": 2.4050312519327527e-05, "loss": 0.1709, "step": 31200 }, { "epoch": 4.239193181432307, "grad_norm": 0.8087997436523438, "learning_rate": 2.401654585599247e-05, "loss": 0.166, "step": 31210 }, { "epoch": 4.240551461849299, "grad_norm": 0.49772873520851135, "learning_rate": 2.3982795419534243e-05, "loss": 0.159, "step": 31220 }, { "epoch": 4.241909742266291, "grad_norm": 0.6828603744506836, "learning_rate": 2.39490612310303e-05, "loss": 0.1634, "step": 31230 }, { "epoch": 4.243268022683283, "grad_norm": 0.446718692779541, "learning_rate": 2.391534331154788e-05, "loss": 0.1497, "step": 31240 }, { "epoch": 4.244626303100275, "grad_norm": 0.5224425196647644, "learning_rate": 2.3881641682144107e-05, "loss": 0.1768, "step": 31250 }, { "epoch": 4.245984583517267, "grad_norm": 0.5529202818870544, "learning_rate": 2.3847956363865946e-05, "loss": 0.1617, "step": 31260 }, { "epoch": 4.247342863934259, "grad_norm": 0.7164201140403748, "learning_rate": 2.3814287377750112e-05, "loss": 0.1563, "step": 31270 }, { "epoch": 4.2487011443512515, "grad_norm": 0.4769268035888672, "learning_rate": 2.3780634744823193e-05, "loss": 0.1477, "step": 31280 }, { "epoch": 4.250059424768243, "grad_norm": 1.210301399230957, "learning_rate": 2.3746998486101494e-05, "loss": 0.1691, "step": 31290 }, { "epoch": 4.2514177051852355, "grad_norm": 0.7250655293464661, "learning_rate": 2.3713378622591153e-05, "loss": 0.163, "step": 31300 }, { "epoch": 4.252775985602227, "grad_norm": 0.5009139180183411, "learning_rate": 2.3679775175288056e-05, "loss": 0.1721, "step": 31310 }, { "epoch": 4.25413426601922, "grad_norm": 0.5932921767234802, "learning_rate": 2.3646188165177796e-05, "loss": 0.1646, "step": 31320 }, { "epoch": 4.255492546436212, "grad_norm": 0.5983490347862244, "learning_rate": 2.3612617613235772e-05, "loss": 0.1704, "step": 31330 }, { "epoch": 4.256850826853204, "grad_norm": 0.618791401386261, "learning_rate": 2.3579063540427022e-05, "loss": 0.1672, "step": 31340 }, { "epoch": 4.258209107270196, "grad_norm": 0.6889152526855469, "learning_rate": 2.3545525967706372e-05, "loss": 0.166, "step": 31350 }, { "epoch": 4.259567387687188, "grad_norm": 0.5493060350418091, "learning_rate": 2.351200491601827e-05, "loss": 0.1646, "step": 31360 }, { "epoch": 4.26092566810418, "grad_norm": 0.5521674752235413, "learning_rate": 2.3478500406296906e-05, "loss": 0.1673, "step": 31370 }, { "epoch": 4.262283948521172, "grad_norm": 0.958859384059906, "learning_rate": 2.344501245946613e-05, "loss": 0.1763, "step": 31380 }, { "epoch": 4.263642228938164, "grad_norm": 0.5471612215042114, "learning_rate": 2.3411541096439404e-05, "loss": 0.1587, "step": 31390 }, { "epoch": 4.265000509355156, "grad_norm": 0.5544211268424988, "learning_rate": 2.3378086338119897e-05, "loss": 0.1619, "step": 31400 }, { "epoch": 4.266358789772148, "grad_norm": 0.3699759840965271, "learning_rate": 2.3344648205400345e-05, "loss": 0.1603, "step": 31410 }, { "epoch": 4.267717070189141, "grad_norm": 0.8980101346969604, "learning_rate": 2.3311226719163138e-05, "loss": 0.1746, "step": 31420 }, { "epoch": 4.2690753506061325, "grad_norm": 1.098261833190918, "learning_rate": 2.3277821900280288e-05, "loss": 0.1803, "step": 31430 }, { "epoch": 4.270433631023125, "grad_norm": 0.49599990248680115, "learning_rate": 2.3244433769613344e-05, "loss": 0.155, "step": 31440 }, { "epoch": 4.271791911440117, "grad_norm": 0.5414462685585022, "learning_rate": 2.3211062348013496e-05, "loss": 0.1585, "step": 31450 }, { "epoch": 4.273150191857109, "grad_norm": 0.6064419746398926, "learning_rate": 2.3177707656321425e-05, "loss": 0.175, "step": 31460 }, { "epoch": 4.274508472274101, "grad_norm": 0.5545496940612793, "learning_rate": 2.3144369715367425e-05, "loss": 0.1642, "step": 31470 }, { "epoch": 4.275866752691093, "grad_norm": 0.5725846886634827, "learning_rate": 2.3111048545971336e-05, "loss": 0.1673, "step": 31480 }, { "epoch": 4.277225033108085, "grad_norm": 0.532606303691864, "learning_rate": 2.3077744168942444e-05, "loss": 0.1674, "step": 31490 }, { "epoch": 4.278583313525077, "grad_norm": 0.7247141003608704, "learning_rate": 2.3044456605079655e-05, "loss": 0.1595, "step": 31500 }, { "epoch": 4.27994159394207, "grad_norm": 0.8863220810890198, "learning_rate": 2.3011185875171277e-05, "loss": 0.163, "step": 31510 }, { "epoch": 4.281299874359061, "grad_norm": 0.6545773148536682, "learning_rate": 2.2977931999995182e-05, "loss": 0.1682, "step": 31520 }, { "epoch": 4.282658154776054, "grad_norm": 0.656083345413208, "learning_rate": 2.2944695000318654e-05, "loss": 0.1617, "step": 31530 }, { "epoch": 4.284016435193045, "grad_norm": 0.5452845096588135, "learning_rate": 2.2911474896898483e-05, "loss": 0.165, "step": 31540 }, { "epoch": 4.285374715610038, "grad_norm": 0.9341265559196472, "learning_rate": 2.2878271710480907e-05, "loss": 0.1636, "step": 31550 }, { "epoch": 4.2867329960270295, "grad_norm": 0.6055859923362732, "learning_rate": 2.2845085461801558e-05, "loss": 0.1662, "step": 31560 }, { "epoch": 4.288091276444022, "grad_norm": 0.5085821747779846, "learning_rate": 2.281191617158554e-05, "loss": 0.171, "step": 31570 }, { "epoch": 4.289449556861014, "grad_norm": 0.5167649984359741, "learning_rate": 2.2778763860547316e-05, "loss": 0.1541, "step": 31580 }, { "epoch": 4.290807837278006, "grad_norm": 0.6356558799743652, "learning_rate": 2.2745628549390775e-05, "loss": 0.1674, "step": 31590 }, { "epoch": 4.292166117694999, "grad_norm": 0.7117891907691956, "learning_rate": 2.2712510258809223e-05, "loss": 0.1739, "step": 31600 }, { "epoch": 4.29352439811199, "grad_norm": 0.5158246755599976, "learning_rate": 2.2679409009485254e-05, "loss": 0.1675, "step": 31610 }, { "epoch": 4.294882678528983, "grad_norm": 1.4153114557266235, "learning_rate": 2.2646324822090896e-05, "loss": 0.1705, "step": 31620 }, { "epoch": 4.296240958945974, "grad_norm": 0.6010976433753967, "learning_rate": 2.2613257717287474e-05, "loss": 0.1711, "step": 31630 }, { "epoch": 4.297599239362967, "grad_norm": 0.7498173713684082, "learning_rate": 2.258020771572566e-05, "loss": 0.1554, "step": 31640 }, { "epoch": 4.298957519779958, "grad_norm": 0.9472950100898743, "learning_rate": 2.254717483804547e-05, "loss": 0.1626, "step": 31650 }, { "epoch": 4.300315800196951, "grad_norm": 0.47160205245018005, "learning_rate": 2.2514159104876177e-05, "loss": 0.1669, "step": 31660 }, { "epoch": 4.301674080613942, "grad_norm": 0.5426551699638367, "learning_rate": 2.248116053683639e-05, "loss": 0.166, "step": 31670 }, { "epoch": 4.303032361030935, "grad_norm": 0.4763004183769226, "learning_rate": 2.2448179154533967e-05, "loss": 0.1636, "step": 31680 }, { "epoch": 4.304390641447927, "grad_norm": 0.5501003861427307, "learning_rate": 2.2415214978566068e-05, "loss": 0.1638, "step": 31690 }, { "epoch": 4.305748921864919, "grad_norm": 0.5634255409240723, "learning_rate": 2.2382268029519054e-05, "loss": 0.1763, "step": 31700 }, { "epoch": 4.3071072022819115, "grad_norm": 0.5280802845954895, "learning_rate": 2.2349338327968582e-05, "loss": 0.1638, "step": 31710 }, { "epoch": 4.308465482698903, "grad_norm": 0.5604649186134338, "learning_rate": 2.2316425894479535e-05, "loss": 0.1615, "step": 31720 }, { "epoch": 4.3098237631158955, "grad_norm": 0.595178484916687, "learning_rate": 2.2283530749605954e-05, "loss": 0.1607, "step": 31730 }, { "epoch": 4.311182043532887, "grad_norm": 0.7753090858459473, "learning_rate": 2.2250652913891157e-05, "loss": 0.1691, "step": 31740 }, { "epoch": 4.31254032394988, "grad_norm": 0.9635034203529358, "learning_rate": 2.221779240786758e-05, "loss": 0.1634, "step": 31750 }, { "epoch": 4.313898604366871, "grad_norm": 0.8232958316802979, "learning_rate": 2.218494925205691e-05, "loss": 0.1623, "step": 31760 }, { "epoch": 4.315256884783864, "grad_norm": 0.5422998070716858, "learning_rate": 2.2152123466969955e-05, "loss": 0.1657, "step": 31770 }, { "epoch": 4.316615165200855, "grad_norm": 0.6714988350868225, "learning_rate": 2.2119315073106663e-05, "loss": 0.1635, "step": 31780 }, { "epoch": 4.317973445617848, "grad_norm": 0.5279560685157776, "learning_rate": 2.208652409095617e-05, "loss": 0.1663, "step": 31790 }, { "epoch": 4.31933172603484, "grad_norm": 0.9500426650047302, "learning_rate": 2.2053750540996683e-05, "loss": 0.1803, "step": 31800 }, { "epoch": 4.320690006451832, "grad_norm": 0.8492748737335205, "learning_rate": 2.2020994443695558e-05, "loss": 0.1682, "step": 31810 }, { "epoch": 4.322048286868824, "grad_norm": 0.6694868206977844, "learning_rate": 2.1988255819509256e-05, "loss": 0.1663, "step": 31820 }, { "epoch": 4.323406567285816, "grad_norm": 0.6423572301864624, "learning_rate": 2.1955534688883283e-05, "loss": 0.165, "step": 31830 }, { "epoch": 4.3247648477028084, "grad_norm": 0.5355410575866699, "learning_rate": 2.1922831072252288e-05, "loss": 0.1767, "step": 31840 }, { "epoch": 4.3261231281198, "grad_norm": 0.5823588371276855, "learning_rate": 2.18901449900399e-05, "loss": 0.1704, "step": 31850 }, { "epoch": 4.3274814085367925, "grad_norm": 0.5742313861846924, "learning_rate": 2.1857476462658882e-05, "loss": 0.1692, "step": 31860 }, { "epoch": 4.328839688953784, "grad_norm": 0.4707489013671875, "learning_rate": 2.182482551051096e-05, "loss": 0.1725, "step": 31870 }, { "epoch": 4.330197969370777, "grad_norm": 0.7265713214874268, "learning_rate": 2.1792192153986923e-05, "loss": 0.1741, "step": 31880 }, { "epoch": 4.331556249787769, "grad_norm": 0.6107183694839478, "learning_rate": 2.17595764134666e-05, "loss": 0.1623, "step": 31890 }, { "epoch": 4.332914530204761, "grad_norm": 0.5376812815666199, "learning_rate": 2.172697830931873e-05, "loss": 0.1615, "step": 31900 }, { "epoch": 4.334272810621753, "grad_norm": 1.0954182147979736, "learning_rate": 2.1694397861901146e-05, "loss": 0.171, "step": 31910 }, { "epoch": 4.335631091038745, "grad_norm": 0.6104332208633423, "learning_rate": 2.166183509156055e-05, "loss": 0.1675, "step": 31920 }, { "epoch": 4.336989371455737, "grad_norm": 0.558191180229187, "learning_rate": 2.162929001863268e-05, "loss": 0.1703, "step": 31930 }, { "epoch": 4.338347651872729, "grad_norm": 0.6322271823883057, "learning_rate": 2.1596762663442218e-05, "loss": 0.1714, "step": 31940 }, { "epoch": 4.339705932289721, "grad_norm": 0.5672459602355957, "learning_rate": 2.1564253046302718e-05, "loss": 0.156, "step": 31950 }, { "epoch": 4.341064212706713, "grad_norm": 0.6341519951820374, "learning_rate": 2.1531761187516736e-05, "loss": 0.1559, "step": 31960 }, { "epoch": 4.342422493123705, "grad_norm": 0.6674389243125916, "learning_rate": 2.149928710737566e-05, "loss": 0.1723, "step": 31970 }, { "epoch": 4.343780773540697, "grad_norm": 0.5246990919113159, "learning_rate": 2.146683082615984e-05, "loss": 0.1644, "step": 31980 }, { "epoch": 4.3451390539576895, "grad_norm": 0.549720048904419, "learning_rate": 2.14343923641385e-05, "loss": 0.155, "step": 31990 }, { "epoch": 4.346497334374682, "grad_norm": 1.2151511907577515, "learning_rate": 2.140197174156969e-05, "loss": 0.1653, "step": 32000 }, { "epoch": 4.347855614791674, "grad_norm": 0.4734196066856384, "learning_rate": 2.1369568978700382e-05, "loss": 0.1654, "step": 32010 }, { "epoch": 4.349213895208666, "grad_norm": 1.0287611484527588, "learning_rate": 2.1337184095766327e-05, "loss": 0.1813, "step": 32020 }, { "epoch": 4.350572175625658, "grad_norm": 0.6608217358589172, "learning_rate": 2.1304817112992182e-05, "loss": 0.1576, "step": 32030 }, { "epoch": 4.35193045604265, "grad_norm": 0.5484539866447449, "learning_rate": 2.127246805059135e-05, "loss": 0.168, "step": 32040 }, { "epoch": 4.353288736459642, "grad_norm": 0.42511603236198425, "learning_rate": 2.124013692876611e-05, "loss": 0.1644, "step": 32050 }, { "epoch": 4.354647016876634, "grad_norm": 0.6009181141853333, "learning_rate": 2.1207823767707512e-05, "loss": 0.1596, "step": 32060 }, { "epoch": 4.356005297293626, "grad_norm": 0.6520244479179382, "learning_rate": 2.1175528587595362e-05, "loss": 0.1636, "step": 32070 }, { "epoch": 4.357363577710618, "grad_norm": 0.5215667486190796, "learning_rate": 2.1143251408598293e-05, "loss": 0.1615, "step": 32080 }, { "epoch": 4.358721858127611, "grad_norm": 1.3559465408325195, "learning_rate": 2.1110992250873625e-05, "loss": 0.1646, "step": 32090 }, { "epoch": 4.360080138544602, "grad_norm": 0.8427206873893738, "learning_rate": 2.1078751134567482e-05, "loss": 0.164, "step": 32100 }, { "epoch": 4.361438418961595, "grad_norm": 0.5089706182479858, "learning_rate": 2.104652807981472e-05, "loss": 0.1676, "step": 32110 }, { "epoch": 4.3627966993785865, "grad_norm": 0.717731773853302, "learning_rate": 2.101432310673886e-05, "loss": 0.1641, "step": 32120 }, { "epoch": 4.364154979795579, "grad_norm": 0.5962165594100952, "learning_rate": 2.098213623545221e-05, "loss": 0.1668, "step": 32130 }, { "epoch": 4.365513260212571, "grad_norm": 0.6332141757011414, "learning_rate": 2.094996748605568e-05, "loss": 0.1767, "step": 32140 }, { "epoch": 4.366871540629563, "grad_norm": 0.4057539999485016, "learning_rate": 2.0917816878638964e-05, "loss": 0.1684, "step": 32150 }, { "epoch": 4.368229821046555, "grad_norm": 0.7728405594825745, "learning_rate": 2.0885684433280333e-05, "loss": 0.1691, "step": 32160 }, { "epoch": 4.369588101463547, "grad_norm": 0.4701368510723114, "learning_rate": 2.085357017004678e-05, "loss": 0.1635, "step": 32170 }, { "epoch": 4.37094638188054, "grad_norm": 0.7179633975028992, "learning_rate": 2.082147410899394e-05, "loss": 0.1725, "step": 32180 }, { "epoch": 4.372304662297531, "grad_norm": 0.5675747990608215, "learning_rate": 2.0789396270166017e-05, "loss": 0.1768, "step": 32190 }, { "epoch": 4.373662942714524, "grad_norm": 0.6417014002799988, "learning_rate": 2.0757336673595928e-05, "loss": 0.1638, "step": 32200 }, { "epoch": 4.375021223131515, "grad_norm": 0.5225614309310913, "learning_rate": 2.0725295339305108e-05, "loss": 0.1772, "step": 32210 }, { "epoch": 4.376379503548508, "grad_norm": 0.4688889980316162, "learning_rate": 2.069327228730365e-05, "loss": 0.1739, "step": 32220 }, { "epoch": 4.377737783965499, "grad_norm": 0.5557004809379578, "learning_rate": 2.0661267537590224e-05, "loss": 0.1609, "step": 32230 }, { "epoch": 4.379096064382492, "grad_norm": 1.0586512088775635, "learning_rate": 2.0629281110152014e-05, "loss": 0.1724, "step": 32240 }, { "epoch": 4.3804543447994835, "grad_norm": 0.5625683665275574, "learning_rate": 2.059731302496485e-05, "loss": 0.155, "step": 32250 }, { "epoch": 4.381812625216476, "grad_norm": 0.7775335311889648, "learning_rate": 2.0565363301993013e-05, "loss": 0.169, "step": 32260 }, { "epoch": 4.3831709056334685, "grad_norm": 0.46491679549217224, "learning_rate": 2.053343196118938e-05, "loss": 0.1675, "step": 32270 }, { "epoch": 4.38452918605046, "grad_norm": 0.5385048985481262, "learning_rate": 2.0501519022495357e-05, "loss": 0.1645, "step": 32280 }, { "epoch": 4.3858874664674525, "grad_norm": 0.8897777795791626, "learning_rate": 2.0469624505840795e-05, "loss": 0.1671, "step": 32290 }, { "epoch": 4.387245746884444, "grad_norm": 0.45839646458625793, "learning_rate": 2.0437748431144104e-05, "loss": 0.16, "step": 32300 }, { "epoch": 4.388604027301437, "grad_norm": 0.6626864671707153, "learning_rate": 2.0405890818312123e-05, "loss": 0.169, "step": 32310 }, { "epoch": 4.389962307718428, "grad_norm": 0.6356166005134583, "learning_rate": 2.0374051687240227e-05, "loss": 0.1648, "step": 32320 }, { "epoch": 4.391320588135421, "grad_norm": 0.6504536271095276, "learning_rate": 2.034223105781216e-05, "loss": 0.1576, "step": 32330 }, { "epoch": 4.392678868552412, "grad_norm": 0.6453339457511902, "learning_rate": 2.031042894990019e-05, "loss": 0.1596, "step": 32340 }, { "epoch": 4.394037148969405, "grad_norm": 1.211305022239685, "learning_rate": 2.0278645383365014e-05, "loss": 0.1643, "step": 32350 }, { "epoch": 4.395395429386397, "grad_norm": 0.581154465675354, "learning_rate": 2.0246880378055676e-05, "loss": 0.167, "step": 32360 }, { "epoch": 4.396753709803389, "grad_norm": 0.5713620185852051, "learning_rate": 2.0215133953809723e-05, "loss": 0.1771, "step": 32370 }, { "epoch": 4.398111990220381, "grad_norm": 0.8235989212989807, "learning_rate": 2.0183406130453015e-05, "loss": 0.1685, "step": 32380 }, { "epoch": 4.399470270637373, "grad_norm": 0.669984757900238, "learning_rate": 2.015169692779985e-05, "loss": 0.1655, "step": 32390 }, { "epoch": 4.400828551054365, "grad_norm": 0.9109957218170166, "learning_rate": 2.0120006365652905e-05, "loss": 0.1642, "step": 32400 }, { "epoch": 4.402186831471357, "grad_norm": 0.5684950351715088, "learning_rate": 2.0088334463803148e-05, "loss": 0.1601, "step": 32410 }, { "epoch": 4.4035451118883495, "grad_norm": 0.7670163512229919, "learning_rate": 2.005668124202998e-05, "loss": 0.1662, "step": 32420 }, { "epoch": 4.404903392305341, "grad_norm": 0.544593334197998, "learning_rate": 2.0025046720101053e-05, "loss": 0.1711, "step": 32430 }, { "epoch": 4.406261672722334, "grad_norm": 0.5127115845680237, "learning_rate": 1.99934309177724e-05, "loss": 0.1776, "step": 32440 }, { "epoch": 4.407619953139326, "grad_norm": 0.6263406276702881, "learning_rate": 1.9961833854788363e-05, "loss": 0.1736, "step": 32450 }, { "epoch": 4.408978233556318, "grad_norm": 0.5455638766288757, "learning_rate": 1.9930255550881533e-05, "loss": 0.1726, "step": 32460 }, { "epoch": 4.41033651397331, "grad_norm": 0.7321581840515137, "learning_rate": 1.989869602577284e-05, "loss": 0.1608, "step": 32470 }, { "epoch": 4.411694794390302, "grad_norm": 0.669871985912323, "learning_rate": 1.9867155299171436e-05, "loss": 0.1682, "step": 32480 }, { "epoch": 4.413053074807294, "grad_norm": 0.4270327687263489, "learning_rate": 1.983563339077479e-05, "loss": 0.1649, "step": 32490 }, { "epoch": 4.414411355224286, "grad_norm": 0.7234973311424255, "learning_rate": 1.9804130320268556e-05, "loss": 0.1703, "step": 32500 }, { "epoch": 4.415769635641278, "grad_norm": 0.7450284957885742, "learning_rate": 1.977264610732667e-05, "loss": 0.1604, "step": 32510 }, { "epoch": 4.41712791605827, "grad_norm": 0.7085921764373779, "learning_rate": 1.9741180771611296e-05, "loss": 0.173, "step": 32520 }, { "epoch": 4.418486196475262, "grad_norm": 0.6125777959823608, "learning_rate": 1.9709734332772755e-05, "loss": 0.1619, "step": 32530 }, { "epoch": 4.419844476892254, "grad_norm": 1.6051397323608398, "learning_rate": 1.9678306810449642e-05, "loss": 0.1684, "step": 32540 }, { "epoch": 4.4212027573092465, "grad_norm": 0.5509023070335388, "learning_rate": 1.9646898224268657e-05, "loss": 0.1724, "step": 32550 }, { "epoch": 4.422561037726239, "grad_norm": 0.5400683879852295, "learning_rate": 1.9615508593844727e-05, "loss": 0.1549, "step": 32560 }, { "epoch": 4.423919318143231, "grad_norm": 0.42700257897377014, "learning_rate": 1.9584137938780967e-05, "loss": 0.1654, "step": 32570 }, { "epoch": 4.425277598560223, "grad_norm": 0.48438775539398193, "learning_rate": 1.955278627866855e-05, "loss": 0.1729, "step": 32580 }, { "epoch": 4.426635878977215, "grad_norm": 0.5966241359710693, "learning_rate": 1.9521453633086883e-05, "loss": 0.1651, "step": 32590 }, { "epoch": 4.427994159394207, "grad_norm": 0.6944745779037476, "learning_rate": 1.9490140021603427e-05, "loss": 0.1656, "step": 32600 }, { "epoch": 4.429352439811199, "grad_norm": 0.7081323862075806, "learning_rate": 1.945884546377379e-05, "loss": 0.1688, "step": 32610 }, { "epoch": 4.430710720228191, "grad_norm": 0.7327597141265869, "learning_rate": 1.9427569979141703e-05, "loss": 0.1709, "step": 32620 }, { "epoch": 4.432069000645183, "grad_norm": 0.5223390460014343, "learning_rate": 1.9396313587238925e-05, "loss": 0.1662, "step": 32630 }, { "epoch": 4.433427281062175, "grad_norm": 0.6429123282432556, "learning_rate": 1.9365076307585344e-05, "loss": 0.1637, "step": 32640 }, { "epoch": 4.434785561479167, "grad_norm": 0.44571611285209656, "learning_rate": 1.9333858159688885e-05, "loss": 0.1669, "step": 32650 }, { "epoch": 4.436143841896159, "grad_norm": 0.590838611125946, "learning_rate": 1.9302659163045543e-05, "loss": 0.1635, "step": 32660 }, { "epoch": 4.437502122313152, "grad_norm": 0.4490809738636017, "learning_rate": 1.927147933713932e-05, "loss": 0.1681, "step": 32670 }, { "epoch": 4.4388604027301435, "grad_norm": 0.5307814478874207, "learning_rate": 1.924031870144229e-05, "loss": 0.1757, "step": 32680 }, { "epoch": 4.440218683147136, "grad_norm": 0.5582832098007202, "learning_rate": 1.920917727541453e-05, "loss": 0.1492, "step": 32690 }, { "epoch": 4.441576963564128, "grad_norm": 0.5961710214614868, "learning_rate": 1.9178055078504083e-05, "loss": 0.164, "step": 32700 }, { "epoch": 4.44293524398112, "grad_norm": 0.6558330655097961, "learning_rate": 1.9146952130147043e-05, "loss": 0.1686, "step": 32710 }, { "epoch": 4.444293524398112, "grad_norm": 1.052640438079834, "learning_rate": 1.911586844976742e-05, "loss": 0.1742, "step": 32720 }, { "epoch": 4.445651804815104, "grad_norm": 0.596874475479126, "learning_rate": 1.9084804056777243e-05, "loss": 0.1731, "step": 32730 }, { "epoch": 4.447010085232096, "grad_norm": 0.6708084940910339, "learning_rate": 1.905375897057649e-05, "loss": 0.173, "step": 32740 }, { "epoch": 4.448368365649088, "grad_norm": 0.5713144540786743, "learning_rate": 1.9022733210553035e-05, "loss": 0.1601, "step": 32750 }, { "epoch": 4.449726646066081, "grad_norm": 0.5117421746253967, "learning_rate": 1.8991726796082747e-05, "loss": 0.1687, "step": 32760 }, { "epoch": 4.451084926483072, "grad_norm": 1.153026819229126, "learning_rate": 1.8960739746529354e-05, "loss": 0.1595, "step": 32770 }, { "epoch": 4.452443206900065, "grad_norm": 1.223427653312683, "learning_rate": 1.892977208124453e-05, "loss": 0.1602, "step": 32780 }, { "epoch": 4.453801487317056, "grad_norm": 0.6640663146972656, "learning_rate": 1.8898823819567845e-05, "loss": 0.1602, "step": 32790 }, { "epoch": 4.455159767734049, "grad_norm": 0.5294244289398193, "learning_rate": 1.8867894980826717e-05, "loss": 0.1541, "step": 32800 }, { "epoch": 4.4565180481510405, "grad_norm": 1.0433814525604248, "learning_rate": 1.8836985584336482e-05, "loss": 0.1687, "step": 32810 }, { "epoch": 4.457876328568033, "grad_norm": 0.6480174660682678, "learning_rate": 1.8806095649400268e-05, "loss": 0.1872, "step": 32820 }, { "epoch": 4.4592346089850246, "grad_norm": 0.8383079171180725, "learning_rate": 1.8775225195309136e-05, "loss": 0.1634, "step": 32830 }, { "epoch": 4.460592889402017, "grad_norm": 0.7397571802139282, "learning_rate": 1.8744374241341888e-05, "loss": 0.1701, "step": 32840 }, { "epoch": 4.4619511698190095, "grad_norm": 0.41833463311195374, "learning_rate": 1.8713542806765216e-05, "loss": 0.1739, "step": 32850 }, { "epoch": 4.463309450236001, "grad_norm": 0.7098782658576965, "learning_rate": 1.868273091083362e-05, "loss": 0.1649, "step": 32860 }, { "epoch": 4.464667730652994, "grad_norm": 0.6409628391265869, "learning_rate": 1.8651938572789335e-05, "loss": 0.1622, "step": 32870 }, { "epoch": 4.466026011069985, "grad_norm": 0.731085479259491, "learning_rate": 1.8621165811862464e-05, "loss": 0.1745, "step": 32880 }, { "epoch": 4.467384291486978, "grad_norm": 0.43102625012397766, "learning_rate": 1.859041264727081e-05, "loss": 0.1573, "step": 32890 }, { "epoch": 4.468742571903969, "grad_norm": 0.7829004526138306, "learning_rate": 1.855967909821999e-05, "loss": 0.1582, "step": 32900 }, { "epoch": 4.470100852320962, "grad_norm": 0.5068439841270447, "learning_rate": 1.8528965183903368e-05, "loss": 0.1629, "step": 32910 }, { "epoch": 4.471459132737953, "grad_norm": 2.2748525142669678, "learning_rate": 1.8498270923501998e-05, "loss": 0.1752, "step": 32920 }, { "epoch": 4.472817413154946, "grad_norm": 0.7181318402290344, "learning_rate": 1.8467596336184732e-05, "loss": 0.172, "step": 32930 }, { "epoch": 4.474175693571938, "grad_norm": 0.5548234581947327, "learning_rate": 1.8436941441108062e-05, "loss": 0.171, "step": 32940 }, { "epoch": 4.47553397398893, "grad_norm": 0.5120195150375366, "learning_rate": 1.8406306257416243e-05, "loss": 0.1644, "step": 32950 }, { "epoch": 4.476892254405922, "grad_norm": 0.44900014996528625, "learning_rate": 1.8375690804241175e-05, "loss": 0.1636, "step": 32960 }, { "epoch": 4.478250534822914, "grad_norm": 0.5695343613624573, "learning_rate": 1.8345095100702457e-05, "loss": 0.1595, "step": 32970 }, { "epoch": 4.4796088152399065, "grad_norm": 0.899222195148468, "learning_rate": 1.8314519165907385e-05, "loss": 0.1742, "step": 32980 }, { "epoch": 4.480967095656898, "grad_norm": 0.4434829652309418, "learning_rate": 1.8283963018950833e-05, "loss": 0.1676, "step": 32990 }, { "epoch": 4.482325376073891, "grad_norm": 0.464828222990036, "learning_rate": 1.8253426678915398e-05, "loss": 0.1663, "step": 33000 }, { "epoch": 4.483683656490882, "grad_norm": 0.5626402497291565, "learning_rate": 1.822291016487123e-05, "loss": 0.1578, "step": 33010 }, { "epoch": 4.485041936907875, "grad_norm": 3.5070629119873047, "learning_rate": 1.8192413495876172e-05, "loss": 0.1609, "step": 33020 }, { "epoch": 4.486400217324867, "grad_norm": 0.6563196778297424, "learning_rate": 1.816193669097564e-05, "loss": 0.169, "step": 33030 }, { "epoch": 4.487758497741859, "grad_norm": 0.7069793343544006, "learning_rate": 1.813147976920262e-05, "loss": 0.1716, "step": 33040 }, { "epoch": 4.489116778158851, "grad_norm": 0.47433218359947205, "learning_rate": 1.810104274957774e-05, "loss": 0.1543, "step": 33050 }, { "epoch": 4.490475058575843, "grad_norm": 0.6356958746910095, "learning_rate": 1.8070625651109125e-05, "loss": 0.1723, "step": 33060 }, { "epoch": 4.491833338992835, "grad_norm": 0.5651212334632874, "learning_rate": 1.8040228492792517e-05, "loss": 0.1734, "step": 33070 }, { "epoch": 4.493191619409827, "grad_norm": 0.78204345703125, "learning_rate": 1.8009851293611206e-05, "loss": 0.1701, "step": 33080 }, { "epoch": 4.494549899826819, "grad_norm": 0.5171128511428833, "learning_rate": 1.7979494072535964e-05, "loss": 0.1622, "step": 33090 }, { "epoch": 4.495908180243811, "grad_norm": 0.8997629880905151, "learning_rate": 1.794915684852515e-05, "loss": 0.1592, "step": 33100 }, { "epoch": 4.4972664606608035, "grad_norm": 0.4721558392047882, "learning_rate": 1.7918839640524575e-05, "loss": 0.1644, "step": 33110 }, { "epoch": 4.498624741077796, "grad_norm": 0.4583549499511719, "learning_rate": 1.788854246746762e-05, "loss": 0.1571, "step": 33120 }, { "epoch": 4.499983021494788, "grad_norm": 0.8111376166343689, "learning_rate": 1.785826534827507e-05, "loss": 0.1648, "step": 33130 }, { "epoch": 4.50134130191178, "grad_norm": 0.50780189037323, "learning_rate": 1.7828008301855252e-05, "loss": 0.1615, "step": 33140 }, { "epoch": 4.502699582328772, "grad_norm": 0.5960315465927124, "learning_rate": 1.7797771347103953e-05, "loss": 0.1697, "step": 33150 }, { "epoch": 4.504057862745764, "grad_norm": 0.6676151752471924, "learning_rate": 1.776755450290436e-05, "loss": 0.1678, "step": 33160 }, { "epoch": 4.505416143162756, "grad_norm": 0.4964967668056488, "learning_rate": 1.773735778812717e-05, "loss": 0.1663, "step": 33170 }, { "epoch": 4.506774423579748, "grad_norm": 0.44060632586479187, "learning_rate": 1.7707181221630443e-05, "loss": 0.1554, "step": 33180 }, { "epoch": 4.50813270399674, "grad_norm": 1.4161025285720825, "learning_rate": 1.7677024822259703e-05, "loss": 0.1573, "step": 33190 }, { "epoch": 4.509490984413732, "grad_norm": 0.8226635456085205, "learning_rate": 1.7646888608847878e-05, "loss": 0.1646, "step": 33200 }, { "epoch": 4.510849264830725, "grad_norm": 0.5835804343223572, "learning_rate": 1.7616772600215255e-05, "loss": 0.1719, "step": 33210 }, { "epoch": 4.512207545247716, "grad_norm": 0.6695581674575806, "learning_rate": 1.7586676815169546e-05, "loss": 0.1619, "step": 33220 }, { "epoch": 4.513565825664708, "grad_norm": 0.6437219977378845, "learning_rate": 1.7556601272505786e-05, "loss": 0.1652, "step": 33230 }, { "epoch": 4.5149241060817005, "grad_norm": 0.7457780838012695, "learning_rate": 1.7526545991006406e-05, "loss": 0.179, "step": 33240 }, { "epoch": 4.516282386498693, "grad_norm": 0.7112127542495728, "learning_rate": 1.749651098944119e-05, "loss": 0.1684, "step": 33250 }, { "epoch": 4.5176406669156846, "grad_norm": 0.6236027479171753, "learning_rate": 1.7466496286567206e-05, "loss": 0.1546, "step": 33260 }, { "epoch": 4.518998947332677, "grad_norm": 0.5385364294052124, "learning_rate": 1.743650190112891e-05, "loss": 0.161, "step": 33270 }, { "epoch": 4.520357227749669, "grad_norm": 0.55323326587677, "learning_rate": 1.7406527851858007e-05, "loss": 0.1556, "step": 33280 }, { "epoch": 4.521715508166661, "grad_norm": 0.7752742171287537, "learning_rate": 1.737657415747357e-05, "loss": 0.1782, "step": 33290 }, { "epoch": 4.523073788583653, "grad_norm": 1.1179612874984741, "learning_rate": 1.7346640836681876e-05, "loss": 0.1665, "step": 33300 }, { "epoch": 4.524432069000645, "grad_norm": 1.0611122846603394, "learning_rate": 1.7316727908176556e-05, "loss": 0.1667, "step": 33310 }, { "epoch": 4.525790349417637, "grad_norm": 0.633258581161499, "learning_rate": 1.7286835390638484e-05, "loss": 0.1728, "step": 33320 }, { "epoch": 4.527148629834629, "grad_norm": 0.5175571441650391, "learning_rate": 1.725696330273575e-05, "loss": 0.1644, "step": 33330 }, { "epoch": 4.528506910251622, "grad_norm": 0.6574620008468628, "learning_rate": 1.7227111663123748e-05, "loss": 0.1693, "step": 33340 }, { "epoch": 4.529865190668613, "grad_norm": 0.6107839941978455, "learning_rate": 1.7197280490445023e-05, "loss": 0.1645, "step": 33350 }, { "epoch": 4.531223471085606, "grad_norm": 0.6350055932998657, "learning_rate": 1.716746980332941e-05, "loss": 0.1615, "step": 33360 }, { "epoch": 4.5325817515025975, "grad_norm": 2.897418737411499, "learning_rate": 1.7137679620393944e-05, "loss": 0.1646, "step": 33370 }, { "epoch": 4.53394003191959, "grad_norm": 0.6261113882064819, "learning_rate": 1.7107909960242798e-05, "loss": 0.1743, "step": 33380 }, { "epoch": 4.5352983123365815, "grad_norm": 0.5011479258537292, "learning_rate": 1.7078160841467388e-05, "loss": 0.1644, "step": 33390 }, { "epoch": 4.536656592753574, "grad_norm": 0.46666860580444336, "learning_rate": 1.704843228264626e-05, "loss": 0.1621, "step": 33400 }, { "epoch": 4.538014873170566, "grad_norm": 0.7982969284057617, "learning_rate": 1.701872430234515e-05, "loss": 0.1572, "step": 33410 }, { "epoch": 4.539373153587558, "grad_norm": 0.5445136427879333, "learning_rate": 1.6989036919116947e-05, "loss": 0.1723, "step": 33420 }, { "epoch": 4.540731434004551, "grad_norm": 0.4761747419834137, "learning_rate": 1.6959370151501624e-05, "loss": 0.1753, "step": 33430 }, { "epoch": 4.542089714421542, "grad_norm": 0.6295275688171387, "learning_rate": 1.692972401802636e-05, "loss": 0.1698, "step": 33440 }, { "epoch": 4.543447994838535, "grad_norm": 0.6199374198913574, "learning_rate": 1.6900098537205367e-05, "loss": 0.164, "step": 33450 }, { "epoch": 4.544806275255526, "grad_norm": 0.860784113407135, "learning_rate": 1.6870493727540032e-05, "loss": 0.1673, "step": 33460 }, { "epoch": 4.546164555672519, "grad_norm": 0.7105755805969238, "learning_rate": 1.6840909607518757e-05, "loss": 0.1646, "step": 33470 }, { "epoch": 4.54752283608951, "grad_norm": 0.5949261784553528, "learning_rate": 1.681134619561709e-05, "loss": 0.1746, "step": 33480 }, { "epoch": 4.548881116506503, "grad_norm": 0.5077519416809082, "learning_rate": 1.678180351029763e-05, "loss": 0.1688, "step": 33490 }, { "epoch": 4.550239396923494, "grad_norm": 0.8956555128097534, "learning_rate": 1.6752281570009985e-05, "loss": 0.1836, "step": 33500 }, { "epoch": 4.551597677340487, "grad_norm": 0.7535531520843506, "learning_rate": 1.6722780393190896e-05, "loss": 0.1699, "step": 33510 }, { "epoch": 4.552955957757479, "grad_norm": 0.5078708529472351, "learning_rate": 1.6693299998264035e-05, "loss": 0.1701, "step": 33520 }, { "epoch": 4.554314238174471, "grad_norm": 0.617891252040863, "learning_rate": 1.666384040364017e-05, "loss": 0.1583, "step": 33530 }, { "epoch": 4.5556725185914635, "grad_norm": 0.5161946415901184, "learning_rate": 1.6634401627717073e-05, "loss": 0.171, "step": 33540 }, { "epoch": 4.557030799008455, "grad_norm": 1.0166009664535522, "learning_rate": 1.6604983688879466e-05, "loss": 0.1551, "step": 33550 }, { "epoch": 4.558389079425448, "grad_norm": 0.672971248626709, "learning_rate": 1.6575586605499116e-05, "loss": 0.1455, "step": 33560 }, { "epoch": 4.559747359842439, "grad_norm": 0.5853837132453918, "learning_rate": 1.6546210395934707e-05, "loss": 0.1643, "step": 33570 }, { "epoch": 4.561105640259432, "grad_norm": 0.6301414370536804, "learning_rate": 1.6516855078531946e-05, "loss": 0.1673, "step": 33580 }, { "epoch": 4.562463920676423, "grad_norm": 0.6064209342002869, "learning_rate": 1.6487520671623468e-05, "loss": 0.1667, "step": 33590 }, { "epoch": 4.563822201093416, "grad_norm": 0.5823454260826111, "learning_rate": 1.6458207193528817e-05, "loss": 0.1591, "step": 33600 }, { "epoch": 4.565180481510408, "grad_norm": 0.7163525819778442, "learning_rate": 1.642891466255454e-05, "loss": 0.1749, "step": 33610 }, { "epoch": 4.5665387619274, "grad_norm": 0.811215877532959, "learning_rate": 1.6399643096994016e-05, "loss": 0.1549, "step": 33620 }, { "epoch": 4.567897042344392, "grad_norm": 0.7072502374649048, "learning_rate": 1.637039251512761e-05, "loss": 0.1634, "step": 33630 }, { "epoch": 4.569255322761384, "grad_norm": 1.1604541540145874, "learning_rate": 1.634116293522251e-05, "loss": 0.1584, "step": 33640 }, { "epoch": 4.570613603178376, "grad_norm": 0.5894860625267029, "learning_rate": 1.6311954375532846e-05, "loss": 0.1658, "step": 33650 }, { "epoch": 4.571971883595368, "grad_norm": 0.45093613862991333, "learning_rate": 1.6282766854299612e-05, "loss": 0.1457, "step": 33660 }, { "epoch": 4.5733301640123605, "grad_norm": 0.5360625386238098, "learning_rate": 1.6253600389750623e-05, "loss": 0.159, "step": 33670 }, { "epoch": 4.574688444429352, "grad_norm": 0.4936796724796295, "learning_rate": 1.6224455000100598e-05, "loss": 0.1656, "step": 33680 }, { "epoch": 4.576046724846345, "grad_norm": 0.39945951104164124, "learning_rate": 1.6195330703551037e-05, "loss": 0.1609, "step": 33690 }, { "epoch": 4.577405005263337, "grad_norm": 1.4398943185806274, "learning_rate": 1.6166227518290318e-05, "loss": 0.1652, "step": 33700 }, { "epoch": 4.578763285680329, "grad_norm": 0.7713378071784973, "learning_rate": 1.613714546249363e-05, "loss": 0.1684, "step": 33710 }, { "epoch": 4.580121566097321, "grad_norm": 0.453446626663208, "learning_rate": 1.6108084554322917e-05, "loss": 0.1658, "step": 33720 }, { "epoch": 4.581479846514313, "grad_norm": 0.453668475151062, "learning_rate": 1.6079044811926986e-05, "loss": 0.1521, "step": 33730 }, { "epoch": 4.582838126931305, "grad_norm": 0.5438262224197388, "learning_rate": 1.6050026253441357e-05, "loss": 0.1589, "step": 33740 }, { "epoch": 4.584196407348297, "grad_norm": 0.5807858109474182, "learning_rate": 1.6021028896988383e-05, "loss": 0.1621, "step": 33750 }, { "epoch": 4.585554687765289, "grad_norm": 0.5498211979866028, "learning_rate": 1.5992052760677116e-05, "loss": 0.1696, "step": 33760 }, { "epoch": 4.586912968182281, "grad_norm": 0.6865106821060181, "learning_rate": 1.5963097862603417e-05, "loss": 0.1668, "step": 33770 }, { "epoch": 4.588271248599273, "grad_norm": 0.719539225101471, "learning_rate": 1.5934164220849847e-05, "loss": 0.1626, "step": 33780 }, { "epoch": 4.589629529016266, "grad_norm": 0.573159396648407, "learning_rate": 1.5905251853485682e-05, "loss": 0.1748, "step": 33790 }, { "epoch": 4.5909878094332575, "grad_norm": 2.006422996520996, "learning_rate": 1.5876360778566952e-05, "loss": 0.1705, "step": 33800 }, { "epoch": 4.59234608985025, "grad_norm": 0.497975617647171, "learning_rate": 1.584749101413634e-05, "loss": 0.1677, "step": 33810 }, { "epoch": 4.5937043702672415, "grad_norm": 0.4816487729549408, "learning_rate": 1.5818642578223263e-05, "loss": 0.1564, "step": 33820 }, { "epoch": 4.595062650684234, "grad_norm": 0.6890683174133301, "learning_rate": 1.578981548884382e-05, "loss": 0.1668, "step": 33830 }, { "epoch": 4.596420931101226, "grad_norm": 0.5515402555465698, "learning_rate": 1.5761009764000718e-05, "loss": 0.1716, "step": 33840 }, { "epoch": 4.597779211518218, "grad_norm": 0.5155599117279053, "learning_rate": 1.5732225421683406e-05, "loss": 0.1622, "step": 33850 }, { "epoch": 4.59913749193521, "grad_norm": 0.527674674987793, "learning_rate": 1.5703462479867908e-05, "loss": 0.1646, "step": 33860 }, { "epoch": 4.600495772352202, "grad_norm": 0.631135880947113, "learning_rate": 1.5674720956516925e-05, "loss": 0.1706, "step": 33870 }, { "epoch": 4.601854052769195, "grad_norm": 0.6878489851951599, "learning_rate": 1.5646000869579784e-05, "loss": 0.1698, "step": 33880 }, { "epoch": 4.603212333186186, "grad_norm": 0.6347100138664246, "learning_rate": 1.5617302236992388e-05, "loss": 0.1541, "step": 33890 }, { "epoch": 4.604570613603179, "grad_norm": 0.6483649611473083, "learning_rate": 1.558862507667729e-05, "loss": 0.1623, "step": 33900 }, { "epoch": 4.60592889402017, "grad_norm": 0.6460469961166382, "learning_rate": 1.5559969406543572e-05, "loss": 0.1632, "step": 33910 }, { "epoch": 4.607287174437163, "grad_norm": 0.7640321254730225, "learning_rate": 1.553133524448697e-05, "loss": 0.1569, "step": 33920 }, { "epoch": 4.608645454854154, "grad_norm": 0.8464639782905579, "learning_rate": 1.550272260838972e-05, "loss": 0.17, "step": 33930 }, { "epoch": 4.610003735271147, "grad_norm": 0.5716574788093567, "learning_rate": 1.5474131516120654e-05, "loss": 0.1687, "step": 33940 }, { "epoch": 4.6113620156881385, "grad_norm": 0.5935233235359192, "learning_rate": 1.5445561985535157e-05, "loss": 0.1558, "step": 33950 }, { "epoch": 4.612720296105131, "grad_norm": 0.7016568779945374, "learning_rate": 1.54170140344751e-05, "loss": 0.1685, "step": 33960 }, { "epoch": 4.614078576522123, "grad_norm": 0.5030527114868164, "learning_rate": 1.5388487680768943e-05, "loss": 0.165, "step": 33970 }, { "epoch": 4.615436856939115, "grad_norm": 0.875801146030426, "learning_rate": 1.5359982942231586e-05, "loss": 0.1657, "step": 33980 }, { "epoch": 4.616795137356107, "grad_norm": 1.0403450727462769, "learning_rate": 1.5331499836664493e-05, "loss": 0.1696, "step": 33990 }, { "epoch": 4.618153417773099, "grad_norm": 0.5583453178405762, "learning_rate": 1.5303038381855605e-05, "loss": 0.1556, "step": 34000 }, { "epoch": 4.619511698190092, "grad_norm": 0.4675743281841278, "learning_rate": 1.52745985955793e-05, "loss": 0.1646, "step": 34010 }, { "epoch": 4.620869978607083, "grad_norm": 0.857879102230072, "learning_rate": 1.5246180495596474e-05, "loss": 0.1702, "step": 34020 }, { "epoch": 4.622228259024076, "grad_norm": 0.7318080067634583, "learning_rate": 1.5217784099654448e-05, "loss": 0.1679, "step": 34030 }, { "epoch": 4.623586539441067, "grad_norm": 0.5780463218688965, "learning_rate": 1.5189409425487e-05, "loss": 0.1527, "step": 34040 }, { "epoch": 4.62494481985806, "grad_norm": 1.1190916299819946, "learning_rate": 1.5161056490814374e-05, "loss": 0.176, "step": 34050 }, { "epoch": 4.626303100275051, "grad_norm": 0.5986526608467102, "learning_rate": 1.513272531334316e-05, "loss": 0.1661, "step": 34060 }, { "epoch": 4.627661380692044, "grad_norm": 2.6642589569091797, "learning_rate": 1.5104415910766445e-05, "loss": 0.1626, "step": 34070 }, { "epoch": 4.6290196611090355, "grad_norm": 0.47352272272109985, "learning_rate": 1.5076128300763649e-05, "loss": 0.1516, "step": 34080 }, { "epoch": 4.630377941526028, "grad_norm": 1.12067449092865, "learning_rate": 1.5047862501000642e-05, "loss": 0.1582, "step": 34090 }, { "epoch": 4.6317362219430205, "grad_norm": 1.5456739664077759, "learning_rate": 1.501961852912962e-05, "loss": 0.1578, "step": 34100 }, { "epoch": 4.633094502360012, "grad_norm": 0.7881343960762024, "learning_rate": 1.4991396402789177e-05, "loss": 0.1544, "step": 34110 }, { "epoch": 4.634452782777005, "grad_norm": 0.5856311917304993, "learning_rate": 1.4963196139604274e-05, "loss": 0.1605, "step": 34120 }, { "epoch": 4.635811063193996, "grad_norm": 0.6069117188453674, "learning_rate": 1.4935017757186182e-05, "loss": 0.1616, "step": 34130 }, { "epoch": 4.637169343610989, "grad_norm": 0.5950397849082947, "learning_rate": 1.490686127313255e-05, "loss": 0.1777, "step": 34140 }, { "epoch": 4.63852762402798, "grad_norm": 0.7278192043304443, "learning_rate": 1.4878726705027301e-05, "loss": 0.178, "step": 34150 }, { "epoch": 4.639885904444973, "grad_norm": 0.527841329574585, "learning_rate": 1.4850614070440722e-05, "loss": 0.1603, "step": 34160 }, { "epoch": 4.641244184861964, "grad_norm": 0.43936434388160706, "learning_rate": 1.4822523386929382e-05, "loss": 0.1549, "step": 34170 }, { "epoch": 4.642602465278957, "grad_norm": 0.6575285196304321, "learning_rate": 1.4794454672036123e-05, "loss": 0.163, "step": 34180 }, { "epoch": 4.643960745695949, "grad_norm": 0.6868091225624084, "learning_rate": 1.4766407943290106e-05, "loss": 0.158, "step": 34190 }, { "epoch": 4.645319026112941, "grad_norm": 0.6083295345306396, "learning_rate": 1.4738383218206713e-05, "loss": 0.1674, "step": 34200 }, { "epoch": 4.646677306529933, "grad_norm": 0.7053366303443909, "learning_rate": 1.471038051428762e-05, "loss": 0.167, "step": 34210 }, { "epoch": 4.648035586946925, "grad_norm": 0.8434534668922424, "learning_rate": 1.4682399849020767e-05, "loss": 0.1591, "step": 34220 }, { "epoch": 4.6493938673639175, "grad_norm": 0.45199939608573914, "learning_rate": 1.4654441239880268e-05, "loss": 0.1605, "step": 34230 }, { "epoch": 4.650752147780909, "grad_norm": 0.5764057040214539, "learning_rate": 1.4626504704326533e-05, "loss": 0.1576, "step": 34240 }, { "epoch": 4.6521104281979015, "grad_norm": 0.5556119084358215, "learning_rate": 1.4598590259806117e-05, "loss": 0.1639, "step": 34250 }, { "epoch": 4.653468708614893, "grad_norm": 0.6463999152183533, "learning_rate": 1.4570697923751858e-05, "loss": 0.1662, "step": 34260 }, { "epoch": 4.654826989031886, "grad_norm": 0.6903064250946045, "learning_rate": 1.4542827713582708e-05, "loss": 0.1564, "step": 34270 }, { "epoch": 4.656185269448878, "grad_norm": 0.5735841989517212, "learning_rate": 1.4514979646703846e-05, "loss": 0.1674, "step": 34280 }, { "epoch": 4.65754354986587, "grad_norm": 0.5662961006164551, "learning_rate": 1.4487153740506649e-05, "loss": 0.1539, "step": 34290 }, { "epoch": 4.658901830282862, "grad_norm": 0.4179501235485077, "learning_rate": 1.4459350012368572e-05, "loss": 0.1562, "step": 34300 }, { "epoch": 4.660260110699854, "grad_norm": 0.6051515936851501, "learning_rate": 1.4431568479653301e-05, "loss": 0.1592, "step": 34310 }, { "epoch": 4.661618391116846, "grad_norm": 0.4820345342159271, "learning_rate": 1.4403809159710607e-05, "loss": 0.1533, "step": 34320 }, { "epoch": 4.662976671533838, "grad_norm": 0.8186147809028625, "learning_rate": 1.4376072069876411e-05, "loss": 0.1615, "step": 34330 }, { "epoch": 4.66433495195083, "grad_norm": 0.6060168147087097, "learning_rate": 1.4348357227472775e-05, "loss": 0.167, "step": 34340 }, { "epoch": 4.665693232367822, "grad_norm": 0.4769505560398102, "learning_rate": 1.4320664649807808e-05, "loss": 0.1625, "step": 34350 }, { "epoch": 4.6670515127848144, "grad_norm": 0.6420239806175232, "learning_rate": 1.4292994354175782e-05, "loss": 0.1706, "step": 34360 }, { "epoch": 4.668409793201807, "grad_norm": 0.8922885656356812, "learning_rate": 1.4265346357856985e-05, "loss": 0.1637, "step": 34370 }, { "epoch": 4.6697680736187985, "grad_norm": 0.6866846680641174, "learning_rate": 1.4237720678117827e-05, "loss": 0.1564, "step": 34380 }, { "epoch": 4.671126354035791, "grad_norm": 0.626465916633606, "learning_rate": 1.4210117332210787e-05, "loss": 0.1567, "step": 34390 }, { "epoch": 4.672484634452783, "grad_norm": 0.4682192802429199, "learning_rate": 1.4182536337374346e-05, "loss": 0.1516, "step": 34400 }, { "epoch": 4.673842914869775, "grad_norm": 1.2784432172775269, "learning_rate": 1.415497771083309e-05, "loss": 0.1635, "step": 34410 }, { "epoch": 4.675201195286767, "grad_norm": 0.4827069640159607, "learning_rate": 1.412744146979757e-05, "loss": 0.1609, "step": 34420 }, { "epoch": 4.676559475703759, "grad_norm": 0.3975920081138611, "learning_rate": 1.4099927631464427e-05, "loss": 0.1518, "step": 34430 }, { "epoch": 4.677917756120751, "grad_norm": 0.5734313726425171, "learning_rate": 1.4072436213016238e-05, "loss": 0.1687, "step": 34440 }, { "epoch": 4.679276036537743, "grad_norm": 0.44706955552101135, "learning_rate": 1.404496723162163e-05, "loss": 0.1713, "step": 34450 }, { "epoch": 4.680634316954736, "grad_norm": 0.6072577834129333, "learning_rate": 1.4017520704435227e-05, "loss": 0.1648, "step": 34460 }, { "epoch": 4.681992597371727, "grad_norm": 1.1175315380096436, "learning_rate": 1.3990096648597572e-05, "loss": 0.1516, "step": 34470 }, { "epoch": 4.68335087778872, "grad_norm": 0.5524747967720032, "learning_rate": 1.3962695081235244e-05, "loss": 0.158, "step": 34480 }, { "epoch": 4.684709158205711, "grad_norm": 0.8691211938858032, "learning_rate": 1.3935316019460703e-05, "loss": 0.1655, "step": 34490 }, { "epoch": 4.686067438622704, "grad_norm": 0.5302237272262573, "learning_rate": 1.3907959480372412e-05, "loss": 0.1619, "step": 34500 }, { "epoch": 4.6874257190396955, "grad_norm": 0.7053501605987549, "learning_rate": 1.388062548105477e-05, "loss": 0.16, "step": 34510 }, { "epoch": 4.688783999456688, "grad_norm": 1.6987684965133667, "learning_rate": 1.3853314038578053e-05, "loss": 0.1645, "step": 34520 }, { "epoch": 4.69014227987368, "grad_norm": 0.43552130460739136, "learning_rate": 1.3826025169998497e-05, "loss": 0.16, "step": 34530 }, { "epoch": 4.691500560290672, "grad_norm": 0.48575031757354736, "learning_rate": 1.37987588923582e-05, "loss": 0.1494, "step": 34540 }, { "epoch": 4.692858840707665, "grad_norm": 0.6372866034507751, "learning_rate": 1.3771515222685193e-05, "loss": 0.1774, "step": 34550 }, { "epoch": 4.694217121124656, "grad_norm": 0.4559236466884613, "learning_rate": 1.374429417799335e-05, "loss": 0.1628, "step": 34560 }, { "epoch": 4.695575401541649, "grad_norm": 0.48662522435188293, "learning_rate": 1.371709577528244e-05, "loss": 0.1638, "step": 34570 }, { "epoch": 4.69693368195864, "grad_norm": 0.6365074515342712, "learning_rate": 1.3689920031538095e-05, "loss": 0.1705, "step": 34580 }, { "epoch": 4.698291962375633, "grad_norm": 0.5268241763114929, "learning_rate": 1.366276696373177e-05, "loss": 0.1688, "step": 34590 }, { "epoch": 4.699650242792624, "grad_norm": 0.567867636680603, "learning_rate": 1.3635636588820793e-05, "loss": 0.1603, "step": 34600 }, { "epoch": 4.701008523209617, "grad_norm": 0.5404268503189087, "learning_rate": 1.3608528923748281e-05, "loss": 0.1643, "step": 34610 }, { "epoch": 4.702366803626608, "grad_norm": 0.6218478083610535, "learning_rate": 1.3581443985443199e-05, "loss": 0.155, "step": 34620 }, { "epoch": 4.703725084043601, "grad_norm": 0.417737752199173, "learning_rate": 1.3554381790820326e-05, "loss": 0.1609, "step": 34630 }, { "epoch": 4.705083364460593, "grad_norm": 0.7840763330459595, "learning_rate": 1.3527342356780188e-05, "loss": 0.1731, "step": 34640 }, { "epoch": 4.706441644877585, "grad_norm": 0.7058510184288025, "learning_rate": 1.3500325700209166e-05, "loss": 0.1602, "step": 34650 }, { "epoch": 4.707799925294577, "grad_norm": 0.49378862977027893, "learning_rate": 1.3473331837979348e-05, "loss": 0.1631, "step": 34660 }, { "epoch": 4.709158205711569, "grad_norm": 0.4783988296985626, "learning_rate": 1.344636078694863e-05, "loss": 0.1756, "step": 34670 }, { "epoch": 4.7105164861285616, "grad_norm": 0.6812304854393005, "learning_rate": 1.341941256396067e-05, "loss": 0.1669, "step": 34680 }, { "epoch": 4.711874766545553, "grad_norm": 0.9664261341094971, "learning_rate": 1.3392487185844821e-05, "loss": 0.1724, "step": 34690 }, { "epoch": 4.713233046962546, "grad_norm": 0.5158857107162476, "learning_rate": 1.3365584669416226e-05, "loss": 0.1614, "step": 34700 }, { "epoch": 4.714591327379537, "grad_norm": 0.5690522789955139, "learning_rate": 1.3338705031475695e-05, "loss": 0.1651, "step": 34710 }, { "epoch": 4.71594960779653, "grad_norm": 0.5796026587486267, "learning_rate": 1.3311848288809813e-05, "loss": 0.1611, "step": 34720 }, { "epoch": 4.717307888213521, "grad_norm": 1.0131943225860596, "learning_rate": 1.3285014458190798e-05, "loss": 0.1595, "step": 34730 }, { "epoch": 4.718666168630514, "grad_norm": 0.667529284954071, "learning_rate": 1.3258203556376603e-05, "loss": 0.1633, "step": 34740 }, { "epoch": 4.720024449047505, "grad_norm": 0.7869409918785095, "learning_rate": 1.323141560011088e-05, "loss": 0.166, "step": 34750 }, { "epoch": 4.721382729464498, "grad_norm": 0.5796931982040405, "learning_rate": 1.3204650606122886e-05, "loss": 0.1548, "step": 34760 }, { "epoch": 4.72274100988149, "grad_norm": 0.5693459510803223, "learning_rate": 1.3177908591127607e-05, "loss": 0.1607, "step": 34770 }, { "epoch": 4.724099290298482, "grad_norm": 0.5409191846847534, "learning_rate": 1.3151189571825622e-05, "loss": 0.1682, "step": 34780 }, { "epoch": 4.7254575707154745, "grad_norm": 0.7495957612991333, "learning_rate": 1.3124493564903185e-05, "loss": 0.1624, "step": 34790 }, { "epoch": 4.726815851132466, "grad_norm": 0.5959596633911133, "learning_rate": 1.3097820587032184e-05, "loss": 0.1643, "step": 34800 }, { "epoch": 4.7281741315494585, "grad_norm": 0.6415248513221741, "learning_rate": 1.3071170654870075e-05, "loss": 0.1544, "step": 34810 }, { "epoch": 4.72953241196645, "grad_norm": 0.5395938754081726, "learning_rate": 1.3044543785059988e-05, "loss": 0.1595, "step": 34820 }, { "epoch": 4.730890692383443, "grad_norm": 0.42933663725852966, "learning_rate": 1.3017939994230588e-05, "loss": 0.1666, "step": 34830 }, { "epoch": 4.732248972800434, "grad_norm": 0.7761124968528748, "learning_rate": 1.2991359298996174e-05, "loss": 0.1611, "step": 34840 }, { "epoch": 4.733607253217427, "grad_norm": 0.7527860403060913, "learning_rate": 1.2964801715956626e-05, "loss": 0.1613, "step": 34850 }, { "epoch": 4.734965533634419, "grad_norm": 0.7709342241287231, "learning_rate": 1.2938267261697323e-05, "loss": 0.1595, "step": 34860 }, { "epoch": 4.736323814051411, "grad_norm": 0.6054039597511292, "learning_rate": 1.2911755952789289e-05, "loss": 0.1653, "step": 34870 }, { "epoch": 4.737682094468403, "grad_norm": 0.652448832988739, "learning_rate": 1.2885267805789014e-05, "loss": 0.1558, "step": 34880 }, { "epoch": 4.739040374885395, "grad_norm": 0.6069378852844238, "learning_rate": 1.2858802837238592e-05, "loss": 0.1664, "step": 34890 }, { "epoch": 4.740398655302387, "grad_norm": 0.8066210746765137, "learning_rate": 1.2832361063665582e-05, "loss": 0.161, "step": 34900 }, { "epoch": 4.741756935719379, "grad_norm": 0.5868949890136719, "learning_rate": 1.2805942501583096e-05, "loss": 0.1626, "step": 34910 }, { "epoch": 4.743115216136371, "grad_norm": 0.6322632431983948, "learning_rate": 1.2779547167489753e-05, "loss": 0.1601, "step": 34920 }, { "epoch": 4.744473496553363, "grad_norm": 0.4563515782356262, "learning_rate": 1.2753175077869628e-05, "loss": 0.1537, "step": 34930 }, { "epoch": 4.7458317769703555, "grad_norm": 1.7508584260940552, "learning_rate": 1.2726826249192325e-05, "loss": 0.1509, "step": 34940 }, { "epoch": 4.747190057387348, "grad_norm": 1.264068603515625, "learning_rate": 1.2700500697912881e-05, "loss": 0.1574, "step": 34950 }, { "epoch": 4.74854833780434, "grad_norm": 1.1057806015014648, "learning_rate": 1.2674198440471824e-05, "loss": 0.1608, "step": 34960 }, { "epoch": 4.749906618221332, "grad_norm": 0.596179723739624, "learning_rate": 1.2647919493295141e-05, "loss": 0.1566, "step": 34970 }, { "epoch": 4.751264898638324, "grad_norm": 0.6845816969871521, "learning_rate": 1.2621663872794215e-05, "loss": 0.1612, "step": 34980 }, { "epoch": 4.752623179055316, "grad_norm": 0.5531831383705139, "learning_rate": 1.2595431595365931e-05, "loss": 0.1675, "step": 34990 }, { "epoch": 4.753981459472308, "grad_norm": 0.4727727472782135, "learning_rate": 1.2569222677392523e-05, "loss": 0.1577, "step": 35000 }, { "epoch": 4.7553397398893, "grad_norm": 0.5658878684043884, "learning_rate": 1.2543037135241681e-05, "loss": 0.163, "step": 35010 }, { "epoch": 4.756698020306292, "grad_norm": 0.5696451663970947, "learning_rate": 1.2516874985266508e-05, "loss": 0.163, "step": 35020 }, { "epoch": 4.758056300723284, "grad_norm": 0.6106990575790405, "learning_rate": 1.2490736243805451e-05, "loss": 0.1591, "step": 35030 }, { "epoch": 4.759414581140277, "grad_norm": 0.9818544983863831, "learning_rate": 1.2464620927182392e-05, "loss": 0.1709, "step": 35040 }, { "epoch": 4.760772861557268, "grad_norm": 0.4751497507095337, "learning_rate": 1.2438529051706527e-05, "loss": 0.16, "step": 35050 }, { "epoch": 4.762131141974261, "grad_norm": 0.5972309708595276, "learning_rate": 1.2412460633672469e-05, "loss": 0.1661, "step": 35060 }, { "epoch": 4.7634894223912525, "grad_norm": 0.49316176772117615, "learning_rate": 1.238641568936013e-05, "loss": 0.1625, "step": 35070 }, { "epoch": 4.764847702808245, "grad_norm": 0.5284515023231506, "learning_rate": 1.2360394235034805e-05, "loss": 0.1778, "step": 35080 }, { "epoch": 4.766205983225237, "grad_norm": 0.4695063531398773, "learning_rate": 1.2334396286947114e-05, "loss": 0.1649, "step": 35090 }, { "epoch": 4.767564263642229, "grad_norm": 0.5033496022224426, "learning_rate": 1.2308421861332952e-05, "loss": 0.1629, "step": 35100 }, { "epoch": 4.768922544059221, "grad_norm": 0.8004832863807678, "learning_rate": 1.2282470974413597e-05, "loss": 0.1648, "step": 35110 }, { "epoch": 4.770280824476213, "grad_norm": 0.606992244720459, "learning_rate": 1.2256543642395547e-05, "loss": 0.1634, "step": 35120 }, { "epoch": 4.771639104893206, "grad_norm": 0.5384237766265869, "learning_rate": 1.2230639881470652e-05, "loss": 0.1593, "step": 35130 }, { "epoch": 4.772997385310197, "grad_norm": 0.7339364290237427, "learning_rate": 1.2204759707816027e-05, "loss": 0.1644, "step": 35140 }, { "epoch": 4.77435566572719, "grad_norm": 0.8693205118179321, "learning_rate": 1.2178903137594034e-05, "loss": 0.1574, "step": 35150 }, { "epoch": 4.775713946144181, "grad_norm": 0.6276689171791077, "learning_rate": 1.2153070186952325e-05, "loss": 0.1611, "step": 35160 }, { "epoch": 4.777072226561174, "grad_norm": 0.4424166679382324, "learning_rate": 1.212726087202376e-05, "loss": 0.1639, "step": 35170 }, { "epoch": 4.778430506978165, "grad_norm": 0.6010932326316833, "learning_rate": 1.2101475208926483e-05, "loss": 0.1602, "step": 35180 }, { "epoch": 4.779788787395158, "grad_norm": 0.7447344660758972, "learning_rate": 1.2075713213763863e-05, "loss": 0.1688, "step": 35190 }, { "epoch": 4.7811470678121495, "grad_norm": 0.728084146976471, "learning_rate": 1.2049974902624445e-05, "loss": 0.1592, "step": 35200 }, { "epoch": 4.782505348229142, "grad_norm": 0.6185115575790405, "learning_rate": 1.2024260291582034e-05, "loss": 0.1629, "step": 35210 }, { "epoch": 4.7838636286461345, "grad_norm": 1.0574746131896973, "learning_rate": 1.1998569396695596e-05, "loss": 0.1685, "step": 35220 }, { "epoch": 4.785221909063126, "grad_norm": 0.5105334520339966, "learning_rate": 1.197290223400932e-05, "loss": 0.1516, "step": 35230 }, { "epoch": 4.7865801894801185, "grad_norm": 0.519932210445404, "learning_rate": 1.1947258819552532e-05, "loss": 0.1583, "step": 35240 }, { "epoch": 4.78793846989711, "grad_norm": 0.614289402961731, "learning_rate": 1.1921639169339765e-05, "loss": 0.1675, "step": 35250 }, { "epoch": 4.789296750314103, "grad_norm": 0.5006380677223206, "learning_rate": 1.1896043299370707e-05, "loss": 0.1691, "step": 35260 }, { "epoch": 4.790655030731094, "grad_norm": 0.577024519443512, "learning_rate": 1.187047122563017e-05, "loss": 0.1578, "step": 35270 }, { "epoch": 4.792013311148087, "grad_norm": 0.8405871987342834, "learning_rate": 1.1844922964088134e-05, "loss": 0.1703, "step": 35280 }, { "epoch": 4.793371591565078, "grad_norm": 0.8341783285140991, "learning_rate": 1.1819398530699677e-05, "loss": 0.1711, "step": 35290 }, { "epoch": 4.794729871982071, "grad_norm": 0.505587637424469, "learning_rate": 1.1793897941405019e-05, "loss": 0.1669, "step": 35300 }, { "epoch": 4.796088152399063, "grad_norm": 0.7752902507781982, "learning_rate": 1.1768421212129505e-05, "loss": 0.1634, "step": 35310 }, { "epoch": 4.797446432816055, "grad_norm": 0.5521948933601379, "learning_rate": 1.1742968358783524e-05, "loss": 0.1664, "step": 35320 }, { "epoch": 4.7988047132330465, "grad_norm": 0.5903010964393616, "learning_rate": 1.1717539397262622e-05, "loss": 0.159, "step": 35330 }, { "epoch": 4.800162993650039, "grad_norm": 0.5774504542350769, "learning_rate": 1.1692134343447365e-05, "loss": 0.1537, "step": 35340 }, { "epoch": 4.801521274067031, "grad_norm": 2.01104474067688, "learning_rate": 1.1666753213203429e-05, "loss": 0.1651, "step": 35350 }, { "epoch": 4.802879554484023, "grad_norm": 0.5805357694625854, "learning_rate": 1.1641396022381523e-05, "loss": 0.1685, "step": 35360 }, { "epoch": 4.8042378349010155, "grad_norm": 2.7348990440368652, "learning_rate": 1.1616062786817422e-05, "loss": 0.1609, "step": 35370 }, { "epoch": 4.805596115318007, "grad_norm": 0.6244687438011169, "learning_rate": 1.1590753522331948e-05, "loss": 0.1586, "step": 35380 }, { "epoch": 4.806954395735, "grad_norm": 0.8948203921318054, "learning_rate": 1.1565468244730921e-05, "loss": 0.1633, "step": 35390 }, { "epoch": 4.808312676151991, "grad_norm": 0.5942915678024292, "learning_rate": 1.1540206969805223e-05, "loss": 0.1592, "step": 35400 }, { "epoch": 4.809670956568984, "grad_norm": 0.6881776452064514, "learning_rate": 1.1514969713330698e-05, "loss": 0.162, "step": 35410 }, { "epoch": 4.811029236985975, "grad_norm": 5.975242614746094, "learning_rate": 1.1489756491068231e-05, "loss": 0.1589, "step": 35420 }, { "epoch": 4.812387517402968, "grad_norm": 0.5144063830375671, "learning_rate": 1.1464567318763692e-05, "loss": 0.1735, "step": 35430 }, { "epoch": 4.81374579781996, "grad_norm": 1.0597695112228394, "learning_rate": 1.1439402212147904e-05, "loss": 0.1628, "step": 35440 }, { "epoch": 4.815104078236952, "grad_norm": 0.5507410168647766, "learning_rate": 1.1414261186936697e-05, "loss": 0.1629, "step": 35450 }, { "epoch": 4.816462358653944, "grad_norm": 0.7200136184692383, "learning_rate": 1.1389144258830825e-05, "loss": 0.1625, "step": 35460 }, { "epoch": 4.817820639070936, "grad_norm": 0.8538552522659302, "learning_rate": 1.1364051443516022e-05, "loss": 0.1579, "step": 35470 }, { "epoch": 4.819178919487928, "grad_norm": 0.4982856214046478, "learning_rate": 1.1338982756662963e-05, "loss": 0.1661, "step": 35480 }, { "epoch": 4.82053719990492, "grad_norm": 0.5936764478683472, "learning_rate": 1.1313938213927228e-05, "loss": 0.1727, "step": 35490 }, { "epoch": 4.8218954803219125, "grad_norm": 0.6253868937492371, "learning_rate": 1.1288917830949353e-05, "loss": 0.177, "step": 35500 }, { "epoch": 4.823253760738904, "grad_norm": 0.6642293930053711, "learning_rate": 1.1263921623354751e-05, "loss": 0.1601, "step": 35510 }, { "epoch": 4.824612041155897, "grad_norm": 0.6503024697303772, "learning_rate": 1.1238949606753774e-05, "loss": 0.1657, "step": 35520 }, { "epoch": 4.825970321572889, "grad_norm": 0.5134065747261047, "learning_rate": 1.121400179674163e-05, "loss": 0.1558, "step": 35530 }, { "epoch": 4.827328601989881, "grad_norm": 0.9033194184303284, "learning_rate": 1.1189078208898434e-05, "loss": 0.1648, "step": 35540 }, { "epoch": 4.828686882406873, "grad_norm": 0.4886474311351776, "learning_rate": 1.1164178858789182e-05, "loss": 0.158, "step": 35550 }, { "epoch": 4.830045162823865, "grad_norm": 0.5793920755386353, "learning_rate": 1.1139303761963694e-05, "loss": 0.1662, "step": 35560 }, { "epoch": 4.831403443240857, "grad_norm": 0.7469372153282166, "learning_rate": 1.1114452933956699e-05, "loss": 0.1544, "step": 35570 }, { "epoch": 4.832761723657849, "grad_norm": 0.7387930750846863, "learning_rate": 1.1089626390287706e-05, "loss": 0.1739, "step": 35580 }, { "epoch": 4.834120004074841, "grad_norm": 0.6153668165206909, "learning_rate": 1.106482414646111e-05, "loss": 0.1621, "step": 35590 }, { "epoch": 4.835478284491833, "grad_norm": 1.2095433473587036, "learning_rate": 1.1040046217966126e-05, "loss": 0.1575, "step": 35600 }, { "epoch": 4.836836564908825, "grad_norm": 0.8400002717971802, "learning_rate": 1.1015292620276741e-05, "loss": 0.17, "step": 35610 }, { "epoch": 4.838194845325818, "grad_norm": 0.528081476688385, "learning_rate": 1.0990563368851803e-05, "loss": 0.1731, "step": 35620 }, { "epoch": 4.8395531257428095, "grad_norm": 0.8104569315910339, "learning_rate": 1.0965858479134905e-05, "loss": 0.1816, "step": 35630 }, { "epoch": 4.840911406159802, "grad_norm": 0.5874035954475403, "learning_rate": 1.0941177966554466e-05, "loss": 0.1619, "step": 35640 }, { "epoch": 4.842269686576794, "grad_norm": 0.510002851486206, "learning_rate": 1.0916521846523669e-05, "loss": 0.1664, "step": 35650 }, { "epoch": 4.843627966993786, "grad_norm": 0.5611934065818787, "learning_rate": 1.0891890134440442e-05, "loss": 0.1582, "step": 35660 }, { "epoch": 4.844986247410778, "grad_norm": 0.557730495929718, "learning_rate": 1.0867282845687516e-05, "loss": 0.1673, "step": 35670 }, { "epoch": 4.84634452782777, "grad_norm": 0.6499781012535095, "learning_rate": 1.0842699995632305e-05, "loss": 0.1755, "step": 35680 }, { "epoch": 4.847702808244762, "grad_norm": 0.5237849354743958, "learning_rate": 1.081814159962703e-05, "loss": 0.1594, "step": 35690 }, { "epoch": 4.849061088661754, "grad_norm": 0.49147915840148926, "learning_rate": 1.0793607673008582e-05, "loss": 0.1519, "step": 35700 }, { "epoch": 4.850419369078747, "grad_norm": 0.6024288535118103, "learning_rate": 1.0769098231098607e-05, "loss": 0.1607, "step": 35710 }, { "epoch": 4.851777649495738, "grad_norm": 0.6725016832351685, "learning_rate": 1.0744613289203465e-05, "loss": 0.1644, "step": 35720 }, { "epoch": 4.853135929912731, "grad_norm": 0.5368932485580444, "learning_rate": 1.0720152862614169e-05, "loss": 0.1615, "step": 35730 }, { "epoch": 4.854494210329722, "grad_norm": 0.7718627452850342, "learning_rate": 1.0695716966606483e-05, "loss": 0.1794, "step": 35740 }, { "epoch": 4.855852490746715, "grad_norm": 0.5342059135437012, "learning_rate": 1.0671305616440797e-05, "loss": 0.1736, "step": 35750 }, { "epoch": 4.8572107711637065, "grad_norm": 0.6256279349327087, "learning_rate": 1.0646918827362207e-05, "loss": 0.1627, "step": 35760 }, { "epoch": 4.858569051580699, "grad_norm": 0.6146807670593262, "learning_rate": 1.0622556614600471e-05, "loss": 0.179, "step": 35770 }, { "epoch": 4.859927331997691, "grad_norm": 1.091754674911499, "learning_rate": 1.0598218993369963e-05, "loss": 0.1572, "step": 35780 }, { "epoch": 4.861285612414683, "grad_norm": 1.0663893222808838, "learning_rate": 1.0573905978869747e-05, "loss": 0.1643, "step": 35790 }, { "epoch": 4.8626438928316755, "grad_norm": 1.0168269872665405, "learning_rate": 1.0549617586283478e-05, "loss": 0.1625, "step": 35800 }, { "epoch": 4.864002173248667, "grad_norm": 0.5805482864379883, "learning_rate": 1.0525353830779455e-05, "loss": 0.1644, "step": 35810 }, { "epoch": 4.86536045366566, "grad_norm": 0.9953058362007141, "learning_rate": 1.0501114727510613e-05, "loss": 0.1579, "step": 35820 }, { "epoch": 4.866718734082651, "grad_norm": 0.8332403898239136, "learning_rate": 1.047690029161444e-05, "loss": 0.1679, "step": 35830 }, { "epoch": 4.868077014499644, "grad_norm": 0.568920910358429, "learning_rate": 1.0452710538213068e-05, "loss": 0.1649, "step": 35840 }, { "epoch": 4.869435294916635, "grad_norm": 1.0456722974777222, "learning_rate": 1.0428545482413166e-05, "loss": 0.1673, "step": 35850 }, { "epoch": 4.870793575333628, "grad_norm": 0.6308647394180298, "learning_rate": 1.0404405139306039e-05, "loss": 0.1555, "step": 35860 }, { "epoch": 4.872151855750619, "grad_norm": 0.5413314700126648, "learning_rate": 1.0380289523967496e-05, "loss": 0.1621, "step": 35870 }, { "epoch": 4.873510136167612, "grad_norm": 0.6325806379318237, "learning_rate": 1.035619865145795e-05, "loss": 0.1673, "step": 35880 }, { "epoch": 4.874868416584604, "grad_norm": 0.5445653200149536, "learning_rate": 1.0332132536822353e-05, "loss": 0.1597, "step": 35890 }, { "epoch": 4.876226697001596, "grad_norm": 0.9030548334121704, "learning_rate": 1.0308091195090164e-05, "loss": 0.1609, "step": 35900 }, { "epoch": 4.877584977418588, "grad_norm": 0.6538652181625366, "learning_rate": 1.0284074641275426e-05, "loss": 0.1549, "step": 35910 }, { "epoch": 4.87894325783558, "grad_norm": 0.8065443634986877, "learning_rate": 1.0260082890376637e-05, "loss": 0.162, "step": 35920 }, { "epoch": 4.8803015382525725, "grad_norm": 0.6210176348686218, "learning_rate": 1.0236115957376852e-05, "loss": 0.1577, "step": 35930 }, { "epoch": 4.881659818669564, "grad_norm": 0.5182929635047913, "learning_rate": 1.0212173857243634e-05, "loss": 0.1621, "step": 35940 }, { "epoch": 4.883018099086557, "grad_norm": 0.5413863658905029, "learning_rate": 1.0188256604928986e-05, "loss": 0.165, "step": 35950 }, { "epoch": 4.884376379503548, "grad_norm": 0.516472339630127, "learning_rate": 1.0164364215369459e-05, "loss": 0.1544, "step": 35960 }, { "epoch": 4.885734659920541, "grad_norm": 0.5075691342353821, "learning_rate": 1.0140496703486007e-05, "loss": 0.1663, "step": 35970 }, { "epoch": 4.887092940337533, "grad_norm": 0.7271299958229065, "learning_rate": 1.0116654084184103e-05, "loss": 0.1644, "step": 35980 }, { "epoch": 4.888451220754525, "grad_norm": 0.836216390132904, "learning_rate": 1.0092836372353664e-05, "loss": 0.1606, "step": 35990 }, { "epoch": 4.889809501171517, "grad_norm": 0.5730876326560974, "learning_rate": 1.006904358286902e-05, "loss": 0.1567, "step": 36000 }, { "epoch": 4.891167781588509, "grad_norm": 1.3280506134033203, "learning_rate": 1.0045275730588987e-05, "loss": 0.1722, "step": 36010 }, { "epoch": 4.892526062005501, "grad_norm": 0.5543367862701416, "learning_rate": 1.0021532830356755e-05, "loss": 0.1598, "step": 36020 }, { "epoch": 4.893884342422493, "grad_norm": 0.6130557060241699, "learning_rate": 9.99781489699998e-06, "loss": 0.1578, "step": 36030 }, { "epoch": 4.895242622839485, "grad_norm": 0.7871922850608826, "learning_rate": 9.974121945330678e-06, "loss": 0.1557, "step": 36040 }, { "epoch": 4.896600903256477, "grad_norm": 1.0579646825790405, "learning_rate": 9.950453990145303e-06, "loss": 0.1668, "step": 36050 }, { "epoch": 4.8979591836734695, "grad_norm": 0.47942331433296204, "learning_rate": 9.926811046224693e-06, "loss": 0.1662, "step": 36060 }, { "epoch": 4.899317464090461, "grad_norm": 0.5645143985748291, "learning_rate": 9.903193128334038e-06, "loss": 0.1537, "step": 36070 }, { "epoch": 4.900675744507454, "grad_norm": 0.5454254150390625, "learning_rate": 9.879600251222936e-06, "loss": 0.1626, "step": 36080 }, { "epoch": 4.902034024924445, "grad_norm": 0.6218956112861633, "learning_rate": 9.856032429625312e-06, "loss": 0.1605, "step": 36090 }, { "epoch": 4.903392305341438, "grad_norm": 0.7107550501823425, "learning_rate": 9.832489678259465e-06, "loss": 0.1625, "step": 36100 }, { "epoch": 4.90475058575843, "grad_norm": 0.586273193359375, "learning_rate": 9.808972011828055e-06, "loss": 0.154, "step": 36110 }, { "epoch": 4.906108866175422, "grad_norm": 0.7544655203819275, "learning_rate": 9.78547944501802e-06, "loss": 0.157, "step": 36120 }, { "epoch": 4.907467146592414, "grad_norm": 0.5245651006698608, "learning_rate": 9.76201199250069e-06, "loss": 0.1605, "step": 36130 }, { "epoch": 4.908825427009406, "grad_norm": 0.5093498826026917, "learning_rate": 9.738569668931651e-06, "loss": 0.1514, "step": 36140 }, { "epoch": 4.910183707426398, "grad_norm": 0.565960168838501, "learning_rate": 9.715152488950846e-06, "loss": 0.1698, "step": 36150 }, { "epoch": 4.91154198784339, "grad_norm": 0.627091646194458, "learning_rate": 9.691760467182475e-06, "loss": 0.1627, "step": 36160 }, { "epoch": 4.912900268260382, "grad_norm": 0.44851282238960266, "learning_rate": 9.668393618235045e-06, "loss": 0.1653, "step": 36170 }, { "epoch": 4.914258548677374, "grad_norm": 0.7051156759262085, "learning_rate": 9.645051956701367e-06, "loss": 0.1608, "step": 36180 }, { "epoch": 4.9156168290943665, "grad_norm": 0.517364501953125, "learning_rate": 9.621735497158462e-06, "loss": 0.1547, "step": 36190 }, { "epoch": 4.916975109511359, "grad_norm": 0.6160323023796082, "learning_rate": 9.598444254167682e-06, "loss": 0.161, "step": 36200 }, { "epoch": 4.918333389928351, "grad_norm": 0.5606710314750671, "learning_rate": 9.575178242274563e-06, "loss": 0.1591, "step": 36210 }, { "epoch": 4.919691670345343, "grad_norm": 0.6951674222946167, "learning_rate": 9.551937476008932e-06, "loss": 0.1618, "step": 36220 }, { "epoch": 4.921049950762335, "grad_norm": 0.7252928018569946, "learning_rate": 9.528721969884846e-06, "loss": 0.1595, "step": 36230 }, { "epoch": 4.922408231179327, "grad_norm": 1.1623382568359375, "learning_rate": 9.505531738400559e-06, "loss": 0.1629, "step": 36240 }, { "epoch": 4.923766511596319, "grad_norm": 0.514342188835144, "learning_rate": 9.482366796038572e-06, "loss": 0.1526, "step": 36250 }, { "epoch": 4.925124792013311, "grad_norm": 1.1574760675430298, "learning_rate": 9.45922715726556e-06, "loss": 0.1649, "step": 36260 }, { "epoch": 4.926483072430303, "grad_norm": 0.5672377347946167, "learning_rate": 9.436112836532424e-06, "loss": 0.1588, "step": 36270 }, { "epoch": 4.927841352847295, "grad_norm": 0.42687317728996277, "learning_rate": 9.413023848274254e-06, "loss": 0.1659, "step": 36280 }, { "epoch": 4.929199633264288, "grad_norm": 0.46467864513397217, "learning_rate": 9.389960206910292e-06, "loss": 0.15, "step": 36290 }, { "epoch": 4.930557913681279, "grad_norm": 0.5235816836357117, "learning_rate": 9.36692192684399e-06, "loss": 0.1614, "step": 36300 }, { "epoch": 4.931916194098272, "grad_norm": 0.6426771283149719, "learning_rate": 9.34390902246291e-06, "loss": 0.1729, "step": 36310 }, { "epoch": 4.9332744745152635, "grad_norm": 0.7690723538398743, "learning_rate": 9.320921508138831e-06, "loss": 0.1571, "step": 36320 }, { "epoch": 4.934632754932256, "grad_norm": 0.5093239545822144, "learning_rate": 9.297959398227606e-06, "loss": 0.1665, "step": 36330 }, { "epoch": 4.9359910353492475, "grad_norm": 3.1544127464294434, "learning_rate": 9.275022707069276e-06, "loss": 0.1578, "step": 36340 }, { "epoch": 4.93734931576624, "grad_norm": 0.6152190566062927, "learning_rate": 9.252111448988004e-06, "loss": 0.1632, "step": 36350 }, { "epoch": 4.938707596183232, "grad_norm": 0.6729171276092529, "learning_rate": 9.229225638292028e-06, "loss": 0.1619, "step": 36360 }, { "epoch": 4.940065876600224, "grad_norm": 0.6777529120445251, "learning_rate": 9.206365289273744e-06, "loss": 0.1571, "step": 36370 }, { "epoch": 4.941424157017217, "grad_norm": 0.6416372060775757, "learning_rate": 9.183530416209601e-06, "loss": 0.1604, "step": 36380 }, { "epoch": 4.942782437434208, "grad_norm": 0.534941554069519, "learning_rate": 9.160721033360182e-06, "loss": 0.1525, "step": 36390 }, { "epoch": 4.944140717851201, "grad_norm": 0.9152023196220398, "learning_rate": 9.137937154970128e-06, "loss": 0.1637, "step": 36400 }, { "epoch": 4.945498998268192, "grad_norm": 0.5389887690544128, "learning_rate": 9.115178795268142e-06, "loss": 0.1515, "step": 36410 }, { "epoch": 4.946857278685185, "grad_norm": 0.5866769552230835, "learning_rate": 9.092445968467029e-06, "loss": 0.1584, "step": 36420 }, { "epoch": 4.948215559102176, "grad_norm": 0.9754812121391296, "learning_rate": 9.069738688763585e-06, "loss": 0.1586, "step": 36430 }, { "epoch": 4.949573839519169, "grad_norm": 0.5507887005805969, "learning_rate": 9.047056970338718e-06, "loss": 0.1542, "step": 36440 }, { "epoch": 4.9509321199361604, "grad_norm": 0.5044301152229309, "learning_rate": 9.024400827357344e-06, "loss": 0.1768, "step": 36450 }, { "epoch": 4.952290400353153, "grad_norm": 0.8221532702445984, "learning_rate": 9.00177027396839e-06, "loss": 0.1585, "step": 36460 }, { "epoch": 4.953648680770145, "grad_norm": 0.7588173151016235, "learning_rate": 8.979165324304839e-06, "loss": 0.162, "step": 36470 }, { "epoch": 4.955006961187137, "grad_norm": 0.611163318157196, "learning_rate": 8.956585992483646e-06, "loss": 0.1758, "step": 36480 }, { "epoch": 4.9563652416041295, "grad_norm": 0.3973206877708435, "learning_rate": 8.934032292605804e-06, "loss": 0.1592, "step": 36490 }, { "epoch": 4.957723522021121, "grad_norm": 1.590853214263916, "learning_rate": 8.911504238756257e-06, "loss": 0.1534, "step": 36500 }, { "epoch": 4.959081802438114, "grad_norm": 0.7048749327659607, "learning_rate": 8.889001845003975e-06, "loss": 0.1558, "step": 36510 }, { "epoch": 4.960440082855105, "grad_norm": 0.693286120891571, "learning_rate": 8.866525125401892e-06, "loss": 0.1536, "step": 36520 }, { "epoch": 4.961798363272098, "grad_norm": 0.5135152339935303, "learning_rate": 8.844074093986875e-06, "loss": 0.1562, "step": 36530 }, { "epoch": 4.963156643689089, "grad_norm": 0.6123496890068054, "learning_rate": 8.821648764779805e-06, "loss": 0.1616, "step": 36540 }, { "epoch": 4.964514924106082, "grad_norm": 0.7153862118721008, "learning_rate": 8.799249151785444e-06, "loss": 0.1672, "step": 36550 }, { "epoch": 4.965873204523074, "grad_norm": 0.5913252234458923, "learning_rate": 8.776875268992557e-06, "loss": 0.1501, "step": 36560 }, { "epoch": 4.967231484940066, "grad_norm": 0.7356968522071838, "learning_rate": 8.754527130373823e-06, "loss": 0.1524, "step": 36570 }, { "epoch": 4.968589765357058, "grad_norm": 1.486419677734375, "learning_rate": 8.732204749885804e-06, "loss": 0.1537, "step": 36580 }, { "epoch": 4.96994804577405, "grad_norm": 0.470673531293869, "learning_rate": 8.709908141469042e-06, "loss": 0.1481, "step": 36590 }, { "epoch": 4.971306326191042, "grad_norm": 0.6168349981307983, "learning_rate": 8.687637319047915e-06, "loss": 0.1672, "step": 36600 }, { "epoch": 4.972664606608034, "grad_norm": 0.7776844501495361, "learning_rate": 8.665392296530744e-06, "loss": 0.1568, "step": 36610 }, { "epoch": 4.9740228870250265, "grad_norm": 0.4439985156059265, "learning_rate": 8.643173087809742e-06, "loss": 0.1642, "step": 36620 }, { "epoch": 4.975381167442018, "grad_norm": 0.6040493249893188, "learning_rate": 8.620979706760963e-06, "loss": 0.1642, "step": 36630 }, { "epoch": 4.976739447859011, "grad_norm": 0.7068130373954773, "learning_rate": 8.59881216724437e-06, "loss": 0.1596, "step": 36640 }, { "epoch": 4.978097728276003, "grad_norm": 3.9417293071746826, "learning_rate": 8.576670483103744e-06, "loss": 0.1707, "step": 36650 }, { "epoch": 4.979456008692995, "grad_norm": 0.5553402304649353, "learning_rate": 8.554554668166775e-06, "loss": 0.1564, "step": 36660 }, { "epoch": 4.980814289109987, "grad_norm": 0.6733876466751099, "learning_rate": 8.532464736244944e-06, "loss": 0.1593, "step": 36670 }, { "epoch": 4.982172569526979, "grad_norm": 0.9246433973312378, "learning_rate": 8.510400701133603e-06, "loss": 0.1642, "step": 36680 }, { "epoch": 4.983530849943971, "grad_norm": 0.6739228367805481, "learning_rate": 8.48836257661193e-06, "loss": 0.1628, "step": 36690 }, { "epoch": 4.984889130360963, "grad_norm": 0.8586932420730591, "learning_rate": 8.466350376442894e-06, "loss": 0.161, "step": 36700 }, { "epoch": 4.986247410777955, "grad_norm": 0.699489951133728, "learning_rate": 8.444364114373304e-06, "loss": 0.1633, "step": 36710 }, { "epoch": 4.987605691194947, "grad_norm": 0.8245803117752075, "learning_rate": 8.42240380413375e-06, "loss": 0.1513, "step": 36720 }, { "epoch": 4.988963971611939, "grad_norm": 0.6739000082015991, "learning_rate": 8.40046945943862e-06, "loss": 0.1593, "step": 36730 }, { "epoch": 4.990322252028932, "grad_norm": 1.1003326177597046, "learning_rate": 8.378561093986103e-06, "loss": 0.1614, "step": 36740 }, { "epoch": 4.9916805324459235, "grad_norm": 0.5290356874465942, "learning_rate": 8.356678721458138e-06, "loss": 0.1643, "step": 36750 }, { "epoch": 4.993038812862915, "grad_norm": 0.5315836071968079, "learning_rate": 8.334822355520455e-06, "loss": 0.1596, "step": 36760 }, { "epoch": 4.9943970932799076, "grad_norm": 0.9868863224983215, "learning_rate": 8.312992009822517e-06, "loss": 0.1489, "step": 36770 }, { "epoch": 4.9957553736969, "grad_norm": 0.5166692733764648, "learning_rate": 8.291187697997555e-06, "loss": 0.1627, "step": 36780 }, { "epoch": 4.997113654113892, "grad_norm": 0.541694164276123, "learning_rate": 8.26940943366255e-06, "loss": 0.165, "step": 36790 }, { "epoch": 4.998471934530884, "grad_norm": 0.5839493870735168, "learning_rate": 8.247657230418183e-06, "loss": 0.1664, "step": 36800 }, { "epoch": 4.999830214947876, "grad_norm": 0.4947507977485657, "learning_rate": 8.225931101848905e-06, "loss": 0.1737, "step": 36810 }, { "epoch": 5.001188495364868, "grad_norm": 0.46263548731803894, "learning_rate": 8.204231061522839e-06, "loss": 0.1562, "step": 36820 }, { "epoch": 5.00254677578186, "grad_norm": 0.4207012951374054, "learning_rate": 8.182557122991846e-06, "loss": 0.1318, "step": 36830 }, { "epoch": 5.003905056198852, "grad_norm": 0.5723206400871277, "learning_rate": 8.160909299791458e-06, "loss": 0.1501, "step": 36840 }, { "epoch": 5.005263336615845, "grad_norm": 0.5125661492347717, "learning_rate": 8.139287605440937e-06, "loss": 0.1469, "step": 36850 }, { "epoch": 5.006621617032836, "grad_norm": 0.49459606409072876, "learning_rate": 8.117692053443205e-06, "loss": 0.1426, "step": 36860 }, { "epoch": 5.007979897449829, "grad_norm": 0.9116210341453552, "learning_rate": 8.096122657284838e-06, "loss": 0.1407, "step": 36870 }, { "epoch": 5.0093381778668205, "grad_norm": 1.0275055170059204, "learning_rate": 8.074579430436124e-06, "loss": 0.1483, "step": 36880 }, { "epoch": 5.010696458283813, "grad_norm": 0.8762903809547424, "learning_rate": 8.053062386350957e-06, "loss": 0.1326, "step": 36890 }, { "epoch": 5.0120547387008045, "grad_norm": 0.854550838470459, "learning_rate": 8.031571538466914e-06, "loss": 0.1392, "step": 36900 }, { "epoch": 5.013413019117797, "grad_norm": 0.9890286326408386, "learning_rate": 8.010106900205221e-06, "loss": 0.1524, "step": 36910 }, { "epoch": 5.014771299534789, "grad_norm": 0.7599340677261353, "learning_rate": 7.98866848497069e-06, "loss": 0.1486, "step": 36920 }, { "epoch": 5.016129579951781, "grad_norm": 0.682043731212616, "learning_rate": 7.96725630615181e-06, "loss": 0.149, "step": 36930 }, { "epoch": 5.017487860368773, "grad_norm": 0.6591745615005493, "learning_rate": 7.94587037712063e-06, "loss": 0.1446, "step": 36940 }, { "epoch": 5.018846140785765, "grad_norm": 0.6125190258026123, "learning_rate": 7.924510711232869e-06, "loss": 0.142, "step": 36950 }, { "epoch": 5.020204421202758, "grad_norm": 0.5530497431755066, "learning_rate": 7.903177321827776e-06, "loss": 0.1348, "step": 36960 }, { "epoch": 5.021562701619749, "grad_norm": 0.7608485817909241, "learning_rate": 7.88187022222825e-06, "loss": 0.1568, "step": 36970 }, { "epoch": 5.022920982036742, "grad_norm": 0.8525851964950562, "learning_rate": 7.860589425740744e-06, "loss": 0.1446, "step": 36980 }, { "epoch": 5.024279262453733, "grad_norm": 0.7583865523338318, "learning_rate": 7.839334945655275e-06, "loss": 0.1352, "step": 36990 }, { "epoch": 5.025637542870726, "grad_norm": 0.6898939609527588, "learning_rate": 7.81810679524545e-06, "loss": 0.1479, "step": 37000 }, { "epoch": 5.026995823287717, "grad_norm": 2.2655410766601562, "learning_rate": 7.79690498776841e-06, "loss": 0.1377, "step": 37010 }, { "epoch": 5.02835410370471, "grad_norm": 0.8028996586799622, "learning_rate": 7.775729536464855e-06, "loss": 0.1547, "step": 37020 }, { "epoch": 5.0297123841217015, "grad_norm": 0.8012681603431702, "learning_rate": 7.754580454559046e-06, "loss": 0.1488, "step": 37030 }, { "epoch": 5.031070664538694, "grad_norm": 0.6341643929481506, "learning_rate": 7.733457755258722e-06, "loss": 0.1424, "step": 37040 }, { "epoch": 5.0324289449556865, "grad_norm": 0.8970765471458435, "learning_rate": 7.712361451755213e-06, "loss": 0.1481, "step": 37050 }, { "epoch": 5.033787225372678, "grad_norm": 1.7224271297454834, "learning_rate": 7.691291557223302e-06, "loss": 0.145, "step": 37060 }, { "epoch": 5.035145505789671, "grad_norm": 0.5768775939941406, "learning_rate": 7.670248084821318e-06, "loss": 0.1365, "step": 37070 }, { "epoch": 5.036503786206662, "grad_norm": 0.6703713536262512, "learning_rate": 7.649231047691097e-06, "loss": 0.1381, "step": 37080 }, { "epoch": 5.037862066623655, "grad_norm": 0.6902905106544495, "learning_rate": 7.628240458957919e-06, "loss": 0.1363, "step": 37090 }, { "epoch": 5.039220347040646, "grad_norm": 0.7954835891723633, "learning_rate": 7.607276331730606e-06, "loss": 0.1412, "step": 37100 }, { "epoch": 5.040578627457639, "grad_norm": 1.274601936340332, "learning_rate": 7.58633867910139e-06, "loss": 0.1407, "step": 37110 }, { "epoch": 5.04193690787463, "grad_norm": 0.690719723701477, "learning_rate": 7.56542751414604e-06, "loss": 0.1333, "step": 37120 }, { "epoch": 5.043295188291623, "grad_norm": 0.7551559209823608, "learning_rate": 7.544542849923719e-06, "loss": 0.1468, "step": 37130 }, { "epoch": 5.044653468708615, "grad_norm": 1.6297558546066284, "learning_rate": 7.52368469947708e-06, "loss": 0.1454, "step": 37140 }, { "epoch": 5.046011749125607, "grad_norm": 1.0627553462982178, "learning_rate": 7.5028530758322166e-06, "loss": 0.1433, "step": 37150 }, { "epoch": 5.047370029542599, "grad_norm": 2.9289660453796387, "learning_rate": 7.482047991998625e-06, "loss": 0.1459, "step": 37160 }, { "epoch": 5.048728309959591, "grad_norm": 0.7092175483703613, "learning_rate": 7.461269460969267e-06, "loss": 0.1403, "step": 37170 }, { "epoch": 5.0500865903765835, "grad_norm": 0.7403029799461365, "learning_rate": 7.440517495720489e-06, "loss": 0.1413, "step": 37180 }, { "epoch": 5.051444870793575, "grad_norm": 0.9704974889755249, "learning_rate": 7.4197921092120595e-06, "loss": 0.1431, "step": 37190 }, { "epoch": 5.052803151210568, "grad_norm": 0.819027304649353, "learning_rate": 7.399093314387173e-06, "loss": 0.1402, "step": 37200 }, { "epoch": 5.054161431627559, "grad_norm": 0.5768560171127319, "learning_rate": 7.378421124172363e-06, "loss": 0.1411, "step": 37210 }, { "epoch": 5.055519712044552, "grad_norm": 0.6238333582878113, "learning_rate": 7.357775551477608e-06, "loss": 0.1339, "step": 37220 }, { "epoch": 5.056877992461544, "grad_norm": 0.4659234285354614, "learning_rate": 7.337156609196205e-06, "loss": 0.1424, "step": 37230 }, { "epoch": 5.058236272878536, "grad_norm": 0.8960242867469788, "learning_rate": 7.316564310204865e-06, "loss": 0.1467, "step": 37240 }, { "epoch": 5.059594553295528, "grad_norm": 0.470428466796875, "learning_rate": 7.295998667363657e-06, "loss": 0.1496, "step": 37250 }, { "epoch": 5.06095283371252, "grad_norm": 0.8899978995323181, "learning_rate": 7.275459693515968e-06, "loss": 0.1362, "step": 37260 }, { "epoch": 5.062311114129512, "grad_norm": 0.9902434349060059, "learning_rate": 7.254947401488576e-06, "loss": 0.1418, "step": 37270 }, { "epoch": 5.063669394546504, "grad_norm": 0.6578777432441711, "learning_rate": 7.234461804091552e-06, "loss": 0.1404, "step": 37280 }, { "epoch": 5.065027674963496, "grad_norm": 0.7439103126525879, "learning_rate": 7.214002914118334e-06, "loss": 0.1369, "step": 37290 }, { "epoch": 5.066385955380488, "grad_norm": 1.0886120796203613, "learning_rate": 7.193570744345646e-06, "loss": 0.1465, "step": 37300 }, { "epoch": 5.0677442357974805, "grad_norm": 0.7392677068710327, "learning_rate": 7.173165307533553e-06, "loss": 0.1438, "step": 37310 }, { "epoch": 5.069102516214472, "grad_norm": 0.8386593461036682, "learning_rate": 7.1527866164254275e-06, "loss": 0.149, "step": 37320 }, { "epoch": 5.0704607966314645, "grad_norm": 0.8222172856330872, "learning_rate": 7.1324346837478964e-06, "loss": 0.144, "step": 37330 }, { "epoch": 5.071819077048457, "grad_norm": 0.645042359828949, "learning_rate": 7.112109522210941e-06, "loss": 0.138, "step": 37340 }, { "epoch": 5.073177357465449, "grad_norm": 0.9194858074188232, "learning_rate": 7.091811144507759e-06, "loss": 0.1452, "step": 37350 }, { "epoch": 5.074535637882441, "grad_norm": 0.6993178129196167, "learning_rate": 7.07153956331486e-06, "loss": 0.1415, "step": 37360 }, { "epoch": 5.075893918299433, "grad_norm": 0.7300678491592407, "learning_rate": 7.051294791292029e-06, "loss": 0.139, "step": 37370 }, { "epoch": 5.077252198716425, "grad_norm": 0.7479437589645386, "learning_rate": 7.031076841082257e-06, "loss": 0.1506, "step": 37380 }, { "epoch": 5.078610479133417, "grad_norm": 0.6381216645240784, "learning_rate": 7.0108857253118494e-06, "loss": 0.136, "step": 37390 }, { "epoch": 5.079968759550409, "grad_norm": 0.8349984884262085, "learning_rate": 6.990721456590294e-06, "loss": 0.1492, "step": 37400 }, { "epoch": 5.081327039967401, "grad_norm": 0.8060391545295715, "learning_rate": 6.970584047510348e-06, "loss": 0.1443, "step": 37410 }, { "epoch": 5.082685320384393, "grad_norm": 0.5884799957275391, "learning_rate": 6.9504735106480035e-06, "loss": 0.1455, "step": 37420 }, { "epoch": 5.084043600801386, "grad_norm": 0.662348210811615, "learning_rate": 6.930389858562425e-06, "loss": 0.1346, "step": 37430 }, { "epoch": 5.085401881218377, "grad_norm": 0.5954204797744751, "learning_rate": 6.910333103796041e-06, "loss": 0.1427, "step": 37440 }, { "epoch": 5.08676016163537, "grad_norm": 0.6940977573394775, "learning_rate": 6.890303258874431e-06, "loss": 0.1389, "step": 37450 }, { "epoch": 5.0881184420523615, "grad_norm": 0.8290256261825562, "learning_rate": 6.870300336306423e-06, "loss": 0.1389, "step": 37460 }, { "epoch": 5.089476722469354, "grad_norm": 1.0125911235809326, "learning_rate": 6.850324348583975e-06, "loss": 0.1508, "step": 37470 }, { "epoch": 5.090835002886346, "grad_norm": 0.6352442502975464, "learning_rate": 6.8303753081822684e-06, "loss": 0.1325, "step": 37480 }, { "epoch": 5.092193283303338, "grad_norm": 1.411110281944275, "learning_rate": 6.810453227559649e-06, "loss": 0.1533, "step": 37490 }, { "epoch": 5.09355156372033, "grad_norm": 0.848087728023529, "learning_rate": 6.790558119157597e-06, "loss": 0.1476, "step": 37500 }, { "epoch": 5.094909844137322, "grad_norm": 0.7833274006843567, "learning_rate": 6.770689995400786e-06, "loss": 0.1389, "step": 37510 }, { "epoch": 5.096268124554315, "grad_norm": 0.7175629138946533, "learning_rate": 6.750848868696996e-06, "loss": 0.143, "step": 37520 }, { "epoch": 5.097626404971306, "grad_norm": 0.5630800724029541, "learning_rate": 6.731034751437193e-06, "loss": 0.1431, "step": 37530 }, { "epoch": 5.098984685388299, "grad_norm": 0.8204418420791626, "learning_rate": 6.711247655995451e-06, "loss": 0.1486, "step": 37540 }, { "epoch": 5.10034296580529, "grad_norm": 2.381701946258545, "learning_rate": 6.6914875947289565e-06, "loss": 0.1354, "step": 37550 }, { "epoch": 5.101701246222283, "grad_norm": 0.6926611661911011, "learning_rate": 6.671754579978046e-06, "loss": 0.1446, "step": 37560 }, { "epoch": 5.103059526639274, "grad_norm": 0.7546995878219604, "learning_rate": 6.6520486240661315e-06, "loss": 0.1481, "step": 37570 }, { "epoch": 5.104417807056267, "grad_norm": 0.9238495230674744, "learning_rate": 6.632369739299743e-06, "loss": 0.1522, "step": 37580 }, { "epoch": 5.1057760874732585, "grad_norm": 0.8862663507461548, "learning_rate": 6.612717937968516e-06, "loss": 0.131, "step": 37590 }, { "epoch": 5.107134367890251, "grad_norm": 0.6986141204833984, "learning_rate": 6.5930932323451435e-06, "loss": 0.1507, "step": 37600 }, { "epoch": 5.108492648307243, "grad_norm": 0.6973748803138733, "learning_rate": 6.573495634685422e-06, "loss": 0.1363, "step": 37610 }, { "epoch": 5.109850928724235, "grad_norm": 1.6127837896347046, "learning_rate": 6.553925157228202e-06, "loss": 0.1382, "step": 37620 }, { "epoch": 5.111209209141228, "grad_norm": 0.7637509107589722, "learning_rate": 6.5343818121954225e-06, "loss": 0.1385, "step": 37630 }, { "epoch": 5.112567489558219, "grad_norm": 0.9822803735733032, "learning_rate": 6.514865611792032e-06, "loss": 0.1421, "step": 37640 }, { "epoch": 5.113925769975212, "grad_norm": 0.6650543212890625, "learning_rate": 6.495376568206069e-06, "loss": 0.1433, "step": 37650 }, { "epoch": 5.115284050392203, "grad_norm": 0.5677294731140137, "learning_rate": 6.475914693608609e-06, "loss": 0.1427, "step": 37660 }, { "epoch": 5.116642330809196, "grad_norm": 0.6195065379142761, "learning_rate": 6.456480000153731e-06, "loss": 0.1391, "step": 37670 }, { "epoch": 5.118000611226187, "grad_norm": 1.1822123527526855, "learning_rate": 6.437072499978581e-06, "loss": 0.1447, "step": 37680 }, { "epoch": 5.11935889164318, "grad_norm": 0.7930792570114136, "learning_rate": 6.417692205203268e-06, "loss": 0.1425, "step": 37690 }, { "epoch": 5.120717172060171, "grad_norm": 0.7906290292739868, "learning_rate": 6.398339127930969e-06, "loss": 0.1441, "step": 37700 }, { "epoch": 5.122075452477164, "grad_norm": 0.5696882009506226, "learning_rate": 6.379013280247831e-06, "loss": 0.1488, "step": 37710 }, { "epoch": 5.123433732894156, "grad_norm": 0.5877124667167664, "learning_rate": 6.359714674222994e-06, "loss": 0.1337, "step": 37720 }, { "epoch": 5.124792013311148, "grad_norm": 0.6102612614631653, "learning_rate": 6.3404433219086065e-06, "loss": 0.139, "step": 37730 }, { "epoch": 5.1261502937281405, "grad_norm": 0.8546424508094788, "learning_rate": 6.321199235339764e-06, "loss": 0.1446, "step": 37740 }, { "epoch": 5.127508574145132, "grad_norm": 0.7998032569885254, "learning_rate": 6.301982426534575e-06, "loss": 0.1438, "step": 37750 }, { "epoch": 5.1288668545621245, "grad_norm": 0.9482899904251099, "learning_rate": 6.282792907494073e-06, "loss": 0.1491, "step": 37760 }, { "epoch": 5.130225134979116, "grad_norm": 0.667029857635498, "learning_rate": 6.263630690202277e-06, "loss": 0.1328, "step": 37770 }, { "epoch": 5.131583415396109, "grad_norm": 1.4296046495437622, "learning_rate": 6.244495786626159e-06, "loss": 0.1434, "step": 37780 }, { "epoch": 5.1329416958131, "grad_norm": 1.0259934663772583, "learning_rate": 6.225388208715593e-06, "loss": 0.1389, "step": 37790 }, { "epoch": 5.134299976230093, "grad_norm": 0.9490211606025696, "learning_rate": 6.206307968403441e-06, "loss": 0.1437, "step": 37800 }, { "epoch": 5.135658256647085, "grad_norm": 0.6145749688148499, "learning_rate": 6.187255077605447e-06, "loss": 0.1439, "step": 37810 }, { "epoch": 5.137016537064077, "grad_norm": 0.7513242959976196, "learning_rate": 6.168229548220305e-06, "loss": 0.1407, "step": 37820 }, { "epoch": 5.138374817481069, "grad_norm": 0.9439026117324829, "learning_rate": 6.149231392129617e-06, "loss": 0.1492, "step": 37830 }, { "epoch": 5.139733097898061, "grad_norm": 1.287014126777649, "learning_rate": 6.130260621197875e-06, "loss": 0.1491, "step": 37840 }, { "epoch": 5.141091378315053, "grad_norm": 0.43638214468955994, "learning_rate": 6.111317247272491e-06, "loss": 0.1488, "step": 37850 }, { "epoch": 5.142449658732045, "grad_norm": 0.5103422403335571, "learning_rate": 6.092401282183735e-06, "loss": 0.1405, "step": 37860 }, { "epoch": 5.1438079391490374, "grad_norm": 0.7239353060722351, "learning_rate": 6.073512737744791e-06, "loss": 0.1281, "step": 37870 }, { "epoch": 5.145166219566029, "grad_norm": 1.149536371231079, "learning_rate": 6.054651625751717e-06, "loss": 0.1459, "step": 37880 }, { "epoch": 5.1465244999830215, "grad_norm": 0.5514386296272278, "learning_rate": 6.035817957983408e-06, "loss": 0.1469, "step": 37890 }, { "epoch": 5.147882780400014, "grad_norm": 0.7567188739776611, "learning_rate": 6.017011746201673e-06, "loss": 0.1476, "step": 37900 }, { "epoch": 5.149241060817006, "grad_norm": 0.9504964351654053, "learning_rate": 5.998233002151104e-06, "loss": 0.1511, "step": 37910 }, { "epoch": 5.150599341233998, "grad_norm": 0.5733868479728699, "learning_rate": 5.979481737559217e-06, "loss": 0.1447, "step": 37920 }, { "epoch": 5.15195762165099, "grad_norm": 0.5770232677459717, "learning_rate": 5.960757964136293e-06, "loss": 0.1407, "step": 37930 }, { "epoch": 5.153315902067982, "grad_norm": 0.6442888379096985, "learning_rate": 5.9420616935754955e-06, "loss": 0.1457, "step": 37940 }, { "epoch": 5.154674182484974, "grad_norm": 0.8004508018493652, "learning_rate": 5.923392937552813e-06, "loss": 0.1397, "step": 37950 }, { "epoch": 5.156032462901966, "grad_norm": 0.38998937606811523, "learning_rate": 5.904751707727002e-06, "loss": 0.131, "step": 37960 }, { "epoch": 5.157390743318958, "grad_norm": 0.8198990821838379, "learning_rate": 5.886138015739689e-06, "loss": 0.1435, "step": 37970 }, { "epoch": 5.15874902373595, "grad_norm": 1.5486451387405396, "learning_rate": 5.8675518732152545e-06, "loss": 0.1377, "step": 37980 }, { "epoch": 5.160107304152943, "grad_norm": 0.6733680963516235, "learning_rate": 5.848993291760907e-06, "loss": 0.1486, "step": 37990 }, { "epoch": 5.161465584569934, "grad_norm": 1.045994520187378, "learning_rate": 5.830462282966631e-06, "loss": 0.1577, "step": 38000 }, { "epoch": 5.162823864986927, "grad_norm": 0.678294837474823, "learning_rate": 5.811958858405176e-06, "loss": 0.1507, "step": 38010 }, { "epoch": 5.1641821454039185, "grad_norm": 1.1025445461273193, "learning_rate": 5.7934830296321016e-06, "loss": 0.1526, "step": 38020 }, { "epoch": 5.165540425820911, "grad_norm": 0.6187464594841003, "learning_rate": 5.775034808185692e-06, "loss": 0.1479, "step": 38030 }, { "epoch": 5.166898706237903, "grad_norm": 0.7098843455314636, "learning_rate": 5.756614205587024e-06, "loss": 0.1317, "step": 38040 }, { "epoch": 5.168256986654895, "grad_norm": 1.228255271911621, "learning_rate": 5.738221233339913e-06, "loss": 0.1351, "step": 38050 }, { "epoch": 5.169615267071887, "grad_norm": 0.5622893571853638, "learning_rate": 5.719855902930904e-06, "loss": 0.1326, "step": 38060 }, { "epoch": 5.170973547488879, "grad_norm": 0.6769603490829468, "learning_rate": 5.701518225829311e-06, "loss": 0.1515, "step": 38070 }, { "epoch": 5.172331827905871, "grad_norm": 2.274784564971924, "learning_rate": 5.683208213487146e-06, "loss": 0.1452, "step": 38080 }, { "epoch": 5.173690108322863, "grad_norm": 1.1002285480499268, "learning_rate": 5.664925877339172e-06, "loss": 0.1191, "step": 38090 }, { "epoch": 5.175048388739856, "grad_norm": 0.550876259803772, "learning_rate": 5.6466712288028575e-06, "loss": 0.1459, "step": 38100 }, { "epoch": 5.176406669156847, "grad_norm": 1.1047718524932861, "learning_rate": 5.628444279278361e-06, "loss": 0.1315, "step": 38110 }, { "epoch": 5.17776494957384, "grad_norm": 0.5925779938697815, "learning_rate": 5.6102450401485814e-06, "loss": 0.1456, "step": 38120 }, { "epoch": 5.179123229990831, "grad_norm": 1.0348660945892334, "learning_rate": 5.5920735227790675e-06, "loss": 0.1504, "step": 38130 }, { "epoch": 5.180481510407824, "grad_norm": 0.9550721049308777, "learning_rate": 5.573929738518102e-06, "loss": 0.1437, "step": 38140 }, { "epoch": 5.1818397908248155, "grad_norm": 0.640778124332428, "learning_rate": 5.555813698696599e-06, "loss": 0.1418, "step": 38150 }, { "epoch": 5.183198071241808, "grad_norm": 0.750571608543396, "learning_rate": 5.537725414628192e-06, "loss": 0.1437, "step": 38160 }, { "epoch": 5.1845563516588, "grad_norm": 0.8570418357849121, "learning_rate": 5.5196648976091555e-06, "loss": 0.143, "step": 38170 }, { "epoch": 5.185914632075792, "grad_norm": 1.2913432121276855, "learning_rate": 5.501632158918424e-06, "loss": 0.1346, "step": 38180 }, { "epoch": 5.1872729124927845, "grad_norm": 0.783355712890625, "learning_rate": 5.483627209817599e-06, "loss": 0.1637, "step": 38190 }, { "epoch": 5.188631192909776, "grad_norm": 0.9120215773582458, "learning_rate": 5.4656500615508976e-06, "loss": 0.1423, "step": 38200 }, { "epoch": 5.189989473326769, "grad_norm": 0.7750624418258667, "learning_rate": 5.447700725345201e-06, "loss": 0.1349, "step": 38210 }, { "epoch": 5.19134775374376, "grad_norm": 0.5739421248435974, "learning_rate": 5.429779212410036e-06, "loss": 0.1378, "step": 38220 }, { "epoch": 5.192706034160753, "grad_norm": 0.6783469319343567, "learning_rate": 5.4118855339375e-06, "loss": 0.1451, "step": 38230 }, { "epoch": 5.194064314577744, "grad_norm": 0.9806886315345764, "learning_rate": 5.394019701102359e-06, "loss": 0.154, "step": 38240 }, { "epoch": 5.195422594994737, "grad_norm": 1.297784447669983, "learning_rate": 5.376181725061963e-06, "loss": 0.1418, "step": 38250 }, { "epoch": 5.196780875411728, "grad_norm": 1.681824803352356, "learning_rate": 5.35837161695627e-06, "loss": 0.1435, "step": 38260 }, { "epoch": 5.198139155828721, "grad_norm": 0.9883407354354858, "learning_rate": 5.3405893879078515e-06, "loss": 0.1397, "step": 38270 }, { "epoch": 5.1994974362457125, "grad_norm": 0.6000626683235168, "learning_rate": 5.322835049021829e-06, "loss": 0.1462, "step": 38280 }, { "epoch": 5.200855716662705, "grad_norm": 1.2666126489639282, "learning_rate": 5.305108611385956e-06, "loss": 0.1412, "step": 38290 }, { "epoch": 5.2022139970796974, "grad_norm": 0.7814596891403198, "learning_rate": 5.2874100860705154e-06, "loss": 0.1497, "step": 38300 }, { "epoch": 5.203572277496689, "grad_norm": 0.6896006464958191, "learning_rate": 5.269739484128383e-06, "loss": 0.1396, "step": 38310 }, { "epoch": 5.2049305579136815, "grad_norm": 0.9841863512992859, "learning_rate": 5.252096816595009e-06, "loss": 0.1503, "step": 38320 }, { "epoch": 5.206288838330673, "grad_norm": 0.9172872304916382, "learning_rate": 5.234482094488358e-06, "loss": 0.1492, "step": 38330 }, { "epoch": 5.207647118747666, "grad_norm": 1.1239937543869019, "learning_rate": 5.216895328808985e-06, "loss": 0.14, "step": 38340 }, { "epoch": 5.209005399164657, "grad_norm": 0.928043782711029, "learning_rate": 5.19933653053995e-06, "loss": 0.144, "step": 38350 }, { "epoch": 5.21036367958165, "grad_norm": 0.7799134850502014, "learning_rate": 5.181805710646881e-06, "loss": 0.1397, "step": 38360 }, { "epoch": 5.211721959998641, "grad_norm": 1.1627172231674194, "learning_rate": 5.1643028800779034e-06, "loss": 0.1443, "step": 38370 }, { "epoch": 5.213080240415634, "grad_norm": 0.5821877121925354, "learning_rate": 5.14682804976368e-06, "loss": 0.1369, "step": 38380 }, { "epoch": 5.214438520832626, "grad_norm": 0.7261438965797424, "learning_rate": 5.129381230617392e-06, "loss": 0.1491, "step": 38390 }, { "epoch": 5.215796801249618, "grad_norm": 0.5427941083908081, "learning_rate": 5.111962433534706e-06, "loss": 0.1379, "step": 38400 }, { "epoch": 5.21715508166661, "grad_norm": 0.7005975842475891, "learning_rate": 5.094571669393816e-06, "loss": 0.145, "step": 38410 }, { "epoch": 5.218513362083602, "grad_norm": 0.6331013441085815, "learning_rate": 5.0772089490553735e-06, "loss": 0.1379, "step": 38420 }, { "epoch": 5.219871642500594, "grad_norm": 1.0737696886062622, "learning_rate": 5.0598742833625534e-06, "loss": 0.1446, "step": 38430 }, { "epoch": 5.221229922917586, "grad_norm": 0.7997241616249084, "learning_rate": 5.0425676831409975e-06, "loss": 0.1514, "step": 38440 }, { "epoch": 5.2225882033345785, "grad_norm": 0.7141030430793762, "learning_rate": 5.025289159198798e-06, "loss": 0.1434, "step": 38450 }, { "epoch": 5.22394648375157, "grad_norm": 1.0287553071975708, "learning_rate": 5.008038722326563e-06, "loss": 0.1371, "step": 38460 }, { "epoch": 5.225304764168563, "grad_norm": 0.7938691973686218, "learning_rate": 4.990816383297298e-06, "loss": 0.1456, "step": 38470 }, { "epoch": 5.226663044585555, "grad_norm": 0.5560646057128906, "learning_rate": 4.973622152866503e-06, "loss": 0.1439, "step": 38480 }, { "epoch": 5.228021325002547, "grad_norm": 0.7456498742103577, "learning_rate": 4.956456041772128e-06, "loss": 0.1373, "step": 38490 }, { "epoch": 5.229379605419539, "grad_norm": 1.7046796083450317, "learning_rate": 4.939318060734533e-06, "loss": 0.141, "step": 38500 }, { "epoch": 5.230737885836531, "grad_norm": 0.8667698502540588, "learning_rate": 4.922208220456537e-06, "loss": 0.1407, "step": 38510 }, { "epoch": 5.232096166253523, "grad_norm": 1.1419817209243774, "learning_rate": 4.90512653162335e-06, "loss": 0.1376, "step": 38520 }, { "epoch": 5.233454446670515, "grad_norm": 0.7932004332542419, "learning_rate": 4.888073004902649e-06, "loss": 0.1434, "step": 38530 }, { "epoch": 5.234812727087507, "grad_norm": 0.4832431972026825, "learning_rate": 4.871047650944494e-06, "loss": 0.1408, "step": 38540 }, { "epoch": 5.236171007504499, "grad_norm": 1.0320626497268677, "learning_rate": 4.854050480381345e-06, "loss": 0.1341, "step": 38550 }, { "epoch": 5.237529287921491, "grad_norm": 0.771741509437561, "learning_rate": 4.837081503828089e-06, "loss": 0.1484, "step": 38560 }, { "epoch": 5.238887568338484, "grad_norm": 1.4774073362350464, "learning_rate": 4.8201407318819725e-06, "loss": 0.1479, "step": 38570 }, { "epoch": 5.2402458487554755, "grad_norm": 0.7428338527679443, "learning_rate": 4.803228175122659e-06, "loss": 0.1378, "step": 38580 }, { "epoch": 5.241604129172468, "grad_norm": 1.107413411140442, "learning_rate": 4.786343844112157e-06, "loss": 0.1494, "step": 38590 }, { "epoch": 5.24296240958946, "grad_norm": 0.9535856246948242, "learning_rate": 4.769487749394885e-06, "loss": 0.1467, "step": 38600 }, { "epoch": 5.244320690006452, "grad_norm": 0.6381769776344299, "learning_rate": 4.752659901497608e-06, "loss": 0.1422, "step": 38610 }, { "epoch": 5.245678970423444, "grad_norm": 0.7972875237464905, "learning_rate": 4.735860310929447e-06, "loss": 0.1485, "step": 38620 }, { "epoch": 5.247037250840436, "grad_norm": 0.817256510257721, "learning_rate": 4.7190889881818916e-06, "loss": 0.1363, "step": 38630 }, { "epoch": 5.248395531257428, "grad_norm": 1.5639797449111938, "learning_rate": 4.7023459437287475e-06, "loss": 0.1348, "step": 38640 }, { "epoch": 5.24975381167442, "grad_norm": 0.7771941423416138, "learning_rate": 4.685631188026196e-06, "loss": 0.1385, "step": 38650 }, { "epoch": 5.251112092091413, "grad_norm": 0.9127650260925293, "learning_rate": 4.668944731512748e-06, "loss": 0.1417, "step": 38660 }, { "epoch": 5.252470372508404, "grad_norm": 0.8027299642562866, "learning_rate": 4.6522865846092115e-06, "loss": 0.1284, "step": 38670 }, { "epoch": 5.253828652925397, "grad_norm": 0.7825043201446533, "learning_rate": 4.635656757718742e-06, "loss": 0.1391, "step": 38680 }, { "epoch": 5.255186933342388, "grad_norm": 0.9476396441459656, "learning_rate": 4.619055261226795e-06, "loss": 0.1381, "step": 38690 }, { "epoch": 5.256545213759381, "grad_norm": 0.7814838886260986, "learning_rate": 4.602482105501144e-06, "loss": 0.1524, "step": 38700 }, { "epoch": 5.2579034941763725, "grad_norm": 1.931306004524231, "learning_rate": 4.585937300891863e-06, "loss": 0.1448, "step": 38710 }, { "epoch": 5.259261774593365, "grad_norm": 0.5851762890815735, "learning_rate": 4.569420857731299e-06, "loss": 0.1433, "step": 38720 }, { "epoch": 5.260620055010357, "grad_norm": 0.514948844909668, "learning_rate": 4.55293278633413e-06, "loss": 0.1481, "step": 38730 }, { "epoch": 5.261978335427349, "grad_norm": 0.5826024413108826, "learning_rate": 4.536473096997262e-06, "loss": 0.1412, "step": 38740 }, { "epoch": 5.263336615844341, "grad_norm": 0.892630398273468, "learning_rate": 4.52004179999993e-06, "loss": 0.1419, "step": 38750 }, { "epoch": 5.264694896261333, "grad_norm": 0.6151211857795715, "learning_rate": 4.503638905603586e-06, "loss": 0.1447, "step": 38760 }, { "epoch": 5.266053176678326, "grad_norm": 0.827334463596344, "learning_rate": 4.487264424051985e-06, "loss": 0.1502, "step": 38770 }, { "epoch": 5.267411457095317, "grad_norm": 1.0067636966705322, "learning_rate": 4.470918365571131e-06, "loss": 0.1371, "step": 38780 }, { "epoch": 5.26876973751231, "grad_norm": 0.8165988326072693, "learning_rate": 4.454600740369247e-06, "loss": 0.1314, "step": 38790 }, { "epoch": 5.270128017929301, "grad_norm": 0.8858462572097778, "learning_rate": 4.438311558636843e-06, "loss": 0.1397, "step": 38800 }, { "epoch": 5.271486298346294, "grad_norm": 0.7622038722038269, "learning_rate": 4.422050830546626e-06, "loss": 0.1455, "step": 38810 }, { "epoch": 5.272844578763285, "grad_norm": 0.7795936465263367, "learning_rate": 4.405818566253561e-06, "loss": 0.1492, "step": 38820 }, { "epoch": 5.274202859180278, "grad_norm": 1.0271930694580078, "learning_rate": 4.3896147758948405e-06, "loss": 0.144, "step": 38830 }, { "epoch": 5.2755611395972695, "grad_norm": 1.0789967775344849, "learning_rate": 4.373439469589841e-06, "loss": 0.145, "step": 38840 }, { "epoch": 5.276919420014262, "grad_norm": 1.0067511796951294, "learning_rate": 4.3572926574401816e-06, "loss": 0.1313, "step": 38850 }, { "epoch": 5.278277700431254, "grad_norm": 0.7815563082695007, "learning_rate": 4.341174349529675e-06, "loss": 0.152, "step": 38860 }, { "epoch": 5.279635980848246, "grad_norm": 1.0038480758666992, "learning_rate": 4.325084555924336e-06, "loss": 0.1412, "step": 38870 }, { "epoch": 5.2809942612652385, "grad_norm": 0.883963406085968, "learning_rate": 4.309023286672375e-06, "loss": 0.1421, "step": 38880 }, { "epoch": 5.28235254168223, "grad_norm": 0.8793511390686035, "learning_rate": 4.292990551804171e-06, "loss": 0.1336, "step": 38890 }, { "epoch": 5.283710822099223, "grad_norm": 1.020180344581604, "learning_rate": 4.276986361332314e-06, "loss": 0.1522, "step": 38900 }, { "epoch": 5.285069102516214, "grad_norm": 0.8277050852775574, "learning_rate": 4.261010725251535e-06, "loss": 0.1399, "step": 38910 }, { "epoch": 5.286427382933207, "grad_norm": 1.0563033819198608, "learning_rate": 4.24506365353875e-06, "loss": 0.1426, "step": 38920 }, { "epoch": 5.287785663350198, "grad_norm": 1.2410494089126587, "learning_rate": 4.229145156153053e-06, "loss": 0.1567, "step": 38930 }, { "epoch": 5.289143943767191, "grad_norm": 0.828594982624054, "learning_rate": 4.213255243035641e-06, "loss": 0.1403, "step": 38940 }, { "epoch": 5.290502224184182, "grad_norm": 0.8019607067108154, "learning_rate": 4.197393924109927e-06, "loss": 0.1435, "step": 38950 }, { "epoch": 5.291860504601175, "grad_norm": 0.7575341463088989, "learning_rate": 4.181561209281404e-06, "loss": 0.1332, "step": 38960 }, { "epoch": 5.293218785018167, "grad_norm": 1.3513734340667725, "learning_rate": 4.1657571084377515e-06, "loss": 0.1443, "step": 38970 }, { "epoch": 5.294577065435159, "grad_norm": 0.8904944658279419, "learning_rate": 4.14998163144874e-06, "loss": 0.1429, "step": 38980 }, { "epoch": 5.295935345852151, "grad_norm": 0.6302757263183594, "learning_rate": 4.134234788166297e-06, "loss": 0.1377, "step": 38990 }, { "epoch": 5.297293626269143, "grad_norm": 0.4782788157463074, "learning_rate": 4.118516588424448e-06, "loss": 0.1371, "step": 39000 }, { "epoch": 5.2986519066861355, "grad_norm": 8.423318862915039, "learning_rate": 4.1028270420393315e-06, "loss": 0.1417, "step": 39010 }, { "epoch": 5.300010187103127, "grad_norm": 0.7367261052131653, "learning_rate": 4.087166158809208e-06, "loss": 0.146, "step": 39020 }, { "epoch": 5.30136846752012, "grad_norm": 0.7856121063232422, "learning_rate": 4.071533948514411e-06, "loss": 0.1439, "step": 39030 }, { "epoch": 5.302726747937111, "grad_norm": 1.6741793155670166, "learning_rate": 4.055930420917387e-06, "loss": 0.1455, "step": 39040 }, { "epoch": 5.304085028354104, "grad_norm": 0.7891892194747925, "learning_rate": 4.040355585762667e-06, "loss": 0.137, "step": 39050 }, { "epoch": 5.305443308771096, "grad_norm": 0.5782830715179443, "learning_rate": 4.024809452776862e-06, "loss": 0.133, "step": 39060 }, { "epoch": 5.306801589188088, "grad_norm": 1.617783546447754, "learning_rate": 4.00929203166866e-06, "loss": 0.1497, "step": 39070 }, { "epoch": 5.30815986960508, "grad_norm": 4.137258052825928, "learning_rate": 3.993803332128804e-06, "loss": 0.1452, "step": 39080 }, { "epoch": 5.309518150022072, "grad_norm": 0.7682560086250305, "learning_rate": 3.978343363830117e-06, "loss": 0.1366, "step": 39090 }, { "epoch": 5.310876430439064, "grad_norm": 1.0354138612747192, "learning_rate": 3.962912136427488e-06, "loss": 0.1472, "step": 39100 }, { "epoch": 5.312234710856056, "grad_norm": 3.6736700534820557, "learning_rate": 3.947509659557824e-06, "loss": 0.1373, "step": 39110 }, { "epoch": 5.313592991273048, "grad_norm": 0.6498751044273376, "learning_rate": 3.9321359428401095e-06, "loss": 0.1416, "step": 39120 }, { "epoch": 5.31495127169004, "grad_norm": 0.8131852149963379, "learning_rate": 3.916790995875342e-06, "loss": 0.1469, "step": 39130 }, { "epoch": 5.3163095521070325, "grad_norm": 1.581787347793579, "learning_rate": 3.901474828246571e-06, "loss": 0.1411, "step": 39140 }, { "epoch": 5.317667832524025, "grad_norm": 1.466425895690918, "learning_rate": 3.8861874495188744e-06, "loss": 0.1381, "step": 39150 }, { "epoch": 5.319026112941017, "grad_norm": 1.0851690769195557, "learning_rate": 3.870928869239332e-06, "loss": 0.1397, "step": 39160 }, { "epoch": 5.320384393358009, "grad_norm": 0.5887765884399414, "learning_rate": 3.85569909693706e-06, "loss": 0.1521, "step": 39170 }, { "epoch": 5.321742673775001, "grad_norm": 0.8750267624855042, "learning_rate": 3.840498142123161e-06, "loss": 0.1398, "step": 39180 }, { "epoch": 5.323100954191993, "grad_norm": 0.8229497075080872, "learning_rate": 3.8253260142907756e-06, "loss": 0.1414, "step": 39190 }, { "epoch": 5.324459234608985, "grad_norm": 1.1904927492141724, "learning_rate": 3.81018272291499e-06, "loss": 0.1407, "step": 39200 }, { "epoch": 5.325817515025977, "grad_norm": 0.8182803392410278, "learning_rate": 3.7950682774529347e-06, "loss": 0.1408, "step": 39210 }, { "epoch": 5.327175795442969, "grad_norm": 0.9044128060340881, "learning_rate": 3.779982687343703e-06, "loss": 0.1342, "step": 39220 }, { "epoch": 5.328534075859961, "grad_norm": 0.7536550760269165, "learning_rate": 3.764925962008353e-06, "loss": 0.1379, "step": 39230 }, { "epoch": 5.329892356276954, "grad_norm": 0.613448441028595, "learning_rate": 3.7498981108499476e-06, "loss": 0.1483, "step": 39240 }, { "epoch": 5.331250636693945, "grad_norm": 0.8056625723838806, "learning_rate": 3.73489914325349e-06, "loss": 0.1384, "step": 39250 }, { "epoch": 5.332608917110938, "grad_norm": 0.6624165773391724, "learning_rate": 3.7199290685859534e-06, "loss": 0.1429, "step": 39260 }, { "epoch": 5.3339671975279295, "grad_norm": 1.4389772415161133, "learning_rate": 3.704987896196288e-06, "loss": 0.1454, "step": 39270 }, { "epoch": 5.335325477944922, "grad_norm": 0.8927149772644043, "learning_rate": 3.690075635415352e-06, "loss": 0.1359, "step": 39280 }, { "epoch": 5.3366837583619136, "grad_norm": 1.0769869089126587, "learning_rate": 3.6751922955560014e-06, "loss": 0.1451, "step": 39290 }, { "epoch": 5.338042038778906, "grad_norm": 0.9463263750076294, "learning_rate": 3.6603378859129732e-06, "loss": 0.1373, "step": 39300 }, { "epoch": 5.339400319195898, "grad_norm": 0.822681188583374, "learning_rate": 3.6455124157629805e-06, "loss": 0.1502, "step": 39310 }, { "epoch": 5.34075859961289, "grad_norm": 0.6290082931518555, "learning_rate": 3.630715894364656e-06, "loss": 0.1409, "step": 39320 }, { "epoch": 5.342116880029883, "grad_norm": 0.9389230608940125, "learning_rate": 3.6159483309585298e-06, "loss": 0.1441, "step": 39330 }, { "epoch": 5.343475160446874, "grad_norm": 1.4326623678207397, "learning_rate": 3.601209734767086e-06, "loss": 0.1435, "step": 39340 }, { "epoch": 5.344833440863867, "grad_norm": 3.936899423599243, "learning_rate": 3.586500114994673e-06, "loss": 0.1302, "step": 39350 }, { "epoch": 5.346191721280858, "grad_norm": 0.7288305759429932, "learning_rate": 3.5718194808275764e-06, "loss": 0.1362, "step": 39360 }, { "epoch": 5.347550001697851, "grad_norm": 0.5292114615440369, "learning_rate": 3.557167841433984e-06, "loss": 0.1446, "step": 39370 }, { "epoch": 5.348908282114842, "grad_norm": 0.5131908655166626, "learning_rate": 3.5425452059639374e-06, "loss": 0.1374, "step": 39380 }, { "epoch": 5.350266562531835, "grad_norm": 0.6277468204498291, "learning_rate": 3.5279515835494213e-06, "loss": 0.1352, "step": 39390 }, { "epoch": 5.3516248429488265, "grad_norm": 0.8893325328826904, "learning_rate": 3.51338698330424e-06, "loss": 0.1541, "step": 39400 }, { "epoch": 5.352983123365819, "grad_norm": 0.753708004951477, "learning_rate": 3.498851414324128e-06, "loss": 0.1384, "step": 39410 }, { "epoch": 5.354341403782811, "grad_norm": 0.8656032681465149, "learning_rate": 3.4843448856866467e-06, "loss": 0.1463, "step": 39420 }, { "epoch": 5.355699684199803, "grad_norm": 1.1776996850967407, "learning_rate": 3.4698674064512493e-06, "loss": 0.1234, "step": 39430 }, { "epoch": 5.3570579646167955, "grad_norm": 0.9559494256973267, "learning_rate": 3.455418985659248e-06, "loss": 0.1442, "step": 39440 }, { "epoch": 5.358416245033787, "grad_norm": 0.5854634642601013, "learning_rate": 3.4409996323337747e-06, "loss": 0.1389, "step": 39450 }, { "epoch": 5.35977452545078, "grad_norm": 1.229901671409607, "learning_rate": 3.4266093554798595e-06, "loss": 0.1376, "step": 39460 }, { "epoch": 5.361132805867771, "grad_norm": 0.7570446729660034, "learning_rate": 3.4122481640843195e-06, "loss": 0.1406, "step": 39470 }, { "epoch": 5.362491086284764, "grad_norm": 0.6286010146141052, "learning_rate": 3.3979160671158526e-06, "loss": 0.1446, "step": 39480 }, { "epoch": 5.363849366701755, "grad_norm": 0.8642194271087646, "learning_rate": 3.3836130735249715e-06, "loss": 0.14, "step": 39490 }, { "epoch": 5.365207647118748, "grad_norm": 1.0615088939666748, "learning_rate": 3.369339192243992e-06, "loss": 0.1502, "step": 39500 }, { "epoch": 5.366565927535739, "grad_norm": 1.0258418321609497, "learning_rate": 3.3550944321870892e-06, "loss": 0.1297, "step": 39510 }, { "epoch": 5.367924207952732, "grad_norm": 0.7431660890579224, "learning_rate": 3.340878802250219e-06, "loss": 0.1581, "step": 39520 }, { "epoch": 5.369282488369724, "grad_norm": 0.8179650902748108, "learning_rate": 3.3266923113111525e-06, "loss": 0.138, "step": 39530 }, { "epoch": 5.370640768786716, "grad_norm": 0.9062204957008362, "learning_rate": 3.312534968229486e-06, "loss": 0.153, "step": 39540 }, { "epoch": 5.371999049203708, "grad_norm": 2.9295246601104736, "learning_rate": 3.2984067818465747e-06, "loss": 0.1441, "step": 39550 }, { "epoch": 5.3733573296207, "grad_norm": 0.9988417625427246, "learning_rate": 3.2843077609855997e-06, "loss": 0.1363, "step": 39560 }, { "epoch": 5.3747156100376925, "grad_norm": 0.5613035559654236, "learning_rate": 3.2702379144515017e-06, "loss": 0.1374, "step": 39570 }, { "epoch": 5.376073890454684, "grad_norm": 0.5888733267784119, "learning_rate": 3.256197251031018e-06, "loss": 0.1422, "step": 39580 }, { "epoch": 5.377432170871677, "grad_norm": 1.2092944383621216, "learning_rate": 3.242185779492668e-06, "loss": 0.1367, "step": 39590 }, { "epoch": 5.378790451288668, "grad_norm": 0.5143150687217712, "learning_rate": 3.228203508586708e-06, "loss": 0.138, "step": 39600 }, { "epoch": 5.380148731705661, "grad_norm": 0.9525545239448547, "learning_rate": 3.2142504470451975e-06, "loss": 0.1337, "step": 39610 }, { "epoch": 5.381507012122652, "grad_norm": 0.8415699601173401, "learning_rate": 3.200326603581921e-06, "loss": 0.1366, "step": 39620 }, { "epoch": 5.382865292539645, "grad_norm": 0.5390661358833313, "learning_rate": 3.186431986892441e-06, "loss": 0.1358, "step": 39630 }, { "epoch": 5.384223572956637, "grad_norm": 0.9183427691459656, "learning_rate": 3.1725666056540536e-06, "loss": 0.1321, "step": 39640 }, { "epoch": 5.385581853373629, "grad_norm": 0.6886507868766785, "learning_rate": 3.1587304685257948e-06, "loss": 0.1433, "step": 39650 }, { "epoch": 5.386940133790621, "grad_norm": 1.4288815259933472, "learning_rate": 3.1449235841484626e-06, "loss": 0.1454, "step": 39660 }, { "epoch": 5.388298414207613, "grad_norm": 0.8222140669822693, "learning_rate": 3.1311459611445493e-06, "loss": 0.1438, "step": 39670 }, { "epoch": 5.389656694624605, "grad_norm": 0.8812934756278992, "learning_rate": 3.117397608118311e-06, "loss": 0.1516, "step": 39680 }, { "epoch": 5.391014975041597, "grad_norm": 0.8469387888908386, "learning_rate": 3.1036785336556797e-06, "loss": 0.14, "step": 39690 }, { "epoch": 5.3923732554585895, "grad_norm": 0.8838472366333008, "learning_rate": 3.0899887463243405e-06, "loss": 0.1444, "step": 39700 }, { "epoch": 5.393731535875581, "grad_norm": 1.0929521322250366, "learning_rate": 3.0763282546736884e-06, "loss": 0.1429, "step": 39710 }, { "epoch": 5.395089816292574, "grad_norm": 0.9416700005531311, "learning_rate": 3.0626970672347878e-06, "loss": 0.1454, "step": 39720 }, { "epoch": 5.396448096709566, "grad_norm": 1.0522996187210083, "learning_rate": 3.0490951925204514e-06, "loss": 0.1521, "step": 39730 }, { "epoch": 5.397806377126558, "grad_norm": 1.015074610710144, "learning_rate": 3.035522639025129e-06, "loss": 0.134, "step": 39740 }, { "epoch": 5.39916465754355, "grad_norm": 0.6179034113883972, "learning_rate": 3.0219794152250126e-06, "loss": 0.1427, "step": 39750 }, { "epoch": 5.400522937960542, "grad_norm": 0.790790319442749, "learning_rate": 3.008465529577953e-06, "loss": 0.1392, "step": 39760 }, { "epoch": 5.401881218377534, "grad_norm": 1.7955784797668457, "learning_rate": 2.9949809905234714e-06, "loss": 0.136, "step": 39770 }, { "epoch": 5.403239498794526, "grad_norm": 0.6470680236816406, "learning_rate": 2.9815258064827766e-06, "loss": 0.1355, "step": 39780 }, { "epoch": 5.404597779211518, "grad_norm": 0.616600513458252, "learning_rate": 2.9680999858587354e-06, "loss": 0.1355, "step": 39790 }, { "epoch": 5.40595605962851, "grad_norm": 0.5880200862884521, "learning_rate": 2.9547035370358857e-06, "loss": 0.1317, "step": 39800 }, { "epoch": 5.407314340045502, "grad_norm": 0.8058574199676514, "learning_rate": 2.9413364683804235e-06, "loss": 0.135, "step": 39810 }, { "epoch": 5.408672620462495, "grad_norm": 0.9309934377670288, "learning_rate": 2.9279987882401826e-06, "loss": 0.1347, "step": 39820 }, { "epoch": 5.4100309008794865, "grad_norm": 0.9143241047859192, "learning_rate": 2.914690504944656e-06, "loss": 0.1351, "step": 39830 }, { "epoch": 5.411389181296479, "grad_norm": 0.6954854130744934, "learning_rate": 2.901411626804962e-06, "loss": 0.1492, "step": 39840 }, { "epoch": 5.4127474617134705, "grad_norm": 0.808152973651886, "learning_rate": 2.888162162113889e-06, "loss": 0.1472, "step": 39850 }, { "epoch": 5.414105742130463, "grad_norm": 0.7980167865753174, "learning_rate": 2.8749421191458127e-06, "loss": 0.1439, "step": 39860 }, { "epoch": 5.415464022547455, "grad_norm": 0.504129946231842, "learning_rate": 2.861751506156768e-06, "loss": 0.1292, "step": 39870 }, { "epoch": 5.416822302964447, "grad_norm": 0.897083044052124, "learning_rate": 2.848590331384404e-06, "loss": 0.1349, "step": 39880 }, { "epoch": 5.418180583381439, "grad_norm": 0.5962047576904297, "learning_rate": 2.8354586030479692e-06, "loss": 0.1454, "step": 39890 }, { "epoch": 5.419538863798431, "grad_norm": 0.7385744452476501, "learning_rate": 2.8223563293483467e-06, "loss": 0.1315, "step": 39900 }, { "epoch": 5.420897144215424, "grad_norm": 0.809693455696106, "learning_rate": 2.809283518467992e-06, "loss": 0.1309, "step": 39910 }, { "epoch": 5.422255424632415, "grad_norm": 2.121685266494751, "learning_rate": 2.796240178570997e-06, "loss": 0.1489, "step": 39920 }, { "epoch": 5.423613705049408, "grad_norm": 1.4547982215881348, "learning_rate": 2.7832263178030393e-06, "loss": 0.1427, "step": 39930 }, { "epoch": 5.424971985466399, "grad_norm": 1.1966248750686646, "learning_rate": 2.770241944291363e-06, "loss": 0.1418, "step": 39940 }, { "epoch": 5.426330265883392, "grad_norm": 0.9525652527809143, "learning_rate": 2.757287066144826e-06, "loss": 0.1498, "step": 39950 }, { "epoch": 5.427688546300383, "grad_norm": 0.7026509642601013, "learning_rate": 2.7443616914538394e-06, "loss": 0.1544, "step": 39960 }, { "epoch": 5.429046826717376, "grad_norm": 1.148956298828125, "learning_rate": 2.7314658282904194e-06, "loss": 0.1509, "step": 39970 }, { "epoch": 5.4304051071343675, "grad_norm": 0.7740558385848999, "learning_rate": 2.7185994847081343e-06, "loss": 0.1427, "step": 39980 }, { "epoch": 5.43176338755136, "grad_norm": 0.7306461334228516, "learning_rate": 2.7057626687421123e-06, "loss": 0.1516, "step": 39990 }, { "epoch": 5.4331216679683525, "grad_norm": 0.9118457436561584, "learning_rate": 2.692955388409063e-06, "loss": 0.1414, "step": 40000 }, { "epoch": 5.434479948385344, "grad_norm": 0.7637365460395813, "learning_rate": 2.680177651707211e-06, "loss": 0.1405, "step": 40010 }, { "epoch": 5.435838228802337, "grad_norm": 0.7411639094352722, "learning_rate": 2.6674294666163834e-06, "loss": 0.15, "step": 40020 }, { "epoch": 5.437196509219328, "grad_norm": 1.4028271436691284, "learning_rate": 2.6547108410979073e-06, "loss": 0.1435, "step": 40030 }, { "epoch": 5.438554789636321, "grad_norm": 0.619103193283081, "learning_rate": 2.6420217830946727e-06, "loss": 0.1402, "step": 40040 }, { "epoch": 5.439913070053312, "grad_norm": 2.4378459453582764, "learning_rate": 2.6293623005311017e-06, "loss": 0.1371, "step": 40050 }, { "epoch": 5.441271350470305, "grad_norm": 0.9855973720550537, "learning_rate": 2.6167324013131324e-06, "loss": 0.1426, "step": 40060 }, { "epoch": 5.442629630887296, "grad_norm": 1.3349634408950806, "learning_rate": 2.604132093328254e-06, "loss": 0.153, "step": 40070 }, { "epoch": 5.443987911304289, "grad_norm": 0.7881361842155457, "learning_rate": 2.5915613844454457e-06, "loss": 0.1437, "step": 40080 }, { "epoch": 5.445346191721281, "grad_norm": 0.633462131023407, "learning_rate": 2.5790202825152267e-06, "loss": 0.1338, "step": 40090 }, { "epoch": 5.446704472138273, "grad_norm": 0.6354900598526001, "learning_rate": 2.5665087953696208e-06, "loss": 0.1373, "step": 40100 }, { "epoch": 5.448062752555265, "grad_norm": 1.0948253870010376, "learning_rate": 2.554026930822134e-06, "loss": 0.1442, "step": 40110 }, { "epoch": 5.449421032972257, "grad_norm": 0.5755667686462402, "learning_rate": 2.5415746966678144e-06, "loss": 0.1388, "step": 40120 }, { "epoch": 5.4507793133892495, "grad_norm": 0.6431179046630859, "learning_rate": 2.529152100683163e-06, "loss": 0.1428, "step": 40130 }, { "epoch": 5.452137593806241, "grad_norm": 0.6096735596656799, "learning_rate": 2.5167591506261967e-06, "loss": 0.1385, "step": 40140 }, { "epoch": 5.453495874223234, "grad_norm": 0.5845221877098083, "learning_rate": 2.5043958542364244e-06, "loss": 0.1453, "step": 40150 }, { "epoch": 5.454854154640225, "grad_norm": 1.6177302598953247, "learning_rate": 2.492062219234814e-06, "loss": 0.1398, "step": 40160 }, { "epoch": 5.456212435057218, "grad_norm": 0.8732364177703857, "learning_rate": 2.4797582533238216e-06, "loss": 0.1294, "step": 40170 }, { "epoch": 5.457570715474209, "grad_norm": 1.359415054321289, "learning_rate": 2.4674839641873725e-06, "loss": 0.1437, "step": 40180 }, { "epoch": 5.458928995891202, "grad_norm": 0.6213569641113281, "learning_rate": 2.4552393594908572e-06, "loss": 0.1372, "step": 40190 }, { "epoch": 5.460287276308194, "grad_norm": 1.103348731994629, "learning_rate": 2.443024446881137e-06, "loss": 0.1296, "step": 40200 }, { "epoch": 5.461645556725186, "grad_norm": 0.5422453880310059, "learning_rate": 2.4308392339865147e-06, "loss": 0.1421, "step": 40210 }, { "epoch": 5.463003837142178, "grad_norm": 0.8007174134254456, "learning_rate": 2.4186837284167708e-06, "loss": 0.1355, "step": 40220 }, { "epoch": 5.46436211755917, "grad_norm": 0.5631441473960876, "learning_rate": 2.406557937763093e-06, "loss": 0.1371, "step": 40230 }, { "epoch": 5.465720397976162, "grad_norm": 1.0136383771896362, "learning_rate": 2.394461869598158e-06, "loss": 0.145, "step": 40240 }, { "epoch": 5.467078678393154, "grad_norm": 0.852955162525177, "learning_rate": 2.3823955314760395e-06, "loss": 0.1461, "step": 40250 }, { "epoch": 5.4684369588101465, "grad_norm": 0.9001619219779968, "learning_rate": 2.370358930932265e-06, "loss": 0.1483, "step": 40260 }, { "epoch": 5.469795239227138, "grad_norm": 0.5431816577911377, "learning_rate": 2.3583520754838105e-06, "loss": 0.1359, "step": 40270 }, { "epoch": 5.4711535196441305, "grad_norm": 0.9713765978813171, "learning_rate": 2.3463749726290286e-06, "loss": 0.1315, "step": 40280 }, { "epoch": 5.472511800061123, "grad_norm": 0.7597293257713318, "learning_rate": 2.33442762984773e-06, "loss": 0.1343, "step": 40290 }, { "epoch": 5.473870080478115, "grad_norm": 0.8606007695198059, "learning_rate": 2.322510054601118e-06, "loss": 0.1303, "step": 40300 }, { "epoch": 5.475228360895107, "grad_norm": 0.9651880264282227, "learning_rate": 2.3106222543318235e-06, "loss": 0.1362, "step": 40310 }, { "epoch": 5.476586641312099, "grad_norm": 0.7233141660690308, "learning_rate": 2.298764236463874e-06, "loss": 0.1463, "step": 40320 }, { "epoch": 5.477944921729091, "grad_norm": 0.6589844226837158, "learning_rate": 2.286936008402685e-06, "loss": 0.1449, "step": 40330 }, { "epoch": 5.479303202146083, "grad_norm": 0.6830262541770935, "learning_rate": 2.275137577535097e-06, "loss": 0.1447, "step": 40340 }, { "epoch": 5.480661482563075, "grad_norm": 0.7699103951454163, "learning_rate": 2.2633689512293053e-06, "loss": 0.1492, "step": 40350 }, { "epoch": 5.482019762980067, "grad_norm": 2.2510061264038086, "learning_rate": 2.251630136834931e-06, "loss": 0.1401, "step": 40360 }, { "epoch": 5.483378043397059, "grad_norm": 1.3840720653533936, "learning_rate": 2.2399211416829545e-06, "loss": 0.1482, "step": 40370 }, { "epoch": 5.484736323814051, "grad_norm": 0.8986865878105164, "learning_rate": 2.2282419730857263e-06, "loss": 0.1385, "step": 40380 }, { "epoch": 5.4860946042310434, "grad_norm": 0.8684307336807251, "learning_rate": 2.216592638337001e-06, "loss": 0.1492, "step": 40390 }, { "epoch": 5.487452884648036, "grad_norm": 0.8904423713684082, "learning_rate": 2.204973144711864e-06, "loss": 0.1425, "step": 40400 }, { "epoch": 5.4888111650650275, "grad_norm": 0.8869350552558899, "learning_rate": 2.1933834994667833e-06, "loss": 0.1412, "step": 40410 }, { "epoch": 5.49016944548202, "grad_norm": 0.9250911474227905, "learning_rate": 2.181823709839603e-06, "loss": 0.1426, "step": 40420 }, { "epoch": 5.491527725899012, "grad_norm": 0.6771318316459656, "learning_rate": 2.1702937830494873e-06, "loss": 0.1419, "step": 40430 }, { "epoch": 5.492886006316004, "grad_norm": 0.4874940812587738, "learning_rate": 2.1587937262969815e-06, "loss": 0.1427, "step": 40440 }, { "epoch": 5.494244286732996, "grad_norm": 0.9255322813987732, "learning_rate": 2.147323546763952e-06, "loss": 0.1418, "step": 40450 }, { "epoch": 5.495602567149988, "grad_norm": 0.9342231154441833, "learning_rate": 2.1358832516136352e-06, "loss": 0.1388, "step": 40460 }, { "epoch": 5.49696084756698, "grad_norm": 1.3004857301712036, "learning_rate": 2.124472847990566e-06, "loss": 0.1347, "step": 40470 }, { "epoch": 5.498319127983972, "grad_norm": 0.6441742777824402, "learning_rate": 2.113092343020645e-06, "loss": 0.1347, "step": 40480 }, { "epoch": 5.499677408400965, "grad_norm": 0.5843059420585632, "learning_rate": 2.101741743811092e-06, "loss": 0.1469, "step": 40490 }, { "epoch": 5.501035688817956, "grad_norm": 1.0365034341812134, "learning_rate": 2.0904210574504424e-06, "loss": 0.1411, "step": 40500 }, { "epoch": 5.502393969234949, "grad_norm": 0.5202316641807556, "learning_rate": 2.079130291008563e-06, "loss": 0.1394, "step": 40510 }, { "epoch": 5.50375224965194, "grad_norm": 1.0431113243103027, "learning_rate": 2.067869451536619e-06, "loss": 0.1435, "step": 40520 }, { "epoch": 5.505110530068933, "grad_norm": 0.7671029567718506, "learning_rate": 2.056638546067102e-06, "loss": 0.1472, "step": 40530 }, { "epoch": 5.5064688104859245, "grad_norm": 0.9073117971420288, "learning_rate": 2.0454375816138073e-06, "loss": 0.1317, "step": 40540 }, { "epoch": 5.507827090902917, "grad_norm": 1.0308116674423218, "learning_rate": 2.0342665651718163e-06, "loss": 0.1576, "step": 40550 }, { "epoch": 5.509185371319909, "grad_norm": 0.7800028324127197, "learning_rate": 2.0231255037175267e-06, "loss": 0.1496, "step": 40560 }, { "epoch": 5.510543651736901, "grad_norm": 0.7173296213150024, "learning_rate": 2.0120144042086176e-06, "loss": 0.1248, "step": 40570 }, { "epoch": 5.511901932153894, "grad_norm": 0.6283446550369263, "learning_rate": 2.0009332735840602e-06, "loss": 0.1282, "step": 40580 }, { "epoch": 5.513260212570885, "grad_norm": 0.9575423002243042, "learning_rate": 1.9898821187641194e-06, "loss": 0.1426, "step": 40590 }, { "epoch": 5.514618492987878, "grad_norm": 0.7109612226486206, "learning_rate": 1.9788609466503182e-06, "loss": 0.13, "step": 40600 }, { "epoch": 5.515976773404869, "grad_norm": 0.8733770847320557, "learning_rate": 1.9678697641254797e-06, "loss": 0.1475, "step": 40610 }, { "epoch": 5.517335053821862, "grad_norm": 1.5246310234069824, "learning_rate": 1.956908578053668e-06, "loss": 0.1552, "step": 40620 }, { "epoch": 5.518693334238853, "grad_norm": 0.9033196568489075, "learning_rate": 1.9459773952802463e-06, "loss": 0.1421, "step": 40630 }, { "epoch": 5.520051614655846, "grad_norm": 0.7101077437400818, "learning_rate": 1.9350762226318263e-06, "loss": 0.1307, "step": 40640 }, { "epoch": 5.521409895072837, "grad_norm": 0.7877579927444458, "learning_rate": 1.9242050669162738e-06, "loss": 0.145, "step": 40650 }, { "epoch": 5.52276817548983, "grad_norm": 0.8262950778007507, "learning_rate": 1.9133639349227183e-06, "loss": 0.1399, "step": 40660 }, { "epoch": 5.524126455906822, "grad_norm": 0.7963125109672546, "learning_rate": 1.9025528334215227e-06, "loss": 0.146, "step": 40670 }, { "epoch": 5.525484736323814, "grad_norm": 1.7347514629364014, "learning_rate": 1.8917717691643244e-06, "loss": 0.1368, "step": 40680 }, { "epoch": 5.5268430167408065, "grad_norm": 1.538240909576416, "learning_rate": 1.8810207488839716e-06, "loss": 0.1443, "step": 40690 }, { "epoch": 5.528201297157798, "grad_norm": 0.4666368365287781, "learning_rate": 1.8702997792945653e-06, "loss": 0.1176, "step": 40700 }, { "epoch": 5.5295595775747906, "grad_norm": 0.5673195123672485, "learning_rate": 1.8596088670914447e-06, "loss": 0.1424, "step": 40710 }, { "epoch": 5.530917857991782, "grad_norm": 0.7812908887863159, "learning_rate": 1.8489480189511578e-06, "loss": 0.151, "step": 40720 }, { "epoch": 5.532276138408775, "grad_norm": 0.9959322810173035, "learning_rate": 1.8383172415315064e-06, "loss": 0.1323, "step": 40730 }, { "epoch": 5.533634418825766, "grad_norm": 1.1396373510360718, "learning_rate": 1.827716541471486e-06, "loss": 0.1432, "step": 40740 }, { "epoch": 5.534992699242759, "grad_norm": 0.6364880204200745, "learning_rate": 1.817145925391317e-06, "loss": 0.1503, "step": 40750 }, { "epoch": 5.536350979659751, "grad_norm": 0.8780985474586487, "learning_rate": 1.806605399892447e-06, "loss": 0.1345, "step": 40760 }, { "epoch": 5.537709260076743, "grad_norm": 1.238878607749939, "learning_rate": 1.7960949715575104e-06, "loss": 0.1424, "step": 40770 }, { "epoch": 5.539067540493735, "grad_norm": 1.4905586242675781, "learning_rate": 1.7856146469503564e-06, "loss": 0.1368, "step": 40780 }, { "epoch": 5.540425820910727, "grad_norm": 0.9282931685447693, "learning_rate": 1.7751644326160277e-06, "loss": 0.1418, "step": 40790 }, { "epoch": 5.541784101327719, "grad_norm": 0.8026456236839294, "learning_rate": 1.7647443350807758e-06, "loss": 0.137, "step": 40800 }, { "epoch": 5.543142381744711, "grad_norm": 0.6562375426292419, "learning_rate": 1.7543543608520285e-06, "loss": 0.142, "step": 40810 }, { "epoch": 5.5445006621617035, "grad_norm": 0.6791468858718872, "learning_rate": 1.7439945164184124e-06, "loss": 0.139, "step": 40820 }, { "epoch": 5.545858942578695, "grad_norm": 0.9385637044906616, "learning_rate": 1.733664808249741e-06, "loss": 0.1401, "step": 40830 }, { "epoch": 5.5472172229956875, "grad_norm": 2.758802890777588, "learning_rate": 1.7233652427969816e-06, "loss": 0.135, "step": 40840 }, { "epoch": 5.54857550341268, "grad_norm": 0.6583212018013, "learning_rate": 1.7130958264923113e-06, "loss": 0.1375, "step": 40850 }, { "epoch": 5.549933783829672, "grad_norm": 0.9166052937507629, "learning_rate": 1.7028565657490614e-06, "loss": 0.1467, "step": 40860 }, { "epoch": 5.551292064246664, "grad_norm": 0.8201191425323486, "learning_rate": 1.6926474669617277e-06, "loss": 0.1456, "step": 40870 }, { "epoch": 5.552650344663656, "grad_norm": 0.8237752318382263, "learning_rate": 1.6824685365059766e-06, "loss": 0.1446, "step": 40880 }, { "epoch": 5.554008625080648, "grad_norm": 0.7636487483978271, "learning_rate": 1.6723197807386348e-06, "loss": 0.1404, "step": 40890 }, { "epoch": 5.55536690549764, "grad_norm": 0.6516261100769043, "learning_rate": 1.6622012059976877e-06, "loss": 0.147, "step": 40900 }, { "epoch": 5.556725185914632, "grad_norm": 0.8793277144432068, "learning_rate": 1.6521128186022529e-06, "loss": 0.1487, "step": 40910 }, { "epoch": 5.558083466331624, "grad_norm": 0.783223032951355, "learning_rate": 1.6420546248526126e-06, "loss": 0.1424, "step": 40920 }, { "epoch": 5.559441746748616, "grad_norm": 0.7025136351585388, "learning_rate": 1.6320266310302034e-06, "loss": 0.1477, "step": 40930 }, { "epoch": 5.560800027165609, "grad_norm": 0.7817115783691406, "learning_rate": 1.6220288433975717e-06, "loss": 0.1444, "step": 40940 }, { "epoch": 5.5621583075826, "grad_norm": 0.8330286145210266, "learning_rate": 1.6120612681984337e-06, "loss": 0.1444, "step": 40950 }, { "epoch": 5.563516587999592, "grad_norm": 0.8443673253059387, "learning_rate": 1.6021239116576047e-06, "loss": 0.1406, "step": 40960 }, { "epoch": 5.5648748684165845, "grad_norm": 1.01258385181427, "learning_rate": 1.5922167799810483e-06, "loss": 0.1366, "step": 40970 }, { "epoch": 5.566233148833577, "grad_norm": 0.786291778087616, "learning_rate": 1.5823398793558597e-06, "loss": 0.1498, "step": 40980 }, { "epoch": 5.567591429250569, "grad_norm": 0.6068903207778931, "learning_rate": 1.5724932159502215e-06, "loss": 0.1442, "step": 40990 }, { "epoch": 5.568949709667561, "grad_norm": 0.6410410404205322, "learning_rate": 1.5626767959134758e-06, "loss": 0.1338, "step": 41000 }, { "epoch": 5.570307990084553, "grad_norm": 0.9610052108764648, "learning_rate": 1.5528906253760466e-06, "loss": 0.1407, "step": 41010 }, { "epoch": 5.571666270501545, "grad_norm": 0.7195908427238464, "learning_rate": 1.5431347104494675e-06, "loss": 0.1509, "step": 41020 }, { "epoch": 5.573024550918537, "grad_norm": 0.9794278740882874, "learning_rate": 1.5334090572264093e-06, "loss": 0.1376, "step": 41030 }, { "epoch": 5.574382831335529, "grad_norm": 0.7396292686462402, "learning_rate": 1.5237136717805966e-06, "loss": 0.1372, "step": 41040 }, { "epoch": 5.575741111752521, "grad_norm": 0.5611283183097839, "learning_rate": 1.514048560166892e-06, "loss": 0.1439, "step": 41050 }, { "epoch": 5.577099392169513, "grad_norm": 1.484043002128601, "learning_rate": 1.5044137284212278e-06, "loss": 0.1471, "step": 41060 }, { "epoch": 5.578457672586506, "grad_norm": 1.1138473749160767, "learning_rate": 1.4948091825606358e-06, "loss": 0.1503, "step": 41070 }, { "epoch": 5.579815953003497, "grad_norm": 0.7002197504043579, "learning_rate": 1.4852349285832345e-06, "loss": 0.1445, "step": 41080 }, { "epoch": 5.58117423342049, "grad_norm": 1.2219966650009155, "learning_rate": 1.4756909724682188e-06, "loss": 0.1336, "step": 41090 }, { "epoch": 5.5825325138374815, "grad_norm": 1.0335873365402222, "learning_rate": 1.4661773201758766e-06, "loss": 0.1412, "step": 41100 }, { "epoch": 5.583890794254474, "grad_norm": 1.8247549533843994, "learning_rate": 1.456693977647544e-06, "loss": 0.1438, "step": 41110 }, { "epoch": 5.585249074671466, "grad_norm": 0.8330690264701843, "learning_rate": 1.4472409508056617e-06, "loss": 0.1406, "step": 41120 }, { "epoch": 5.586607355088458, "grad_norm": 0.8167879581451416, "learning_rate": 1.437818245553707e-06, "loss": 0.1371, "step": 41130 }, { "epoch": 5.58796563550545, "grad_norm": 0.8568784594535828, "learning_rate": 1.4284258677762397e-06, "loss": 0.1455, "step": 41140 }, { "epoch": 5.589323915922442, "grad_norm": 0.908466100692749, "learning_rate": 1.4190638233388842e-06, "loss": 0.1295, "step": 41150 }, { "epoch": 5.590682196339435, "grad_norm": 0.9794155955314636, "learning_rate": 1.4097321180883028e-06, "loss": 0.1365, "step": 41160 }, { "epoch": 5.592040476756426, "grad_norm": 0.9148662090301514, "learning_rate": 1.400430757852228e-06, "loss": 0.1394, "step": 41170 }, { "epoch": 5.593398757173419, "grad_norm": 0.5634327530860901, "learning_rate": 1.3911597484394246e-06, "loss": 0.1327, "step": 41180 }, { "epoch": 5.59475703759041, "grad_norm": 0.7486961483955383, "learning_rate": 1.381919095639722e-06, "loss": 0.1519, "step": 41190 }, { "epoch": 5.596115318007403, "grad_norm": 1.0208015441894531, "learning_rate": 1.3727088052239823e-06, "loss": 0.139, "step": 41200 }, { "epoch": 5.597473598424394, "grad_norm": 0.6587234139442444, "learning_rate": 1.3635288829440984e-06, "loss": 0.1409, "step": 41210 }, { "epoch": 5.598831878841387, "grad_norm": 0.6429316401481628, "learning_rate": 1.3543793345330125e-06, "loss": 0.1437, "step": 41220 }, { "epoch": 5.6001901592583785, "grad_norm": 0.4531972110271454, "learning_rate": 1.345260165704687e-06, "loss": 0.1281, "step": 41230 }, { "epoch": 5.601548439675371, "grad_norm": 1.1498427391052246, "learning_rate": 1.3361713821541223e-06, "loss": 0.1394, "step": 41240 }, { "epoch": 5.6029067200923635, "grad_norm": 0.9409658312797546, "learning_rate": 1.327112989557333e-06, "loss": 0.152, "step": 41250 }, { "epoch": 5.604265000509355, "grad_norm": 1.0199317932128906, "learning_rate": 1.3180849935713612e-06, "loss": 0.1367, "step": 41260 }, { "epoch": 5.6056232809263475, "grad_norm": 2.0520832538604736, "learning_rate": 1.3090873998342578e-06, "loss": 0.1412, "step": 41270 }, { "epoch": 5.606981561343339, "grad_norm": 1.5013713836669922, "learning_rate": 1.3001202139650947e-06, "loss": 0.1396, "step": 41280 }, { "epoch": 5.608339841760332, "grad_norm": 0.7270447015762329, "learning_rate": 1.291183441563959e-06, "loss": 0.1368, "step": 41290 }, { "epoch": 5.609698122177323, "grad_norm": 1.0119043588638306, "learning_rate": 1.282277088211925e-06, "loss": 0.1544, "step": 41300 }, { "epoch": 5.611056402594316, "grad_norm": 0.8099943399429321, "learning_rate": 1.2734011594710937e-06, "loss": 0.1461, "step": 41310 }, { "epoch": 5.612414683011307, "grad_norm": 1.1126813888549805, "learning_rate": 1.2645556608845476e-06, "loss": 0.1392, "step": 41320 }, { "epoch": 5.6137729634283, "grad_norm": 0.6711204648017883, "learning_rate": 1.2557405979763737e-06, "loss": 0.1229, "step": 41330 }, { "epoch": 5.615131243845292, "grad_norm": 0.9795059561729431, "learning_rate": 1.2469559762516514e-06, "loss": 0.1327, "step": 41340 }, { "epoch": 5.616489524262284, "grad_norm": 1.2105903625488281, "learning_rate": 1.2382018011964425e-06, "loss": 0.1419, "step": 41350 }, { "epoch": 5.617847804679276, "grad_norm": 0.7697600722312927, "learning_rate": 1.229478078277807e-06, "loss": 0.1387, "step": 41360 }, { "epoch": 5.619206085096268, "grad_norm": 0.7865532040596008, "learning_rate": 1.220784812943776e-06, "loss": 0.1425, "step": 41370 }, { "epoch": 5.62056436551326, "grad_norm": 0.9119434356689453, "learning_rate": 1.2121220106233678e-06, "loss": 0.1356, "step": 41380 }, { "epoch": 5.621922645930252, "grad_norm": 0.8327426910400391, "learning_rate": 1.2034896767265658e-06, "loss": 0.1408, "step": 41390 }, { "epoch": 5.6232809263472445, "grad_norm": 0.6340413093566895, "learning_rate": 1.1948878166443357e-06, "loss": 0.1445, "step": 41400 }, { "epoch": 5.624639206764236, "grad_norm": 0.9182793498039246, "learning_rate": 1.186316435748608e-06, "loss": 0.143, "step": 41410 }, { "epoch": 5.625997487181229, "grad_norm": 0.7707954049110413, "learning_rate": 1.1777755393922841e-06, "loss": 0.1296, "step": 41420 }, { "epoch": 5.627355767598221, "grad_norm": 0.6941982507705688, "learning_rate": 1.1692651329092197e-06, "loss": 0.1323, "step": 41430 }, { "epoch": 5.628714048015213, "grad_norm": 0.7435147166252136, "learning_rate": 1.16078522161423e-06, "loss": 0.1424, "step": 41440 }, { "epoch": 5.630072328432205, "grad_norm": 1.1416733264923096, "learning_rate": 1.1523358108030847e-06, "loss": 0.1383, "step": 41450 }, { "epoch": 5.631430608849197, "grad_norm": 0.7104542255401611, "learning_rate": 1.1439169057525233e-06, "loss": 0.1338, "step": 41460 }, { "epoch": 5.632788889266189, "grad_norm": 0.7697431445121765, "learning_rate": 1.135528511720202e-06, "loss": 0.1508, "step": 41470 }, { "epoch": 5.634147169683181, "grad_norm": 1.0849742889404297, "learning_rate": 1.1271706339447463e-06, "loss": 0.1392, "step": 41480 }, { "epoch": 5.635505450100173, "grad_norm": 0.8856390118598938, "learning_rate": 1.1188432776457313e-06, "loss": 0.1415, "step": 41490 }, { "epoch": 5.636863730517165, "grad_norm": 0.7224352359771729, "learning_rate": 1.1105464480236417e-06, "loss": 0.1242, "step": 41500 }, { "epoch": 5.638222010934157, "grad_norm": 1.3479745388031006, "learning_rate": 1.1022801502599212e-06, "loss": 0.1407, "step": 41510 }, { "epoch": 5.63958029135115, "grad_norm": 0.8452427983283997, "learning_rate": 1.0940443895169406e-06, "loss": 0.1488, "step": 41520 }, { "epoch": 5.6409385717681415, "grad_norm": 1.0003329515457153, "learning_rate": 1.0858391709379912e-06, "loss": 0.1417, "step": 41530 }, { "epoch": 5.642296852185134, "grad_norm": 0.7275543212890625, "learning_rate": 1.0776644996473129e-06, "loss": 0.1454, "step": 41540 }, { "epoch": 5.643655132602126, "grad_norm": 0.6780206561088562, "learning_rate": 1.0695203807500442e-06, "loss": 0.1364, "step": 41550 }, { "epoch": 5.645013413019118, "grad_norm": 0.5707554221153259, "learning_rate": 1.0614068193322556e-06, "loss": 0.1283, "step": 41560 }, { "epoch": 5.64637169343611, "grad_norm": 1.0737545490264893, "learning_rate": 1.053323820460922e-06, "loss": 0.1487, "step": 41570 }, { "epoch": 5.647729973853102, "grad_norm": 0.6037214994430542, "learning_rate": 1.0452713891839549e-06, "loss": 0.1417, "step": 41580 }, { "epoch": 5.649088254270094, "grad_norm": 0.9591968059539795, "learning_rate": 1.03724953053016e-06, "loss": 0.1327, "step": 41590 }, { "epoch": 5.650446534687086, "grad_norm": 0.8833913803100586, "learning_rate": 1.0292582495092463e-06, "loss": 0.1413, "step": 41600 }, { "epoch": 5.651804815104079, "grad_norm": 0.598792552947998, "learning_rate": 1.021297551111844e-06, "loss": 0.1438, "step": 41610 }, { "epoch": 5.65316309552107, "grad_norm": 0.782558798789978, "learning_rate": 1.0133674403094594e-06, "loss": 0.1381, "step": 41620 }, { "epoch": 5.654521375938062, "grad_norm": 1.026680827140808, "learning_rate": 1.005467922054526e-06, "loss": 0.1428, "step": 41630 }, { "epoch": 5.655879656355054, "grad_norm": 0.6625080704689026, "learning_rate": 9.975990012803415e-07, "loss": 0.1307, "step": 41640 }, { "epoch": 5.657237936772047, "grad_norm": 0.843893826007843, "learning_rate": 9.897606829011197e-07, "loss": 0.1322, "step": 41650 }, { "epoch": 5.6585962171890385, "grad_norm": 0.80437833070755, "learning_rate": 9.819529718119557e-07, "loss": 0.1359, "step": 41660 }, { "epoch": 5.659954497606031, "grad_norm": 1.378403663635254, "learning_rate": 9.741758728888218e-07, "loss": 0.135, "step": 41670 }, { "epoch": 5.661312778023023, "grad_norm": 1.3878700733184814, "learning_rate": 9.664293909885824e-07, "loss": 0.1463, "step": 41680 }, { "epoch": 5.662671058440015, "grad_norm": 0.9381775856018066, "learning_rate": 9.587135309489736e-07, "loss": 0.1522, "step": 41690 }, { "epoch": 5.664029338857007, "grad_norm": 0.8933408260345459, "learning_rate": 9.510282975886187e-07, "loss": 0.1419, "step": 41700 }, { "epoch": 5.665387619273999, "grad_norm": 0.6789520978927612, "learning_rate": 9.43373695707006e-07, "loss": 0.139, "step": 41710 }, { "epoch": 5.666745899690991, "grad_norm": 1.2759653329849243, "learning_rate": 9.357497300844897e-07, "loss": 0.1435, "step": 41720 }, { "epoch": 5.668104180107983, "grad_norm": 0.8718083500862122, "learning_rate": 9.28156405482311e-07, "loss": 0.1424, "step": 41730 }, { "epoch": 5.669462460524976, "grad_norm": 1.0086135864257812, "learning_rate": 9.205937266425491e-07, "loss": 0.1307, "step": 41740 }, { "epoch": 5.670820740941967, "grad_norm": 0.989074170589447, "learning_rate": 9.130616982881646e-07, "loss": 0.1462, "step": 41750 }, { "epoch": 5.67217902135896, "grad_norm": 0.5956922173500061, "learning_rate": 9.055603251229727e-07, "loss": 0.1356, "step": 41760 }, { "epoch": 5.673537301775951, "grad_norm": 1.2609509229660034, "learning_rate": 8.980896118316373e-07, "loss": 0.1455, "step": 41770 }, { "epoch": 5.674895582192944, "grad_norm": 1.9751479625701904, "learning_rate": 8.906495630796819e-07, "loss": 0.1449, "step": 41780 }, { "epoch": 5.6762538626099355, "grad_norm": 0.7886742353439331, "learning_rate": 8.832401835134674e-07, "loss": 0.157, "step": 41790 }, { "epoch": 5.677612143026928, "grad_norm": 1.421133279800415, "learning_rate": 8.758614777602259e-07, "loss": 0.1458, "step": 41800 }, { "epoch": 5.67897042344392, "grad_norm": 0.8877543807029724, "learning_rate": 8.685134504280046e-07, "loss": 0.1405, "step": 41810 }, { "epoch": 5.680328703860912, "grad_norm": 1.050431251525879, "learning_rate": 8.611961061057161e-07, "loss": 0.142, "step": 41820 }, { "epoch": 5.6816869842779045, "grad_norm": 0.6102004647254944, "learning_rate": 8.539094493630995e-07, "loss": 0.1426, "step": 41830 }, { "epoch": 5.683045264694896, "grad_norm": 0.7038069367408752, "learning_rate": 8.466534847507201e-07, "loss": 0.1506, "step": 41840 }, { "epoch": 5.684403545111889, "grad_norm": 1.0042662620544434, "learning_rate": 8.394282168000089e-07, "loss": 0.1435, "step": 41850 }, { "epoch": 5.68576182552888, "grad_norm": 0.8187046647071838, "learning_rate": 8.322336500231787e-07, "loss": 0.1435, "step": 41860 }, { "epoch": 5.687120105945873, "grad_norm": 0.5849819779396057, "learning_rate": 8.250697889133075e-07, "loss": 0.1338, "step": 41870 }, { "epoch": 5.688478386362864, "grad_norm": 0.8545428514480591, "learning_rate": 8.179366379442888e-07, "loss": 0.1486, "step": 41880 }, { "epoch": 5.689836666779857, "grad_norm": 0.9905210137367249, "learning_rate": 8.108342015708204e-07, "loss": 0.1524, "step": 41890 }, { "epoch": 5.691194947196848, "grad_norm": 0.7423296570777893, "learning_rate": 8.037624842284486e-07, "loss": 0.1434, "step": 41900 }, { "epoch": 5.692553227613841, "grad_norm": 0.8121839761734009, "learning_rate": 7.967214903334963e-07, "loss": 0.1451, "step": 41910 }, { "epoch": 5.693911508030833, "grad_norm": 0.9413681626319885, "learning_rate": 7.89711224283135e-07, "loss": 0.1457, "step": 41920 }, { "epoch": 5.695269788447825, "grad_norm": 1.148727536201477, "learning_rate": 7.827316904553295e-07, "loss": 0.1406, "step": 41930 }, { "epoch": 5.696628068864817, "grad_norm": 0.5580938458442688, "learning_rate": 7.75782893208854e-07, "loss": 0.1392, "step": 41940 }, { "epoch": 5.697986349281809, "grad_norm": 0.9795404076576233, "learning_rate": 7.688648368832929e-07, "loss": 0.1484, "step": 41950 }, { "epoch": 5.6993446296988015, "grad_norm": 0.693550705909729, "learning_rate": 7.619775257990236e-07, "loss": 0.1423, "step": 41960 }, { "epoch": 5.700702910115793, "grad_norm": 0.6401588916778564, "learning_rate": 7.551209642572277e-07, "loss": 0.1389, "step": 41970 }, { "epoch": 5.702061190532786, "grad_norm": 2.711390256881714, "learning_rate": 7.482951565398855e-07, "loss": 0.133, "step": 41980 }, { "epoch": 5.703419470949777, "grad_norm": 0.763937771320343, "learning_rate": 7.415001069097593e-07, "loss": 0.1432, "step": 41990 }, { "epoch": 5.70477775136677, "grad_norm": 0.6249594688415527, "learning_rate": 7.347358196104326e-07, "loss": 0.1405, "step": 42000 }, { "epoch": 5.706136031783762, "grad_norm": 0.591541588306427, "learning_rate": 7.280022988662371e-07, "loss": 0.1373, "step": 42010 }, { "epoch": 5.707494312200754, "grad_norm": 1.2406282424926758, "learning_rate": 7.21299548882326e-07, "loss": 0.1504, "step": 42020 }, { "epoch": 5.708852592617746, "grad_norm": 1.3114311695098877, "learning_rate": 7.146275738446062e-07, "loss": 0.1643, "step": 42030 }, { "epoch": 5.710210873034738, "grad_norm": 0.7713119387626648, "learning_rate": 7.079863779197948e-07, "loss": 0.1325, "step": 42040 }, { "epoch": 5.71156915345173, "grad_norm": 0.6382005214691162, "learning_rate": 7.013759652553631e-07, "loss": 0.1449, "step": 42050 }, { "epoch": 5.712927433868722, "grad_norm": 0.7733715772628784, "learning_rate": 6.947963399795642e-07, "loss": 0.1302, "step": 42060 }, { "epoch": 5.714285714285714, "grad_norm": 0.7549878358840942, "learning_rate": 6.882475062014392e-07, "loss": 0.1491, "step": 42070 }, { "epoch": 5.715643994702706, "grad_norm": 0.8239365220069885, "learning_rate": 6.817294680107778e-07, "loss": 0.1353, "step": 42080 }, { "epoch": 5.7170022751196985, "grad_norm": 0.576304018497467, "learning_rate": 6.752422294781569e-07, "loss": 0.1422, "step": 42090 }, { "epoch": 5.718360555536691, "grad_norm": 0.9369167685508728, "learning_rate": 6.687857946548914e-07, "loss": 0.14, "step": 42100 }, { "epoch": 5.719718835953683, "grad_norm": 0.838309645652771, "learning_rate": 6.62360167573095e-07, "loss": 0.143, "step": 42110 }, { "epoch": 5.721077116370675, "grad_norm": 0.8450754284858704, "learning_rate": 6.559653522456188e-07, "loss": 0.1503, "step": 42120 }, { "epoch": 5.722435396787667, "grad_norm": 2.7430906295776367, "learning_rate": 6.496013526660683e-07, "loss": 0.1348, "step": 42130 }, { "epoch": 5.723793677204659, "grad_norm": 0.9229729175567627, "learning_rate": 6.432681728088252e-07, "loss": 0.1422, "step": 42140 }, { "epoch": 5.725151957621651, "grad_norm": 0.7425034642219543, "learning_rate": 6.369658166290038e-07, "loss": 0.1426, "step": 42150 }, { "epoch": 5.726510238038643, "grad_norm": 0.8837848901748657, "learning_rate": 6.306942880624778e-07, "loss": 0.1356, "step": 42160 }, { "epoch": 5.727868518455635, "grad_norm": 0.7442014813423157, "learning_rate": 6.244535910258698e-07, "loss": 0.1396, "step": 42170 }, { "epoch": 5.729226798872627, "grad_norm": 3.721750259399414, "learning_rate": 6.1824372941654e-07, "loss": 0.1608, "step": 42180 }, { "epoch": 5.73058507928962, "grad_norm": 0.9450153708457947, "learning_rate": 6.120647071126029e-07, "loss": 0.141, "step": 42190 }, { "epoch": 5.731943359706611, "grad_norm": 0.9872137308120728, "learning_rate": 6.059165279729051e-07, "loss": 0.1449, "step": 42200 }, { "epoch": 5.733301640123604, "grad_norm": 0.8139085173606873, "learning_rate": 5.997991958370309e-07, "loss": 0.1308, "step": 42210 }, { "epoch": 5.7346599205405955, "grad_norm": 0.9036415815353394, "learning_rate": 5.93712714525313e-07, "loss": 0.1382, "step": 42220 }, { "epoch": 5.736018200957588, "grad_norm": 1.0338029861450195, "learning_rate": 5.876570878387944e-07, "loss": 0.1409, "step": 42230 }, { "epoch": 5.73737648137458, "grad_norm": 0.6889071464538574, "learning_rate": 5.816323195592776e-07, "loss": 0.1293, "step": 42240 }, { "epoch": 5.738734761791572, "grad_norm": 0.9058610200881958, "learning_rate": 5.756384134492698e-07, "loss": 0.1431, "step": 42250 }, { "epoch": 5.740093042208564, "grad_norm": 0.9461732506752014, "learning_rate": 5.696753732520154e-07, "loss": 0.15, "step": 42260 }, { "epoch": 5.741451322625556, "grad_norm": 1.431554913520813, "learning_rate": 5.637432026914802e-07, "loss": 0.1434, "step": 42270 }, { "epoch": 5.742809603042549, "grad_norm": 1.1538783311843872, "learning_rate": 5.57841905472356e-07, "loss": 0.1452, "step": 42280 }, { "epoch": 5.74416788345954, "grad_norm": 0.8694644570350647, "learning_rate": 5.519714852800506e-07, "loss": 0.1363, "step": 42290 }, { "epoch": 5.745526163876532, "grad_norm": 1.0254439115524292, "learning_rate": 5.461319457806869e-07, "loss": 0.1376, "step": 42300 }, { "epoch": 5.746884444293524, "grad_norm": 0.6621975898742676, "learning_rate": 5.403232906211031e-07, "loss": 0.1442, "step": 42310 }, { "epoch": 5.748242724710517, "grad_norm": 0.6652215123176575, "learning_rate": 5.345455234288477e-07, "loss": 0.1321, "step": 42320 }, { "epoch": 5.749601005127508, "grad_norm": 0.8143565058708191, "learning_rate": 5.287986478121898e-07, "loss": 0.1372, "step": 42330 }, { "epoch": 5.750959285544501, "grad_norm": 1.171067476272583, "learning_rate": 5.230826673601031e-07, "loss": 0.1393, "step": 42340 }, { "epoch": 5.7523175659614925, "grad_norm": 0.9404630064964294, "learning_rate": 5.173975856422542e-07, "loss": 0.1439, "step": 42350 }, { "epoch": 5.753675846378485, "grad_norm": 0.856653094291687, "learning_rate": 5.117434062090309e-07, "loss": 0.138, "step": 42360 }, { "epoch": 5.7550341267954765, "grad_norm": 0.9202766418457031, "learning_rate": 5.061201325915032e-07, "loss": 0.1464, "step": 42370 }, { "epoch": 5.756392407212469, "grad_norm": 0.4832250773906708, "learning_rate": 5.005277683014564e-07, "loss": 0.1458, "step": 42380 }, { "epoch": 5.757750687629461, "grad_norm": 0.833764910697937, "learning_rate": 4.949663168313745e-07, "loss": 0.1358, "step": 42390 }, { "epoch": 5.759108968046453, "grad_norm": 0.4593173861503601, "learning_rate": 4.894357816544126e-07, "loss": 0.1457, "step": 42400 }, { "epoch": 5.760467248463446, "grad_norm": 0.702305257320404, "learning_rate": 4.839361662244523e-07, "loss": 0.129, "step": 42410 }, { "epoch": 5.761825528880437, "grad_norm": 0.7938226461410522, "learning_rate": 4.784674739760353e-07, "loss": 0.1415, "step": 42420 }, { "epoch": 5.76318380929743, "grad_norm": 0.499076783657074, "learning_rate": 4.7302970832441287e-07, "loss": 0.1413, "step": 42430 }, { "epoch": 5.764542089714421, "grad_norm": 0.9278774857521057, "learning_rate": 4.6762287266550187e-07, "loss": 0.1492, "step": 42440 }, { "epoch": 5.765900370131414, "grad_norm": 0.6545235514640808, "learning_rate": 4.622469703759291e-07, "loss": 0.1284, "step": 42450 }, { "epoch": 5.767258650548405, "grad_norm": 1.0153961181640625, "learning_rate": 4.5690200481297575e-07, "loss": 0.1402, "step": 42460 }, { "epoch": 5.768616930965398, "grad_norm": 0.7216044664382935, "learning_rate": 4.5158797931462717e-07, "loss": 0.1448, "step": 42470 }, { "epoch": 5.7699752113823894, "grad_norm": 1.2465262413024902, "learning_rate": 4.463048971995287e-07, "loss": 0.1427, "step": 42480 }, { "epoch": 5.771333491799382, "grad_norm": 0.6985843777656555, "learning_rate": 4.4105276176700793e-07, "loss": 0.1471, "step": 42490 }, { "epoch": 5.772691772216374, "grad_norm": 4.030367851257324, "learning_rate": 4.358315762970633e-07, "loss": 0.146, "step": 42500 }, { "epoch": 5.774050052633366, "grad_norm": 1.2481906414031982, "learning_rate": 4.306413440503754e-07, "loss": 0.1545, "step": 42510 }, { "epoch": 5.7754083330503585, "grad_norm": 0.9436084628105164, "learning_rate": 4.2548206826827916e-07, "loss": 0.1367, "step": 42520 }, { "epoch": 5.77676661346735, "grad_norm": 0.8290557861328125, "learning_rate": 4.2035375217279184e-07, "loss": 0.1375, "step": 42530 }, { "epoch": 5.778124893884343, "grad_norm": 0.8138709664344788, "learning_rate": 4.152563989665792e-07, "loss": 0.1478, "step": 42540 }, { "epoch": 5.779483174301334, "grad_norm": 0.7453136444091797, "learning_rate": 4.101900118329838e-07, "loss": 0.1368, "step": 42550 }, { "epoch": 5.780841454718327, "grad_norm": 0.7559126019477844, "learning_rate": 4.051545939360024e-07, "loss": 0.1368, "step": 42560 }, { "epoch": 5.782199735135318, "grad_norm": 0.7320087552070618, "learning_rate": 4.0015014842029743e-07, "loss": 0.1311, "step": 42570 }, { "epoch": 5.783558015552311, "grad_norm": 0.6499378085136414, "learning_rate": 3.9517667841119097e-07, "loss": 0.1342, "step": 42580 }, { "epoch": 5.784916295969303, "grad_norm": 0.6821578145027161, "learning_rate": 3.9023418701463754e-07, "loss": 0.1435, "step": 42590 }, { "epoch": 5.786274576386295, "grad_norm": 0.7514090538024902, "learning_rate": 3.8532267731728465e-07, "loss": 0.1355, "step": 42600 }, { "epoch": 5.787632856803287, "grad_norm": 0.9936935305595398, "learning_rate": 3.804421523863899e-07, "loss": 0.1395, "step": 42610 }, { "epoch": 5.788991137220279, "grad_norm": 0.7715401649475098, "learning_rate": 3.7559261526989297e-07, "loss": 0.1451, "step": 42620 }, { "epoch": 5.790349417637271, "grad_norm": 0.6973790526390076, "learning_rate": 3.707740689963712e-07, "loss": 0.1391, "step": 42630 }, { "epoch": 5.791707698054263, "grad_norm": 1.2675719261169434, "learning_rate": 3.6598651657503423e-07, "loss": 0.1456, "step": 42640 }, { "epoch": 5.7930659784712555, "grad_norm": 0.9026110768318176, "learning_rate": 3.6122996099575147e-07, "loss": 0.1397, "step": 42650 }, { "epoch": 5.794424258888247, "grad_norm": 0.6314105987548828, "learning_rate": 3.5650440522903563e-07, "loss": 0.1399, "step": 42660 }, { "epoch": 5.79578253930524, "grad_norm": 0.6825308203697205, "learning_rate": 3.5180985222603155e-07, "loss": 0.1396, "step": 42670 }, { "epoch": 5.797140819722232, "grad_norm": 1.6709537506103516, "learning_rate": 3.4714630491852197e-07, "loss": 0.1427, "step": 42680 }, { "epoch": 5.798499100139224, "grad_norm": 0.45494750142097473, "learning_rate": 3.425137662189326e-07, "loss": 0.135, "step": 42690 }, { "epoch": 5.799857380556216, "grad_norm": 0.6043773889541626, "learning_rate": 3.3791223902032155e-07, "loss": 0.1418, "step": 42700 }, { "epoch": 5.801215660973208, "grad_norm": 0.6432932615280151, "learning_rate": 3.3334172619637893e-07, "loss": 0.1319, "step": 42710 }, { "epoch": 5.8025739413902, "grad_norm": 0.6700814962387085, "learning_rate": 3.288022306014327e-07, "loss": 0.137, "step": 42720 }, { "epoch": 5.803932221807192, "grad_norm": 0.9799638390541077, "learning_rate": 3.242937550704206e-07, "loss": 0.1452, "step": 42730 }, { "epoch": 5.805290502224184, "grad_norm": 0.7717625498771667, "learning_rate": 3.198163024189349e-07, "loss": 0.1409, "step": 42740 }, { "epoch": 5.806648782641176, "grad_norm": 0.9460979104042053, "learning_rate": 3.1536987544317777e-07, "loss": 0.1398, "step": 42750 }, { "epoch": 5.808007063058168, "grad_norm": 0.7904230356216431, "learning_rate": 3.109544769199724e-07, "loss": 0.1312, "step": 42760 }, { "epoch": 5.809365343475161, "grad_norm": 1.0997706651687622, "learning_rate": 3.0657010960677967e-07, "loss": 0.1276, "step": 42770 }, { "epoch": 5.8107236238921525, "grad_norm": 0.6207943558692932, "learning_rate": 3.0221677624167054e-07, "loss": 0.1394, "step": 42780 }, { "epoch": 5.812081904309145, "grad_norm": 0.7280292510986328, "learning_rate": 2.9789447954333137e-07, "loss": 0.1403, "step": 42790 }, { "epoch": 5.8134401847261366, "grad_norm": 0.5742581486701965, "learning_rate": 2.936032222110752e-07, "loss": 0.1449, "step": 42800 }, { "epoch": 5.814798465143129, "grad_norm": 0.740777850151062, "learning_rate": 2.8934300692482484e-07, "loss": 0.1345, "step": 42810 }, { "epoch": 5.816156745560121, "grad_norm": 0.8938523530960083, "learning_rate": 2.8511383634512446e-07, "loss": 0.1387, "step": 42820 }, { "epoch": 5.817515025977113, "grad_norm": 0.8195593953132629, "learning_rate": 2.809157131131168e-07, "loss": 0.1473, "step": 42830 }, { "epoch": 5.818873306394105, "grad_norm": 0.7836967706680298, "learning_rate": 2.767486398505714e-07, "loss": 0.1514, "step": 42840 }, { "epoch": 5.820231586811097, "grad_norm": 0.7907763719558716, "learning_rate": 2.726126191598566e-07, "loss": 0.1448, "step": 42850 }, { "epoch": 5.82158986722809, "grad_norm": 1.0470409393310547, "learning_rate": 2.685076536239506e-07, "loss": 0.1401, "step": 42860 }, { "epoch": 5.822948147645081, "grad_norm": 0.7466161251068115, "learning_rate": 2.644337458064361e-07, "loss": 0.1353, "step": 42870 }, { "epoch": 5.824306428062074, "grad_norm": 1.5047842264175415, "learning_rate": 2.603908982515058e-07, "loss": 0.1297, "step": 42880 }, { "epoch": 5.825664708479065, "grad_norm": 0.6874199509620667, "learning_rate": 2.563791134839455e-07, "loss": 0.1426, "step": 42890 }, { "epoch": 5.827022988896058, "grad_norm": 0.9091113805770874, "learning_rate": 2.523983940091457e-07, "loss": 0.1488, "step": 42900 }, { "epoch": 5.8283812693130495, "grad_norm": 0.7393041253089905, "learning_rate": 2.48448742313101e-07, "loss": 0.1487, "step": 42910 }, { "epoch": 5.829739549730042, "grad_norm": 1.4209617376327515, "learning_rate": 2.445301608624051e-07, "loss": 0.1396, "step": 42920 }, { "epoch": 5.8310978301470335, "grad_norm": 1.0118939876556396, "learning_rate": 2.4064265210423376e-07, "loss": 0.1404, "step": 42930 }, { "epoch": 5.832456110564026, "grad_norm": 0.7043196558952332, "learning_rate": 2.3678621846637828e-07, "loss": 0.1386, "step": 42940 }, { "epoch": 5.8338143909810185, "grad_norm": 0.6645058989524841, "learning_rate": 2.329608623572066e-07, "loss": 0.1399, "step": 42950 }, { "epoch": 5.83517267139801, "grad_norm": 0.9369907975196838, "learning_rate": 2.2916658616568553e-07, "loss": 0.138, "step": 42960 }, { "epoch": 5.836530951815003, "grad_norm": 0.6439912915229797, "learning_rate": 2.254033922613752e-07, "loss": 0.1381, "step": 42970 }, { "epoch": 5.837889232231994, "grad_norm": 0.5435279607772827, "learning_rate": 2.216712829944123e-07, "loss": 0.1353, "step": 42980 }, { "epoch": 5.839247512648987, "grad_norm": 0.9074403643608093, "learning_rate": 2.179702606955325e-07, "loss": 0.1401, "step": 42990 }, { "epoch": 5.840605793065978, "grad_norm": 0.9018656015396118, "learning_rate": 2.1430032767605358e-07, "loss": 0.1534, "step": 43000 }, { "epoch": 5.841964073482971, "grad_norm": 0.6688591837882996, "learning_rate": 2.1066148622788662e-07, "loss": 0.1379, "step": 43010 }, { "epoch": 5.843322353899962, "grad_norm": 0.8841167092323303, "learning_rate": 2.070537386235083e-07, "loss": 0.1375, "step": 43020 }, { "epoch": 5.844680634316955, "grad_norm": 1.3263598680496216, "learning_rate": 2.0347708711598302e-07, "loss": 0.132, "step": 43030 }, { "epoch": 5.846038914733947, "grad_norm": 0.9241674542427063, "learning_rate": 1.9993153393896847e-07, "loss": 0.1456, "step": 43040 }, { "epoch": 5.847397195150939, "grad_norm": 0.9472998380661011, "learning_rate": 1.964170813066879e-07, "loss": 0.1447, "step": 43050 }, { "epoch": 5.8487554755679305, "grad_norm": 0.7257258892059326, "learning_rate": 1.9293373141394122e-07, "loss": 0.1474, "step": 43060 }, { "epoch": 5.850113755984923, "grad_norm": 0.8458983898162842, "learning_rate": 1.8948148643611608e-07, "loss": 0.1469, "step": 43070 }, { "epoch": 5.8514720364019155, "grad_norm": 0.6000439524650574, "learning_rate": 1.860603485291601e-07, "loss": 0.1356, "step": 43080 }, { "epoch": 5.852830316818907, "grad_norm": 0.8602129220962524, "learning_rate": 1.8267031982960315e-07, "loss": 0.1433, "step": 43090 }, { "epoch": 5.8541885972359, "grad_norm": 0.6988477110862732, "learning_rate": 1.7931140245454614e-07, "loss": 0.1348, "step": 43100 }, { "epoch": 5.855546877652891, "grad_norm": 1.3469178676605225, "learning_rate": 1.7598359850166113e-07, "loss": 0.1494, "step": 43110 }, { "epoch": 5.856905158069884, "grad_norm": 0.7703179717063904, "learning_rate": 1.7268691004919123e-07, "loss": 0.1338, "step": 43120 }, { "epoch": 5.858263438486875, "grad_norm": 0.6967087388038635, "learning_rate": 1.6942133915593962e-07, "loss": 0.1404, "step": 43130 }, { "epoch": 5.859621718903868, "grad_norm": 0.8643378615379333, "learning_rate": 1.6618688786128046e-07, "loss": 0.1387, "step": 43140 }, { "epoch": 5.860979999320859, "grad_norm": 0.9114177823066711, "learning_rate": 1.6298355818515908e-07, "loss": 0.1476, "step": 43150 }, { "epoch": 5.862338279737852, "grad_norm": 1.366724967956543, "learning_rate": 1.598113521280864e-07, "loss": 0.1394, "step": 43160 }, { "epoch": 5.863696560154844, "grad_norm": 0.7426915764808655, "learning_rate": 1.5667027167111658e-07, "loss": 0.1421, "step": 43170 }, { "epoch": 5.865054840571836, "grad_norm": 1.5762697458267212, "learning_rate": 1.535603187758916e-07, "loss": 0.1482, "step": 43180 }, { "epoch": 5.866413120988828, "grad_norm": 0.6786098480224609, "learning_rate": 1.504814953845912e-07, "loss": 0.1445, "step": 43190 }, { "epoch": 5.86777140140582, "grad_norm": 0.48607930541038513, "learning_rate": 1.4743380341997738e-07, "loss": 0.1419, "step": 43200 }, { "epoch": 5.8691296818228125, "grad_norm": 0.9134004712104797, "learning_rate": 1.4441724478535534e-07, "loss": 0.1318, "step": 43210 }, { "epoch": 5.870487962239804, "grad_norm": 0.8437184691429138, "learning_rate": 1.4143182136457934e-07, "loss": 0.128, "step": 43220 }, { "epoch": 5.8718462426567966, "grad_norm": 1.0287580490112305, "learning_rate": 1.384775350220857e-07, "loss": 0.1429, "step": 43230 }, { "epoch": 5.873204523073788, "grad_norm": 0.9547917246818542, "learning_rate": 1.3555438760284311e-07, "loss": 0.1332, "step": 43240 }, { "epoch": 5.874562803490781, "grad_norm": 0.7700017094612122, "learning_rate": 1.3266238093238015e-07, "loss": 0.135, "step": 43250 }, { "epoch": 5.875921083907773, "grad_norm": 0.502143383026123, "learning_rate": 1.2980151681677431e-07, "loss": 0.1304, "step": 43260 }, { "epoch": 5.877279364324765, "grad_norm": 1.0156567096710205, "learning_rate": 1.2697179704266871e-07, "loss": 0.1501, "step": 43270 }, { "epoch": 5.878637644741757, "grad_norm": 0.5612376928329468, "learning_rate": 1.241732233772386e-07, "loss": 0.1373, "step": 43280 }, { "epoch": 5.879995925158749, "grad_norm": 0.9121813178062439, "learning_rate": 1.2140579756821924e-07, "loss": 0.1426, "step": 43290 }, { "epoch": 5.881354205575741, "grad_norm": 0.7083497047424316, "learning_rate": 1.1866952134388931e-07, "loss": 0.1421, "step": 43300 }, { "epoch": 5.882712485992733, "grad_norm": 0.6439822316169739, "learning_rate": 1.1596439641307077e-07, "loss": 0.1354, "step": 43310 }, { "epoch": 5.884070766409725, "grad_norm": 0.6784988641738892, "learning_rate": 1.132904244651456e-07, "loss": 0.1415, "step": 43320 }, { "epoch": 5.885429046826717, "grad_norm": 0.8995897769927979, "learning_rate": 1.1064760717002243e-07, "loss": 0.1378, "step": 43330 }, { "epoch": 5.8867873272437095, "grad_norm": 1.5300796031951904, "learning_rate": 1.0803594617816437e-07, "loss": 0.1444, "step": 43340 }, { "epoch": 5.888145607660702, "grad_norm": 1.277114748954773, "learning_rate": 1.0545544312057787e-07, "loss": 0.1486, "step": 43350 }, { "epoch": 5.8895038880776935, "grad_norm": 0.8330977559089661, "learning_rate": 1.0290609960880159e-07, "loss": 0.138, "step": 43360 }, { "epoch": 5.890862168494686, "grad_norm": 0.5882835388183594, "learning_rate": 1.0038791723492868e-07, "loss": 0.154, "step": 43370 }, { "epoch": 5.892220448911678, "grad_norm": 0.6834270358085632, "learning_rate": 9.790089757157894e-08, "loss": 0.1328, "step": 43380 }, { "epoch": 5.89357872932867, "grad_norm": 0.6746418476104736, "learning_rate": 9.544504217192107e-08, "loss": 0.1387, "step": 43390 }, { "epoch": 5.894937009745662, "grad_norm": 0.7515066862106323, "learning_rate": 9.302035256965048e-08, "loss": 0.1425, "step": 43400 }, { "epoch": 5.896295290162654, "grad_norm": 0.814963698387146, "learning_rate": 9.062683027901142e-08, "loss": 0.1387, "step": 43410 }, { "epoch": 5.897653570579646, "grad_norm": 0.7498738169670105, "learning_rate": 8.826447679476934e-08, "loss": 0.1489, "step": 43420 }, { "epoch": 5.899011850996638, "grad_norm": 0.9613010287284851, "learning_rate": 8.593329359224411e-08, "loss": 0.1344, "step": 43430 }, { "epoch": 5.900370131413631, "grad_norm": 0.6479271650314331, "learning_rate": 8.363328212727117e-08, "loss": 0.1455, "step": 43440 }, { "epoch": 5.901728411830622, "grad_norm": 1.2872908115386963, "learning_rate": 8.136444383622933e-08, "loss": 0.1371, "step": 43450 }, { "epoch": 5.903086692247615, "grad_norm": 0.8949052095413208, "learning_rate": 7.912678013602404e-08, "loss": 0.1449, "step": 43460 }, { "epoch": 5.904444972664606, "grad_norm": 0.8261060118675232, "learning_rate": 7.692029242409304e-08, "loss": 0.1377, "step": 43470 }, { "epoch": 5.905803253081599, "grad_norm": 0.7346044778823853, "learning_rate": 7.474498207841185e-08, "loss": 0.1366, "step": 43480 }, { "epoch": 5.9071615334985905, "grad_norm": 1.4242463111877441, "learning_rate": 7.260085045747156e-08, "loss": 0.1492, "step": 43490 }, { "epoch": 5.908519813915583, "grad_norm": 0.9833252429962158, "learning_rate": 7.04878989003066e-08, "loss": 0.1351, "step": 43500 }, { "epoch": 5.909878094332575, "grad_norm": 0.6278096437454224, "learning_rate": 6.840612872647256e-08, "loss": 0.1337, "step": 43510 }, { "epoch": 5.911236374749567, "grad_norm": 0.5873438715934753, "learning_rate": 6.635554123604614e-08, "loss": 0.1432, "step": 43520 }, { "epoch": 5.91259465516656, "grad_norm": 1.2704215049743652, "learning_rate": 6.433613770963631e-08, "loss": 0.1503, "step": 43530 }, { "epoch": 5.913952935583551, "grad_norm": 0.7020022869110107, "learning_rate": 6.234791940837869e-08, "loss": 0.1417, "step": 43540 }, { "epoch": 5.915311216000544, "grad_norm": 1.0867749452590942, "learning_rate": 6.039088757393563e-08, "loss": 0.1376, "step": 43550 }, { "epoch": 5.916669496417535, "grad_norm": 0.8384567499160767, "learning_rate": 5.846504342847947e-08, "loss": 0.1502, "step": 43560 }, { "epoch": 5.918027776834528, "grad_norm": 0.879112958908081, "learning_rate": 5.657038817472593e-08, "loss": 0.1359, "step": 43570 }, { "epoch": 5.919386057251519, "grad_norm": 1.3054834604263306, "learning_rate": 5.470692299589519e-08, "loss": 0.14, "step": 43580 }, { "epoch": 5.920744337668512, "grad_norm": 0.6657294631004333, "learning_rate": 5.287464905573969e-08, "loss": 0.1484, "step": 43590 }, { "epoch": 5.922102618085503, "grad_norm": 0.6832488179206848, "learning_rate": 5.107356749853298e-08, "loss": 0.1415, "step": 43600 }, { "epoch": 5.923460898502496, "grad_norm": 0.6521658301353455, "learning_rate": 4.930367944905312e-08, "loss": 0.1373, "step": 43610 }, { "epoch": 5.924819178919488, "grad_norm": 0.7443221211433411, "learning_rate": 4.756498601262704e-08, "loss": 0.1368, "step": 43620 }, { "epoch": 5.92617745933648, "grad_norm": 0.8846837282180786, "learning_rate": 4.585748827506398e-08, "loss": 0.137, "step": 43630 }, { "epoch": 5.9275357397534725, "grad_norm": 0.7875809669494629, "learning_rate": 4.4181187302722025e-08, "loss": 0.1512, "step": 43640 }, { "epoch": 5.928894020170464, "grad_norm": 0.7589226961135864, "learning_rate": 4.253608414246379e-08, "loss": 0.1386, "step": 43650 }, { "epoch": 5.930252300587457, "grad_norm": 0.7143007516860962, "learning_rate": 4.092217982166191e-08, "loss": 0.1329, "step": 43660 }, { "epoch": 5.931610581004448, "grad_norm": 1.0409319400787354, "learning_rate": 3.933947534822124e-08, "loss": 0.1411, "step": 43670 }, { "epoch": 5.932968861421441, "grad_norm": 0.6191720366477966, "learning_rate": 3.778797171055115e-08, "loss": 0.1351, "step": 43680 }, { "epoch": 5.934327141838432, "grad_norm": 0.8280099034309387, "learning_rate": 3.626766987757657e-08, "loss": 0.1371, "step": 43690 }, { "epoch": 5.935685422255425, "grad_norm": 1.2657291889190674, "learning_rate": 3.477857079873803e-08, "loss": 0.1494, "step": 43700 }, { "epoch": 5.937043702672417, "grad_norm": 1.0027809143066406, "learning_rate": 3.332067540399164e-08, "loss": 0.1431, "step": 43710 }, { "epoch": 5.938401983089409, "grad_norm": 0.6957676410675049, "learning_rate": 3.189398460380355e-08, "loss": 0.1567, "step": 43720 }, { "epoch": 5.9397602635064, "grad_norm": 0.4745795726776123, "learning_rate": 3.049849928915549e-08, "loss": 0.1466, "step": 43730 }, { "epoch": 5.941118543923393, "grad_norm": 0.7248262166976929, "learning_rate": 2.913422033153923e-08, "loss": 0.1311, "step": 43740 }, { "epoch": 5.942476824340385, "grad_norm": 0.8796412944793701, "learning_rate": 2.7801148582956572e-08, "loss": 0.1324, "step": 43750 }, { "epoch": 5.943835104757377, "grad_norm": 0.8329164981842041, "learning_rate": 2.649928487591935e-08, "loss": 0.1489, "step": 43760 }, { "epoch": 5.9451933851743695, "grad_norm": 2.182788133621216, "learning_rate": 2.5228630023454992e-08, "loss": 0.1483, "step": 43770 }, { "epoch": 5.946551665591361, "grad_norm": 0.9235145449638367, "learning_rate": 2.39891848190954e-08, "loss": 0.1323, "step": 43780 }, { "epoch": 5.9479099460083535, "grad_norm": 0.5664082765579224, "learning_rate": 2.278095003688807e-08, "loss": 0.1508, "step": 43790 }, { "epoch": 5.949268226425345, "grad_norm": 0.7792126536369324, "learning_rate": 2.160392643138498e-08, "loss": 0.1397, "step": 43800 }, { "epoch": 5.950626506842338, "grad_norm": 0.9972976446151733, "learning_rate": 2.0458114737642586e-08, "loss": 0.1591, "step": 43810 }, { "epoch": 5.951984787259329, "grad_norm": 0.7094663381576538, "learning_rate": 1.9343515671232935e-08, "loss": 0.139, "step": 43820 }, { "epoch": 5.953343067676322, "grad_norm": 0.8355711102485657, "learning_rate": 1.826012992823256e-08, "loss": 0.1396, "step": 43830 }, { "epoch": 5.954701348093314, "grad_norm": 1.016047716140747, "learning_rate": 1.7207958185222472e-08, "loss": 0.1368, "step": 43840 }, { "epoch": 5.956059628510306, "grad_norm": 1.1851811408996582, "learning_rate": 1.618700109929372e-08, "loss": 0.143, "step": 43850 }, { "epoch": 5.957417908927298, "grad_norm": 0.8799508810043335, "learning_rate": 1.5197259308036283e-08, "loss": 0.1488, "step": 43860 }, { "epoch": 5.95877618934429, "grad_norm": 0.7225849032402039, "learning_rate": 1.423873342956128e-08, "loss": 0.1409, "step": 43870 }, { "epoch": 5.960134469761282, "grad_norm": 0.7054046988487244, "learning_rate": 1.331142406247321e-08, "loss": 0.1463, "step": 43880 }, { "epoch": 5.961492750178274, "grad_norm": 0.7974108457565308, "learning_rate": 1.2415331785881057e-08, "loss": 0.1353, "step": 43890 }, { "epoch": 5.962851030595266, "grad_norm": 0.9054345488548279, "learning_rate": 1.1550457159403838e-08, "loss": 0.133, "step": 43900 }, { "epoch": 5.964209311012258, "grad_norm": 0.7178274393081665, "learning_rate": 1.0716800723165054e-08, "loss": 0.1388, "step": 43910 }, { "epoch": 5.9655675914292505, "grad_norm": 0.7498517036437988, "learning_rate": 9.914362997787142e-09, "loss": 0.129, "step": 43920 }, { "epoch": 5.966925871846243, "grad_norm": 0.7260889410972595, "learning_rate": 9.143144484397015e-09, "loss": 0.1398, "step": 43930 }, { "epoch": 5.968284152263235, "grad_norm": 0.7839494943618774, "learning_rate": 8.403145664626077e-09, "loss": 0.1315, "step": 43940 }, { "epoch": 5.969642432680227, "grad_norm": 1.13367760181427, "learning_rate": 7.694367000615765e-09, "loss": 0.1352, "step": 43950 }, { "epoch": 5.971000713097219, "grad_norm": 0.9072442054748535, "learning_rate": 7.016808935000896e-09, "loss": 0.1381, "step": 43960 }, { "epoch": 5.972358993514211, "grad_norm": 0.7867223024368286, "learning_rate": 6.370471890926322e-09, "loss": 0.1361, "step": 43970 }, { "epoch": 5.973717273931203, "grad_norm": 0.7834551334381104, "learning_rate": 5.7553562720247256e-09, "loss": 0.154, "step": 43980 }, { "epoch": 5.975075554348195, "grad_norm": 0.7563917636871338, "learning_rate": 5.171462462444376e-09, "loss": 0.1367, "step": 43990 }, { "epoch": 5.976433834765187, "grad_norm": 0.7050895094871521, "learning_rate": 4.6187908268380265e-09, "loss": 0.1323, "step": 44000 }, { "epoch": 5.977792115182179, "grad_norm": 3.11155104637146, "learning_rate": 4.0973417103462584e-09, "loss": 0.1361, "step": 44010 }, { "epoch": 5.979150395599172, "grad_norm": 0.8904614448547363, "learning_rate": 3.6071154386252413e-09, "loss": 0.1392, "step": 44020 }, { "epoch": 5.980508676016163, "grad_norm": 0.6612969636917114, "learning_rate": 3.1481123178189743e-09, "loss": 0.141, "step": 44030 }, { "epoch": 5.981866956433156, "grad_norm": 0.8515172004699707, "learning_rate": 2.7203326345759394e-09, "loss": 0.1381, "step": 44040 }, { "epoch": 5.9832252368501475, "grad_norm": 0.525837242603302, "learning_rate": 2.3237766560602058e-09, "loss": 0.1321, "step": 44050 }, { "epoch": 5.98458351726714, "grad_norm": 1.2149951457977295, "learning_rate": 1.95844462991257e-09, "loss": 0.1328, "step": 44060 }, { "epoch": 5.985941797684132, "grad_norm": 1.3056334257125854, "learning_rate": 1.6243367842894153e-09, "loss": 0.131, "step": 44070 }, { "epoch": 5.987300078101124, "grad_norm": 0.8542829751968384, "learning_rate": 1.3214533278460562e-09, "loss": 0.1392, "step": 44080 }, { "epoch": 5.988658358518116, "grad_norm": 0.623806357383728, "learning_rate": 1.0497944497367406e-09, "loss": 0.1446, "step": 44090 }, { "epoch": 5.990016638935108, "grad_norm": 1.1721960306167603, "learning_rate": 8.093603196090982e-10, "loss": 0.1318, "step": 44100 }, { "epoch": 5.991374919352101, "grad_norm": 0.9991965889930725, "learning_rate": 6.001510876152416e-10, "loss": 0.1391, "step": 44110 }, { "epoch": 5.992733199769092, "grad_norm": 0.5916539430618286, "learning_rate": 4.2216688441731876e-10, "loss": 0.1423, "step": 44120 }, { "epoch": 5.994091480186085, "grad_norm": 0.7762668132781982, "learning_rate": 2.754078211597566e-10, "loss": 0.148, "step": 44130 }, { "epoch": 5.995449760603076, "grad_norm": 0.5913887023925781, "learning_rate": 1.5987398950256804e-10, "loss": 0.135, "step": 44140 }, { "epoch": 5.996808041020069, "grad_norm": 0.5935466289520264, "learning_rate": 7.556546158804523e-11, "loss": 0.1385, "step": 44150 }, { "epoch": 5.99816632143706, "grad_norm": 0.7687810063362122, "learning_rate": 2.24822900685151e-11, "loss": 0.1431, "step": 44160 }, { "epoch": 5.999524601854053, "grad_norm": 0.8409439325332642, "learning_rate": 6.245081007882902e-13, "loss": 0.1415, "step": 44170 } ], "logging_steps": 10, "max_steps": 44172, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.466008313374494e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }