diff --git "a/checkpoint-23280/trainer_state.json" "b/checkpoint-23280/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-23280/trainer_state.json" @@ -0,0 +1,6585 @@ +{ + "best_metric": 0.08581268042325974, + "best_model_checkpoint": "autotrain-ai-image-detect-20250613-0139/checkpoint-23280", + "epoch": 2.0, + "eval_steps": 500, + "global_step": 23280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002147766323024055, + "grad_norm": 6.364386558532715, + "learning_rate": 1.9999993676736793e-05, + "loss": 0.0701, + "step": 25 + }, + { + "epoch": 0.00429553264604811, + "grad_norm": 0.2656271457672119, + "learning_rate": 1.9999974706955162e-05, + "loss": 0.0549, + "step": 50 + }, + { + "epoch": 0.006443298969072165, + "grad_norm": 0.06899937242269516, + "learning_rate": 1.99999430906791e-05, + "loss": 0.0363, + "step": 75 + }, + { + "epoch": 0.00859106529209622, + "grad_norm": 0.31279855966567993, + "learning_rate": 1.9999898827948593e-05, + "loss": 0.0537, + "step": 100 + }, + { + "epoch": 0.010738831615120275, + "grad_norm": 7.762655258178711, + "learning_rate": 1.9999841918819616e-05, + "loss": 0.067, + "step": 125 + }, + { + "epoch": 0.01288659793814433, + "grad_norm": 0.12942329049110413, + "learning_rate": 1.9999772363364142e-05, + "loss": 0.0708, + "step": 150 + }, + { + "epoch": 0.015034364261168385, + "grad_norm": 0.1635713428258896, + "learning_rate": 1.9999690161670123e-05, + "loss": 0.0304, + "step": 175 + }, + { + "epoch": 0.01718213058419244, + "grad_norm": 6.841973781585693, + "learning_rate": 1.999959531384153e-05, + "loss": 0.0564, + "step": 200 + }, + { + "epoch": 0.019329896907216496, + "grad_norm": 4.627384185791016, + "learning_rate": 1.9999487819998307e-05, + "loss": 0.0631, + "step": 225 + }, + { + "epoch": 0.02147766323024055, + "grad_norm": 26.23409652709961, + "learning_rate": 1.9999367680276395e-05, + "loss": 0.0525, + "step": 250 + }, + { + "epoch": 0.023625429553264604, + "grad_norm": 0.18105703592300415, + "learning_rate": 1.999923489482773e-05, + "loss": 0.0626, + "step": 275 + }, + { + "epoch": 0.02577319587628866, + "grad_norm": 8.290199279785156, + "learning_rate": 1.999908946382024e-05, + "loss": 0.0489, + "step": 300 + }, + { + "epoch": 0.027920962199312716, + "grad_norm": 0.10118798166513443, + "learning_rate": 1.9998931387437845e-05, + "loss": 0.0274, + "step": 325 + }, + { + "epoch": 0.03006872852233677, + "grad_norm": 0.40575110912323, + "learning_rate": 1.9998760665880453e-05, + "loss": 0.0646, + "step": 350 + }, + { + "epoch": 0.03221649484536082, + "grad_norm": 0.7302772402763367, + "learning_rate": 1.9998577299363974e-05, + "loss": 0.0367, + "step": 375 + }, + { + "epoch": 0.03436426116838488, + "grad_norm": 4.77200174331665, + "learning_rate": 1.9998381288120296e-05, + "loss": 0.0967, + "step": 400 + }, + { + "epoch": 0.03651202749140894, + "grad_norm": 0.13119499385356903, + "learning_rate": 1.999817263239731e-05, + "loss": 0.0387, + "step": 425 + }, + { + "epoch": 0.03865979381443299, + "grad_norm": 0.1904808133840561, + "learning_rate": 1.999795133245889e-05, + "loss": 0.063, + "step": 450 + }, + { + "epoch": 0.040807560137457045, + "grad_norm": 4.139963150024414, + "learning_rate": 1.999771738858491e-05, + "loss": 0.038, + "step": 475 + }, + { + "epoch": 0.0429553264604811, + "grad_norm": 5.248744964599609, + "learning_rate": 1.999747080107122e-05, + "loss": 0.0566, + "step": 500 + }, + { + "epoch": 0.045103092783505154, + "grad_norm": 0.14466960728168488, + "learning_rate": 1.999721157022967e-05, + "loss": 0.0574, + "step": 525 + }, + { + "epoch": 0.04725085910652921, + "grad_norm": 9.33919906616211, + "learning_rate": 1.9996939696388095e-05, + "loss": 0.0508, + "step": 550 + }, + { + "epoch": 0.04939862542955326, + "grad_norm": 0.19377951323986053, + "learning_rate": 1.999665517989033e-05, + "loss": 0.0577, + "step": 575 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 0.4148195683956146, + "learning_rate": 1.9996358021096174e-05, + "loss": 0.0629, + "step": 600 + }, + { + "epoch": 0.05369415807560137, + "grad_norm": 0.21487022936344147, + "learning_rate": 1.9996048220381447e-05, + "loss": 0.0364, + "step": 625 + }, + { + "epoch": 0.05584192439862543, + "grad_norm": 0.5803439617156982, + "learning_rate": 1.9995725778137927e-05, + "loss": 0.0182, + "step": 650 + }, + { + "epoch": 0.05798969072164949, + "grad_norm": 0.16263632476329803, + "learning_rate": 1.9995390694773396e-05, + "loss": 0.0697, + "step": 675 + }, + { + "epoch": 0.06013745704467354, + "grad_norm": 0.5192539691925049, + "learning_rate": 1.9995042970711615e-05, + "loss": 0.0755, + "step": 700 + }, + { + "epoch": 0.062285223367697595, + "grad_norm": 0.4817010760307312, + "learning_rate": 1.999468260639234e-05, + "loss": 0.0742, + "step": 725 + }, + { + "epoch": 0.06443298969072164, + "grad_norm": 3.310624837875366, + "learning_rate": 1.9994309602271302e-05, + "loss": 0.0801, + "step": 750 + }, + { + "epoch": 0.0665807560137457, + "grad_norm": 9.354548454284668, + "learning_rate": 1.9993923958820224e-05, + "loss": 0.0639, + "step": 775 + }, + { + "epoch": 0.06872852233676977, + "grad_norm": 5.356099605560303, + "learning_rate": 1.9993525676526807e-05, + "loss": 0.0575, + "step": 800 + }, + { + "epoch": 0.07087628865979381, + "grad_norm": 0.3128518760204315, + "learning_rate": 1.9993114755894745e-05, + "loss": 0.0243, + "step": 825 + }, + { + "epoch": 0.07302405498281787, + "grad_norm": 0.6620001792907715, + "learning_rate": 1.9992691197443707e-05, + "loss": 0.0459, + "step": 850 + }, + { + "epoch": 0.07517182130584192, + "grad_norm": 64.59632110595703, + "learning_rate": 1.9992255001709345e-05, + "loss": 0.0597, + "step": 875 + }, + { + "epoch": 0.07731958762886598, + "grad_norm": 33.163761138916016, + "learning_rate": 1.9991806169243302e-05, + "loss": 0.0523, + "step": 900 + }, + { + "epoch": 0.07946735395189003, + "grad_norm": 0.0949297621846199, + "learning_rate": 1.999134470061319e-05, + "loss": 0.0541, + "step": 925 + }, + { + "epoch": 0.08161512027491409, + "grad_norm": 11.246814727783203, + "learning_rate": 1.9990870596402607e-05, + "loss": 0.0466, + "step": 950 + }, + { + "epoch": 0.08376288659793814, + "grad_norm": 0.07439606636762619, + "learning_rate": 1.999038385721113e-05, + "loss": 0.0531, + "step": 975 + }, + { + "epoch": 0.0859106529209622, + "grad_norm": 6.2220683097839355, + "learning_rate": 1.9989884483654315e-05, + "loss": 0.0696, + "step": 1000 + }, + { + "epoch": 0.08805841924398626, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.9989372476363698e-05, + "loss": 0.0508, + "step": 1025 + }, + { + "epoch": 0.09020618556701031, + "grad_norm": 9.115586280822754, + "learning_rate": 1.998884783598679e-05, + "loss": 0.0763, + "step": 1050 + }, + { + "epoch": 0.09235395189003437, + "grad_norm": 0.41690298914909363, + "learning_rate": 1.9988310563187077e-05, + "loss": 0.0575, + "step": 1075 + }, + { + "epoch": 0.09450171821305842, + "grad_norm": 3.7026329040527344, + "learning_rate": 1.9987760658644023e-05, + "loss": 0.0566, + "step": 1100 + }, + { + "epoch": 0.09664948453608248, + "grad_norm": 0.2436683177947998, + "learning_rate": 1.9987198123053066e-05, + "loss": 0.0475, + "step": 1125 + }, + { + "epoch": 0.09879725085910653, + "grad_norm": 0.301731139421463, + "learning_rate": 1.998662295712562e-05, + "loss": 0.0351, + "step": 1150 + }, + { + "epoch": 0.10094501718213059, + "grad_norm": 9.841277122497559, + "learning_rate": 1.998603516158907e-05, + "loss": 0.0337, + "step": 1175 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 15.753165245056152, + "learning_rate": 1.998543473718677e-05, + "loss": 0.0591, + "step": 1200 + }, + { + "epoch": 0.1052405498281787, + "grad_norm": 39.0996208190918, + "learning_rate": 1.9984821684678052e-05, + "loss": 0.0487, + "step": 1225 + }, + { + "epoch": 0.10738831615120274, + "grad_norm": 0.09229845553636551, + "learning_rate": 1.998419600483821e-05, + "loss": 0.0664, + "step": 1250 + }, + { + "epoch": 0.1095360824742268, + "grad_norm": 0.6484953165054321, + "learning_rate": 1.998355769845852e-05, + "loss": 0.0544, + "step": 1275 + }, + { + "epoch": 0.11168384879725086, + "grad_norm": 6.235883712768555, + "learning_rate": 1.998290676634621e-05, + "loss": 0.0688, + "step": 1300 + }, + { + "epoch": 0.11383161512027491, + "grad_norm": 0.06320979446172714, + "learning_rate": 1.9982243209324485e-05, + "loss": 0.0291, + "step": 1325 + }, + { + "epoch": 0.11597938144329897, + "grad_norm": 4.2293901443481445, + "learning_rate": 1.9981567028232514e-05, + "loss": 0.0578, + "step": 1350 + }, + { + "epoch": 0.11812714776632302, + "grad_norm": 1.0895273685455322, + "learning_rate": 1.998087822392543e-05, + "loss": 0.0664, + "step": 1375 + }, + { + "epoch": 0.12027491408934708, + "grad_norm": 0.5239425897598267, + "learning_rate": 1.9980176797274335e-05, + "loss": 0.0603, + "step": 1400 + }, + { + "epoch": 0.12242268041237113, + "grad_norm": 1.0259077548980713, + "learning_rate": 1.997946274916629e-05, + "loss": 0.0465, + "step": 1425 + }, + { + "epoch": 0.12457044673539519, + "grad_norm": 3.306816577911377, + "learning_rate": 1.9978736080504313e-05, + "loss": 0.0396, + "step": 1450 + }, + { + "epoch": 0.12671821305841924, + "grad_norm": 12.068175315856934, + "learning_rate": 1.997799679220739e-05, + "loss": 0.0315, + "step": 1475 + }, + { + "epoch": 0.12886597938144329, + "grad_norm": 0.3856680393218994, + "learning_rate": 1.9977244885210467e-05, + "loss": 0.0633, + "step": 1500 + }, + { + "epoch": 0.13101374570446736, + "grad_norm": 0.6916818022727966, + "learning_rate": 1.9976480360464442e-05, + "loss": 0.0566, + "step": 1525 + }, + { + "epoch": 0.1331615120274914, + "grad_norm": 9.237444877624512, + "learning_rate": 1.997570321893617e-05, + "loss": 0.0296, + "step": 1550 + }, + { + "epoch": 0.13530927835051546, + "grad_norm": 3.657514810562134, + "learning_rate": 1.9974913461608473e-05, + "loss": 0.0671, + "step": 1575 + }, + { + "epoch": 0.13745704467353953, + "grad_norm": 1.2361929416656494, + "learning_rate": 1.9974111089480112e-05, + "loss": 0.0481, + "step": 1600 + }, + { + "epoch": 0.13960481099656358, + "grad_norm": 5.634768009185791, + "learning_rate": 1.9973296103565817e-05, + "loss": 0.0586, + "step": 1625 + }, + { + "epoch": 0.14175257731958762, + "grad_norm": 0.4086708724498749, + "learning_rate": 1.9972468504896253e-05, + "loss": 0.0368, + "step": 1650 + }, + { + "epoch": 0.14390034364261167, + "grad_norm": 0.1444201022386551, + "learning_rate": 1.9971628294518052e-05, + "loss": 0.0657, + "step": 1675 + }, + { + "epoch": 0.14604810996563575, + "grad_norm": 6.272415637969971, + "learning_rate": 1.997077547349378e-05, + "loss": 0.068, + "step": 1700 + }, + { + "epoch": 0.1481958762886598, + "grad_norm": 0.2670227885246277, + "learning_rate": 1.996991004290197e-05, + "loss": 0.061, + "step": 1725 + }, + { + "epoch": 0.15034364261168384, + "grad_norm": 0.14512453973293304, + "learning_rate": 1.9969032003837085e-05, + "loss": 0.0491, + "step": 1750 + }, + { + "epoch": 0.15249140893470792, + "grad_norm": 5.123754024505615, + "learning_rate": 1.996814135740954e-05, + "loss": 0.0236, + "step": 1775 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 4.357936859130859, + "learning_rate": 1.9967238104745695e-05, + "loss": 0.061, + "step": 1800 + }, + { + "epoch": 0.156786941580756, + "grad_norm": 8.39400863647461, + "learning_rate": 1.996632224698785e-05, + "loss": 0.0354, + "step": 1825 + }, + { + "epoch": 0.15893470790378006, + "grad_norm": 0.050370946526527405, + "learning_rate": 1.996539378529425e-05, + "loss": 0.0405, + "step": 1850 + }, + { + "epoch": 0.16108247422680413, + "grad_norm": 1.0384316444396973, + "learning_rate": 1.9964452720839065e-05, + "loss": 0.0383, + "step": 1875 + }, + { + "epoch": 0.16323024054982818, + "grad_norm": 0.08300001919269562, + "learning_rate": 1.9963499054812424e-05, + "loss": 0.065, + "step": 1900 + }, + { + "epoch": 0.16537800687285223, + "grad_norm": 0.14855168759822845, + "learning_rate": 1.9962532788420386e-05, + "loss": 0.0346, + "step": 1925 + }, + { + "epoch": 0.16752577319587628, + "grad_norm": 4.275923728942871, + "learning_rate": 1.9961553922884938e-05, + "loss": 0.0541, + "step": 1950 + }, + { + "epoch": 0.16967353951890035, + "grad_norm": 0.1353452056646347, + "learning_rate": 1.9960562459444006e-05, + "loss": 0.0565, + "step": 1975 + }, + { + "epoch": 0.1718213058419244, + "grad_norm": 1.0562896728515625, + "learning_rate": 1.9959558399351443e-05, + "loss": 0.0347, + "step": 2000 + }, + { + "epoch": 0.17396907216494845, + "grad_norm": 0.2936221659183502, + "learning_rate": 1.995854174387704e-05, + "loss": 0.0648, + "step": 2025 + }, + { + "epoch": 0.17611683848797252, + "grad_norm": 0.3627230226993561, + "learning_rate": 1.9957512494306516e-05, + "loss": 0.0404, + "step": 2050 + }, + { + "epoch": 0.17826460481099657, + "grad_norm": 50.099117279052734, + "learning_rate": 1.995647065194151e-05, + "loss": 0.0634, + "step": 2075 + }, + { + "epoch": 0.18041237113402062, + "grad_norm": 0.4619729220867157, + "learning_rate": 1.995541621809959e-05, + "loss": 0.0492, + "step": 2100 + }, + { + "epoch": 0.18256013745704466, + "grad_norm": 0.06161491572856903, + "learning_rate": 1.995434919411425e-05, + "loss": 0.0487, + "step": 2125 + }, + { + "epoch": 0.18470790378006874, + "grad_norm": 2.6800079345703125, + "learning_rate": 1.99532695813349e-05, + "loss": 0.0563, + "step": 2150 + }, + { + "epoch": 0.18685567010309279, + "grad_norm": 0.10793239623308182, + "learning_rate": 1.9952177381126885e-05, + "loss": 0.059, + "step": 2175 + }, + { + "epoch": 0.18900343642611683, + "grad_norm": 23.19139289855957, + "learning_rate": 1.9951072594871452e-05, + "loss": 0.0569, + "step": 2200 + }, + { + "epoch": 0.19115120274914088, + "grad_norm": 0.14359937608242035, + "learning_rate": 1.9949955223965775e-05, + "loss": 0.0444, + "step": 2225 + }, + { + "epoch": 0.19329896907216496, + "grad_norm": 17.149826049804688, + "learning_rate": 1.9948825269822934e-05, + "loss": 0.0402, + "step": 2250 + }, + { + "epoch": 0.195446735395189, + "grad_norm": 5.4898271560668945, + "learning_rate": 1.994768273387194e-05, + "loss": 0.059, + "step": 2275 + }, + { + "epoch": 0.19759450171821305, + "grad_norm": 8.055678367614746, + "learning_rate": 1.994652761755769e-05, + "loss": 0.0943, + "step": 2300 + }, + { + "epoch": 0.19974226804123713, + "grad_norm": 0.5743317008018494, + "learning_rate": 1.994535992234102e-05, + "loss": 0.0257, + "step": 2325 + }, + { + "epoch": 0.20189003436426117, + "grad_norm": 4.2678704261779785, + "learning_rate": 1.9944179649698645e-05, + "loss": 0.0615, + "step": 2350 + }, + { + "epoch": 0.20403780068728522, + "grad_norm": 1.0461511611938477, + "learning_rate": 1.9942986801123206e-05, + "loss": 0.0411, + "step": 2375 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 8.589263916015625, + "learning_rate": 1.9941781378123244e-05, + "loss": 0.0446, + "step": 2400 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 0.23947636783123016, + "learning_rate": 1.9940563382223196e-05, + "loss": 0.0311, + "step": 2425 + }, + { + "epoch": 0.2104810996563574, + "grad_norm": 35.49877166748047, + "learning_rate": 1.9939332814963407e-05, + "loss": 0.0506, + "step": 2450 + }, + { + "epoch": 0.21262886597938144, + "grad_norm": 0.10725884884595871, + "learning_rate": 1.993808967790012e-05, + "loss": 0.0551, + "step": 2475 + }, + { + "epoch": 0.21477663230240548, + "grad_norm": 0.4474234879016876, + "learning_rate": 1.9936833972605462e-05, + "loss": 0.0487, + "step": 2500 + }, + { + "epoch": 0.21692439862542956, + "grad_norm": 0.14895235002040863, + "learning_rate": 1.993556570066747e-05, + "loss": 0.0554, + "step": 2525 + }, + { + "epoch": 0.2190721649484536, + "grad_norm": 0.17111554741859436, + "learning_rate": 1.9934284863690073e-05, + "loss": 0.0239, + "step": 2550 + }, + { + "epoch": 0.22121993127147765, + "grad_norm": 9.492555618286133, + "learning_rate": 1.9932991463293076e-05, + "loss": 0.0424, + "step": 2575 + }, + { + "epoch": 0.22336769759450173, + "grad_norm": 0.26110920310020447, + "learning_rate": 1.9931685501112183e-05, + "loss": 0.0455, + "step": 2600 + }, + { + "epoch": 0.22551546391752578, + "grad_norm": 0.3599640429019928, + "learning_rate": 1.9930366978798983e-05, + "loss": 0.054, + "step": 2625 + }, + { + "epoch": 0.22766323024054982, + "grad_norm": 0.11810922622680664, + "learning_rate": 1.9929035898020954e-05, + "loss": 0.0792, + "step": 2650 + }, + { + "epoch": 0.22981099656357387, + "grad_norm": 2.6308884620666504, + "learning_rate": 1.9927692260461442e-05, + "loss": 0.0462, + "step": 2675 + }, + { + "epoch": 0.23195876288659795, + "grad_norm": 1.7579152584075928, + "learning_rate": 1.9926336067819686e-05, + "loss": 0.0661, + "step": 2700 + }, + { + "epoch": 0.234106529209622, + "grad_norm": 0.07637064903974533, + "learning_rate": 1.99249673218108e-05, + "loss": 0.0576, + "step": 2725 + }, + { + "epoch": 0.23625429553264604, + "grad_norm": 13.395700454711914, + "learning_rate": 1.992358602416577e-05, + "loss": 0.0325, + "step": 2750 + }, + { + "epoch": 0.23840206185567012, + "grad_norm": 6.431772232055664, + "learning_rate": 1.9922192176631464e-05, + "loss": 0.0416, + "step": 2775 + }, + { + "epoch": 0.24054982817869416, + "grad_norm": 2.539883613586426, + "learning_rate": 1.9920785780970604e-05, + "loss": 0.072, + "step": 2800 + }, + { + "epoch": 0.2426975945017182, + "grad_norm": 4.053053855895996, + "learning_rate": 1.99193668389618e-05, + "loss": 0.052, + "step": 2825 + }, + { + "epoch": 0.24484536082474226, + "grad_norm": 0.2745245099067688, + "learning_rate": 1.9917935352399516e-05, + "loss": 0.0598, + "step": 2850 + }, + { + "epoch": 0.24699312714776633, + "grad_norm": 0.47292354702949524, + "learning_rate": 1.991649132309409e-05, + "loss": 0.0548, + "step": 2875 + }, + { + "epoch": 0.24914089347079038, + "grad_norm": 4.954909324645996, + "learning_rate": 1.9915034752871714e-05, + "loss": 0.0432, + "step": 2900 + }, + { + "epoch": 0.25128865979381443, + "grad_norm": 0.3622566759586334, + "learning_rate": 1.9913565643574447e-05, + "loss": 0.0527, + "step": 2925 + }, + { + "epoch": 0.2534364261168385, + "grad_norm": 4.326613426208496, + "learning_rate": 1.9912083997060194e-05, + "loss": 0.0643, + "step": 2950 + }, + { + "epoch": 0.2555841924398625, + "grad_norm": 8.708691596984863, + "learning_rate": 1.9910589815202735e-05, + "loss": 0.0344, + "step": 2975 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 3.4240548610687256, + "learning_rate": 1.9909083099891682e-05, + "loss": 0.051, + "step": 3000 + }, + { + "epoch": 0.2598797250859107, + "grad_norm": 3.4485340118408203, + "learning_rate": 1.990756385303251e-05, + "loss": 0.0365, + "step": 3025 + }, + { + "epoch": 0.2620274914089347, + "grad_norm": 0.16023315489292145, + "learning_rate": 1.9906032076546537e-05, + "loss": 0.0601, + "step": 3050 + }, + { + "epoch": 0.26417525773195877, + "grad_norm": 0.09343978762626648, + "learning_rate": 1.990448777237093e-05, + "loss": 0.0583, + "step": 3075 + }, + { + "epoch": 0.2663230240549828, + "grad_norm": 15.7915678024292, + "learning_rate": 1.9902930942458694e-05, + "loss": 0.0586, + "step": 3100 + }, + { + "epoch": 0.26847079037800686, + "grad_norm": 0.13562452793121338, + "learning_rate": 1.990136158877868e-05, + "loss": 0.0255, + "step": 3125 + }, + { + "epoch": 0.2706185567010309, + "grad_norm": 2.393234968185425, + "learning_rate": 1.9899779713315577e-05, + "loss": 0.0427, + "step": 3150 + }, + { + "epoch": 0.27276632302405496, + "grad_norm": 0.21304355561733246, + "learning_rate": 1.9898185318069905e-05, + "loss": 0.0538, + "step": 3175 + }, + { + "epoch": 0.27491408934707906, + "grad_norm": 0.05964520201086998, + "learning_rate": 1.9896578405058028e-05, + "loss": 0.024, + "step": 3200 + }, + { + "epoch": 0.2770618556701031, + "grad_norm": 10.167670249938965, + "learning_rate": 1.989495897631212e-05, + "loss": 0.0485, + "step": 3225 + }, + { + "epoch": 0.27920962199312716, + "grad_norm": 0.0654948353767395, + "learning_rate": 1.9893327033880205e-05, + "loss": 0.0516, + "step": 3250 + }, + { + "epoch": 0.2813573883161512, + "grad_norm": 2.6573526859283447, + "learning_rate": 1.9891682579826123e-05, + "loss": 0.0581, + "step": 3275 + }, + { + "epoch": 0.28350515463917525, + "grad_norm": 5.646691799163818, + "learning_rate": 1.989002561622953e-05, + "loss": 0.0656, + "step": 3300 + }, + { + "epoch": 0.2856529209621993, + "grad_norm": 5.568312644958496, + "learning_rate": 1.9888356145185917e-05, + "loss": 0.0265, + "step": 3325 + }, + { + "epoch": 0.28780068728522334, + "grad_norm": 4.8628458976745605, + "learning_rate": 1.988667416880658e-05, + "loss": 0.0843, + "step": 3350 + }, + { + "epoch": 0.28994845360824745, + "grad_norm": 0.8384003639221191, + "learning_rate": 1.988497968921864e-05, + "loss": 0.0629, + "step": 3375 + }, + { + "epoch": 0.2920962199312715, + "grad_norm": 0.11907734721899033, + "learning_rate": 1.9883272708565022e-05, + "loss": 0.0415, + "step": 3400 + }, + { + "epoch": 0.29424398625429554, + "grad_norm": 0.22190101444721222, + "learning_rate": 1.9881553229004466e-05, + "loss": 0.0527, + "step": 3425 + }, + { + "epoch": 0.2963917525773196, + "grad_norm": 0.0642390251159668, + "learning_rate": 1.9879821252711507e-05, + "loss": 0.0528, + "step": 3450 + }, + { + "epoch": 0.29853951890034364, + "grad_norm": 0.09625408798456192, + "learning_rate": 1.9878076781876506e-05, + "loss": 0.0662, + "step": 3475 + }, + { + "epoch": 0.3006872852233677, + "grad_norm": 0.6517542600631714, + "learning_rate": 1.9876319818705604e-05, + "loss": 0.0504, + "step": 3500 + }, + { + "epoch": 0.30283505154639173, + "grad_norm": 0.5447396636009216, + "learning_rate": 1.9874550365420758e-05, + "loss": 0.0289, + "step": 3525 + }, + { + "epoch": 0.30498281786941583, + "grad_norm": 12.05667495727539, + "learning_rate": 1.98727684242597e-05, + "loss": 0.0651, + "step": 3550 + }, + { + "epoch": 0.3071305841924399, + "grad_norm": 0.1834930032491684, + "learning_rate": 1.9870973997475976e-05, + "loss": 0.0385, + "step": 3575 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 1.2707126140594482, + "learning_rate": 1.9869167087338908e-05, + "loss": 0.0374, + "step": 3600 + }, + { + "epoch": 0.311426116838488, + "grad_norm": 10.986795425415039, + "learning_rate": 1.986734769613361e-05, + "loss": 0.0676, + "step": 3625 + }, + { + "epoch": 0.313573883161512, + "grad_norm": 11.88626480102539, + "learning_rate": 1.9865515826160984e-05, + "loss": 0.0697, + "step": 3650 + }, + { + "epoch": 0.31572164948453607, + "grad_norm": 0.3198440968990326, + "learning_rate": 1.9863671479737704e-05, + "loss": 0.0622, + "step": 3675 + }, + { + "epoch": 0.3178694158075601, + "grad_norm": 4.663450241088867, + "learning_rate": 1.986181465919623e-05, + "loss": 0.0457, + "step": 3700 + }, + { + "epoch": 0.32001718213058417, + "grad_norm": 3.393342971801758, + "learning_rate": 1.9859945366884795e-05, + "loss": 0.0545, + "step": 3725 + }, + { + "epoch": 0.32216494845360827, + "grad_norm": 12.382338523864746, + "learning_rate": 1.9858063605167403e-05, + "loss": 0.0575, + "step": 3750 + }, + { + "epoch": 0.3243127147766323, + "grad_norm": 0.1436670422554016, + "learning_rate": 1.985616937642383e-05, + "loss": 0.0393, + "step": 3775 + }, + { + "epoch": 0.32646048109965636, + "grad_norm": 0.2215060293674469, + "learning_rate": 1.985426268304962e-05, + "loss": 0.0182, + "step": 3800 + }, + { + "epoch": 0.3286082474226804, + "grad_norm": 0.037068601697683334, + "learning_rate": 1.9852343527456074e-05, + "loss": 0.0635, + "step": 3825 + }, + { + "epoch": 0.33075601374570446, + "grad_norm": 0.357901394367218, + "learning_rate": 1.9850411912070253e-05, + "loss": 0.095, + "step": 3850 + }, + { + "epoch": 0.3329037800687285, + "grad_norm": 0.22573980689048767, + "learning_rate": 1.9848467839334994e-05, + "loss": 0.0851, + "step": 3875 + }, + { + "epoch": 0.33505154639175255, + "grad_norm": 0.2808251976966858, + "learning_rate": 1.9846511311708857e-05, + "loss": 0.0628, + "step": 3900 + }, + { + "epoch": 0.33719931271477666, + "grad_norm": 0.20651878416538239, + "learning_rate": 1.984454233166618e-05, + "loss": 0.054, + "step": 3925 + }, + { + "epoch": 0.3393470790378007, + "grad_norm": 0.49057453870773315, + "learning_rate": 1.9842560901697038e-05, + "loss": 0.059, + "step": 3950 + }, + { + "epoch": 0.34149484536082475, + "grad_norm": 0.10894645750522614, + "learning_rate": 1.9840567024307248e-05, + "loss": 0.0307, + "step": 3975 + }, + { + "epoch": 0.3436426116838488, + "grad_norm": 0.11885436624288559, + "learning_rate": 1.9838560702018373e-05, + "loss": 0.0477, + "step": 4000 + }, + { + "epoch": 0.34579037800687284, + "grad_norm": 0.4040253758430481, + "learning_rate": 1.9836541937367718e-05, + "loss": 0.0731, + "step": 4025 + }, + { + "epoch": 0.3479381443298969, + "grad_norm": 0.4288731813430786, + "learning_rate": 1.9834510732908314e-05, + "loss": 0.0615, + "step": 4050 + }, + { + "epoch": 0.35008591065292094, + "grad_norm": 0.19315102696418762, + "learning_rate": 1.9832467091208935e-05, + "loss": 0.0303, + "step": 4075 + }, + { + "epoch": 0.35223367697594504, + "grad_norm": 8.324795722961426, + "learning_rate": 1.983041101485407e-05, + "loss": 0.049, + "step": 4100 + }, + { + "epoch": 0.3543814432989691, + "grad_norm": 0.07661579549312592, + "learning_rate": 1.9828342506443946e-05, + "loss": 0.0231, + "step": 4125 + }, + { + "epoch": 0.35652920962199314, + "grad_norm": 0.3669668734073639, + "learning_rate": 1.982626156859451e-05, + "loss": 0.0293, + "step": 4150 + }, + { + "epoch": 0.3586769759450172, + "grad_norm": 18.510757446289062, + "learning_rate": 1.9824168203937417e-05, + "loss": 0.0689, + "step": 4175 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 0.06907837092876434, + "learning_rate": 1.9822062415120053e-05, + "loss": 0.072, + "step": 4200 + }, + { + "epoch": 0.3629725085910653, + "grad_norm": 2.708010196685791, + "learning_rate": 1.9819944204805513e-05, + "loss": 0.058, + "step": 4225 + }, + { + "epoch": 0.3651202749140893, + "grad_norm": 9.297883033752441, + "learning_rate": 1.9817813575672587e-05, + "loss": 0.04, + "step": 4250 + }, + { + "epoch": 0.36726804123711343, + "grad_norm": 0.18147027492523193, + "learning_rate": 1.9815670530415788e-05, + "loss": 0.0594, + "step": 4275 + }, + { + "epoch": 0.3694158075601375, + "grad_norm": 4.933472156524658, + "learning_rate": 1.9813515071745324e-05, + "loss": 0.0442, + "step": 4300 + }, + { + "epoch": 0.3715635738831615, + "grad_norm": 12.08565616607666, + "learning_rate": 1.9811347202387098e-05, + "loss": 0.0402, + "step": 4325 + }, + { + "epoch": 0.37371134020618557, + "grad_norm": 7.418493747711182, + "learning_rate": 1.9809166925082714e-05, + "loss": 0.0803, + "step": 4350 + }, + { + "epoch": 0.3758591065292096, + "grad_norm": 0.521247923374176, + "learning_rate": 1.980697424258946e-05, + "loss": 0.063, + "step": 4375 + }, + { + "epoch": 0.37800687285223367, + "grad_norm": 20.276582717895508, + "learning_rate": 1.980476915768033e-05, + "loss": 0.0419, + "step": 4400 + }, + { + "epoch": 0.3801546391752577, + "grad_norm": 66.68013763427734, + "learning_rate": 1.9802551673143976e-05, + "loss": 0.0519, + "step": 4425 + }, + { + "epoch": 0.38230240549828176, + "grad_norm": 0.04855303838849068, + "learning_rate": 1.9800321791784757e-05, + "loss": 0.0647, + "step": 4450 + }, + { + "epoch": 0.38445017182130586, + "grad_norm": 7.420501708984375, + "learning_rate": 1.979807951642269e-05, + "loss": 0.0669, + "step": 4475 + }, + { + "epoch": 0.3865979381443299, + "grad_norm": 5.69240665435791, + "learning_rate": 1.9795824849893483e-05, + "loss": 0.0574, + "step": 4500 + }, + { + "epoch": 0.38874570446735396, + "grad_norm": 0.3805040419101715, + "learning_rate": 1.9793557795048498e-05, + "loss": 0.0543, + "step": 4525 + }, + { + "epoch": 0.390893470790378, + "grad_norm": 1.0642169713974, + "learning_rate": 1.9791278354754774e-05, + "loss": 0.0223, + "step": 4550 + }, + { + "epoch": 0.39304123711340205, + "grad_norm": 0.12806548178195953, + "learning_rate": 1.9788986531895015e-05, + "loss": 0.0355, + "step": 4575 + }, + { + "epoch": 0.3951890034364261, + "grad_norm": 8.487398147583008, + "learning_rate": 1.9786682329367578e-05, + "loss": 0.0531, + "step": 4600 + }, + { + "epoch": 0.39733676975945015, + "grad_norm": 5.226919174194336, + "learning_rate": 1.978436575008648e-05, + "loss": 0.0709, + "step": 4625 + }, + { + "epoch": 0.39948453608247425, + "grad_norm": 0.8841233849525452, + "learning_rate": 1.9782036796981384e-05, + "loss": 0.0507, + "step": 4650 + }, + { + "epoch": 0.4016323024054983, + "grad_norm": 0.4147562086582184, + "learning_rate": 1.9779695472997617e-05, + "loss": 0.0335, + "step": 4675 + }, + { + "epoch": 0.40378006872852235, + "grad_norm": 0.04043642431497574, + "learning_rate": 1.9777341781096128e-05, + "loss": 0.0568, + "step": 4700 + }, + { + "epoch": 0.4059278350515464, + "grad_norm": 8.132497787475586, + "learning_rate": 1.977497572425353e-05, + "loss": 0.0542, + "step": 4725 + }, + { + "epoch": 0.40807560137457044, + "grad_norm": 2.5598886013031006, + "learning_rate": 1.9772597305462056e-05, + "loss": 0.0785, + "step": 4750 + }, + { + "epoch": 0.4102233676975945, + "grad_norm": 0.4135844111442566, + "learning_rate": 1.9770206527729586e-05, + "loss": 0.0371, + "step": 4775 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 6.21074104309082, + "learning_rate": 1.9767803394079618e-05, + "loss": 0.075, + "step": 4800 + }, + { + "epoch": 0.41451890034364264, + "grad_norm": 20.720787048339844, + "learning_rate": 1.9765387907551283e-05, + "loss": 0.0458, + "step": 4825 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 22.962743759155273, + "learning_rate": 1.9762960071199334e-05, + "loss": 0.0435, + "step": 4850 + }, + { + "epoch": 0.41881443298969073, + "grad_norm": 0.20661643147468567, + "learning_rate": 1.976051988809414e-05, + "loss": 0.0299, + "step": 4875 + }, + { + "epoch": 0.4209621993127148, + "grad_norm": 7.80222749710083, + "learning_rate": 1.9758067361321683e-05, + "loss": 0.0582, + "step": 4900 + }, + { + "epoch": 0.4231099656357388, + "grad_norm": 16.07340431213379, + "learning_rate": 1.975560249398356e-05, + "loss": 0.0378, + "step": 4925 + }, + { + "epoch": 0.4252577319587629, + "grad_norm": 3.892580509185791, + "learning_rate": 1.975312528919697e-05, + "loss": 0.0805, + "step": 4950 + }, + { + "epoch": 0.4274054982817869, + "grad_norm": 3.822619676589966, + "learning_rate": 1.9750635750094722e-05, + "loss": 0.0543, + "step": 4975 + }, + { + "epoch": 0.42955326460481097, + "grad_norm": 0.48507124185562134, + "learning_rate": 1.974813387982521e-05, + "loss": 0.0738, + "step": 5000 + }, + { + "epoch": 0.43170103092783507, + "grad_norm": 4.88975191116333, + "learning_rate": 1.974561968155243e-05, + "loss": 0.0423, + "step": 5025 + }, + { + "epoch": 0.4338487972508591, + "grad_norm": 0.11655927449464798, + "learning_rate": 1.9743093158455983e-05, + "loss": 0.0283, + "step": 5050 + }, + { + "epoch": 0.43599656357388317, + "grad_norm": 21.77886390686035, + "learning_rate": 1.9740554313731032e-05, + "loss": 0.0405, + "step": 5075 + }, + { + "epoch": 0.4381443298969072, + "grad_norm": 0.7759548425674438, + "learning_rate": 1.973800315058833e-05, + "loss": 0.0461, + "step": 5100 + }, + { + "epoch": 0.44029209621993126, + "grad_norm": 0.43608465790748596, + "learning_rate": 1.9735439672254225e-05, + "loss": 0.0483, + "step": 5125 + }, + { + "epoch": 0.4424398625429553, + "grad_norm": 0.35271695256233215, + "learning_rate": 1.9732863881970614e-05, + "loss": 0.0539, + "step": 5150 + }, + { + "epoch": 0.44458762886597936, + "grad_norm": 0.04264714568853378, + "learning_rate": 1.9730275782994984e-05, + "loss": 0.0342, + "step": 5175 + }, + { + "epoch": 0.44673539518900346, + "grad_norm": 0.17068693041801453, + "learning_rate": 1.9727675378600382e-05, + "loss": 0.0338, + "step": 5200 + }, + { + "epoch": 0.4488831615120275, + "grad_norm": 0.1435108780860901, + "learning_rate": 1.9725062672075413e-05, + "loss": 0.0972, + "step": 5225 + }, + { + "epoch": 0.45103092783505155, + "grad_norm": 6.6026105880737305, + "learning_rate": 1.9722437666724245e-05, + "loss": 0.0469, + "step": 5250 + }, + { + "epoch": 0.4531786941580756, + "grad_norm": 0.22810663282871246, + "learning_rate": 1.97198003658666e-05, + "loss": 0.0478, + "step": 5275 + }, + { + "epoch": 0.45532646048109965, + "grad_norm": 0.12249985337257385, + "learning_rate": 1.9717150772837738e-05, + "loss": 0.0722, + "step": 5300 + }, + { + "epoch": 0.4574742268041237, + "grad_norm": 6.071200847625732, + "learning_rate": 1.9714488890988485e-05, + "loss": 0.0421, + "step": 5325 + }, + { + "epoch": 0.45962199312714774, + "grad_norm": 0.4488905966281891, + "learning_rate": 1.9711814723685192e-05, + "loss": 0.0331, + "step": 5350 + }, + { + "epoch": 0.46176975945017185, + "grad_norm": 0.9306180477142334, + "learning_rate": 1.9709128274309754e-05, + "loss": 0.05, + "step": 5375 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 1.6058001518249512, + "learning_rate": 1.9706429546259592e-05, + "loss": 0.0485, + "step": 5400 + }, + { + "epoch": 0.46606529209621994, + "grad_norm": 0.7603501677513123, + "learning_rate": 1.9703718542947663e-05, + "loss": 0.0544, + "step": 5425 + }, + { + "epoch": 0.468213058419244, + "grad_norm": 0.05442309379577637, + "learning_rate": 1.9700995267802446e-05, + "loss": 0.0663, + "step": 5450 + }, + { + "epoch": 0.47036082474226804, + "grad_norm": 4.575985431671143, + "learning_rate": 1.969825972426793e-05, + "loss": 0.032, + "step": 5475 + }, + { + "epoch": 0.4725085910652921, + "grad_norm": 0.4870621860027313, + "learning_rate": 1.969551191580364e-05, + "loss": 0.0367, + "step": 5500 + }, + { + "epoch": 0.47465635738831613, + "grad_norm": 23.871408462524414, + "learning_rate": 1.969275184588459e-05, + "loss": 0.0157, + "step": 5525 + }, + { + "epoch": 0.47680412371134023, + "grad_norm": 0.08326155692338943, + "learning_rate": 1.968997951800131e-05, + "loss": 0.0372, + "step": 5550 + }, + { + "epoch": 0.4789518900343643, + "grad_norm": 7.102270603179932, + "learning_rate": 1.9687194935659835e-05, + "loss": 0.0633, + "step": 5575 + }, + { + "epoch": 0.48109965635738833, + "grad_norm": 27.40580177307129, + "learning_rate": 1.9684398102381694e-05, + "loss": 0.0778, + "step": 5600 + }, + { + "epoch": 0.4832474226804124, + "grad_norm": 0.7544988989830017, + "learning_rate": 1.968158902170391e-05, + "loss": 0.0633, + "step": 5625 + }, + { + "epoch": 0.4853951890034364, + "grad_norm": 2.4647064208984375, + "learning_rate": 1.967876769717899e-05, + "loss": 0.0494, + "step": 5650 + }, + { + "epoch": 0.48754295532646047, + "grad_norm": 5.285757064819336, + "learning_rate": 1.9675934132374938e-05, + "loss": 0.043, + "step": 5675 + }, + { + "epoch": 0.4896907216494845, + "grad_norm": 1.280023455619812, + "learning_rate": 1.967308833087522e-05, + "loss": 0.041, + "step": 5700 + }, + { + "epoch": 0.49183848797250856, + "grad_norm": 0.12720395624637604, + "learning_rate": 1.9670230296278788e-05, + "loss": 0.0392, + "step": 5725 + }, + { + "epoch": 0.49398625429553267, + "grad_norm": 5.303956031799316, + "learning_rate": 1.9667360032200067e-05, + "loss": 0.0503, + "step": 5750 + }, + { + "epoch": 0.4961340206185567, + "grad_norm": 5.097157955169678, + "learning_rate": 1.9664477542268946e-05, + "loss": 0.0383, + "step": 5775 + }, + { + "epoch": 0.49828178694158076, + "grad_norm": 1.7934011220932007, + "learning_rate": 1.9661582830130766e-05, + "loss": 0.0674, + "step": 5800 + }, + { + "epoch": 0.5004295532646048, + "grad_norm": 0.5759339928627014, + "learning_rate": 1.9658675899446337e-05, + "loss": 0.036, + "step": 5825 + }, + { + "epoch": 0.5025773195876289, + "grad_norm": 1.0998343229293823, + "learning_rate": 1.9655756753891916e-05, + "loss": 0.0418, + "step": 5850 + }, + { + "epoch": 0.5047250859106529, + "grad_norm": 0.997466504573822, + "learning_rate": 1.9652825397159208e-05, + "loss": 0.0682, + "step": 5875 + }, + { + "epoch": 0.506872852233677, + "grad_norm": 3.268512725830078, + "learning_rate": 1.9649881832955362e-05, + "loss": 0.0216, + "step": 5900 + }, + { + "epoch": 0.509020618556701, + "grad_norm": 0.757311999797821, + "learning_rate": 1.9646926065002963e-05, + "loss": 0.0526, + "step": 5925 + }, + { + "epoch": 0.511168384879725, + "grad_norm": 0.07271480560302734, + "learning_rate": 1.9643958097040027e-05, + "loss": 0.0448, + "step": 5950 + }, + { + "epoch": 0.5133161512027491, + "grad_norm": 7.053762435913086, + "learning_rate": 1.964097793282001e-05, + "loss": 0.0476, + "step": 5975 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 3.3734793663024902, + "learning_rate": 1.963798557611178e-05, + "loss": 0.0733, + "step": 6000 + }, + { + "epoch": 0.5176116838487973, + "grad_norm": 5.378251075744629, + "learning_rate": 1.963498103069963e-05, + "loss": 0.0673, + "step": 6025 + }, + { + "epoch": 0.5197594501718213, + "grad_norm": 5.600104331970215, + "learning_rate": 1.963196430038327e-05, + "loss": 0.0483, + "step": 6050 + }, + { + "epoch": 0.5219072164948454, + "grad_norm": 6.683028697967529, + "learning_rate": 1.9628935388977804e-05, + "loss": 0.0793, + "step": 6075 + }, + { + "epoch": 0.5240549828178694, + "grad_norm": 0.2220430076122284, + "learning_rate": 1.9625894300313768e-05, + "loss": 0.0584, + "step": 6100 + }, + { + "epoch": 0.5262027491408935, + "grad_norm": 87.20639038085938, + "learning_rate": 1.9622841038237076e-05, + "loss": 0.0382, + "step": 6125 + }, + { + "epoch": 0.5283505154639175, + "grad_norm": 0.24583739042282104, + "learning_rate": 1.9619775606609038e-05, + "loss": 0.0504, + "step": 6150 + }, + { + "epoch": 0.5304982817869416, + "grad_norm": 2.210832118988037, + "learning_rate": 1.961669800930637e-05, + "loss": 0.0697, + "step": 6175 + }, + { + "epoch": 0.5326460481099656, + "grad_norm": 7.439363956451416, + "learning_rate": 1.961360825022116e-05, + "loss": 0.0497, + "step": 6200 + }, + { + "epoch": 0.5347938144329897, + "grad_norm": 0.3508168160915375, + "learning_rate": 1.9610506333260876e-05, + "loss": 0.063, + "step": 6225 + }, + { + "epoch": 0.5369415807560137, + "grad_norm": 8.452553749084473, + "learning_rate": 1.960739226234837e-05, + "loss": 0.0483, + "step": 6250 + }, + { + "epoch": 0.5390893470790378, + "grad_norm": 0.20960940420627594, + "learning_rate": 1.960426604142186e-05, + "loss": 0.0517, + "step": 6275 + }, + { + "epoch": 0.5412371134020618, + "grad_norm": 0.7777512073516846, + "learning_rate": 1.960112767443493e-05, + "loss": 0.049, + "step": 6300 + }, + { + "epoch": 0.5433848797250859, + "grad_norm": 0.377209335565567, + "learning_rate": 1.959797716535652e-05, + "loss": 0.0297, + "step": 6325 + }, + { + "epoch": 0.5455326460481099, + "grad_norm": 0.26072120666503906, + "learning_rate": 1.959481451817093e-05, + "loss": 0.0614, + "step": 6350 + }, + { + "epoch": 0.5476804123711341, + "grad_norm": 5.090667247772217, + "learning_rate": 1.9591639736877817e-05, + "loss": 0.038, + "step": 6375 + }, + { + "epoch": 0.5498281786941581, + "grad_norm": 5.221283435821533, + "learning_rate": 1.958845282549217e-05, + "loss": 0.0488, + "step": 6400 + }, + { + "epoch": 0.5519759450171822, + "grad_norm": 3.866102457046509, + "learning_rate": 1.9585253788044328e-05, + "loss": 0.0447, + "step": 6425 + }, + { + "epoch": 0.5541237113402062, + "grad_norm": 0.2685663402080536, + "learning_rate": 1.9582042628579957e-05, + "loss": 0.045, + "step": 6450 + }, + { + "epoch": 0.5562714776632303, + "grad_norm": 28.368770599365234, + "learning_rate": 1.9578819351160065e-05, + "loss": 0.0625, + "step": 6475 + }, + { + "epoch": 0.5584192439862543, + "grad_norm": 7.145070552825928, + "learning_rate": 1.9575583959860978e-05, + "loss": 0.0757, + "step": 6500 + }, + { + "epoch": 0.5605670103092784, + "grad_norm": 0.5358865857124329, + "learning_rate": 1.9572336458774336e-05, + "loss": 0.0236, + "step": 6525 + }, + { + "epoch": 0.5627147766323024, + "grad_norm": 6.031324863433838, + "learning_rate": 1.9569076852007104e-05, + "loss": 0.0446, + "step": 6550 + }, + { + "epoch": 0.5648625429553265, + "grad_norm": 0.31238433718681335, + "learning_rate": 1.9565805143681557e-05, + "loss": 0.0291, + "step": 6575 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 6.528176307678223, + "learning_rate": 1.9562521337935255e-05, + "loss": 0.0661, + "step": 6600 + }, + { + "epoch": 0.5691580756013745, + "grad_norm": 0.12286274880170822, + "learning_rate": 1.9559225438921088e-05, + "loss": 0.0656, + "step": 6625 + }, + { + "epoch": 0.5713058419243986, + "grad_norm": 6.793941020965576, + "learning_rate": 1.9555917450807215e-05, + "loss": 0.0411, + "step": 6650 + }, + { + "epoch": 0.5734536082474226, + "grad_norm": 7.339749813079834, + "learning_rate": 1.9552597377777092e-05, + "loss": 0.0731, + "step": 6675 + }, + { + "epoch": 0.5756013745704467, + "grad_norm": 0.47486642003059387, + "learning_rate": 1.9549265224029457e-05, + "loss": 0.0298, + "step": 6700 + }, + { + "epoch": 0.5777491408934707, + "grad_norm": 0.02516503445804119, + "learning_rate": 1.9545920993778336e-05, + "loss": 0.0638, + "step": 6725 + }, + { + "epoch": 0.5798969072164949, + "grad_norm": 0.18045619130134583, + "learning_rate": 1.954256469125301e-05, + "loss": 0.0547, + "step": 6750 + }, + { + "epoch": 0.5820446735395189, + "grad_norm": 1.2854068279266357, + "learning_rate": 1.9539196320698034e-05, + "loss": 0.0385, + "step": 6775 + }, + { + "epoch": 0.584192439862543, + "grad_norm": 7.502423286437988, + "learning_rate": 1.953581588637323e-05, + "loss": 0.0449, + "step": 6800 + }, + { + "epoch": 0.586340206185567, + "grad_norm": 0.056159455329179764, + "learning_rate": 1.9532423392553685e-05, + "loss": 0.0456, + "step": 6825 + }, + { + "epoch": 0.5884879725085911, + "grad_norm": 1.01603364944458, + "learning_rate": 1.9529018843529702e-05, + "loss": 0.0673, + "step": 6850 + }, + { + "epoch": 0.5906357388316151, + "grad_norm": 0.6476568579673767, + "learning_rate": 1.9525602243606872e-05, + "loss": 0.0432, + "step": 6875 + }, + { + "epoch": 0.5927835051546392, + "grad_norm": 0.46902328729629517, + "learning_rate": 1.9522173597105997e-05, + "loss": 0.0524, + "step": 6900 + }, + { + "epoch": 0.5949312714776632, + "grad_norm": 0.24333925545215607, + "learning_rate": 1.9518732908363128e-05, + "loss": 0.0257, + "step": 6925 + }, + { + "epoch": 0.5970790378006873, + "grad_norm": 6.165106296539307, + "learning_rate": 1.9515280181729538e-05, + "loss": 0.0425, + "step": 6950 + }, + { + "epoch": 0.5992268041237113, + "grad_norm": 0.1131974533200264, + "learning_rate": 1.9511815421571733e-05, + "loss": 0.0734, + "step": 6975 + }, + { + "epoch": 0.6013745704467354, + "grad_norm": 0.4201565384864807, + "learning_rate": 1.9508338632271425e-05, + "loss": 0.037, + "step": 7000 + }, + { + "epoch": 0.6035223367697594, + "grad_norm": 15.548513412475586, + "learning_rate": 1.9504849818225544e-05, + "loss": 0.0449, + "step": 7025 + }, + { + "epoch": 0.6056701030927835, + "grad_norm": 4.770808219909668, + "learning_rate": 1.9501348983846234e-05, + "loss": 0.0258, + "step": 7050 + }, + { + "epoch": 0.6078178694158075, + "grad_norm": 0.4175913631916046, + "learning_rate": 1.9497836133560833e-05, + "loss": 0.0553, + "step": 7075 + }, + { + "epoch": 0.6099656357388317, + "grad_norm": 0.4637356400489807, + "learning_rate": 1.949431127181187e-05, + "loss": 0.0583, + "step": 7100 + }, + { + "epoch": 0.6121134020618557, + "grad_norm": 0.20159782469272614, + "learning_rate": 1.949077440305708e-05, + "loss": 0.0527, + "step": 7125 + }, + { + "epoch": 0.6142611683848798, + "grad_norm": 0.5328507423400879, + "learning_rate": 1.9487225531769365e-05, + "loss": 0.0548, + "step": 7150 + }, + { + "epoch": 0.6164089347079038, + "grad_norm": 0.2819543778896332, + "learning_rate": 1.948366466243682e-05, + "loss": 0.071, + "step": 7175 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 0.18138593435287476, + "learning_rate": 1.9480091799562706e-05, + "loss": 0.0663, + "step": 7200 + }, + { + "epoch": 0.6207044673539519, + "grad_norm": 0.10751151293516159, + "learning_rate": 1.947650694766545e-05, + "loss": 0.061, + "step": 7225 + }, + { + "epoch": 0.622852233676976, + "grad_norm": 0.4629723131656647, + "learning_rate": 1.9472910111278654e-05, + "loss": 0.0471, + "step": 7250 + }, + { + "epoch": 0.625, + "grad_norm": 0.6148477792739868, + "learning_rate": 1.946930129495106e-05, + "loss": 0.0437, + "step": 7275 + }, + { + "epoch": 0.627147766323024, + "grad_norm": 4.070817470550537, + "learning_rate": 1.946568050324656e-05, + "loss": 0.0697, + "step": 7300 + }, + { + "epoch": 0.6292955326460481, + "grad_norm": 5.666269779205322, + "learning_rate": 1.9462047740744213e-05, + "loss": 0.0526, + "step": 7325 + }, + { + "epoch": 0.6314432989690721, + "grad_norm": 1.8848096132278442, + "learning_rate": 1.9458403012038193e-05, + "loss": 0.0521, + "step": 7350 + }, + { + "epoch": 0.6335910652920962, + "grad_norm": 0.4152567684650421, + "learning_rate": 1.9454746321737816e-05, + "loss": 0.0391, + "step": 7375 + }, + { + "epoch": 0.6357388316151202, + "grad_norm": 5.874293327331543, + "learning_rate": 1.9451077674467526e-05, + "loss": 0.0401, + "step": 7400 + }, + { + "epoch": 0.6378865979381443, + "grad_norm": 0.23378446698188782, + "learning_rate": 1.944739707486689e-05, + "loss": 0.0787, + "step": 7425 + }, + { + "epoch": 0.6400343642611683, + "grad_norm": 5.4545392990112305, + "learning_rate": 1.944370452759058e-05, + "loss": 0.0465, + "step": 7450 + }, + { + "epoch": 0.6421821305841925, + "grad_norm": 0.21332301199436188, + "learning_rate": 1.9440000037308393e-05, + "loss": 0.0613, + "step": 7475 + }, + { + "epoch": 0.6443298969072165, + "grad_norm": 0.14508970081806183, + "learning_rate": 1.943628360870522e-05, + "loss": 0.0537, + "step": 7500 + }, + { + "epoch": 0.6464776632302406, + "grad_norm": 4.550618648529053, + "learning_rate": 1.9432555246481056e-05, + "loss": 0.0459, + "step": 7525 + }, + { + "epoch": 0.6486254295532646, + "grad_norm": 0.1952928751707077, + "learning_rate": 1.942881495535098e-05, + "loss": 0.0749, + "step": 7550 + }, + { + "epoch": 0.6507731958762887, + "grad_norm": 0.30271071195602417, + "learning_rate": 1.942506274004516e-05, + "loss": 0.0629, + "step": 7575 + }, + { + "epoch": 0.6529209621993127, + "grad_norm": 0.11940059065818787, + "learning_rate": 1.9421298605308847e-05, + "loss": 0.06, + "step": 7600 + }, + { + "epoch": 0.6550687285223368, + "grad_norm": 5.8459577560424805, + "learning_rate": 1.9417522555902365e-05, + "loss": 0.0384, + "step": 7625 + }, + { + "epoch": 0.6572164948453608, + "grad_norm": 0.060936424881219864, + "learning_rate": 1.9413734596601104e-05, + "loss": 0.0456, + "step": 7650 + }, + { + "epoch": 0.6593642611683849, + "grad_norm": 0.7595691084861755, + "learning_rate": 1.9409934732195515e-05, + "loss": 0.0504, + "step": 7675 + }, + { + "epoch": 0.6615120274914089, + "grad_norm": 3.2232072353363037, + "learning_rate": 1.940612296749111e-05, + "loss": 0.0612, + "step": 7700 + }, + { + "epoch": 0.663659793814433, + "grad_norm": 0.43697860836982727, + "learning_rate": 1.9402299307308445e-05, + "loss": 0.0496, + "step": 7725 + }, + { + "epoch": 0.665807560137457, + "grad_norm": 5.286158561706543, + "learning_rate": 1.9398463756483122e-05, + "loss": 0.0531, + "step": 7750 + }, + { + "epoch": 0.6679553264604811, + "grad_norm": 0.12369837611913681, + "learning_rate": 1.939461631986578e-05, + "loss": 0.053, + "step": 7775 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 7.27524471282959, + "learning_rate": 1.939075700232209e-05, + "loss": 0.08, + "step": 7800 + }, + { + "epoch": 0.6722508591065293, + "grad_norm": 0.17221558094024658, + "learning_rate": 1.9386885808732748e-05, + "loss": 0.0417, + "step": 7825 + }, + { + "epoch": 0.6743986254295533, + "grad_norm": 2.5155370235443115, + "learning_rate": 1.9383002743993476e-05, + "loss": 0.0366, + "step": 7850 + }, + { + "epoch": 0.6765463917525774, + "grad_norm": 0.27873021364212036, + "learning_rate": 1.937910781301499e-05, + "loss": 0.0504, + "step": 7875 + }, + { + "epoch": 0.6786941580756014, + "grad_norm": 9.276744842529297, + "learning_rate": 1.9375201020723034e-05, + "loss": 0.0451, + "step": 7900 + }, + { + "epoch": 0.6808419243986255, + "grad_norm": 2.118696689605713, + "learning_rate": 1.9371282372058337e-05, + "loss": 0.0707, + "step": 7925 + }, + { + "epoch": 0.6829896907216495, + "grad_norm": 1.7809380292892456, + "learning_rate": 1.9367351871976634e-05, + "loss": 0.0206, + "step": 7950 + }, + { + "epoch": 0.6851374570446735, + "grad_norm": 4.310935020446777, + "learning_rate": 1.936340952544864e-05, + "loss": 0.0657, + "step": 7975 + }, + { + "epoch": 0.6872852233676976, + "grad_norm": 4.755844593048096, + "learning_rate": 1.9359455337460054e-05, + "loss": 0.041, + "step": 8000 + }, + { + "epoch": 0.6894329896907216, + "grad_norm": 6.642769813537598, + "learning_rate": 1.9355489313011547e-05, + "loss": 0.0371, + "step": 8025 + }, + { + "epoch": 0.6915807560137457, + "grad_norm": 0.5431068539619446, + "learning_rate": 1.9351511457118768e-05, + "loss": 0.0407, + "step": 8050 + }, + { + "epoch": 0.6937285223367697, + "grad_norm": 0.17442908883094788, + "learning_rate": 1.934752177481232e-05, + "loss": 0.0683, + "step": 8075 + }, + { + "epoch": 0.6958762886597938, + "grad_norm": 0.1295899599790573, + "learning_rate": 1.9343520271137764e-05, + "loss": 0.0256, + "step": 8100 + }, + { + "epoch": 0.6980240549828178, + "grad_norm": 4.093729496002197, + "learning_rate": 1.9339506951155614e-05, + "loss": 0.0323, + "step": 8125 + }, + { + "epoch": 0.7001718213058419, + "grad_norm": 0.06741012632846832, + "learning_rate": 1.933548181994133e-05, + "loss": 0.0409, + "step": 8150 + }, + { + "epoch": 0.7023195876288659, + "grad_norm": 5.186906814575195, + "learning_rate": 1.933144488258529e-05, + "loss": 0.0298, + "step": 8175 + }, + { + "epoch": 0.7044673539518901, + "grad_norm": 4.369027137756348, + "learning_rate": 1.9327396144192837e-05, + "loss": 0.0753, + "step": 8200 + }, + { + "epoch": 0.7066151202749141, + "grad_norm": 0.5283697843551636, + "learning_rate": 1.9323335609884208e-05, + "loss": 0.0509, + "step": 8225 + }, + { + "epoch": 0.7087628865979382, + "grad_norm": 0.2651509940624237, + "learning_rate": 1.9319263284794566e-05, + "loss": 0.0313, + "step": 8250 + }, + { + "epoch": 0.7109106529209622, + "grad_norm": 0.14367155730724335, + "learning_rate": 1.9315179174073995e-05, + "loss": 0.0677, + "step": 8275 + }, + { + "epoch": 0.7130584192439863, + "grad_norm": 4.3503570556640625, + "learning_rate": 1.9311083282887472e-05, + "loss": 0.0681, + "step": 8300 + }, + { + "epoch": 0.7152061855670103, + "grad_norm": 6.368698596954346, + "learning_rate": 1.9306975616414876e-05, + "loss": 0.0163, + "step": 8325 + }, + { + "epoch": 0.7173539518900344, + "grad_norm": 3.404118537902832, + "learning_rate": 1.930285617985098e-05, + "loss": 0.0501, + "step": 8350 + }, + { + "epoch": 0.7195017182130584, + "grad_norm": 0.11590408533811569, + "learning_rate": 1.9298724978405444e-05, + "loss": 0.0602, + "step": 8375 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 0.09684989601373672, + "learning_rate": 1.9294582017302797e-05, + "loss": 0.0502, + "step": 8400 + }, + { + "epoch": 0.7237972508591065, + "grad_norm": 0.10448263585567474, + "learning_rate": 1.9290427301782445e-05, + "loss": 0.0608, + "step": 8425 + }, + { + "epoch": 0.7259450171821306, + "grad_norm": 0.41792720556259155, + "learning_rate": 1.9286260837098666e-05, + "loss": 0.0634, + "step": 8450 + }, + { + "epoch": 0.7280927835051546, + "grad_norm": 21.387121200561523, + "learning_rate": 1.9282082628520585e-05, + "loss": 0.0489, + "step": 8475 + }, + { + "epoch": 0.7302405498281787, + "grad_norm": 4.301572799682617, + "learning_rate": 1.9277892681332187e-05, + "loss": 0.0339, + "step": 8500 + }, + { + "epoch": 0.7323883161512027, + "grad_norm": 5.473195552825928, + "learning_rate": 1.9273691000832298e-05, + "loss": 0.0673, + "step": 8525 + }, + { + "epoch": 0.7345360824742269, + "grad_norm": 0.12455861270427704, + "learning_rate": 1.926947759233459e-05, + "loss": 0.0447, + "step": 8550 + }, + { + "epoch": 0.7366838487972509, + "grad_norm": 4.059422016143799, + "learning_rate": 1.9265252461167554e-05, + "loss": 0.0563, + "step": 8575 + }, + { + "epoch": 0.738831615120275, + "grad_norm": 9.709542274475098, + "learning_rate": 1.9261015612674518e-05, + "loss": 0.0422, + "step": 8600 + }, + { + "epoch": 0.740979381443299, + "grad_norm": 0.8353398442268372, + "learning_rate": 1.925676705221362e-05, + "loss": 0.0454, + "step": 8625 + }, + { + "epoch": 0.743127147766323, + "grad_norm": 0.14278650283813477, + "learning_rate": 1.925250678515782e-05, + "loss": 0.0407, + "step": 8650 + }, + { + "epoch": 0.7452749140893471, + "grad_norm": 0.7284332513809204, + "learning_rate": 1.9248234816894867e-05, + "loss": 0.0602, + "step": 8675 + }, + { + "epoch": 0.7474226804123711, + "grad_norm": 0.11121925711631775, + "learning_rate": 1.924395115282732e-05, + "loss": 0.0774, + "step": 8700 + }, + { + "epoch": 0.7495704467353952, + "grad_norm": 3.824934482574463, + "learning_rate": 1.923965579837253e-05, + "loss": 0.034, + "step": 8725 + }, + { + "epoch": 0.7517182130584192, + "grad_norm": 1.2591402530670166, + "learning_rate": 1.9235348758962625e-05, + "loss": 0.0476, + "step": 8750 + }, + { + "epoch": 0.7538659793814433, + "grad_norm": 0.8432162404060364, + "learning_rate": 1.9231030040044514e-05, + "loss": 0.0607, + "step": 8775 + }, + { + "epoch": 0.7560137457044673, + "grad_norm": 5.705496788024902, + "learning_rate": 1.9226699647079874e-05, + "loss": 0.0476, + "step": 8800 + }, + { + "epoch": 0.7581615120274914, + "grad_norm": 2.7912771701812744, + "learning_rate": 1.9222357585545155e-05, + "loss": 0.0493, + "step": 8825 + }, + { + "epoch": 0.7603092783505154, + "grad_norm": 0.06402193754911423, + "learning_rate": 1.9218003860931548e-05, + "loss": 0.0262, + "step": 8850 + }, + { + "epoch": 0.7624570446735395, + "grad_norm": 4.520138263702393, + "learning_rate": 1.921363847874501e-05, + "loss": 0.0379, + "step": 8875 + }, + { + "epoch": 0.7646048109965635, + "grad_norm": 12.417888641357422, + "learning_rate": 1.9209261444506227e-05, + "loss": 0.0343, + "step": 8900 + }, + { + "epoch": 0.7667525773195877, + "grad_norm": 0.11066804081201553, + "learning_rate": 1.920487276375063e-05, + "loss": 0.0398, + "step": 8925 + }, + { + "epoch": 0.7689003436426117, + "grad_norm": 0.14987987279891968, + "learning_rate": 1.9200472442028376e-05, + "loss": 0.0578, + "step": 8950 + }, + { + "epoch": 0.7710481099656358, + "grad_norm": 0.9843977093696594, + "learning_rate": 1.9196060484904344e-05, + "loss": 0.0688, + "step": 8975 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 12.880500793457031, + "learning_rate": 1.9191636897958123e-05, + "loss": 0.0219, + "step": 9000 + }, + { + "epoch": 0.7753436426116839, + "grad_norm": 7.195580005645752, + "learning_rate": 1.9187201686784018e-05, + "loss": 0.0704, + "step": 9025 + }, + { + "epoch": 0.7774914089347079, + "grad_norm": 0.05571509525179863, + "learning_rate": 1.9182754856991032e-05, + "loss": 0.0355, + "step": 9050 + }, + { + "epoch": 0.779639175257732, + "grad_norm": 3.004206657409668, + "learning_rate": 1.9178296414202853e-05, + "loss": 0.0725, + "step": 9075 + }, + { + "epoch": 0.781786941580756, + "grad_norm": 9.869146347045898, + "learning_rate": 1.9173826364057868e-05, + "loss": 0.0692, + "step": 9100 + }, + { + "epoch": 0.7839347079037801, + "grad_norm": 8.073410987854004, + "learning_rate": 1.9169344712209138e-05, + "loss": 0.0417, + "step": 9125 + }, + { + "epoch": 0.7860824742268041, + "grad_norm": 0.3765357434749603, + "learning_rate": 1.9164851464324396e-05, + "loss": 0.0237, + "step": 9150 + }, + { + "epoch": 0.7882302405498282, + "grad_norm": 0.10676300525665283, + "learning_rate": 1.9160346626086035e-05, + "loss": 0.0462, + "step": 9175 + }, + { + "epoch": 0.7903780068728522, + "grad_norm": 0.4272958040237427, + "learning_rate": 1.915583020319111e-05, + "loss": 0.0686, + "step": 9200 + }, + { + "epoch": 0.7925257731958762, + "grad_norm": 0.3102481961250305, + "learning_rate": 1.915130220135134e-05, + "loss": 0.0453, + "step": 9225 + }, + { + "epoch": 0.7946735395189003, + "grad_norm": 6.971241474151611, + "learning_rate": 1.914676262629306e-05, + "loss": 0.0622, + "step": 9250 + }, + { + "epoch": 0.7968213058419243, + "grad_norm": 0.24601082503795624, + "learning_rate": 1.914221148375726e-05, + "loss": 0.0283, + "step": 9275 + }, + { + "epoch": 0.7989690721649485, + "grad_norm": 8.912467956542969, + "learning_rate": 1.9137648779499562e-05, + "loss": 0.0395, + "step": 9300 + }, + { + "epoch": 0.8011168384879725, + "grad_norm": 10.853771209716797, + "learning_rate": 1.9133074519290188e-05, + "loss": 0.0437, + "step": 9325 + }, + { + "epoch": 0.8032646048109966, + "grad_norm": 0.059863246977329254, + "learning_rate": 1.9128488708914e-05, + "loss": 0.0623, + "step": 9350 + }, + { + "epoch": 0.8054123711340206, + "grad_norm": 0.17583651840686798, + "learning_rate": 1.912389135417045e-05, + "loss": 0.0783, + "step": 9375 + }, + { + "epoch": 0.8075601374570447, + "grad_norm": 0.15230855345726013, + "learning_rate": 1.9119282460873597e-05, + "loss": 0.0627, + "step": 9400 + }, + { + "epoch": 0.8097079037800687, + "grad_norm": 2.772639036178589, + "learning_rate": 1.9114662034852087e-05, + "loss": 0.0486, + "step": 9425 + }, + { + "epoch": 0.8118556701030928, + "grad_norm": 0.19025437533855438, + "learning_rate": 1.9110030081949157e-05, + "loss": 0.0318, + "step": 9450 + }, + { + "epoch": 0.8140034364261168, + "grad_norm": 0.06832994520664215, + "learning_rate": 1.9105386608022615e-05, + "loss": 0.069, + "step": 9475 + }, + { + "epoch": 0.8161512027491409, + "grad_norm": 14.231243133544922, + "learning_rate": 1.9100731618944847e-05, + "loss": 0.0793, + "step": 9500 + }, + { + "epoch": 0.8182989690721649, + "grad_norm": 9.425990104675293, + "learning_rate": 1.9096065120602793e-05, + "loss": 0.0585, + "step": 9525 + }, + { + "epoch": 0.820446735395189, + "grad_norm": 0.22641202807426453, + "learning_rate": 1.9091387118897957e-05, + "loss": 0.0648, + "step": 9550 + }, + { + "epoch": 0.822594501718213, + "grad_norm": 5.411764144897461, + "learning_rate": 1.908669761974638e-05, + "loss": 0.0419, + "step": 9575 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 15.377092361450195, + "learning_rate": 1.9081996629078655e-05, + "loss": 0.0493, + "step": 9600 + }, + { + "epoch": 0.8268900343642611, + "grad_norm": 4.105844974517822, + "learning_rate": 1.9077284152839903e-05, + "loss": 0.0362, + "step": 9625 + }, + { + "epoch": 0.8290378006872853, + "grad_norm": 0.07050717622041702, + "learning_rate": 1.907256019698977e-05, + "loss": 0.0293, + "step": 9650 + }, + { + "epoch": 0.8311855670103093, + "grad_norm": 0.25353294610977173, + "learning_rate": 1.9067824767502404e-05, + "loss": 0.0504, + "step": 9675 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.920162558555603, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.0528, + "step": 9700 + }, + { + "epoch": 0.8354810996563574, + "grad_norm": 3.8590941429138184, + "learning_rate": 1.9058319511585222e-05, + "loss": 0.0572, + "step": 9725 + }, + { + "epoch": 0.8376288659793815, + "grad_norm": 9.597905158996582, + "learning_rate": 1.9053549697176245e-05, + "loss": 0.037, + "step": 9750 + }, + { + "epoch": 0.8397766323024055, + "grad_norm": 0.08004238456487656, + "learning_rate": 1.904876843317173e-05, + "loss": 0.0515, + "step": 9775 + }, + { + "epoch": 0.8419243986254296, + "grad_norm": 6.970203876495361, + "learning_rate": 1.9043975725618304e-05, + "loss": 0.0715, + "step": 9800 + }, + { + "epoch": 0.8440721649484536, + "grad_norm": 5.193155288696289, + "learning_rate": 1.9039171580577087e-05, + "loss": 0.031, + "step": 9825 + }, + { + "epoch": 0.8462199312714777, + "grad_norm": 0.05196263641119003, + "learning_rate": 1.9034356004123653e-05, + "loss": 0.0329, + "step": 9850 + }, + { + "epoch": 0.8483676975945017, + "grad_norm": 5.069810390472412, + "learning_rate": 1.902952900234803e-05, + "loss": 0.0635, + "step": 9875 + }, + { + "epoch": 0.8505154639175257, + "grad_norm": 0.8148592114448547, + "learning_rate": 1.90246905813547e-05, + "loss": 0.064, + "step": 9900 + }, + { + "epoch": 0.8526632302405498, + "grad_norm": 39.451961517333984, + "learning_rate": 1.9019840747262586e-05, + "loss": 0.0338, + "step": 9925 + }, + { + "epoch": 0.8548109965635738, + "grad_norm": 0.6645063161849976, + "learning_rate": 1.9014979506205042e-05, + "loss": 0.0546, + "step": 9950 + }, + { + "epoch": 0.8569587628865979, + "grad_norm": 0.32298406958580017, + "learning_rate": 1.901010686432985e-05, + "loss": 0.0525, + "step": 9975 + }, + { + "epoch": 0.8591065292096219, + "grad_norm": 1.0112435817718506, + "learning_rate": 1.9005222827799213e-05, + "loss": 0.0586, + "step": 10000 + }, + { + "epoch": 0.8612542955326461, + "grad_norm": 8.311635971069336, + "learning_rate": 1.9000327402789732e-05, + "loss": 0.0473, + "step": 10025 + }, + { + "epoch": 0.8634020618556701, + "grad_norm": 4.45230770111084, + "learning_rate": 1.8995420595492428e-05, + "loss": 0.0577, + "step": 10050 + }, + { + "epoch": 0.8655498281786942, + "grad_norm": 0.18009991943836212, + "learning_rate": 1.8990502412112702e-05, + "loss": 0.0436, + "step": 10075 + }, + { + "epoch": 0.8676975945017182, + "grad_norm": 0.1316894292831421, + "learning_rate": 1.8985572858870348e-05, + "loss": 0.063, + "step": 10100 + }, + { + "epoch": 0.8698453608247423, + "grad_norm": 0.14470544457435608, + "learning_rate": 1.8980631941999544e-05, + "loss": 0.0585, + "step": 10125 + }, + { + "epoch": 0.8719931271477663, + "grad_norm": 0.1042383536696434, + "learning_rate": 1.897567966774883e-05, + "loss": 0.0264, + "step": 10150 + }, + { + "epoch": 0.8741408934707904, + "grad_norm": 0.5906662940979004, + "learning_rate": 1.897071604238111e-05, + "loss": 0.0362, + "step": 10175 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 0.286993145942688, + "learning_rate": 1.8965741072173647e-05, + "loss": 0.0518, + "step": 10200 + }, + { + "epoch": 0.8784364261168385, + "grad_norm": 23.263864517211914, + "learning_rate": 1.8960754763418053e-05, + "loss": 0.0454, + "step": 10225 + }, + { + "epoch": 0.8805841924398625, + "grad_norm": 0.16274809837341309, + "learning_rate": 1.8955757122420277e-05, + "loss": 0.0554, + "step": 10250 + }, + { + "epoch": 0.8827319587628866, + "grad_norm": 3.3463187217712402, + "learning_rate": 1.8950748155500598e-05, + "loss": 0.0475, + "step": 10275 + }, + { + "epoch": 0.8848797250859106, + "grad_norm": 20.643089294433594, + "learning_rate": 1.8945727868993616e-05, + "loss": 0.0415, + "step": 10300 + }, + { + "epoch": 0.8870274914089347, + "grad_norm": 0.4690730571746826, + "learning_rate": 1.8940696269248252e-05, + "loss": 0.0753, + "step": 10325 + }, + { + "epoch": 0.8891752577319587, + "grad_norm": 7.777348518371582, + "learning_rate": 1.893565336262773e-05, + "loss": 0.0836, + "step": 10350 + }, + { + "epoch": 0.8913230240549829, + "grad_norm": 3.6813340187072754, + "learning_rate": 1.8930599155509583e-05, + "loss": 0.0696, + "step": 10375 + }, + { + "epoch": 0.8934707903780069, + "grad_norm": 28.128311157226562, + "learning_rate": 1.8925533654285617e-05, + "loss": 0.051, + "step": 10400 + }, + { + "epoch": 0.895618556701031, + "grad_norm": 5.613421440124512, + "learning_rate": 1.8920456865361933e-05, + "loss": 0.0519, + "step": 10425 + }, + { + "epoch": 0.897766323024055, + "grad_norm": 2.2128212451934814, + "learning_rate": 1.8915368795158912e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.8999140893470791, + "grad_norm": 2.4711761474609375, + "learning_rate": 1.8910269450111193e-05, + "loss": 0.0312, + "step": 10475 + }, + { + "epoch": 0.9020618556701031, + "grad_norm": 0.20002958178520203, + "learning_rate": 1.8905158836667678e-05, + "loss": 0.0535, + "step": 10500 + }, + { + "epoch": 0.9042096219931272, + "grad_norm": 4.844693183898926, + "learning_rate": 1.890003696129151e-05, + "loss": 0.0554, + "step": 10525 + }, + { + "epoch": 0.9063573883161512, + "grad_norm": 12.150792121887207, + "learning_rate": 1.8894903830460086e-05, + "loss": 0.0543, + "step": 10550 + }, + { + "epoch": 0.9085051546391752, + "grad_norm": 5.957699775695801, + "learning_rate": 1.8889759450665036e-05, + "loss": 0.0503, + "step": 10575 + }, + { + "epoch": 0.9106529209621993, + "grad_norm": 0.49310606718063354, + "learning_rate": 1.8884603828412212e-05, + "loss": 0.0577, + "step": 10600 + }, + { + "epoch": 0.9128006872852233, + "grad_norm": 0.39462143182754517, + "learning_rate": 1.8879436970221685e-05, + "loss": 0.0428, + "step": 10625 + }, + { + "epoch": 0.9149484536082474, + "grad_norm": 1.347753643989563, + "learning_rate": 1.8874258882627737e-05, + "loss": 0.0493, + "step": 10650 + }, + { + "epoch": 0.9170962199312714, + "grad_norm": 0.4527444839477539, + "learning_rate": 1.8869069572178847e-05, + "loss": 0.0533, + "step": 10675 + }, + { + "epoch": 0.9192439862542955, + "grad_norm": 12.810412406921387, + "learning_rate": 1.8863869045437696e-05, + "loss": 0.067, + "step": 10700 + }, + { + "epoch": 0.9213917525773195, + "grad_norm": 0.21272066235542297, + "learning_rate": 1.885865730898114e-05, + "loss": 0.0715, + "step": 10725 + }, + { + "epoch": 0.9235395189003437, + "grad_norm": 0.40799570083618164, + "learning_rate": 1.8853434369400214e-05, + "loss": 0.0389, + "step": 10750 + }, + { + "epoch": 0.9256872852233677, + "grad_norm": 5.805741310119629, + "learning_rate": 1.8848200233300128e-05, + "loss": 0.0396, + "step": 10775 + }, + { + "epoch": 0.9278350515463918, + "grad_norm": 0.5805464386940002, + "learning_rate": 1.8842954907300236e-05, + "loss": 0.051, + "step": 10800 + }, + { + "epoch": 0.9299828178694158, + "grad_norm": 6.981866359710693, + "learning_rate": 1.8837698398034066e-05, + "loss": 0.0838, + "step": 10825 + }, + { + "epoch": 0.9321305841924399, + "grad_norm": 0.36789411306381226, + "learning_rate": 1.8832430712149264e-05, + "loss": 0.025, + "step": 10850 + }, + { + "epoch": 0.9342783505154639, + "grad_norm": 0.8260797262191772, + "learning_rate": 1.882715185630763e-05, + "loss": 0.0701, + "step": 10875 + }, + { + "epoch": 0.936426116838488, + "grad_norm": 0.05520935729146004, + "learning_rate": 1.8821861837185085e-05, + "loss": 0.0476, + "step": 10900 + }, + { + "epoch": 0.938573883161512, + "grad_norm": 3.1457221508026123, + "learning_rate": 1.8816560661471657e-05, + "loss": 0.0287, + "step": 10925 + }, + { + "epoch": 0.9407216494845361, + "grad_norm": 0.5083457231521606, + "learning_rate": 1.8811248335871503e-05, + "loss": 0.0337, + "step": 10950 + }, + { + "epoch": 0.9428694158075601, + "grad_norm": 1.0675407648086548, + "learning_rate": 1.880592486710286e-05, + "loss": 0.0438, + "step": 10975 + }, + { + "epoch": 0.9450171821305842, + "grad_norm": 0.30811360478401184, + "learning_rate": 1.880059026189807e-05, + "loss": 0.0531, + "step": 11000 + }, + { + "epoch": 0.9471649484536082, + "grad_norm": 4.159587860107422, + "learning_rate": 1.8795244527003557e-05, + "loss": 0.0422, + "step": 11025 + }, + { + "epoch": 0.9493127147766323, + "grad_norm": 9.493789672851562, + "learning_rate": 1.878988766917982e-05, + "loss": 0.0541, + "step": 11050 + }, + { + "epoch": 0.9514604810996563, + "grad_norm": 0.2072204351425171, + "learning_rate": 1.878451969520142e-05, + "loss": 0.055, + "step": 11075 + }, + { + "epoch": 0.9536082474226805, + "grad_norm": 5.616858005523682, + "learning_rate": 1.8779140611856977e-05, + "loss": 0.0637, + "step": 11100 + }, + { + "epoch": 0.9557560137457045, + "grad_norm": 0.3237551748752594, + "learning_rate": 1.8773750425949172e-05, + "loss": 0.0374, + "step": 11125 + }, + { + "epoch": 0.9579037800687286, + "grad_norm": 0.504916250705719, + "learning_rate": 1.876834914429471e-05, + "loss": 0.0495, + "step": 11150 + }, + { + "epoch": 0.9600515463917526, + "grad_norm": 6.353029727935791, + "learning_rate": 1.876293677372434e-05, + "loss": 0.0567, + "step": 11175 + }, + { + "epoch": 0.9621993127147767, + "grad_norm": 0.0656585767865181, + "learning_rate": 1.875751332108283e-05, + "loss": 0.0372, + "step": 11200 + }, + { + "epoch": 0.9643470790378007, + "grad_norm": 15.765392303466797, + "learning_rate": 1.875207879322896e-05, + "loss": 0.036, + "step": 11225 + }, + { + "epoch": 0.9664948453608248, + "grad_norm": 0.3891215920448303, + "learning_rate": 1.8746633197035525e-05, + "loss": 0.0493, + "step": 11250 + }, + { + "epoch": 0.9686426116838488, + "grad_norm": 0.9667115211486816, + "learning_rate": 1.874117653938931e-05, + "loss": 0.046, + "step": 11275 + }, + { + "epoch": 0.9707903780068728, + "grad_norm": 68.40766906738281, + "learning_rate": 1.8735708827191098e-05, + "loss": 0.0745, + "step": 11300 + }, + { + "epoch": 0.9729381443298969, + "grad_norm": 0.13170503079891205, + "learning_rate": 1.8730230067355634e-05, + "loss": 0.048, + "step": 11325 + }, + { + "epoch": 0.9750859106529209, + "grad_norm": 0.7202484607696533, + "learning_rate": 1.8724740266811655e-05, + "loss": 0.0631, + "step": 11350 + }, + { + "epoch": 0.977233676975945, + "grad_norm": 4.844538688659668, + "learning_rate": 1.8719239432501845e-05, + "loss": 0.0643, + "step": 11375 + }, + { + "epoch": 0.979381443298969, + "grad_norm": 0.21506763994693756, + "learning_rate": 1.8713727571382857e-05, + "loss": 0.0266, + "step": 11400 + }, + { + "epoch": 0.9815292096219931, + "grad_norm": 67.31729125976562, + "learning_rate": 1.8708204690425272e-05, + "loss": 0.061, + "step": 11425 + }, + { + "epoch": 0.9836769759450171, + "grad_norm": 0.8202037215232849, + "learning_rate": 1.8702670796613624e-05, + "loss": 0.0487, + "step": 11450 + }, + { + "epoch": 0.9858247422680413, + "grad_norm": 0.15015943348407745, + "learning_rate": 1.869712589694636e-05, + "loss": 0.0505, + "step": 11475 + }, + { + "epoch": 0.9879725085910653, + "grad_norm": 0.5480268597602844, + "learning_rate": 1.8691569998435856e-05, + "loss": 0.0517, + "step": 11500 + }, + { + "epoch": 0.9901202749140894, + "grad_norm": 6.367101669311523, + "learning_rate": 1.8686003108108392e-05, + "loss": 0.0668, + "step": 11525 + }, + { + "epoch": 0.9922680412371134, + "grad_norm": 0.25604814291000366, + "learning_rate": 1.868042523300415e-05, + "loss": 0.0457, + "step": 11550 + }, + { + "epoch": 0.9944158075601375, + "grad_norm": 8.345518112182617, + "learning_rate": 1.8674836380177208e-05, + "loss": 0.0698, + "step": 11575 + }, + { + "epoch": 0.9965635738831615, + "grad_norm": 6.629955768585205, + "learning_rate": 1.8669236556695517e-05, + "loss": 0.0412, + "step": 11600 + }, + { + "epoch": 0.9987113402061856, + "grad_norm": 0.1669285148382187, + "learning_rate": 1.8663625769640913e-05, + "loss": 0.0398, + "step": 11625 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.978418635638994, + "eval_auc": 0.9950589707607411, + "eval_f1": 0.9855603139477497, + "eval_loss": 0.08961447328329086, + "eval_precision": 0.9766509954879626, + "eval_recall": 0.9946336760925449, + "eval_runtime": 4307.7178, + "eval_samples_per_second": 9.756, + "eval_steps_per_second": 0.153, + "step": 11640 + }, + { + "epoch": 1.0008591065292096, + "grad_norm": 3.1584737300872803, + "learning_rate": 1.8658004026109097e-05, + "loss": 0.0433, + "step": 11650 + }, + { + "epoch": 1.0030068728522337, + "grad_norm": 0.10521333664655685, + "learning_rate": 1.865237133320961e-05, + "loss": 0.0569, + "step": 11675 + }, + { + "epoch": 1.0051546391752577, + "grad_norm": 5.000732898712158, + "learning_rate": 1.8646727698065865e-05, + "loss": 0.0288, + "step": 11700 + }, + { + "epoch": 1.0073024054982818, + "grad_norm": 8.98947811126709, + "learning_rate": 1.864107312781509e-05, + "loss": 0.0277, + "step": 11725 + }, + { + "epoch": 1.0094501718213058, + "grad_norm": 8.189676284790039, + "learning_rate": 1.863540762960836e-05, + "loss": 0.0256, + "step": 11750 + }, + { + "epoch": 1.0115979381443299, + "grad_norm": 0.22616490721702576, + "learning_rate": 1.862973121061055e-05, + "loss": 0.0307, + "step": 11775 + }, + { + "epoch": 1.013745704467354, + "grad_norm": 0.07888036221265793, + "learning_rate": 1.8624043878000378e-05, + "loss": 0.0821, + "step": 11800 + }, + { + "epoch": 1.015893470790378, + "grad_norm": 0.05555611476302147, + "learning_rate": 1.8618345638970326e-05, + "loss": 0.036, + "step": 11825 + }, + { + "epoch": 1.018041237113402, + "grad_norm": 0.036044951528310776, + "learning_rate": 1.86126365007267e-05, + "loss": 0.046, + "step": 11850 + }, + { + "epoch": 1.020189003436426, + "grad_norm": 3.823352336883545, + "learning_rate": 1.8606916470489563e-05, + "loss": 0.068, + "step": 11875 + }, + { + "epoch": 1.02233676975945, + "grad_norm": 0.4989180266857147, + "learning_rate": 1.860118555549278e-05, + "loss": 0.0462, + "step": 11900 + }, + { + "epoch": 1.0244845360824741, + "grad_norm": 0.6881848573684692, + "learning_rate": 1.8595443762983958e-05, + "loss": 0.0498, + "step": 11925 + }, + { + "epoch": 1.0266323024054982, + "grad_norm": 0.4148818850517273, + "learning_rate": 1.8589691100224477e-05, + "loss": 0.0745, + "step": 11950 + }, + { + "epoch": 1.0287800687285222, + "grad_norm": 3.7787914276123047, + "learning_rate": 1.858392757448945e-05, + "loss": 0.0384, + "step": 11975 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 0.08168889582157135, + "learning_rate": 1.8578153193067746e-05, + "loss": 0.0549, + "step": 12000 + }, + { + "epoch": 1.0330756013745706, + "grad_norm": 6.255758285522461, + "learning_rate": 1.857236796326194e-05, + "loss": 0.0387, + "step": 12025 + }, + { + "epoch": 1.0352233676975946, + "grad_norm": 5.992846965789795, + "learning_rate": 1.8566571892388343e-05, + "loss": 0.0404, + "step": 12050 + }, + { + "epoch": 1.0373711340206186, + "grad_norm": 0.18908949196338654, + "learning_rate": 1.8560764987776974e-05, + "loss": 0.0477, + "step": 12075 + }, + { + "epoch": 1.0395189003436427, + "grad_norm": 1.0027879476547241, + "learning_rate": 1.8554947256771542e-05, + "loss": 0.0315, + "step": 12100 + }, + { + "epoch": 1.0416666666666667, + "grad_norm": 0.03971044719219208, + "learning_rate": 1.854911870672947e-05, + "loss": 0.0384, + "step": 12125 + }, + { + "epoch": 1.0438144329896908, + "grad_norm": 1.3332908153533936, + "learning_rate": 1.8543279345021834e-05, + "loss": 0.0376, + "step": 12150 + }, + { + "epoch": 1.0459621993127148, + "grad_norm": 0.10120372474193573, + "learning_rate": 1.8537429179033406e-05, + "loss": 0.047, + "step": 12175 + }, + { + "epoch": 1.0481099656357389, + "grad_norm": 7.517392158508301, + "learning_rate": 1.8531568216162618e-05, + "loss": 0.0559, + "step": 12200 + }, + { + "epoch": 1.050257731958763, + "grad_norm": 9.07286262512207, + "learning_rate": 1.852569646382154e-05, + "loss": 0.0476, + "step": 12225 + }, + { + "epoch": 1.052405498281787, + "grad_norm": 0.3558478057384491, + "learning_rate": 1.851981392943591e-05, + "loss": 0.0504, + "step": 12250 + }, + { + "epoch": 1.054553264604811, + "grad_norm": 0.03881434351205826, + "learning_rate": 1.8513920620445085e-05, + "loss": 0.0496, + "step": 12275 + }, + { + "epoch": 1.056701030927835, + "grad_norm": 1.248205542564392, + "learning_rate": 1.8508016544302057e-05, + "loss": 0.0493, + "step": 12300 + }, + { + "epoch": 1.0588487972508591, + "grad_norm": 0.06321801245212555, + "learning_rate": 1.8502101708473427e-05, + "loss": 0.055, + "step": 12325 + }, + { + "epoch": 1.0609965635738832, + "grad_norm": 16.840360641479492, + "learning_rate": 1.8496176120439417e-05, + "loss": 0.0731, + "step": 12350 + }, + { + "epoch": 1.0631443298969072, + "grad_norm": 0.1889524608850479, + "learning_rate": 1.8490239787693825e-05, + "loss": 0.0369, + "step": 12375 + }, + { + "epoch": 1.0652920962199313, + "grad_norm": 0.14490237832069397, + "learning_rate": 1.848429271774406e-05, + "loss": 0.0364, + "step": 12400 + }, + { + "epoch": 1.0674398625429553, + "grad_norm": 0.08228684216737747, + "learning_rate": 1.8478334918111092e-05, + "loss": 0.0198, + "step": 12425 + }, + { + "epoch": 1.0695876288659794, + "grad_norm": 5.142478942871094, + "learning_rate": 1.8472366396329477e-05, + "loss": 0.0267, + "step": 12450 + }, + { + "epoch": 1.0717353951890034, + "grad_norm": 17.46861457824707, + "learning_rate": 1.8466387159947316e-05, + "loss": 0.04, + "step": 12475 + }, + { + "epoch": 1.0738831615120275, + "grad_norm": 4.351114749908447, + "learning_rate": 1.8460397216526265e-05, + "loss": 0.0394, + "step": 12500 + }, + { + "epoch": 1.0760309278350515, + "grad_norm": 0.12745483219623566, + "learning_rate": 1.8454396573641523e-05, + "loss": 0.0374, + "step": 12525 + }, + { + "epoch": 1.0781786941580755, + "grad_norm": 0.12874144315719604, + "learning_rate": 1.844838523888182e-05, + "loss": 0.0609, + "step": 12550 + }, + { + "epoch": 1.0803264604810996, + "grad_norm": 1.1297818422317505, + "learning_rate": 1.844236321984941e-05, + "loss": 0.0383, + "step": 12575 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 0.08448002487421036, + "learning_rate": 1.8436330524160048e-05, + "loss": 0.0587, + "step": 12600 + }, + { + "epoch": 1.0846219931271477, + "grad_norm": 2.434951066970825, + "learning_rate": 1.8430287159443e-05, + "loss": 0.1032, + "step": 12625 + }, + { + "epoch": 1.0867697594501717, + "grad_norm": 0.1631106734275818, + "learning_rate": 1.8424233133341027e-05, + "loss": 0.0307, + "step": 12650 + }, + { + "epoch": 1.0889175257731958, + "grad_norm": 8.717646598815918, + "learning_rate": 1.8418168453510366e-05, + "loss": 0.0357, + "step": 12675 + }, + { + "epoch": 1.0910652920962198, + "grad_norm": 0.29639554023742676, + "learning_rate": 1.8412093127620733e-05, + "loss": 0.0533, + "step": 12700 + }, + { + "epoch": 1.093213058419244, + "grad_norm": 6.197211742401123, + "learning_rate": 1.8406007163355304e-05, + "loss": 0.0376, + "step": 12725 + }, + { + "epoch": 1.0953608247422681, + "grad_norm": 0.3147558271884918, + "learning_rate": 1.839991056841071e-05, + "loss": 0.0512, + "step": 12750 + }, + { + "epoch": 1.0975085910652922, + "grad_norm": 23.46294593811035, + "learning_rate": 1.839380335049702e-05, + "loss": 0.0287, + "step": 12775 + }, + { + "epoch": 1.0996563573883162, + "grad_norm": 0.1599414199590683, + "learning_rate": 1.8387685517337752e-05, + "loss": 0.0479, + "step": 12800 + }, + { + "epoch": 1.1018041237113403, + "grad_norm": 13.79839038848877, + "learning_rate": 1.8381557076669837e-05, + "loss": 0.0438, + "step": 12825 + }, + { + "epoch": 1.1039518900343643, + "grad_norm": 39.546512603759766, + "learning_rate": 1.837541803624362e-05, + "loss": 0.0526, + "step": 12850 + }, + { + "epoch": 1.1060996563573884, + "grad_norm": 15.752116203308105, + "learning_rate": 1.836926840382286e-05, + "loss": 0.0396, + "step": 12875 + }, + { + "epoch": 1.1082474226804124, + "grad_norm": 9.200237274169922, + "learning_rate": 1.8363108187184702e-05, + "loss": 0.0537, + "step": 12900 + }, + { + "epoch": 1.1103951890034365, + "grad_norm": 0.19225525856018066, + "learning_rate": 1.8356937394119682e-05, + "loss": 0.0272, + "step": 12925 + }, + { + "epoch": 1.1125429553264605, + "grad_norm": 66.53321838378906, + "learning_rate": 1.835075603243171e-05, + "loss": 0.0573, + "step": 12950 + }, + { + "epoch": 1.1146907216494846, + "grad_norm": 56.954647064208984, + "learning_rate": 1.8344564109938058e-05, + "loss": 0.0372, + "step": 12975 + }, + { + "epoch": 1.1168384879725086, + "grad_norm": 0.051831308752298355, + "learning_rate": 1.8338361634469362e-05, + "loss": 0.0335, + "step": 13000 + }, + { + "epoch": 1.1189862542955327, + "grad_norm": 7.303863525390625, + "learning_rate": 1.8332148613869596e-05, + "loss": 0.033, + "step": 13025 + }, + { + "epoch": 1.1211340206185567, + "grad_norm": 0.0853796899318695, + "learning_rate": 1.8325925055996076e-05, + "loss": 0.0398, + "step": 13050 + }, + { + "epoch": 1.1232817869415808, + "grad_norm": 0.5127688050270081, + "learning_rate": 1.8319690968719438e-05, + "loss": 0.0319, + "step": 13075 + }, + { + "epoch": 1.1254295532646048, + "grad_norm": 0.08459598571062088, + "learning_rate": 1.8313446359923638e-05, + "loss": 0.0496, + "step": 13100 + }, + { + "epoch": 1.1275773195876289, + "grad_norm": 7.318602085113525, + "learning_rate": 1.8307191237505936e-05, + "loss": 0.0814, + "step": 13125 + }, + { + "epoch": 1.129725085910653, + "grad_norm": 0.5823403000831604, + "learning_rate": 1.830092560937689e-05, + "loss": 0.0325, + "step": 13150 + }, + { + "epoch": 1.131872852233677, + "grad_norm": 7.017449855804443, + "learning_rate": 1.829464948346034e-05, + "loss": 0.0426, + "step": 13175 + }, + { + "epoch": 1.134020618556701, + "grad_norm": 3.4388301372528076, + "learning_rate": 1.8288362867693414e-05, + "loss": 0.0482, + "step": 13200 + }, + { + "epoch": 1.136168384879725, + "grad_norm": 0.055725034326314926, + "learning_rate": 1.828206577002649e-05, + "loss": 0.0271, + "step": 13225 + }, + { + "epoch": 1.138316151202749, + "grad_norm": 8.620025634765625, + "learning_rate": 1.8275758198423208e-05, + "loss": 0.0436, + "step": 13250 + }, + { + "epoch": 1.1404639175257731, + "grad_norm": 0.11146937310695648, + "learning_rate": 1.826944016086046e-05, + "loss": 0.0165, + "step": 13275 + }, + { + "epoch": 1.1426116838487972, + "grad_norm": 0.06790604442358017, + "learning_rate": 1.8263111665328366e-05, + "loss": 0.0749, + "step": 13300 + }, + { + "epoch": 1.1447594501718212, + "grad_norm": 3.8895959854125977, + "learning_rate": 1.8256772719830274e-05, + "loss": 0.0441, + "step": 13325 + }, + { + "epoch": 1.1469072164948453, + "grad_norm": 4.305970191955566, + "learning_rate": 1.825042333238275e-05, + "loss": 0.0323, + "step": 13350 + }, + { + "epoch": 1.1490549828178693, + "grad_norm": 4.302104949951172, + "learning_rate": 1.8244063511015562e-05, + "loss": 0.0544, + "step": 13375 + }, + { + "epoch": 1.1512027491408934, + "grad_norm": 3.072025775909424, + "learning_rate": 1.8237693263771678e-05, + "loss": 0.0616, + "step": 13400 + }, + { + "epoch": 1.1533505154639174, + "grad_norm": 4.083505153656006, + "learning_rate": 1.8231312598707243e-05, + "loss": 0.0441, + "step": 13425 + }, + { + "epoch": 1.1554982817869415, + "grad_norm": 3.606635332107544, + "learning_rate": 1.822492152389159e-05, + "loss": 0.0375, + "step": 13450 + }, + { + "epoch": 1.1576460481099655, + "grad_norm": 10.878451347351074, + "learning_rate": 1.8218520047407197e-05, + "loss": 0.0455, + "step": 13475 + }, + { + "epoch": 1.1597938144329896, + "grad_norm": 0.23790162801742554, + "learning_rate": 1.8212108177349722e-05, + "loss": 0.0407, + "step": 13500 + }, + { + "epoch": 1.1619415807560138, + "grad_norm": 0.09610047191381454, + "learning_rate": 1.820568592182794e-05, + "loss": 0.0571, + "step": 13525 + }, + { + "epoch": 1.1640893470790379, + "grad_norm": 0.18398234248161316, + "learning_rate": 1.819925328896378e-05, + "loss": 0.0595, + "step": 13550 + }, + { + "epoch": 1.166237113402062, + "grad_norm": 0.09044495970010757, + "learning_rate": 1.819281028689229e-05, + "loss": 0.0399, + "step": 13575 + }, + { + "epoch": 1.168384879725086, + "grad_norm": 1.1893888711929321, + "learning_rate": 1.818635692376163e-05, + "loss": 0.0436, + "step": 13600 + }, + { + "epoch": 1.17053264604811, + "grad_norm": 3.802780866622925, + "learning_rate": 1.817989320773305e-05, + "loss": 0.0286, + "step": 13625 + }, + { + "epoch": 1.172680412371134, + "grad_norm": 0.18910564482212067, + "learning_rate": 1.8173419146980924e-05, + "loss": 0.0292, + "step": 13650 + }, + { + "epoch": 1.1748281786941581, + "grad_norm": 0.19052541255950928, + "learning_rate": 1.816693474969268e-05, + "loss": 0.0368, + "step": 13675 + }, + { + "epoch": 1.1769759450171822, + "grad_norm": 0.12135069072246552, + "learning_rate": 1.8160440024068827e-05, + "loss": 0.0403, + "step": 13700 + }, + { + "epoch": 1.1791237113402062, + "grad_norm": 0.1408136636018753, + "learning_rate": 1.815393497832294e-05, + "loss": 0.0264, + "step": 13725 + }, + { + "epoch": 1.1812714776632303, + "grad_norm": 0.027343835681676865, + "learning_rate": 1.8147419620681644e-05, + "loss": 0.0305, + "step": 13750 + }, + { + "epoch": 1.1834192439862543, + "grad_norm": 0.3214264512062073, + "learning_rate": 1.81408939593846e-05, + "loss": 0.0292, + "step": 13775 + }, + { + "epoch": 1.1855670103092784, + "grad_norm": 0.38665422797203064, + "learning_rate": 1.8134358002684504e-05, + "loss": 0.0421, + "step": 13800 + }, + { + "epoch": 1.1877147766323024, + "grad_norm": 0.9560924172401428, + "learning_rate": 1.8127811758847073e-05, + "loss": 0.025, + "step": 13825 + }, + { + "epoch": 1.1898625429553265, + "grad_norm": 0.06022180989384651, + "learning_rate": 1.8121255236151026e-05, + "loss": 0.0354, + "step": 13850 + }, + { + "epoch": 1.1920103092783505, + "grad_norm": 0.3031279742717743, + "learning_rate": 1.8114688442888092e-05, + "loss": 0.0139, + "step": 13875 + }, + { + "epoch": 1.1941580756013745, + "grad_norm": 0.07230107486248016, + "learning_rate": 1.810811138736298e-05, + "loss": 0.0291, + "step": 13900 + }, + { + "epoch": 1.1963058419243986, + "grad_norm": 0.264476478099823, + "learning_rate": 1.8101524077893385e-05, + "loss": 0.0436, + "step": 13925 + }, + { + "epoch": 1.1984536082474226, + "grad_norm": 4.398700714111328, + "learning_rate": 1.8094926522809958e-05, + "loss": 0.0706, + "step": 13950 + }, + { + "epoch": 1.2006013745704467, + "grad_norm": 2.1592957973480225, + "learning_rate": 1.8088318730456325e-05, + "loss": 0.0428, + "step": 13975 + }, + { + "epoch": 1.2027491408934707, + "grad_norm": 0.045114222913980484, + "learning_rate": 1.8081700709189038e-05, + "loss": 0.0378, + "step": 14000 + }, + { + "epoch": 1.2048969072164948, + "grad_norm": 0.2204490453004837, + "learning_rate": 1.80750724673776e-05, + "loss": 0.0474, + "step": 14025 + }, + { + "epoch": 1.2070446735395188, + "grad_norm": 0.06442329287528992, + "learning_rate": 1.8068434013404433e-05, + "loss": 0.0574, + "step": 14050 + }, + { + "epoch": 1.2091924398625429, + "grad_norm": 0.17818066477775574, + "learning_rate": 1.8061785355664875e-05, + "loss": 0.0263, + "step": 14075 + }, + { + "epoch": 1.211340206185567, + "grad_norm": 0.402577668428421, + "learning_rate": 1.8055126502567172e-05, + "loss": 0.0662, + "step": 14100 + }, + { + "epoch": 1.213487972508591, + "grad_norm": 0.0901382714509964, + "learning_rate": 1.804845746253246e-05, + "loss": 0.0314, + "step": 14125 + }, + { + "epoch": 1.2156357388316152, + "grad_norm": 7.231109619140625, + "learning_rate": 1.8041778243994753e-05, + "loss": 0.0399, + "step": 14150 + }, + { + "epoch": 1.2177835051546393, + "grad_norm": 1.6399953365325928, + "learning_rate": 1.803508885540094e-05, + "loss": 0.0709, + "step": 14175 + }, + { + "epoch": 1.2199312714776633, + "grad_norm": 6.085312366485596, + "learning_rate": 1.8028389305210787e-05, + "loss": 0.0528, + "step": 14200 + }, + { + "epoch": 1.2220790378006874, + "grad_norm": 9.0698881149292, + "learning_rate": 1.8021679601896886e-05, + "loss": 0.0409, + "step": 14225 + }, + { + "epoch": 1.2242268041237114, + "grad_norm": 0.28462520241737366, + "learning_rate": 1.8014959753944688e-05, + "loss": 0.049, + "step": 14250 + }, + { + "epoch": 1.2263745704467355, + "grad_norm": 0.19014780223369598, + "learning_rate": 1.800822976985246e-05, + "loss": 0.0386, + "step": 14275 + }, + { + "epoch": 1.2285223367697595, + "grad_norm": 1.030049443244934, + "learning_rate": 1.8001489658131302e-05, + "loss": 0.0432, + "step": 14300 + }, + { + "epoch": 1.2306701030927836, + "grad_norm": 4.112753868103027, + "learning_rate": 1.7994739427305105e-05, + "loss": 0.0255, + "step": 14325 + }, + { + "epoch": 1.2328178694158076, + "grad_norm": 0.23347465693950653, + "learning_rate": 1.7987979085910575e-05, + "loss": 0.0551, + "step": 14350 + }, + { + "epoch": 1.2349656357388317, + "grad_norm": 0.14936472475528717, + "learning_rate": 1.798120864249719e-05, + "loss": 0.0526, + "step": 14375 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 0.6420214772224426, + "learning_rate": 1.797442810562721e-05, + "loss": 0.0299, + "step": 14400 + }, + { + "epoch": 1.2392611683848798, + "grad_norm": 0.33108434081077576, + "learning_rate": 1.796763748387566e-05, + "loss": 0.0404, + "step": 14425 + }, + { + "epoch": 1.2414089347079038, + "grad_norm": 0.13329073786735535, + "learning_rate": 1.796083678583032e-05, + "loss": 0.0554, + "step": 14450 + }, + { + "epoch": 1.2435567010309279, + "grad_norm": 0.6672009229660034, + "learning_rate": 1.7954026020091705e-05, + "loss": 0.0408, + "step": 14475 + }, + { + "epoch": 1.245704467353952, + "grad_norm": 4.718705654144287, + "learning_rate": 1.794720519527307e-05, + "loss": 0.0527, + "step": 14500 + }, + { + "epoch": 1.247852233676976, + "grad_norm": 0.068761445581913, + "learning_rate": 1.794037432000039e-05, + "loss": 0.0418, + "step": 14525 + }, + { + "epoch": 1.25, + "grad_norm": 0.3524285852909088, + "learning_rate": 1.7933533402912354e-05, + "loss": 0.061, + "step": 14550 + }, + { + "epoch": 1.252147766323024, + "grad_norm": 0.0817950963973999, + "learning_rate": 1.792668245266034e-05, + "loss": 0.0364, + "step": 14575 + }, + { + "epoch": 1.254295532646048, + "grad_norm": 0.7396872043609619, + "learning_rate": 1.7919821477908418e-05, + "loss": 0.0395, + "step": 14600 + }, + { + "epoch": 1.2564432989690721, + "grad_norm": 2.933992385864258, + "learning_rate": 1.7912950487333345e-05, + "loss": 0.0626, + "step": 14625 + }, + { + "epoch": 1.2585910652920962, + "grad_norm": 0.09322584420442581, + "learning_rate": 1.7906069489624532e-05, + "loss": 0.0313, + "step": 14650 + }, + { + "epoch": 1.2607388316151202, + "grad_norm": 18.86374855041504, + "learning_rate": 1.7899178493484054e-05, + "loss": 0.0437, + "step": 14675 + }, + { + "epoch": 1.2628865979381443, + "grad_norm": 0.835211455821991, + "learning_rate": 1.7892277507626627e-05, + "loss": 0.0281, + "step": 14700 + }, + { + "epoch": 1.2650343642611683, + "grad_norm": 0.9107890129089355, + "learning_rate": 1.78853665407796e-05, + "loss": 0.0432, + "step": 14725 + }, + { + "epoch": 1.2671821305841924, + "grad_norm": 2.495563268661499, + "learning_rate": 1.7878445601682947e-05, + "loss": 0.0268, + "step": 14750 + }, + { + "epoch": 1.2693298969072164, + "grad_norm": 0.197845920920372, + "learning_rate": 1.787151469908925e-05, + "loss": 0.0683, + "step": 14775 + }, + { + "epoch": 1.2714776632302405, + "grad_norm": 0.25817155838012695, + "learning_rate": 1.7864573841763695e-05, + "loss": 0.0461, + "step": 14800 + }, + { + "epoch": 1.2736254295532645, + "grad_norm": 0.23237344622612, + "learning_rate": 1.7857623038484054e-05, + "loss": 0.0247, + "step": 14825 + }, + { + "epoch": 1.2757731958762886, + "grad_norm": 2.6980578899383545, + "learning_rate": 1.7850662298040676e-05, + "loss": 0.0449, + "step": 14850 + }, + { + "epoch": 1.2779209621993126, + "grad_norm": 0.10769737511873245, + "learning_rate": 1.784369162923649e-05, + "loss": 0.0294, + "step": 14875 + }, + { + "epoch": 1.2800687285223367, + "grad_norm": 0.11851859837770462, + "learning_rate": 1.7836711040886956e-05, + "loss": 0.0375, + "step": 14900 + }, + { + "epoch": 1.2822164948453607, + "grad_norm": 0.16257694363594055, + "learning_rate": 1.7829720541820106e-05, + "loss": 0.0469, + "step": 14925 + }, + { + "epoch": 1.2843642611683848, + "grad_norm": 0.23113368451595306, + "learning_rate": 1.782272014087649e-05, + "loss": 0.0445, + "step": 14950 + }, + { + "epoch": 1.2865120274914088, + "grad_norm": 0.04179134592413902, + "learning_rate": 1.7815709846909176e-05, + "loss": 0.0529, + "step": 14975 + }, + { + "epoch": 1.2886597938144329, + "grad_norm": 0.06446385383605957, + "learning_rate": 1.7808689668783762e-05, + "loss": 0.0354, + "step": 15000 + }, + { + "epoch": 1.2908075601374571, + "grad_norm": 0.0470893494784832, + "learning_rate": 1.7801659615378327e-05, + "loss": 0.0382, + "step": 15025 + }, + { + "epoch": 1.2929553264604812, + "grad_norm": 6.822221279144287, + "learning_rate": 1.7794619695583452e-05, + "loss": 0.0611, + "step": 15050 + }, + { + "epoch": 1.2951030927835052, + "grad_norm": 0.1641368269920349, + "learning_rate": 1.7787569918302185e-05, + "loss": 0.0189, + "step": 15075 + }, + { + "epoch": 1.2972508591065293, + "grad_norm": 0.24317267537117004, + "learning_rate": 1.778051029245005e-05, + "loss": 0.0425, + "step": 15100 + }, + { + "epoch": 1.2993986254295533, + "grad_norm": 0.25097760558128357, + "learning_rate": 1.7773440826955018e-05, + "loss": 0.0308, + "step": 15125 + }, + { + "epoch": 1.3015463917525774, + "grad_norm": 0.11992732435464859, + "learning_rate": 1.776636153075751e-05, + "loss": 0.052, + "step": 15150 + }, + { + "epoch": 1.3036941580756014, + "grad_norm": 0.05358404293656349, + "learning_rate": 1.7759272412810375e-05, + "loss": 0.0361, + "step": 15175 + }, + { + "epoch": 1.3058419243986255, + "grad_norm": 5.971042633056641, + "learning_rate": 1.775217348207888e-05, + "loss": 0.0509, + "step": 15200 + }, + { + "epoch": 1.3079896907216495, + "grad_norm": 23.76131820678711, + "learning_rate": 1.7745064747540714e-05, + "loss": 0.0302, + "step": 15225 + }, + { + "epoch": 1.3101374570446735, + "grad_norm": 50.58725357055664, + "learning_rate": 1.7737946218185957e-05, + "loss": 0.0599, + "step": 15250 + }, + { + "epoch": 1.3122852233676976, + "grad_norm": 0.26424476504325867, + "learning_rate": 1.773081790301707e-05, + "loss": 0.0265, + "step": 15275 + }, + { + "epoch": 1.3144329896907216, + "grad_norm": 0.04694714769721031, + "learning_rate": 1.7723679811048904e-05, + "loss": 0.0333, + "step": 15300 + }, + { + "epoch": 1.3165807560137457, + "grad_norm": 6.524735450744629, + "learning_rate": 1.7716531951308656e-05, + "loss": 0.0449, + "step": 15325 + }, + { + "epoch": 1.3187285223367697, + "grad_norm": 6.111476421356201, + "learning_rate": 1.7709374332835893e-05, + "loss": 0.0298, + "step": 15350 + }, + { + "epoch": 1.3208762886597938, + "grad_norm": 3.6359059810638428, + "learning_rate": 1.7702206964682515e-05, + "loss": 0.0648, + "step": 15375 + }, + { + "epoch": 1.3230240549828178, + "grad_norm": 0.038961756974458694, + "learning_rate": 1.7695029855912748e-05, + "loss": 0.0292, + "step": 15400 + }, + { + "epoch": 1.3251718213058419, + "grad_norm": 0.5314356684684753, + "learning_rate": 1.7687843015603144e-05, + "loss": 0.0229, + "step": 15425 + }, + { + "epoch": 1.327319587628866, + "grad_norm": 1.05496346950531, + "learning_rate": 1.7680646452842564e-05, + "loss": 0.0873, + "step": 15450 + }, + { + "epoch": 1.32946735395189, + "grad_norm": 0.5957635045051575, + "learning_rate": 1.7673440176732156e-05, + "loss": 0.0478, + "step": 15475 + }, + { + "epoch": 1.331615120274914, + "grad_norm": 8.844915390014648, + "learning_rate": 1.766622419638536e-05, + "loss": 0.0223, + "step": 15500 + }, + { + "epoch": 1.333762886597938, + "grad_norm": 0.20430457592010498, + "learning_rate": 1.7658998520927878e-05, + "loss": 0.0362, + "step": 15525 + }, + { + "epoch": 1.3359106529209621, + "grad_norm": 0.16939419507980347, + "learning_rate": 1.7651763159497684e-05, + "loss": 0.063, + "step": 15550 + }, + { + "epoch": 1.3380584192439864, + "grad_norm": 0.011635910719633102, + "learning_rate": 1.7644518121244997e-05, + "loss": 0.0412, + "step": 15575 + }, + { + "epoch": 1.3402061855670104, + "grad_norm": 0.2999529540538788, + "learning_rate": 1.7637263415332272e-05, + "loss": 0.0443, + "step": 15600 + }, + { + "epoch": 1.3423539518900345, + "grad_norm": 1.1540156602859497, + "learning_rate": 1.762999905093419e-05, + "loss": 0.0492, + "step": 15625 + }, + { + "epoch": 1.3445017182130585, + "grad_norm": 0.27002963423728943, + "learning_rate": 1.7622725037237657e-05, + "loss": 0.0499, + "step": 15650 + }, + { + "epoch": 1.3466494845360826, + "grad_norm": 0.03746243938803673, + "learning_rate": 1.7615441383441766e-05, + "loss": 0.0324, + "step": 15675 + }, + { + "epoch": 1.3487972508591066, + "grad_norm": 0.18822067975997925, + "learning_rate": 1.760814809875781e-05, + "loss": 0.0061, + "step": 15700 + }, + { + "epoch": 1.3509450171821307, + "grad_norm": 0.13383351266384125, + "learning_rate": 1.7600845192409262e-05, + "loss": 0.048, + "step": 15725 + }, + { + "epoch": 1.3530927835051547, + "grad_norm": 0.9389625787734985, + "learning_rate": 1.7593532673631765e-05, + "loss": 0.0351, + "step": 15750 + }, + { + "epoch": 1.3552405498281788, + "grad_norm": 0.07219479233026505, + "learning_rate": 1.758621055167311e-05, + "loss": 0.0168, + "step": 15775 + }, + { + "epoch": 1.3573883161512028, + "grad_norm": 0.30087390542030334, + "learning_rate": 1.757887883579324e-05, + "loss": 0.052, + "step": 15800 + }, + { + "epoch": 1.3595360824742269, + "grad_norm": 12.692648887634277, + "learning_rate": 1.757153753526423e-05, + "loss": 0.0777, + "step": 15825 + }, + { + "epoch": 1.361683848797251, + "grad_norm": 1.1169930696487427, + "learning_rate": 1.756418665937027e-05, + "loss": 0.0532, + "step": 15850 + }, + { + "epoch": 1.363831615120275, + "grad_norm": 0.08878464996814728, + "learning_rate": 1.755682621740767e-05, + "loss": 0.0334, + "step": 15875 + }, + { + "epoch": 1.365979381443299, + "grad_norm": 0.04953119903802872, + "learning_rate": 1.7549456218684833e-05, + "loss": 0.0471, + "step": 15900 + }, + { + "epoch": 1.368127147766323, + "grad_norm": 0.5105817914009094, + "learning_rate": 1.7542076672522243e-05, + "loss": 0.03, + "step": 15925 + }, + { + "epoch": 1.370274914089347, + "grad_norm": 0.04872726649045944, + "learning_rate": 1.7534687588252464e-05, + "loss": 0.0423, + "step": 15950 + }, + { + "epoch": 1.3724226804123711, + "grad_norm": 9.976511001586914, + "learning_rate": 1.7527288975220123e-05, + "loss": 0.0437, + "step": 15975 + }, + { + "epoch": 1.3745704467353952, + "grad_norm": 0.15280357003211975, + "learning_rate": 1.7519880842781892e-05, + "loss": 0.0457, + "step": 16000 + }, + { + "epoch": 1.3767182130584192, + "grad_norm": 0.49648553133010864, + "learning_rate": 1.751246320030649e-05, + "loss": 0.067, + "step": 16025 + }, + { + "epoch": 1.3788659793814433, + "grad_norm": 0.5740901231765747, + "learning_rate": 1.7505036057174654e-05, + "loss": 0.0488, + "step": 16050 + }, + { + "epoch": 1.3810137457044673, + "grad_norm": 21.149887084960938, + "learning_rate": 1.7497599422779142e-05, + "loss": 0.0647, + "step": 16075 + }, + { + "epoch": 1.3831615120274914, + "grad_norm": 7.864665508270264, + "learning_rate": 1.7490153306524712e-05, + "loss": 0.0335, + "step": 16100 + }, + { + "epoch": 1.3853092783505154, + "grad_norm": 0.06656129658222198, + "learning_rate": 1.748269771782812e-05, + "loss": 0.038, + "step": 16125 + }, + { + "epoch": 1.3874570446735395, + "grad_norm": 0.11410772055387497, + "learning_rate": 1.747523266611809e-05, + "loss": 0.0463, + "step": 16150 + }, + { + "epoch": 1.3896048109965635, + "grad_norm": 1.8307406902313232, + "learning_rate": 1.7467758160835318e-05, + "loss": 0.0361, + "step": 16175 + }, + { + "epoch": 1.3917525773195876, + "grad_norm": 0.04927309975028038, + "learning_rate": 1.7460274211432463e-05, + "loss": 0.0349, + "step": 16200 + }, + { + "epoch": 1.3939003436426116, + "grad_norm": 6.352517604827881, + "learning_rate": 1.7452780827374118e-05, + "loss": 0.0558, + "step": 16225 + }, + { + "epoch": 1.3960481099656357, + "grad_norm": 3.262645959854126, + "learning_rate": 1.7445278018136805e-05, + "loss": 0.0483, + "step": 16250 + }, + { + "epoch": 1.3981958762886597, + "grad_norm": 9.554914474487305, + "learning_rate": 1.7437765793208986e-05, + "loss": 0.037, + "step": 16275 + }, + { + "epoch": 1.4003436426116838, + "grad_norm": 0.3373095989227295, + "learning_rate": 1.7430244162091005e-05, + "loss": 0.0355, + "step": 16300 + }, + { + "epoch": 1.4024914089347078, + "grad_norm": 6.12929105758667, + "learning_rate": 1.7422713134295113e-05, + "loss": 0.062, + "step": 16325 + }, + { + "epoch": 1.4046391752577319, + "grad_norm": 4.86439323425293, + "learning_rate": 1.7415172719345447e-05, + "loss": 0.0397, + "step": 16350 + }, + { + "epoch": 1.406786941580756, + "grad_norm": 0.04560985416173935, + "learning_rate": 1.7407622926778015e-05, + "loss": 0.0327, + "step": 16375 + }, + { + "epoch": 1.40893470790378, + "grad_norm": 3.7440860271453857, + "learning_rate": 1.740006376614068e-05, + "loss": 0.0794, + "step": 16400 + }, + { + "epoch": 1.411082474226804, + "grad_norm": 4.484395980834961, + "learning_rate": 1.739249524699315e-05, + "loss": 0.0584, + "step": 16425 + }, + { + "epoch": 1.413230240549828, + "grad_norm": 1.3115192651748657, + "learning_rate": 1.738491737890698e-05, + "loss": 0.0524, + "step": 16450 + }, + { + "epoch": 1.4153780068728523, + "grad_norm": 0.22514896094799042, + "learning_rate": 1.7377330171465533e-05, + "loss": 0.0567, + "step": 16475 + }, + { + "epoch": 1.4175257731958764, + "grad_norm": 0.10586314648389816, + "learning_rate": 1.7369733634264e-05, + "loss": 0.0688, + "step": 16500 + }, + { + "epoch": 1.4196735395189004, + "grad_norm": 0.09370585530996323, + "learning_rate": 1.7362127776909354e-05, + "loss": 0.0254, + "step": 16525 + }, + { + "epoch": 1.4218213058419245, + "grad_norm": 0.050814494490623474, + "learning_rate": 1.735451260902037e-05, + "loss": 0.0231, + "step": 16550 + }, + { + "epoch": 1.4239690721649485, + "grad_norm": 8.047842979431152, + "learning_rate": 1.7346888140227584e-05, + "loss": 0.0945, + "step": 16575 + }, + { + "epoch": 1.4261168384879725, + "grad_norm": 0.24908557534217834, + "learning_rate": 1.7339254380173302e-05, + "loss": 0.0131, + "step": 16600 + }, + { + "epoch": 1.4282646048109966, + "grad_norm": 0.5017355680465698, + "learning_rate": 1.733161133851158e-05, + "loss": 0.0418, + "step": 16625 + }, + { + "epoch": 1.4304123711340206, + "grad_norm": 5.389187335968018, + "learning_rate": 1.732395902490821e-05, + "loss": 0.054, + "step": 16650 + }, + { + "epoch": 1.4325601374570447, + "grad_norm": 0.040262218564748764, + "learning_rate": 1.7316297449040714e-05, + "loss": 0.0462, + "step": 16675 + }, + { + "epoch": 1.4347079037800687, + "grad_norm": 0.16867591440677643, + "learning_rate": 1.7308626620598316e-05, + "loss": 0.0457, + "step": 16700 + }, + { + "epoch": 1.4368556701030928, + "grad_norm": 6.9571709632873535, + "learning_rate": 1.7300946549281955e-05, + "loss": 0.0259, + "step": 16725 + }, + { + "epoch": 1.4390034364261168, + "grad_norm": 0.26982614398002625, + "learning_rate": 1.7293257244804257e-05, + "loss": 0.0408, + "step": 16750 + }, + { + "epoch": 1.4411512027491409, + "grad_norm": 6.920740604400635, + "learning_rate": 1.7285558716889514e-05, + "loss": 0.0667, + "step": 16775 + }, + { + "epoch": 1.443298969072165, + "grad_norm": 0.039386555552482605, + "learning_rate": 1.7277850975273694e-05, + "loss": 0.043, + "step": 16800 + }, + { + "epoch": 1.445446735395189, + "grad_norm": 0.3090720772743225, + "learning_rate": 1.7270134029704414e-05, + "loss": 0.035, + "step": 16825 + }, + { + "epoch": 1.447594501718213, + "grad_norm": 0.5195348858833313, + "learning_rate": 1.7262407889940928e-05, + "loss": 0.037, + "step": 16850 + }, + { + "epoch": 1.449742268041237, + "grad_norm": 0.0756247341632843, + "learning_rate": 1.7254672565754115e-05, + "loss": 0.0367, + "step": 16875 + }, + { + "epoch": 1.4518900343642611, + "grad_norm": 0.09504783153533936, + "learning_rate": 1.7246928066926474e-05, + "loss": 0.0277, + "step": 16900 + }, + { + "epoch": 1.4540378006872852, + "grad_norm": 11.347119331359863, + "learning_rate": 1.7239174403252113e-05, + "loss": 0.0446, + "step": 16925 + }, + { + "epoch": 1.4561855670103092, + "grad_norm": 0.18968385457992554, + "learning_rate": 1.7231411584536718e-05, + "loss": 0.0458, + "step": 16950 + }, + { + "epoch": 1.4583333333333333, + "grad_norm": 12.026653289794922, + "learning_rate": 1.7223639620597556e-05, + "loss": 0.0376, + "step": 16975 + }, + { + "epoch": 1.4604810996563573, + "grad_norm": 34.08223342895508, + "learning_rate": 1.7215858521263467e-05, + "loss": 0.0641, + "step": 17000 + }, + { + "epoch": 1.4626288659793816, + "grad_norm": 0.09242331236600876, + "learning_rate": 1.7208068296374837e-05, + "loss": 0.0657, + "step": 17025 + }, + { + "epoch": 1.4647766323024056, + "grad_norm": 0.38555970788002014, + "learning_rate": 1.7200268955783593e-05, + "loss": 0.0303, + "step": 17050 + }, + { + "epoch": 1.4669243986254297, + "grad_norm": 10.925196647644043, + "learning_rate": 1.7192460509353192e-05, + "loss": 0.0465, + "step": 17075 + }, + { + "epoch": 1.4690721649484537, + "grad_norm": 0.05999082326889038, + "learning_rate": 1.718464296695861e-05, + "loss": 0.0557, + "step": 17100 + }, + { + "epoch": 1.4712199312714778, + "grad_norm": 3.1567437648773193, + "learning_rate": 1.7176816338486317e-05, + "loss": 0.0443, + "step": 17125 + }, + { + "epoch": 1.4733676975945018, + "grad_norm": 5.659643173217773, + "learning_rate": 1.7168980633834283e-05, + "loss": 0.0589, + "step": 17150 + }, + { + "epoch": 1.4755154639175259, + "grad_norm": 0.30447596311569214, + "learning_rate": 1.7161135862911952e-05, + "loss": 0.023, + "step": 17175 + }, + { + "epoch": 1.47766323024055, + "grad_norm": 0.21890893578529358, + "learning_rate": 1.715328203564023e-05, + "loss": 0.0383, + "step": 17200 + }, + { + "epoch": 1.479810996563574, + "grad_norm": 3.6251766681671143, + "learning_rate": 1.714541916195149e-05, + "loss": 0.0295, + "step": 17225 + }, + { + "epoch": 1.481958762886598, + "grad_norm": 0.10181698948144913, + "learning_rate": 1.7137547251789527e-05, + "loss": 0.0562, + "step": 17250 + }, + { + "epoch": 1.484106529209622, + "grad_norm": 37.291168212890625, + "learning_rate": 1.7129666315109575e-05, + "loss": 0.0726, + "step": 17275 + }, + { + "epoch": 1.486254295532646, + "grad_norm": 0.5083115100860596, + "learning_rate": 1.712177636187829e-05, + "loss": 0.0252, + "step": 17300 + }, + { + "epoch": 1.4884020618556701, + "grad_norm": 11.082416534423828, + "learning_rate": 1.711387740207371e-05, + "loss": 0.0281, + "step": 17325 + }, + { + "epoch": 1.4905498281786942, + "grad_norm": 1.0045373439788818, + "learning_rate": 1.7105969445685278e-05, + "loss": 0.0342, + "step": 17350 + }, + { + "epoch": 1.4926975945017182, + "grad_norm": 4.263568878173828, + "learning_rate": 1.7098052502713817e-05, + "loss": 0.0598, + "step": 17375 + }, + { + "epoch": 1.4948453608247423, + "grad_norm": 0.11571075022220612, + "learning_rate": 1.7090126583171503e-05, + "loss": 0.0291, + "step": 17400 + }, + { + "epoch": 1.4969931271477663, + "grad_norm": 0.1762894093990326, + "learning_rate": 1.708219169708188e-05, + "loss": 0.0822, + "step": 17425 + }, + { + "epoch": 1.4991408934707904, + "grad_norm": 4.626094341278076, + "learning_rate": 1.7074247854479815e-05, + "loss": 0.0373, + "step": 17450 + }, + { + "epoch": 1.5012886597938144, + "grad_norm": 0.20573559403419495, + "learning_rate": 1.7066295065411512e-05, + "loss": 0.0249, + "step": 17475 + }, + { + "epoch": 1.5034364261168385, + "grad_norm": 6.59017276763916, + "learning_rate": 1.705833333993449e-05, + "loss": 0.0406, + "step": 17500 + }, + { + "epoch": 1.5055841924398625, + "grad_norm": 5.708477973937988, + "learning_rate": 1.7050362688117564e-05, + "loss": 0.0424, + "step": 17525 + }, + { + "epoch": 1.5077319587628866, + "grad_norm": 0.7654080986976624, + "learning_rate": 1.7042383120040837e-05, + "loss": 0.0534, + "step": 17550 + }, + { + "epoch": 1.5098797250859106, + "grad_norm": 1.313754677772522, + "learning_rate": 1.703439464579569e-05, + "loss": 0.0511, + "step": 17575 + }, + { + "epoch": 1.5120274914089347, + "grad_norm": 0.5071334838867188, + "learning_rate": 1.7026397275484773e-05, + "loss": 0.0529, + "step": 17600 + }, + { + "epoch": 1.5141752577319587, + "grad_norm": 0.297961950302124, + "learning_rate": 1.701839101922198e-05, + "loss": 0.0529, + "step": 17625 + }, + { + "epoch": 1.5163230240549828, + "grad_norm": 0.4158160090446472, + "learning_rate": 1.7010375887132443e-05, + "loss": 0.0292, + "step": 17650 + }, + { + "epoch": 1.5184707903780068, + "grad_norm": 4.187367916107178, + "learning_rate": 1.7002351889352524e-05, + "loss": 0.0369, + "step": 17675 + }, + { + "epoch": 1.5206185567010309, + "grad_norm": 0.31822288036346436, + "learning_rate": 1.6994319036029786e-05, + "loss": 0.0508, + "step": 17700 + }, + { + "epoch": 1.522766323024055, + "grad_norm": 7.984460830688477, + "learning_rate": 1.6986277337323e-05, + "loss": 0.0441, + "step": 17725 + }, + { + "epoch": 1.524914089347079, + "grad_norm": 13.718408584594727, + "learning_rate": 1.697822680340213e-05, + "loss": 0.0541, + "step": 17750 + }, + { + "epoch": 1.527061855670103, + "grad_norm": 0.7653276920318604, + "learning_rate": 1.697016744444829e-05, + "loss": 0.0437, + "step": 17775 + }, + { + "epoch": 1.529209621993127, + "grad_norm": 4.531834125518799, + "learning_rate": 1.696209927065378e-05, + "loss": 0.0498, + "step": 17800 + }, + { + "epoch": 1.531357388316151, + "grad_norm": 2.5493171215057373, + "learning_rate": 1.695402229222204e-05, + "loss": 0.0379, + "step": 17825 + }, + { + "epoch": 1.5335051546391751, + "grad_norm": 0.06949194520711899, + "learning_rate": 1.694593651936763e-05, + "loss": 0.0823, + "step": 17850 + }, + { + "epoch": 1.5356529209621992, + "grad_norm": 6.198754787445068, + "learning_rate": 1.6937841962316257e-05, + "loss": 0.0413, + "step": 17875 + }, + { + "epoch": 1.5378006872852232, + "grad_norm": 3.8461148738861084, + "learning_rate": 1.6929738631304717e-05, + "loss": 0.0501, + "step": 17900 + }, + { + "epoch": 1.5399484536082473, + "grad_norm": 0.417136549949646, + "learning_rate": 1.692162653658091e-05, + "loss": 0.0199, + "step": 17925 + }, + { + "epoch": 1.5420962199312713, + "grad_norm": 9.54780387878418, + "learning_rate": 1.691350568840382e-05, + "loss": 0.0342, + "step": 17950 + }, + { + "epoch": 1.5442439862542954, + "grad_norm": 0.15378040075302124, + "learning_rate": 1.6905376097043494e-05, + "loss": 0.0838, + "step": 17975 + }, + { + "epoch": 1.5463917525773194, + "grad_norm": 0.39857593178749084, + "learning_rate": 1.6897237772781046e-05, + "loss": 0.0387, + "step": 18000 + }, + { + "epoch": 1.5485395189003437, + "grad_norm": 0.8702962398529053, + "learning_rate": 1.6889090725908625e-05, + "loss": 0.0348, + "step": 18025 + }, + { + "epoch": 1.5506872852233677, + "grad_norm": 33.92485809326172, + "learning_rate": 1.688093496672942e-05, + "loss": 0.0733, + "step": 18050 + }, + { + "epoch": 1.5528350515463918, + "grad_norm": 0.14279040694236755, + "learning_rate": 1.6872770505557632e-05, + "loss": 0.0363, + "step": 18075 + }, + { + "epoch": 1.5549828178694158, + "grad_norm": 0.25937598943710327, + "learning_rate": 1.6864597352718468e-05, + "loss": 0.0365, + "step": 18100 + }, + { + "epoch": 1.5571305841924399, + "grad_norm": 0.5015255808830261, + "learning_rate": 1.6856415518548127e-05, + "loss": 0.0352, + "step": 18125 + }, + { + "epoch": 1.559278350515464, + "grad_norm": 0.23217765986919403, + "learning_rate": 1.6848225013393787e-05, + "loss": 0.0364, + "step": 18150 + }, + { + "epoch": 1.561426116838488, + "grad_norm": 0.7769215703010559, + "learning_rate": 1.684002584761359e-05, + "loss": 0.0246, + "step": 18175 + }, + { + "epoch": 1.563573883161512, + "grad_norm": 36.113651275634766, + "learning_rate": 1.683181803157664e-05, + "loss": 0.0503, + "step": 18200 + }, + { + "epoch": 1.565721649484536, + "grad_norm": 0.12159606069326401, + "learning_rate": 1.6823601575662963e-05, + "loss": 0.0269, + "step": 18225 + }, + { + "epoch": 1.5678694158075601, + "grad_norm": 0.017954021692276, + "learning_rate": 1.6815376490263533e-05, + "loss": 0.038, + "step": 18250 + }, + { + "epoch": 1.5700171821305842, + "grad_norm": 0.13318189978599548, + "learning_rate": 1.6807142785780216e-05, + "loss": 0.0433, + "step": 18275 + }, + { + "epoch": 1.5721649484536082, + "grad_norm": 0.818631649017334, + "learning_rate": 1.6798900472625793e-05, + "loss": 0.0398, + "step": 18300 + }, + { + "epoch": 1.5743127147766323, + "grad_norm": 0.6064812541007996, + "learning_rate": 1.6790649561223925e-05, + "loss": 0.0662, + "step": 18325 + }, + { + "epoch": 1.5764604810996563, + "grad_norm": 1.978344202041626, + "learning_rate": 1.678239006200915e-05, + "loss": 0.0272, + "step": 18350 + }, + { + "epoch": 1.5786082474226806, + "grad_norm": 0.23546159267425537, + "learning_rate": 1.677412198542687e-05, + "loss": 0.0403, + "step": 18375 + }, + { + "epoch": 1.5807560137457046, + "grad_norm": 0.18896391987800598, + "learning_rate": 1.676584534193332e-05, + "loss": 0.0198, + "step": 18400 + }, + { + "epoch": 1.5829037800687287, + "grad_norm": 0.06539963185787201, + "learning_rate": 1.6757560141995588e-05, + "loss": 0.0517, + "step": 18425 + }, + { + "epoch": 1.5850515463917527, + "grad_norm": 9.31898021697998, + "learning_rate": 1.674926639609157e-05, + "loss": 0.048, + "step": 18450 + }, + { + "epoch": 1.5871993127147768, + "grad_norm": 0.06868600100278854, + "learning_rate": 1.674096411470997e-05, + "loss": 0.0339, + "step": 18475 + }, + { + "epoch": 1.5893470790378008, + "grad_norm": 0.5289402008056641, + "learning_rate": 1.67326533083503e-05, + "loss": 0.0261, + "step": 18500 + }, + { + "epoch": 1.5914948453608249, + "grad_norm": 0.14737766981124878, + "learning_rate": 1.6724333987522837e-05, + "loss": 0.0328, + "step": 18525 + }, + { + "epoch": 1.593642611683849, + "grad_norm": 0.5424079298973083, + "learning_rate": 1.671600616274863e-05, + "loss": 0.0507, + "step": 18550 + }, + { + "epoch": 1.595790378006873, + "grad_norm": 9.46137523651123, + "learning_rate": 1.670766984455949e-05, + "loss": 0.0456, + "step": 18575 + }, + { + "epoch": 1.597938144329897, + "grad_norm": 6.523070335388184, + "learning_rate": 1.6699325043497957e-05, + "loss": 0.0756, + "step": 18600 + }, + { + "epoch": 1.600085910652921, + "grad_norm": 0.17204754054546356, + "learning_rate": 1.669097177011731e-05, + "loss": 0.0425, + "step": 18625 + }, + { + "epoch": 1.602233676975945, + "grad_norm": 5.918762683868408, + "learning_rate": 1.6682610034981542e-05, + "loss": 0.0452, + "step": 18650 + }, + { + "epoch": 1.6043814432989691, + "grad_norm": 0.6920080184936523, + "learning_rate": 1.6674239848665336e-05, + "loss": 0.0461, + "step": 18675 + }, + { + "epoch": 1.6065292096219932, + "grad_norm": 0.13812555372714996, + "learning_rate": 1.6665861221754075e-05, + "loss": 0.0413, + "step": 18700 + }, + { + "epoch": 1.6086769759450172, + "grad_norm": 0.32908719778060913, + "learning_rate": 1.665747416484381e-05, + "loss": 0.0306, + "step": 18725 + }, + { + "epoch": 1.6108247422680413, + "grad_norm": 0.2205660492181778, + "learning_rate": 1.664907868854125e-05, + "loss": 0.04, + "step": 18750 + }, + { + "epoch": 1.6129725085910653, + "grad_norm": 0.24674585461616516, + "learning_rate": 1.6640674803463768e-05, + "loss": 0.0431, + "step": 18775 + }, + { + "epoch": 1.6151202749140894, + "grad_norm": 0.2122160643339157, + "learning_rate": 1.663226252023935e-05, + "loss": 0.0424, + "step": 18800 + }, + { + "epoch": 1.6172680412371134, + "grad_norm": 8.551375389099121, + "learning_rate": 1.6623841849506616e-05, + "loss": 0.0404, + "step": 18825 + }, + { + "epoch": 1.6194158075601375, + "grad_norm": 0.9400714039802551, + "learning_rate": 1.661541280191479e-05, + "loss": 0.028, + "step": 18850 + }, + { + "epoch": 1.6215635738831615, + "grad_norm": 0.04252336546778679, + "learning_rate": 1.6606975388123683e-05, + "loss": 0.0661, + "step": 18875 + }, + { + "epoch": 1.6237113402061856, + "grad_norm": 0.1788533329963684, + "learning_rate": 1.65985296188037e-05, + "loss": 0.0413, + "step": 18900 + }, + { + "epoch": 1.6258591065292096, + "grad_norm": 6.2799224853515625, + "learning_rate": 1.6590075504635803e-05, + "loss": 0.0441, + "step": 18925 + }, + { + "epoch": 1.6280068728522337, + "grad_norm": 1.373415231704712, + "learning_rate": 1.658161305631151e-05, + "loss": 0.0544, + "step": 18950 + }, + { + "epoch": 1.6301546391752577, + "grad_norm": 0.43312501907348633, + "learning_rate": 1.6573142284532875e-05, + "loss": 0.0531, + "step": 18975 + }, + { + "epoch": 1.6323024054982818, + "grad_norm": 5.94455099105835, + "learning_rate": 1.6564663200012488e-05, + "loss": 0.0466, + "step": 19000 + }, + { + "epoch": 1.6344501718213058, + "grad_norm": 0.08782713860273361, + "learning_rate": 1.6556175813473445e-05, + "loss": 0.0457, + "step": 19025 + }, + { + "epoch": 1.6365979381443299, + "grad_norm": 0.2059842348098755, + "learning_rate": 1.6547680135649334e-05, + "loss": 0.0315, + "step": 19050 + }, + { + "epoch": 1.638745704467354, + "grad_norm": 0.02548149786889553, + "learning_rate": 1.6539176177284247e-05, + "loss": 0.0335, + "step": 19075 + }, + { + "epoch": 1.640893470790378, + "grad_norm": 0.016313333064317703, + "learning_rate": 1.653066394913273e-05, + "loss": 0.0362, + "step": 19100 + }, + { + "epoch": 1.643041237113402, + "grad_norm": 0.7065303921699524, + "learning_rate": 1.6522143461959796e-05, + "loss": 0.0414, + "step": 19125 + }, + { + "epoch": 1.645189003436426, + "grad_norm": 0.2705369293689728, + "learning_rate": 1.6513614726540903e-05, + "loss": 0.0272, + "step": 19150 + }, + { + "epoch": 1.64733676975945, + "grad_norm": 0.05961885303258896, + "learning_rate": 1.6505077753661943e-05, + "loss": 0.0286, + "step": 19175 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 0.23002365231513977, + "learning_rate": 1.6496532554119214e-05, + "loss": 0.0409, + "step": 19200 + }, + { + "epoch": 1.6516323024054982, + "grad_norm": 0.38117915391921997, + "learning_rate": 1.6487979138719424e-05, + "loss": 0.0466, + "step": 19225 + }, + { + "epoch": 1.6537800687285222, + "grad_norm": 11.877320289611816, + "learning_rate": 1.6479417518279684e-05, + "loss": 0.0403, + "step": 19250 + }, + { + "epoch": 1.6559278350515463, + "grad_norm": 0.04415282607078552, + "learning_rate": 1.6470847703627457e-05, + "loss": 0.025, + "step": 19275 + }, + { + "epoch": 1.6580756013745703, + "grad_norm": 0.7436922192573547, + "learning_rate": 1.6462269705600592e-05, + "loss": 0.0803, + "step": 19300 + }, + { + "epoch": 1.6602233676975944, + "grad_norm": 0.17797823250293732, + "learning_rate": 1.645368353504727e-05, + "loss": 0.0363, + "step": 19325 + }, + { + "epoch": 1.6623711340206184, + "grad_norm": 0.617654025554657, + "learning_rate": 1.644508920282601e-05, + "loss": 0.0529, + "step": 19350 + }, + { + "epoch": 1.6645189003436425, + "grad_norm": 4.261432647705078, + "learning_rate": 1.643648671980567e-05, + "loss": 0.0192, + "step": 19375 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 8.34128189086914, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.0252, + "step": 19400 + }, + { + "epoch": 1.6688144329896906, + "grad_norm": 0.05478670448064804, + "learning_rate": 1.641925734489463e-05, + "loss": 0.0541, + "step": 19425 + }, + { + "epoch": 1.6709621993127146, + "grad_norm": 6.241009712219238, + "learning_rate": 1.641063047479311e-05, + "loss": 0.0431, + "step": 19450 + }, + { + "epoch": 1.6731099656357389, + "grad_norm": 3.7993428707122803, + "learning_rate": 1.6401995497470825e-05, + "loss": 0.0635, + "step": 19475 + }, + { + "epoch": 1.675257731958763, + "grad_norm": 0.05354524031281471, + "learning_rate": 1.6393352423848016e-05, + "loss": 0.0191, + "step": 19500 + }, + { + "epoch": 1.677405498281787, + "grad_norm": 11.136503219604492, + "learning_rate": 1.638470126485518e-05, + "loss": 0.0604, + "step": 19525 + }, + { + "epoch": 1.679553264604811, + "grad_norm": 0.43096381425857544, + "learning_rate": 1.637604203143302e-05, + "loss": 0.0532, + "step": 19550 + }, + { + "epoch": 1.681701030927835, + "grad_norm": 0.200944185256958, + "learning_rate": 1.636737473453246e-05, + "loss": 0.0328, + "step": 19575 + }, + { + "epoch": 1.6838487972508591, + "grad_norm": 0.08211785554885864, + "learning_rate": 1.6358699385114626e-05, + "loss": 0.046, + "step": 19600 + }, + { + "epoch": 1.6859965635738832, + "grad_norm": 0.515724778175354, + "learning_rate": 1.6350015994150812e-05, + "loss": 0.0308, + "step": 19625 + }, + { + "epoch": 1.6881443298969072, + "grad_norm": 0.023973003029823303, + "learning_rate": 1.6341324572622497e-05, + "loss": 0.0216, + "step": 19650 + }, + { + "epoch": 1.6902920962199313, + "grad_norm": 0.026632381603121758, + "learning_rate": 1.6332625131521308e-05, + "loss": 0.0589, + "step": 19675 + }, + { + "epoch": 1.6924398625429553, + "grad_norm": 0.031033935025334358, + "learning_rate": 1.6323917681849015e-05, + "loss": 0.055, + "step": 19700 + }, + { + "epoch": 1.6945876288659794, + "grad_norm": 5.675904750823975, + "learning_rate": 1.6315202234617523e-05, + "loss": 0.0338, + "step": 19725 + }, + { + "epoch": 1.6967353951890034, + "grad_norm": 28.125383377075195, + "learning_rate": 1.630647880084884e-05, + "loss": 0.0422, + "step": 19750 + }, + { + "epoch": 1.6988831615120275, + "grad_norm": 0.15432217717170715, + "learning_rate": 1.629774739157508e-05, + "loss": 0.035, + "step": 19775 + }, + { + "epoch": 1.7010309278350515, + "grad_norm": 5.479340076446533, + "learning_rate": 1.6289008017838447e-05, + "loss": 0.0551, + "step": 19800 + }, + { + "epoch": 1.7031786941580758, + "grad_norm": 1.5156309604644775, + "learning_rate": 1.628026069069121e-05, + "loss": 0.0601, + "step": 19825 + }, + { + "epoch": 1.7053264604810998, + "grad_norm": 4.51410436630249, + "learning_rate": 1.6271505421195697e-05, + "loss": 0.0462, + "step": 19850 + }, + { + "epoch": 1.7074742268041239, + "grad_norm": 0.8334910273551941, + "learning_rate": 1.6262742220424286e-05, + "loss": 0.0525, + "step": 19875 + }, + { + "epoch": 1.709621993127148, + "grad_norm": 8.881254196166992, + "learning_rate": 1.6253971099459382e-05, + "loss": 0.0481, + "step": 19900 + }, + { + "epoch": 1.711769759450172, + "grad_norm": 7.049525737762451, + "learning_rate": 1.6245192069393404e-05, + "loss": 0.068, + "step": 19925 + }, + { + "epoch": 1.713917525773196, + "grad_norm": 0.26233991980552673, + "learning_rate": 1.623640514132878e-05, + "loss": 0.0435, + "step": 19950 + }, + { + "epoch": 1.71606529209622, + "grad_norm": 0.412545770406723, + "learning_rate": 1.622761032637792e-05, + "loss": 0.0654, + "step": 19975 + }, + { + "epoch": 1.718213058419244, + "grad_norm": 0.2939416170120239, + "learning_rate": 1.6218807635663204e-05, + "loss": 0.0329, + "step": 20000 + }, + { + "epoch": 1.7203608247422681, + "grad_norm": 0.07821674644947052, + "learning_rate": 1.6209997080316983e-05, + "loss": 0.0441, + "step": 20025 + }, + { + "epoch": 1.7225085910652922, + "grad_norm": 0.34942519664764404, + "learning_rate": 1.6201178671481554e-05, + "loss": 0.0178, + "step": 20050 + }, + { + "epoch": 1.7246563573883162, + "grad_norm": 9.938037872314453, + "learning_rate": 1.619235242030913e-05, + "loss": 0.0534, + "step": 20075 + }, + { + "epoch": 1.7268041237113403, + "grad_norm": 0.07277333736419678, + "learning_rate": 1.6183518337961864e-05, + "loss": 0.0326, + "step": 20100 + }, + { + "epoch": 1.7289518900343643, + "grad_norm": 0.07598915696144104, + "learning_rate": 1.6174676435611793e-05, + "loss": 0.0634, + "step": 20125 + }, + { + "epoch": 1.7310996563573884, + "grad_norm": 11.39462661743164, + "learning_rate": 1.6165826724440857e-05, + "loss": 0.0488, + "step": 20150 + }, + { + "epoch": 1.7332474226804124, + "grad_norm": 0.1691262125968933, + "learning_rate": 1.6156969215640866e-05, + "loss": 0.06, + "step": 20175 + }, + { + "epoch": 1.7353951890034365, + "grad_norm": 0.3956061601638794, + "learning_rate": 1.614810392041349e-05, + "loss": 0.0742, + "step": 20200 + }, + { + "epoch": 1.7375429553264605, + "grad_norm": 0.2363876849412918, + "learning_rate": 1.613923084997025e-05, + "loss": 0.0745, + "step": 20225 + }, + { + "epoch": 1.7396907216494846, + "grad_norm": 11.347530364990234, + "learning_rate": 1.6130350015532498e-05, + "loss": 0.0528, + "step": 20250 + }, + { + "epoch": 1.7418384879725086, + "grad_norm": 30.7435302734375, + "learning_rate": 1.6121461428331403e-05, + "loss": 0.0689, + "step": 20275 + }, + { + "epoch": 1.7439862542955327, + "grad_norm": 0.35632649064064026, + "learning_rate": 1.6112565099607937e-05, + "loss": 0.0473, + "step": 20300 + }, + { + "epoch": 1.7461340206185567, + "grad_norm": 0.29202038049697876, + "learning_rate": 1.6103661040612878e-05, + "loss": 0.0343, + "step": 20325 + }, + { + "epoch": 1.7482817869415808, + "grad_norm": 0.1018579751253128, + "learning_rate": 1.6094749262606754e-05, + "loss": 0.035, + "step": 20350 + }, + { + "epoch": 1.7504295532646048, + "grad_norm": 0.4335426092147827, + "learning_rate": 1.6085829776859873e-05, + "loss": 0.0481, + "step": 20375 + }, + { + "epoch": 1.7525773195876289, + "grad_norm": 0.14077405631542206, + "learning_rate": 1.607690259465229e-05, + "loss": 0.023, + "step": 20400 + }, + { + "epoch": 1.754725085910653, + "grad_norm": 7.874173164367676, + "learning_rate": 1.6067967727273787e-05, + "loss": 0.0409, + "step": 20425 + }, + { + "epoch": 1.756872852233677, + "grad_norm": 0.09678610414266586, + "learning_rate": 1.6059025186023866e-05, + "loss": 0.0379, + "step": 20450 + }, + { + "epoch": 1.759020618556701, + "grad_norm": 0.04254009574651718, + "learning_rate": 1.6050074982211738e-05, + "loss": 0.018, + "step": 20475 + }, + { + "epoch": 1.761168384879725, + "grad_norm": 0.8822915554046631, + "learning_rate": 1.6041117127156303e-05, + "loss": 0.0466, + "step": 20500 + }, + { + "epoch": 1.763316151202749, + "grad_norm": 4.516848087310791, + "learning_rate": 1.603215163218613e-05, + "loss": 0.0666, + "step": 20525 + }, + { + "epoch": 1.7654639175257731, + "grad_norm": 0.1640489548444748, + "learning_rate": 1.6023178508639462e-05, + "loss": 0.0471, + "step": 20550 + }, + { + "epoch": 1.7676116838487972, + "grad_norm": 0.5044551491737366, + "learning_rate": 1.601419776786418e-05, + "loss": 0.0465, + "step": 20575 + }, + { + "epoch": 1.7697594501718212, + "grad_norm": 0.5579286217689514, + "learning_rate": 1.6005209421217804e-05, + "loss": 0.0275, + "step": 20600 + }, + { + "epoch": 1.7719072164948453, + "grad_norm": 9.701484680175781, + "learning_rate": 1.599621348006747e-05, + "loss": 0.0293, + "step": 20625 + }, + { + "epoch": 1.7740549828178693, + "grad_norm": 0.31440287828445435, + "learning_rate": 1.5987209955789914e-05, + "loss": 0.0187, + "step": 20650 + }, + { + "epoch": 1.7762027491408934, + "grad_norm": 6.272521018981934, + "learning_rate": 1.5978198859771476e-05, + "loss": 0.0465, + "step": 20675 + }, + { + "epoch": 1.7783505154639174, + "grad_norm": 0.052252426743507385, + "learning_rate": 1.5969180203408052e-05, + "loss": 0.0384, + "step": 20700 + }, + { + "epoch": 1.7804982817869415, + "grad_norm": 4.904228210449219, + "learning_rate": 1.596015399810512e-05, + "loss": 0.0526, + "step": 20725 + }, + { + "epoch": 1.7826460481099655, + "grad_norm": 0.2864196300506592, + "learning_rate": 1.5951120255277684e-05, + "loss": 0.0446, + "step": 20750 + }, + { + "epoch": 1.7847938144329896, + "grad_norm": 15.869146347045898, + "learning_rate": 1.5942078986350295e-05, + "loss": 0.0572, + "step": 20775 + }, + { + "epoch": 1.7869415807560136, + "grad_norm": 0.1265736073255539, + "learning_rate": 1.593303020275702e-05, + "loss": 0.0368, + "step": 20800 + }, + { + "epoch": 1.7890893470790377, + "grad_norm": 0.04614259675145149, + "learning_rate": 1.5923973915941427e-05, + "loss": 0.0592, + "step": 20825 + }, + { + "epoch": 1.7912371134020617, + "grad_norm": 1.1712764501571655, + "learning_rate": 1.591491013735657e-05, + "loss": 0.0403, + "step": 20850 + }, + { + "epoch": 1.7933848797250858, + "grad_norm": 6.788266181945801, + "learning_rate": 1.5905838878464985e-05, + "loss": 0.0625, + "step": 20875 + }, + { + "epoch": 1.7955326460481098, + "grad_norm": 0.09173892438411713, + "learning_rate": 1.5896760150738658e-05, + "loss": 0.0162, + "step": 20900 + }, + { + "epoch": 1.797680412371134, + "grad_norm": 0.041052963584661484, + "learning_rate": 1.5887673965659027e-05, + "loss": 0.0358, + "step": 20925 + }, + { + "epoch": 1.7998281786941581, + "grad_norm": 0.0621948279440403, + "learning_rate": 1.5878580334716964e-05, + "loss": 0.0225, + "step": 20950 + }, + { + "epoch": 1.8019759450171822, + "grad_norm": 0.16536907851696014, + "learning_rate": 1.5869479269412748e-05, + "loss": 0.0419, + "step": 20975 + }, + { + "epoch": 1.8041237113402062, + "grad_norm": 0.03967837989330292, + "learning_rate": 1.586037078125607e-05, + "loss": 0.0235, + "step": 21000 + }, + { + "epoch": 1.8062714776632303, + "grad_norm": 0.1563262641429901, + "learning_rate": 1.5851254881766004e-05, + "loss": 0.0454, + "step": 21025 + }, + { + "epoch": 1.8084192439862543, + "grad_norm": 0.04460283741354942, + "learning_rate": 1.5842131582470992e-05, + "loss": 0.0342, + "step": 21050 + }, + { + "epoch": 1.8105670103092784, + "grad_norm": 0.900725781917572, + "learning_rate": 1.5833000894908837e-05, + "loss": 0.0373, + "step": 21075 + }, + { + "epoch": 1.8127147766323024, + "grad_norm": 18.849639892578125, + "learning_rate": 1.5823862830626694e-05, + "loss": 0.0424, + "step": 21100 + }, + { + "epoch": 1.8148625429553265, + "grad_norm": 3.786435127258301, + "learning_rate": 1.5814717401181038e-05, + "loss": 0.0314, + "step": 21125 + }, + { + "epoch": 1.8170103092783505, + "grad_norm": 0.5006939768791199, + "learning_rate": 1.580556461813766e-05, + "loss": 0.0667, + "step": 21150 + }, + { + "epoch": 1.8191580756013745, + "grad_norm": 0.07261355221271515, + "learning_rate": 1.579640449307165e-05, + "loss": 0.0353, + "step": 21175 + }, + { + "epoch": 1.8213058419243986, + "grad_norm": 0.33159881830215454, + "learning_rate": 1.5787237037567383e-05, + "loss": 0.0445, + "step": 21200 + }, + { + "epoch": 1.8234536082474226, + "grad_norm": 0.0883435532450676, + "learning_rate": 1.577806226321851e-05, + "loss": 0.0225, + "step": 21225 + }, + { + "epoch": 1.8256013745704467, + "grad_norm": 2.438499689102173, + "learning_rate": 1.576888018162793e-05, + "loss": 0.0509, + "step": 21250 + }, + { + "epoch": 1.8277491408934707, + "grad_norm": 0.22667260468006134, + "learning_rate": 1.575969080440779e-05, + "loss": 0.0117, + "step": 21275 + }, + { + "epoch": 1.829896907216495, + "grad_norm": 0.066948302090168, + "learning_rate": 1.5750494143179456e-05, + "loss": 0.0444, + "step": 21300 + }, + { + "epoch": 1.832044673539519, + "grad_norm": 9.642599105834961, + "learning_rate": 1.5741290209573512e-05, + "loss": 0.0328, + "step": 21325 + }, + { + "epoch": 1.834192439862543, + "grad_norm": 0.8451420068740845, + "learning_rate": 1.573207901522974e-05, + "loss": 0.0413, + "step": 21350 + }, + { + "epoch": 1.8363402061855671, + "grad_norm": 0.2618654668331146, + "learning_rate": 1.5722860571797098e-05, + "loss": 0.0234, + "step": 21375 + }, + { + "epoch": 1.8384879725085912, + "grad_norm": 5.036684989929199, + "learning_rate": 1.5713634890933714e-05, + "loss": 0.028, + "step": 21400 + }, + { + "epoch": 1.8406357388316152, + "grad_norm": 57.75002670288086, + "learning_rate": 1.5704401984306875e-05, + "loss": 0.0399, + "step": 21425 + }, + { + "epoch": 1.8427835051546393, + "grad_norm": 0.12449263036251068, + "learning_rate": 1.5695161863592992e-05, + "loss": 0.0281, + "step": 21450 + }, + { + "epoch": 1.8449312714776633, + "grad_norm": 0.03235558047890663, + "learning_rate": 1.5685914540477615e-05, + "loss": 0.049, + "step": 21475 + }, + { + "epoch": 1.8470790378006874, + "grad_norm": 0.17300717532634735, + "learning_rate": 1.5676660026655394e-05, + "loss": 0.0597, + "step": 21500 + }, + { + "epoch": 1.8492268041237114, + "grad_norm": 4.741662979125977, + "learning_rate": 1.5667398333830074e-05, + "loss": 0.0546, + "step": 21525 + }, + { + "epoch": 1.8513745704467355, + "grad_norm": 9.109816551208496, + "learning_rate": 1.565812947371448e-05, + "loss": 0.0508, + "step": 21550 + }, + { + "epoch": 1.8535223367697595, + "grad_norm": 0.03804530203342438, + "learning_rate": 1.5648853458030498e-05, + "loss": 0.0285, + "step": 21575 + }, + { + "epoch": 1.8556701030927836, + "grad_norm": 15.225668907165527, + "learning_rate": 1.5639570298509067e-05, + "loss": 0.0453, + "step": 21600 + }, + { + "epoch": 1.8578178694158076, + "grad_norm": 0.12421774864196777, + "learning_rate": 1.563028000689016e-05, + "loss": 0.0251, + "step": 21625 + }, + { + "epoch": 1.8599656357388317, + "grad_norm": 0.9082395434379578, + "learning_rate": 1.5620982594922768e-05, + "loss": 0.0463, + "step": 21650 + }, + { + "epoch": 1.8621134020618557, + "grad_norm": 12.958740234375, + "learning_rate": 1.5611678074364886e-05, + "loss": 0.0399, + "step": 21675 + }, + { + "epoch": 1.8642611683848798, + "grad_norm": 0.21332372725009918, + "learning_rate": 1.5602366456983506e-05, + "loss": 0.0495, + "step": 21700 + }, + { + "epoch": 1.8664089347079038, + "grad_norm": 0.6246588826179504, + "learning_rate": 1.5593047754554588e-05, + "loss": 0.0369, + "step": 21725 + }, + { + "epoch": 1.8685567010309279, + "grad_norm": 0.44248655438423157, + "learning_rate": 1.5583721978863046e-05, + "loss": 0.0822, + "step": 21750 + }, + { + "epoch": 1.870704467353952, + "grad_norm": 5.654780387878418, + "learning_rate": 1.5574389141702755e-05, + "loss": 0.0344, + "step": 21775 + }, + { + "epoch": 1.872852233676976, + "grad_norm": 0.033251844346523285, + "learning_rate": 1.5565049254876508e-05, + "loss": 0.0283, + "step": 21800 + }, + { + "epoch": 1.875, + "grad_norm": 0.15938566625118256, + "learning_rate": 1.5555702330196024e-05, + "loss": 0.0637, + "step": 21825 + }, + { + "epoch": 1.877147766323024, + "grad_norm": 0.11084859818220139, + "learning_rate": 1.554634837948191e-05, + "loss": 0.0587, + "step": 21850 + }, + { + "epoch": 1.879295532646048, + "grad_norm": 0.15624207258224487, + "learning_rate": 1.5536987414563662e-05, + "loss": 0.0324, + "step": 21875 + }, + { + "epoch": 1.8814432989690721, + "grad_norm": 3.5152292251586914, + "learning_rate": 1.5527619447279657e-05, + "loss": 0.0537, + "step": 21900 + }, + { + "epoch": 1.8835910652920962, + "grad_norm": 3.467630386352539, + "learning_rate": 1.551824448947711e-05, + "loss": 0.0794, + "step": 21925 + }, + { + "epoch": 1.8857388316151202, + "grad_norm": 3.7443084716796875, + "learning_rate": 1.5508862553012096e-05, + "loss": 0.034, + "step": 21950 + }, + { + "epoch": 1.8878865979381443, + "grad_norm": 0.14582639932632446, + "learning_rate": 1.54994736497495e-05, + "loss": 0.0497, + "step": 21975 + }, + { + "epoch": 1.8900343642611683, + "grad_norm": 14.627659797668457, + "learning_rate": 1.5490077791563023e-05, + "loss": 0.0244, + "step": 22000 + }, + { + "epoch": 1.8921821305841924, + "grad_norm": 0.6119852662086487, + "learning_rate": 1.5480674990335165e-05, + "loss": 0.0299, + "step": 22025 + }, + { + "epoch": 1.8943298969072164, + "grad_norm": 0.7965523600578308, + "learning_rate": 1.5471265257957202e-05, + "loss": 0.0769, + "step": 22050 + }, + { + "epoch": 1.8964776632302405, + "grad_norm": 12.196985244750977, + "learning_rate": 1.546184860632918e-05, + "loss": 0.062, + "step": 22075 + }, + { + "epoch": 1.8986254295532645, + "grad_norm": 6.000990390777588, + "learning_rate": 1.5452425047359882e-05, + "loss": 0.035, + "step": 22100 + }, + { + "epoch": 1.9007731958762886, + "grad_norm": 0.3372439444065094, + "learning_rate": 1.5442994592966846e-05, + "loss": 0.0366, + "step": 22125 + }, + { + "epoch": 1.9029209621993126, + "grad_norm": 3.572455883026123, + "learning_rate": 1.5433557255076318e-05, + "loss": 0.0565, + "step": 22150 + }, + { + "epoch": 1.9050687285223367, + "grad_norm": 5.208206653594971, + "learning_rate": 1.5424113045623257e-05, + "loss": 0.0531, + "step": 22175 + }, + { + "epoch": 1.9072164948453607, + "grad_norm": 0.2095174789428711, + "learning_rate": 1.54146619765513e-05, + "loss": 0.0516, + "step": 22200 + }, + { + "epoch": 1.9093642611683848, + "grad_norm": 0.8063966631889343, + "learning_rate": 1.5405204059812775e-05, + "loss": 0.0773, + "step": 22225 + }, + { + "epoch": 1.9115120274914088, + "grad_norm": 2.273289203643799, + "learning_rate": 1.5395739307368652e-05, + "loss": 0.0295, + "step": 22250 + }, + { + "epoch": 1.9136597938144329, + "grad_norm": 2.8430917263031006, + "learning_rate": 1.538626773118856e-05, + "loss": 0.0347, + "step": 22275 + }, + { + "epoch": 1.915807560137457, + "grad_norm": 7.238885879516602, + "learning_rate": 1.5376789343250752e-05, + "loss": 0.0482, + "step": 22300 + }, + { + "epoch": 1.917955326460481, + "grad_norm": 58.79974365234375, + "learning_rate": 1.5367304155542102e-05, + "loss": 0.0388, + "step": 22325 + }, + { + "epoch": 1.920103092783505, + "grad_norm": 30.069286346435547, + "learning_rate": 1.5357812180058066e-05, + "loss": 0.0405, + "step": 22350 + }, + { + "epoch": 1.9222508591065293, + "grad_norm": 0.07702163606882095, + "learning_rate": 1.5348313428802707e-05, + "loss": 0.0264, + "step": 22375 + }, + { + "epoch": 1.9243986254295533, + "grad_norm": 0.051251109689474106, + "learning_rate": 1.533880791378864e-05, + "loss": 0.0512, + "step": 22400 + }, + { + "epoch": 1.9265463917525774, + "grad_norm": 8.51008129119873, + "learning_rate": 1.5329295647037036e-05, + "loss": 0.0507, + "step": 22425 + }, + { + "epoch": 1.9286941580756014, + "grad_norm": 20.708614349365234, + "learning_rate": 1.5319776640577617e-05, + "loss": 0.0617, + "step": 22450 + }, + { + "epoch": 1.9308419243986255, + "grad_norm": 0.23668460547924042, + "learning_rate": 1.5310250906448617e-05, + "loss": 0.0508, + "step": 22475 + }, + { + "epoch": 1.9329896907216495, + "grad_norm": 0.07346130162477493, + "learning_rate": 1.530071845669678e-05, + "loss": 0.0345, + "step": 22500 + }, + { + "epoch": 1.9351374570446735, + "grad_norm": 0.23996694386005402, + "learning_rate": 1.5291179303377345e-05, + "loss": 0.0355, + "step": 22525 + }, + { + "epoch": 1.9372852233676976, + "grad_norm": 0.053811319172382355, + "learning_rate": 1.5281633458554022e-05, + "loss": 0.0669, + "step": 22550 + }, + { + "epoch": 1.9394329896907216, + "grad_norm": 0.6679021120071411, + "learning_rate": 1.5272080934298995e-05, + "loss": 0.0292, + "step": 22575 + }, + { + "epoch": 1.9415807560137457, + "grad_norm": 0.46241459250450134, + "learning_rate": 1.5262521742692883e-05, + "loss": 0.0364, + "step": 22600 + }, + { + "epoch": 1.9437285223367697, + "grad_norm": 0.049090396612882614, + "learning_rate": 1.5252955895824752e-05, + "loss": 0.0624, + "step": 22625 + }, + { + "epoch": 1.9458762886597938, + "grad_norm": 5.0672454833984375, + "learning_rate": 1.5243383405792068e-05, + "loss": 0.0453, + "step": 22650 + }, + { + "epoch": 1.9480240549828178, + "grad_norm": 62.40793228149414, + "learning_rate": 1.5233804284700707e-05, + "loss": 0.0326, + "step": 22675 + }, + { + "epoch": 1.9501718213058419, + "grad_norm": 1.795426845550537, + "learning_rate": 1.5224218544664933e-05, + "loss": 0.0471, + "step": 22700 + }, + { + "epoch": 1.952319587628866, + "grad_norm": 10.246925354003906, + "learning_rate": 1.5214626197807373e-05, + "loss": 0.0562, + "step": 22725 + }, + { + "epoch": 1.9544673539518902, + "grad_norm": 0.1521003544330597, + "learning_rate": 1.5205027256259017e-05, + "loss": 0.0419, + "step": 22750 + }, + { + "epoch": 1.9566151202749142, + "grad_norm": 19.455888748168945, + "learning_rate": 1.5195421732159192e-05, + "loss": 0.0566, + "step": 22775 + }, + { + "epoch": 1.9587628865979383, + "grad_norm": 0.8177959322929382, + "learning_rate": 1.5185809637655548e-05, + "loss": 0.0248, + "step": 22800 + }, + { + "epoch": 1.9609106529209623, + "grad_norm": 0.8234220147132874, + "learning_rate": 1.5176190984904048e-05, + "loss": 0.0302, + "step": 22825 + }, + { + "epoch": 1.9630584192439864, + "grad_norm": 1.4468770027160645, + "learning_rate": 1.516656578606894e-05, + "loss": 0.0425, + "step": 22850 + }, + { + "epoch": 1.9652061855670104, + "grad_norm": 0.09610499441623688, + "learning_rate": 1.5156934053322766e-05, + "loss": 0.0426, + "step": 22875 + }, + { + "epoch": 1.9673539518900345, + "grad_norm": 5.533576011657715, + "learning_rate": 1.5147295798846315e-05, + "loss": 0.0434, + "step": 22900 + }, + { + "epoch": 1.9695017182130585, + "grad_norm": 0.628807008266449, + "learning_rate": 1.5137651034828636e-05, + "loss": 0.0299, + "step": 22925 + }, + { + "epoch": 1.9716494845360826, + "grad_norm": 12.579349517822266, + "learning_rate": 1.5127999773467001e-05, + "loss": 0.0265, + "step": 22950 + }, + { + "epoch": 1.9737972508591066, + "grad_norm": 0.10457532107830048, + "learning_rate": 1.5118342026966905e-05, + "loss": 0.044, + "step": 22975 + }, + { + "epoch": 1.9759450171821307, + "grad_norm": 0.2482430636882782, + "learning_rate": 1.5108677807542047e-05, + "loss": 0.0706, + "step": 23000 + }, + { + "epoch": 1.9780927835051547, + "grad_norm": 0.061811573803424835, + "learning_rate": 1.5099007127414302e-05, + "loss": 0.0383, + "step": 23025 + }, + { + "epoch": 1.9802405498281788, + "grad_norm": 8.656332015991211, + "learning_rate": 1.5089329998813722e-05, + "loss": 0.0173, + "step": 23050 + }, + { + "epoch": 1.9823883161512028, + "grad_norm": 0.21635934710502625, + "learning_rate": 1.5079646433978514e-05, + "loss": 0.0284, + "step": 23075 + }, + { + "epoch": 1.9845360824742269, + "grad_norm": 0.1174037978053093, + "learning_rate": 1.5069956445155027e-05, + "loss": 0.03, + "step": 23100 + }, + { + "epoch": 1.986683848797251, + "grad_norm": 13.901311874389648, + "learning_rate": 1.5060260044597726e-05, + "loss": 0.067, + "step": 23125 + }, + { + "epoch": 1.988831615120275, + "grad_norm": 8.303023338317871, + "learning_rate": 1.505055724456919e-05, + "loss": 0.0443, + "step": 23150 + }, + { + "epoch": 1.990979381443299, + "grad_norm": 1.8508024215698242, + "learning_rate": 1.5040848057340097e-05, + "loss": 0.0337, + "step": 23175 + }, + { + "epoch": 1.993127147766323, + "grad_norm": 0.05789276957511902, + "learning_rate": 1.5031132495189186e-05, + "loss": 0.0299, + "step": 23200 + }, + { + "epoch": 1.995274914089347, + "grad_norm": 0.16750355064868927, + "learning_rate": 1.5021410570403277e-05, + "loss": 0.035, + "step": 23225 + }, + { + "epoch": 1.9974226804123711, + "grad_norm": 0.0412740558385849, + "learning_rate": 1.5011682295277223e-05, + "loss": 0.0544, + "step": 23250 + }, + { + "epoch": 1.9995704467353952, + "grad_norm": 10.323492050170898, + "learning_rate": 1.5001947682113916e-05, + "loss": 0.0266, + "step": 23275 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.981488090989126, + "eval_auc": 0.9953790023764837, + "eval_f1": 0.9875742669136907, + "eval_loss": 0.08581268042325974, + "eval_precision": 0.9817413946399085, + "eval_recall": 0.9934768637532133, + "eval_runtime": 4444.4296, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 0.148, + "step": 23280 + } + ], + "logging_steps": 25, + "max_steps": 69840, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.01 + }, + "attributes": { + "early_stopping_patience_counter": 1 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.715332238091265e+20, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}