{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 399837, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007503057495929591, "grad_norm": 10.402915000915527, "learning_rate": 4.998812015896478e-05, "loss": 3.3178, "step": 100 }, { "epoch": 0.0015006114991859183, "grad_norm": 7.5047783851623535, "learning_rate": 4.997561506313823e-05, "loss": 3.111, "step": 200 }, { "epoch": 0.0022509172487788776, "grad_norm": 4.5860748291015625, "learning_rate": 4.996310996731168e-05, "loss": 3.3194, "step": 300 }, { "epoch": 0.0030012229983718366, "grad_norm": 3.7194831371307373, "learning_rate": 4.9950604871485135e-05, "loss": 3.2496, "step": 400 }, { "epoch": 0.0037515287479647956, "grad_norm": 2.8906636238098145, "learning_rate": 4.993809977565858e-05, "loss": 3.2479, "step": 500 }, { "epoch": 0.004501834497557755, "grad_norm": 4.07413911819458, "learning_rate": 4.992559467983203e-05, "loss": 3.2026, "step": 600 }, { "epoch": 0.005252140247150714, "grad_norm": 4.367282390594482, "learning_rate": 4.9913089584005485e-05, "loss": 3.1183, "step": 700 }, { "epoch": 0.006002445996743673, "grad_norm": 3.052788019180298, "learning_rate": 4.990058448817894e-05, "loss": 3.1302, "step": 800 }, { "epoch": 0.006752751746336632, "grad_norm": 2.675797700881958, "learning_rate": 4.988807939235239e-05, "loss": 3.0736, "step": 900 }, { "epoch": 0.007503057495929591, "grad_norm": 3.203097343444824, "learning_rate": 4.9875574296525834e-05, "loss": 3.227, "step": 1000 }, { "epoch": 0.00825336324552255, "grad_norm": 2.8115711212158203, "learning_rate": 4.986306920069929e-05, "loss": 3.1167, "step": 1100 }, { "epoch": 0.00900366899511551, "grad_norm": 2.5919222831726074, "learning_rate": 4.9850564104872736e-05, "loss": 3.2433, "step": 1200 }, { "epoch": 0.009753974744708469, "grad_norm": 6.2064290046691895, "learning_rate": 4.983805900904619e-05, "loss": 3.1953, "step": 1300 }, { "epoch": 0.010504280494301427, "grad_norm": 2.5879666805267334, "learning_rate": 4.982555391321964e-05, "loss": 3.1496, "step": 1400 }, { "epoch": 0.011254586243894388, "grad_norm": 3.7193636894226074, "learning_rate": 4.9813048817393086e-05, "loss": 3.2032, "step": 1500 }, { "epoch": 0.012004891993487346, "grad_norm": 5.175328254699707, "learning_rate": 4.980054372156654e-05, "loss": 3.2157, "step": 1600 }, { "epoch": 0.012755197743080305, "grad_norm": 2.189391613006592, "learning_rate": 4.978803862573999e-05, "loss": 3.196, "step": 1700 }, { "epoch": 0.013505503492673264, "grad_norm": 2.800657272338867, "learning_rate": 4.977553352991344e-05, "loss": 3.1016, "step": 1800 }, { "epoch": 0.014255809242266224, "grad_norm": 3.537086009979248, "learning_rate": 4.97630284340869e-05, "loss": 3.1066, "step": 1900 }, { "epoch": 0.015006114991859183, "grad_norm": 2.8318355083465576, "learning_rate": 4.975052333826034e-05, "loss": 3.0605, "step": 2000 }, { "epoch": 0.015756420741452143, "grad_norm": 2.020409107208252, "learning_rate": 4.973801824243379e-05, "loss": 3.1811, "step": 2100 }, { "epoch": 0.0165067264910451, "grad_norm": 4.791799545288086, "learning_rate": 4.972551314660725e-05, "loss": 3.0946, "step": 2200 }, { "epoch": 0.01725703224063806, "grad_norm": 2.4155328273773193, "learning_rate": 4.9713008050780694e-05, "loss": 3.0876, "step": 2300 }, { "epoch": 0.01800733799023102, "grad_norm": 5.003592014312744, "learning_rate": 4.970050295495415e-05, "loss": 3.0397, "step": 2400 }, { "epoch": 0.018757643739823977, "grad_norm": 2.605706214904785, "learning_rate": 4.9687997859127596e-05, "loss": 2.8665, "step": 2500 }, { "epoch": 0.019507949489416938, "grad_norm": 2.2089078426361084, "learning_rate": 4.9675492763301044e-05, "loss": 3.1985, "step": 2600 }, { "epoch": 0.020258255239009898, "grad_norm": 2.4101600646972656, "learning_rate": 4.96629876674745e-05, "loss": 3.1812, "step": 2700 }, { "epoch": 0.021008560988602855, "grad_norm": 2.5130929946899414, "learning_rate": 4.965048257164795e-05, "loss": 3.043, "step": 2800 }, { "epoch": 0.021758866738195815, "grad_norm": 6.818473815917969, "learning_rate": 4.96379774758214e-05, "loss": 3.011, "step": 2900 }, { "epoch": 0.022509172487788776, "grad_norm": 2.525233507156372, "learning_rate": 4.9625472379994855e-05, "loss": 3.0814, "step": 3000 }, { "epoch": 0.023259478237381732, "grad_norm": 2.93475079536438, "learning_rate": 4.96129672841683e-05, "loss": 3.1275, "step": 3100 }, { "epoch": 0.024009783986974693, "grad_norm": 2.83451771736145, "learning_rate": 4.960046218834175e-05, "loss": 3.1759, "step": 3200 }, { "epoch": 0.02476008973656765, "grad_norm": 3.296314001083374, "learning_rate": 4.9587957092515204e-05, "loss": 2.9299, "step": 3300 }, { "epoch": 0.02551039548616061, "grad_norm": 2.238349199295044, "learning_rate": 4.957545199668865e-05, "loss": 3.2333, "step": 3400 }, { "epoch": 0.02626070123575357, "grad_norm": 2.2563164234161377, "learning_rate": 4.9562946900862106e-05, "loss": 3.1259, "step": 3500 }, { "epoch": 0.027011006985346527, "grad_norm": 1.969322919845581, "learning_rate": 4.9550441805035554e-05, "loss": 3.0813, "step": 3600 }, { "epoch": 0.027761312734939488, "grad_norm": 1.8468555212020874, "learning_rate": 4.9537936709209e-05, "loss": 3.0887, "step": 3700 }, { "epoch": 0.028511618484532448, "grad_norm": 2.5109381675720215, "learning_rate": 4.9525431613382456e-05, "loss": 3.0722, "step": 3800 }, { "epoch": 0.029261924234125405, "grad_norm": 2.1091690063476562, "learning_rate": 4.951292651755591e-05, "loss": 2.9382, "step": 3900 }, { "epoch": 0.030012229983718365, "grad_norm": 1.8125531673431396, "learning_rate": 4.950042142172936e-05, "loss": 2.9795, "step": 4000 }, { "epoch": 0.030762535733311325, "grad_norm": 6.980484485626221, "learning_rate": 4.9487916325902806e-05, "loss": 3.0245, "step": 4100 }, { "epoch": 0.031512841482904286, "grad_norm": 2.771277904510498, "learning_rate": 4.947541123007626e-05, "loss": 2.8941, "step": 4200 }, { "epoch": 0.03226314723249724, "grad_norm": 2.3706371784210205, "learning_rate": 4.946290613424971e-05, "loss": 3.0453, "step": 4300 }, { "epoch": 0.0330134529820902, "grad_norm": 2.156803846359253, "learning_rate": 4.945040103842316e-05, "loss": 2.9344, "step": 4400 }, { "epoch": 0.03376375873168316, "grad_norm": 2.5668785572052, "learning_rate": 4.943789594259661e-05, "loss": 2.9789, "step": 4500 }, { "epoch": 0.03451406448127612, "grad_norm": 2.4201319217681885, "learning_rate": 4.942539084677006e-05, "loss": 3.1641, "step": 4600 }, { "epoch": 0.03526437023086908, "grad_norm": 3.477221965789795, "learning_rate": 4.941288575094351e-05, "loss": 2.9907, "step": 4700 }, { "epoch": 0.03601467598046204, "grad_norm": 2.9683077335357666, "learning_rate": 4.940038065511696e-05, "loss": 3.1362, "step": 4800 }, { "epoch": 0.036764981730055, "grad_norm": 2.132108688354492, "learning_rate": 4.9387875559290414e-05, "loss": 3.0652, "step": 4900 }, { "epoch": 0.037515287479647955, "grad_norm": 3.713635206222534, "learning_rate": 4.937537046346387e-05, "loss": 2.9184, "step": 5000 }, { "epoch": 0.03826559322924092, "grad_norm": 3.3507375717163086, "learning_rate": 4.936286536763731e-05, "loss": 3.0213, "step": 5100 }, { "epoch": 0.039015898978833875, "grad_norm": 3.6174747943878174, "learning_rate": 4.9350360271810764e-05, "loss": 2.9949, "step": 5200 }, { "epoch": 0.03976620472842683, "grad_norm": 1.8211045265197754, "learning_rate": 4.933785517598422e-05, "loss": 3.0683, "step": 5300 }, { "epoch": 0.040516510478019796, "grad_norm": 2.4387869834899902, "learning_rate": 4.9325350080157666e-05, "loss": 3.0203, "step": 5400 }, { "epoch": 0.04126681622761275, "grad_norm": 2.127376079559326, "learning_rate": 4.931284498433112e-05, "loss": 2.9752, "step": 5500 }, { "epoch": 0.04201712197720571, "grad_norm": 2.8264763355255127, "learning_rate": 4.930033988850457e-05, "loss": 3.027, "step": 5600 }, { "epoch": 0.042767427726798674, "grad_norm": 2.228292465209961, "learning_rate": 4.9287834792678015e-05, "loss": 3.138, "step": 5700 }, { "epoch": 0.04351773347639163, "grad_norm": 2.9660837650299072, "learning_rate": 4.927532969685147e-05, "loss": 2.9266, "step": 5800 }, { "epoch": 0.04426803922598459, "grad_norm": 6.914978504180908, "learning_rate": 4.926282460102492e-05, "loss": 2.9647, "step": 5900 }, { "epoch": 0.04501834497557755, "grad_norm": 2.5028915405273438, "learning_rate": 4.925031950519837e-05, "loss": 2.9895, "step": 6000 }, { "epoch": 0.04576865072517051, "grad_norm": 3.3082127571105957, "learning_rate": 4.9237814409371826e-05, "loss": 2.8908, "step": 6100 }, { "epoch": 0.046518956474763465, "grad_norm": 2.9629955291748047, "learning_rate": 4.922530931354527e-05, "loss": 3.0651, "step": 6200 }, { "epoch": 0.04726926222435643, "grad_norm": 1.8667373657226562, "learning_rate": 4.921292926867699e-05, "loss": 2.9494, "step": 6300 }, { "epoch": 0.048019567973949386, "grad_norm": 2.952314853668213, "learning_rate": 4.920042417285044e-05, "loss": 2.9516, "step": 6400 }, { "epoch": 0.04876987372354234, "grad_norm": 1.977097749710083, "learning_rate": 4.918791907702389e-05, "loss": 2.8398, "step": 6500 }, { "epoch": 0.0495201794731353, "grad_norm": 4.394473075866699, "learning_rate": 4.917541398119734e-05, "loss": 2.942, "step": 6600 }, { "epoch": 0.05027048522272826, "grad_norm": 1.7400914430618286, "learning_rate": 4.916290888537079e-05, "loss": 2.8982, "step": 6700 }, { "epoch": 0.05102079097232122, "grad_norm": 2.0394186973571777, "learning_rate": 4.915040378954424e-05, "loss": 3.1078, "step": 6800 }, { "epoch": 0.05177109672191418, "grad_norm": 3.9248790740966797, "learning_rate": 4.913789869371769e-05, "loss": 2.9153, "step": 6900 }, { "epoch": 0.05252140247150714, "grad_norm": 7.07273530960083, "learning_rate": 4.9125393597891145e-05, "loss": 2.9634, "step": 7000 }, { "epoch": 0.0532717082211001, "grad_norm": 2.593172788619995, "learning_rate": 4.911288850206459e-05, "loss": 3.1605, "step": 7100 }, { "epoch": 0.054022013970693054, "grad_norm": 2.8512721061706543, "learning_rate": 4.910038340623804e-05, "loss": 3.032, "step": 7200 }, { "epoch": 0.05477231972028602, "grad_norm": 2.6492693424224854, "learning_rate": 4.9087878310411495e-05, "loss": 2.9714, "step": 7300 }, { "epoch": 0.055522625469878975, "grad_norm": 2.5775938034057617, "learning_rate": 4.907537321458495e-05, "loss": 3.0607, "step": 7400 }, { "epoch": 0.05627293121947193, "grad_norm": 3.0690536499023438, "learning_rate": 4.9062868118758397e-05, "loss": 3.0345, "step": 7500 }, { "epoch": 0.057023236969064896, "grad_norm": 3.9141719341278076, "learning_rate": 4.9050363022931844e-05, "loss": 2.9969, "step": 7600 }, { "epoch": 0.05777354271865785, "grad_norm": 1.906204342842102, "learning_rate": 4.90378579271053e-05, "loss": 2.8301, "step": 7700 }, { "epoch": 0.05852384846825081, "grad_norm": 2.5034420490264893, "learning_rate": 4.9025352831278746e-05, "loss": 2.8877, "step": 7800 }, { "epoch": 0.05927415421784377, "grad_norm": 2.7920846939086914, "learning_rate": 4.90128477354522e-05, "loss": 2.9891, "step": 7900 }, { "epoch": 0.06002445996743673, "grad_norm": 2.8211658000946045, "learning_rate": 4.900034263962565e-05, "loss": 2.9118, "step": 8000 }, { "epoch": 0.06077476571702969, "grad_norm": 2.4786484241485596, "learning_rate": 4.8987837543799096e-05, "loss": 2.9618, "step": 8100 }, { "epoch": 0.06152507146662265, "grad_norm": 2.273275375366211, "learning_rate": 4.897533244797255e-05, "loss": 2.9389, "step": 8200 }, { "epoch": 0.06227537721621561, "grad_norm": 2.394374370574951, "learning_rate": 4.8962827352146e-05, "loss": 2.9563, "step": 8300 }, { "epoch": 0.06302568296580857, "grad_norm": 2.303840160369873, "learning_rate": 4.895032225631945e-05, "loss": 2.992, "step": 8400 }, { "epoch": 0.06377598871540152, "grad_norm": 2.67393159866333, "learning_rate": 4.893781716049291e-05, "loss": 3.1233, "step": 8500 }, { "epoch": 0.06452629446499449, "grad_norm": 2.979414939880371, "learning_rate": 4.8925312064666354e-05, "loss": 3.0034, "step": 8600 }, { "epoch": 0.06527660021458745, "grad_norm": 2.2907655239105225, "learning_rate": 4.89128069688398e-05, "loss": 2.9943, "step": 8700 }, { "epoch": 0.0660269059641804, "grad_norm": 2.034583806991577, "learning_rate": 4.8900301873013256e-05, "loss": 2.91, "step": 8800 }, { "epoch": 0.06677721171377336, "grad_norm": 1.8067829608917236, "learning_rate": 4.8887796777186704e-05, "loss": 3.0586, "step": 8900 }, { "epoch": 0.06752751746336633, "grad_norm": 2.7633233070373535, "learning_rate": 4.887529168136016e-05, "loss": 3.0529, "step": 9000 }, { "epoch": 0.06827782321295928, "grad_norm": 2.3206777572631836, "learning_rate": 4.886278658553361e-05, "loss": 3.0264, "step": 9100 }, { "epoch": 0.06902812896255224, "grad_norm": 3.205404281616211, "learning_rate": 4.8850281489707054e-05, "loss": 2.8279, "step": 9200 }, { "epoch": 0.0697784347121452, "grad_norm": 2.3460378646850586, "learning_rate": 4.883777639388051e-05, "loss": 2.9058, "step": 9300 }, { "epoch": 0.07052874046173815, "grad_norm": 2.8569111824035645, "learning_rate": 4.882527129805396e-05, "loss": 2.907, "step": 9400 }, { "epoch": 0.07127904621133112, "grad_norm": 2.6105356216430664, "learning_rate": 4.881289125318568e-05, "loss": 3.0738, "step": 9500 }, { "epoch": 0.07202935196092408, "grad_norm": 2.409979820251465, "learning_rate": 4.880038615735912e-05, "loss": 2.9836, "step": 9600 }, { "epoch": 0.07277965771051703, "grad_norm": 3.1170191764831543, "learning_rate": 4.8787881061532575e-05, "loss": 2.8182, "step": 9700 }, { "epoch": 0.07352996346011, "grad_norm": 2.849836587905884, "learning_rate": 4.877537596570603e-05, "loss": 2.9784, "step": 9800 }, { "epoch": 0.07428026920970296, "grad_norm": 2.1785635948181152, "learning_rate": 4.876287086987948e-05, "loss": 2.9754, "step": 9900 }, { "epoch": 0.07503057495929591, "grad_norm": 1.8046019077301025, "learning_rate": 4.875036577405293e-05, "loss": 2.9997, "step": 10000 }, { "epoch": 0.07578088070888887, "grad_norm": 2.655799627304077, "learning_rate": 4.873786067822638e-05, "loss": 2.9235, "step": 10100 }, { "epoch": 0.07653118645848184, "grad_norm": 2.076716661453247, "learning_rate": 4.872535558239983e-05, "loss": 2.9307, "step": 10200 }, { "epoch": 0.07728149220807479, "grad_norm": 2.1874351501464844, "learning_rate": 4.871285048657328e-05, "loss": 2.9553, "step": 10300 }, { "epoch": 0.07803179795766775, "grad_norm": 1.9707907438278198, "learning_rate": 4.870034539074673e-05, "loss": 2.8947, "step": 10400 }, { "epoch": 0.07878210370726071, "grad_norm": 2.359984874725342, "learning_rate": 4.868784029492018e-05, "loss": 2.9735, "step": 10500 }, { "epoch": 0.07953240945685366, "grad_norm": 2.6311581134796143, "learning_rate": 4.867533519909364e-05, "loss": 2.8847, "step": 10600 }, { "epoch": 0.08028271520644663, "grad_norm": 1.9136499166488647, "learning_rate": 4.866283010326708e-05, "loss": 2.9145, "step": 10700 }, { "epoch": 0.08103302095603959, "grad_norm": 2.418715715408325, "learning_rate": 4.865032500744053e-05, "loss": 2.9215, "step": 10800 }, { "epoch": 0.08178332670563254, "grad_norm": 2.053532838821411, "learning_rate": 4.863781991161399e-05, "loss": 2.9606, "step": 10900 }, { "epoch": 0.0825336324552255, "grad_norm": 2.552037000656128, "learning_rate": 4.8625314815787435e-05, "loss": 2.8001, "step": 11000 }, { "epoch": 0.08328393820481847, "grad_norm": 1.680416226387024, "learning_rate": 4.861280971996089e-05, "loss": 2.8042, "step": 11100 }, { "epoch": 0.08403424395441142, "grad_norm": 1.8076189756393433, "learning_rate": 4.860030462413434e-05, "loss": 3.0132, "step": 11200 }, { "epoch": 0.08478454970400438, "grad_norm": 3.547727346420288, "learning_rate": 4.8587799528307785e-05, "loss": 3.0526, "step": 11300 }, { "epoch": 0.08553485545359735, "grad_norm": 2.7237260341644287, "learning_rate": 4.857529443248124e-05, "loss": 2.948, "step": 11400 }, { "epoch": 0.0862851612031903, "grad_norm": 2.0759477615356445, "learning_rate": 4.8562789336654694e-05, "loss": 3.0402, "step": 11500 }, { "epoch": 0.08703546695278326, "grad_norm": 2.7819414138793945, "learning_rate": 4.855028424082814e-05, "loss": 2.9766, "step": 11600 }, { "epoch": 0.08778577270237622, "grad_norm": 3.2905097007751465, "learning_rate": 4.853777914500159e-05, "loss": 2.9428, "step": 11700 }, { "epoch": 0.08853607845196917, "grad_norm": 3.616520643234253, "learning_rate": 4.852527404917504e-05, "loss": 2.9433, "step": 11800 }, { "epoch": 0.08928638420156214, "grad_norm": 2.620955228805542, "learning_rate": 4.851276895334849e-05, "loss": 3.1618, "step": 11900 }, { "epoch": 0.0900366899511551, "grad_norm": 2.2513437271118164, "learning_rate": 4.850038890848021e-05, "loss": 2.9067, "step": 12000 }, { "epoch": 0.09078699570074805, "grad_norm": 1.9781179428100586, "learning_rate": 4.8487883812653656e-05, "loss": 2.9172, "step": 12100 }, { "epoch": 0.09153730145034102, "grad_norm": 2.5822603702545166, "learning_rate": 4.847537871682711e-05, "loss": 2.9811, "step": 12200 }, { "epoch": 0.09228760719993398, "grad_norm": 4.067698001861572, "learning_rate": 4.846287362100056e-05, "loss": 2.9539, "step": 12300 }, { "epoch": 0.09303791294952693, "grad_norm": 2.351558208465576, "learning_rate": 4.845036852517401e-05, "loss": 3.1162, "step": 12400 }, { "epoch": 0.0937882186991199, "grad_norm": 1.6981570720672607, "learning_rate": 4.843786342934746e-05, "loss": 2.9616, "step": 12500 }, { "epoch": 0.09453852444871286, "grad_norm": 2.2095375061035156, "learning_rate": 4.842535833352091e-05, "loss": 2.9713, "step": 12600 }, { "epoch": 0.09528883019830581, "grad_norm": 2.1840851306915283, "learning_rate": 4.841285323769436e-05, "loss": 2.9943, "step": 12700 }, { "epoch": 0.09603913594789877, "grad_norm": 2.306711435317993, "learning_rate": 4.840034814186781e-05, "loss": 2.9587, "step": 12800 }, { "epoch": 0.09678944169749172, "grad_norm": 1.5903010368347168, "learning_rate": 4.8387843046041264e-05, "loss": 2.9736, "step": 12900 }, { "epoch": 0.09753974744708468, "grad_norm": 2.0420937538146973, "learning_rate": 4.837533795021472e-05, "loss": 2.8452, "step": 13000 }, { "epoch": 0.09829005319667765, "grad_norm": 1.504791259765625, "learning_rate": 4.8362832854388166e-05, "loss": 2.945, "step": 13100 }, { "epoch": 0.0990403589462706, "grad_norm": 1.6954783201217651, "learning_rate": 4.8350327758561614e-05, "loss": 2.9572, "step": 13200 }, { "epoch": 0.09979066469586356, "grad_norm": 2.0318634510040283, "learning_rate": 4.833782266273507e-05, "loss": 2.938, "step": 13300 }, { "epoch": 0.10054097044545653, "grad_norm": 2.6904966831207275, "learning_rate": 4.8325317566908516e-05, "loss": 3.0028, "step": 13400 }, { "epoch": 0.10129127619504948, "grad_norm": 2.238069534301758, "learning_rate": 4.831281247108197e-05, "loss": 2.9184, "step": 13500 }, { "epoch": 0.10204158194464244, "grad_norm": 1.7648524045944214, "learning_rate": 4.8300307375255425e-05, "loss": 2.993, "step": 13600 }, { "epoch": 0.1027918876942354, "grad_norm": 3.7921173572540283, "learning_rate": 4.8287802279428865e-05, "loss": 2.9377, "step": 13700 }, { "epoch": 0.10354219344382835, "grad_norm": 2.157898426055908, "learning_rate": 4.827529718360232e-05, "loss": 3.1192, "step": 13800 }, { "epoch": 0.10429249919342132, "grad_norm": 2.2195377349853516, "learning_rate": 4.8262792087775774e-05, "loss": 2.944, "step": 13900 }, { "epoch": 0.10504280494301428, "grad_norm": 1.9699676036834717, "learning_rate": 4.825028699194922e-05, "loss": 2.7591, "step": 14000 }, { "epoch": 0.10579311069260723, "grad_norm": 1.8396100997924805, "learning_rate": 4.8237781896122676e-05, "loss": 2.9027, "step": 14100 }, { "epoch": 0.1065434164422002, "grad_norm": 2.6320276260375977, "learning_rate": 4.8225276800296124e-05, "loss": 2.9229, "step": 14200 }, { "epoch": 0.10729372219179316, "grad_norm": 2.0953328609466553, "learning_rate": 4.821277170446957e-05, "loss": 2.7714, "step": 14300 }, { "epoch": 0.10804402794138611, "grad_norm": 1.947546124458313, "learning_rate": 4.8200266608643026e-05, "loss": 2.8746, "step": 14400 }, { "epoch": 0.10879433369097907, "grad_norm": 2.762054204940796, "learning_rate": 4.8187761512816473e-05, "loss": 2.9237, "step": 14500 }, { "epoch": 0.10954463944057204, "grad_norm": 2.008850336074829, "learning_rate": 4.817525641698993e-05, "loss": 2.9179, "step": 14600 }, { "epoch": 0.11029494519016499, "grad_norm": 3.4574875831604004, "learning_rate": 4.8162751321163376e-05, "loss": 2.8509, "step": 14700 }, { "epoch": 0.11104525093975795, "grad_norm": 1.6569924354553223, "learning_rate": 4.815024622533682e-05, "loss": 3.0249, "step": 14800 }, { "epoch": 0.11179555668935091, "grad_norm": 2.92584490776062, "learning_rate": 4.813774112951028e-05, "loss": 2.9016, "step": 14900 }, { "epoch": 0.11254586243894386, "grad_norm": 2.0413055419921875, "learning_rate": 4.812523603368373e-05, "loss": 2.8893, "step": 15000 }, { "epoch": 0.11329616818853683, "grad_norm": 2.127453327178955, "learning_rate": 4.811273093785718e-05, "loss": 2.9288, "step": 15100 }, { "epoch": 0.11404647393812979, "grad_norm": 3.2836432456970215, "learning_rate": 4.810022584203063e-05, "loss": 2.8565, "step": 15200 }, { "epoch": 0.11479677968772274, "grad_norm": 3.2104673385620117, "learning_rate": 4.808772074620408e-05, "loss": 2.9399, "step": 15300 }, { "epoch": 0.1155470854373157, "grad_norm": 1.707024097442627, "learning_rate": 4.807521565037753e-05, "loss": 3.0946, "step": 15400 }, { "epoch": 0.11629739118690867, "grad_norm": 2.1351993083953857, "learning_rate": 4.8062710554550984e-05, "loss": 3.0532, "step": 15500 }, { "epoch": 0.11704769693650162, "grad_norm": 2.885483741760254, "learning_rate": 4.805020545872443e-05, "loss": 2.934, "step": 15600 }, { "epoch": 0.11779800268609458, "grad_norm": 1.8311530351638794, "learning_rate": 4.803770036289788e-05, "loss": 2.9186, "step": 15700 }, { "epoch": 0.11854830843568755, "grad_norm": 1.795498013496399, "learning_rate": 4.802519526707133e-05, "loss": 2.8936, "step": 15800 }, { "epoch": 0.1192986141852805, "grad_norm": 1.869397521018982, "learning_rate": 4.801269017124478e-05, "loss": 2.9398, "step": 15900 }, { "epoch": 0.12004891993487346, "grad_norm": 1.8565394878387451, "learning_rate": 4.8000185075418235e-05, "loss": 2.9616, "step": 16000 }, { "epoch": 0.12079922568446642, "grad_norm": 1.9668381214141846, "learning_rate": 4.798780503054995e-05, "loss": 3.1369, "step": 16100 }, { "epoch": 0.12154953143405937, "grad_norm": 2.776435375213623, "learning_rate": 4.79752999347234e-05, "loss": 2.8616, "step": 16200 }, { "epoch": 0.12229983718365234, "grad_norm": 1.9244012832641602, "learning_rate": 4.7962794838896855e-05, "loss": 2.8287, "step": 16300 }, { "epoch": 0.1230501429332453, "grad_norm": 2.56720232963562, "learning_rate": 4.79502897430703e-05, "loss": 3.03, "step": 16400 }, { "epoch": 0.12380044868283825, "grad_norm": 2.6655514240264893, "learning_rate": 4.793778464724376e-05, "loss": 2.8123, "step": 16500 }, { "epoch": 0.12455075443243122, "grad_norm": 2.577895164489746, "learning_rate": 4.7925279551417204e-05, "loss": 3.0146, "step": 16600 }, { "epoch": 0.12530106018202417, "grad_norm": 2.3880739212036133, "learning_rate": 4.791277445559065e-05, "loss": 3.0778, "step": 16700 }, { "epoch": 0.12605136593161714, "grad_norm": 2.1627469062805176, "learning_rate": 4.7900269359764107e-05, "loss": 2.9351, "step": 16800 }, { "epoch": 0.1268016716812101, "grad_norm": 1.618640661239624, "learning_rate": 4.7887764263937554e-05, "loss": 2.8786, "step": 16900 }, { "epoch": 0.12755197743080304, "grad_norm": 2.2393202781677246, "learning_rate": 4.787525916811101e-05, "loss": 2.8924, "step": 17000 }, { "epoch": 0.12830228318039602, "grad_norm": 2.795625686645508, "learning_rate": 4.786275407228446e-05, "loss": 2.8132, "step": 17100 }, { "epoch": 0.12905258892998897, "grad_norm": 3.9374496936798096, "learning_rate": 4.7850248976457904e-05, "loss": 2.8797, "step": 17200 }, { "epoch": 0.12980289467958192, "grad_norm": 2.030653238296509, "learning_rate": 4.783774388063136e-05, "loss": 2.8146, "step": 17300 }, { "epoch": 0.1305532004291749, "grad_norm": 1.8862794637680054, "learning_rate": 4.782523878480481e-05, "loss": 2.8943, "step": 17400 }, { "epoch": 0.13130350617876785, "grad_norm": 2.7834320068359375, "learning_rate": 4.781273368897826e-05, "loss": 2.9369, "step": 17500 }, { "epoch": 0.1320538119283608, "grad_norm": 3.0001308917999268, "learning_rate": 4.7800228593151715e-05, "loss": 2.87, "step": 17600 }, { "epoch": 0.13280411767795378, "grad_norm": 2.348259687423706, "learning_rate": 4.778772349732516e-05, "loss": 2.9043, "step": 17700 }, { "epoch": 0.13355442342754673, "grad_norm": 1.4746006727218628, "learning_rate": 4.777521840149861e-05, "loss": 2.9091, "step": 17800 }, { "epoch": 0.13430472917713968, "grad_norm": 1.814907431602478, "learning_rate": 4.7762713305672064e-05, "loss": 3.0263, "step": 17900 }, { "epoch": 0.13505503492673265, "grad_norm": 4.144092082977295, "learning_rate": 4.775020820984551e-05, "loss": 3.087, "step": 18000 }, { "epoch": 0.1358053406763256, "grad_norm": 1.940637230873108, "learning_rate": 4.7737828164977236e-05, "loss": 2.8241, "step": 18100 }, { "epoch": 0.13655564642591855, "grad_norm": 1.9452474117279053, "learning_rate": 4.772544812010895e-05, "loss": 2.8158, "step": 18200 }, { "epoch": 0.13730595217551153, "grad_norm": 1.7403905391693115, "learning_rate": 4.7712943024282394e-05, "loss": 2.9527, "step": 18300 }, { "epoch": 0.13805625792510448, "grad_norm": 2.2030928134918213, "learning_rate": 4.770043792845585e-05, "loss": 2.8422, "step": 18400 }, { "epoch": 0.13880656367469743, "grad_norm": 2.067284345626831, "learning_rate": 4.76879328326293e-05, "loss": 2.9605, "step": 18500 }, { "epoch": 0.1395568694242904, "grad_norm": 1.865280270576477, "learning_rate": 4.7675427736802744e-05, "loss": 2.8107, "step": 18600 }, { "epoch": 0.14030717517388336, "grad_norm": 2.0328869819641113, "learning_rate": 4.76629226409762e-05, "loss": 2.7994, "step": 18700 }, { "epoch": 0.1410574809234763, "grad_norm": 2.0293164253234863, "learning_rate": 4.765041754514965e-05, "loss": 2.8683, "step": 18800 }, { "epoch": 0.14180778667306929, "grad_norm": 2.827681541442871, "learning_rate": 4.76379124493231e-05, "loss": 2.8813, "step": 18900 }, { "epoch": 0.14255809242266224, "grad_norm": 1.6506294012069702, "learning_rate": 4.7625407353496555e-05, "loss": 2.9393, "step": 19000 }, { "epoch": 0.14330839817225519, "grad_norm": 1.4857709407806396, "learning_rate": 4.761290225767e-05, "loss": 2.8937, "step": 19100 }, { "epoch": 0.14405870392184816, "grad_norm": 1.9910835027694702, "learning_rate": 4.760039716184345e-05, "loss": 2.9664, "step": 19200 }, { "epoch": 0.1448090096714411, "grad_norm": 2.6099064350128174, "learning_rate": 4.7587892066016905e-05, "loss": 2.9137, "step": 19300 }, { "epoch": 0.14555931542103406, "grad_norm": 1.7662348747253418, "learning_rate": 4.757538697019035e-05, "loss": 2.8989, "step": 19400 }, { "epoch": 0.14630962117062704, "grad_norm": 1.9270617961883545, "learning_rate": 4.756288187436381e-05, "loss": 2.8567, "step": 19500 }, { "epoch": 0.14705992692022, "grad_norm": 1.6797236204147339, "learning_rate": 4.755037677853726e-05, "loss": 2.8707, "step": 19600 }, { "epoch": 0.14781023266981294, "grad_norm": 2.7501676082611084, "learning_rate": 4.75378716827107e-05, "loss": 2.8353, "step": 19700 }, { "epoch": 0.14856053841940592, "grad_norm": 2.1067864894866943, "learning_rate": 4.7525366586884156e-05, "loss": 2.8146, "step": 19800 }, { "epoch": 0.14931084416899887, "grad_norm": 1.9475054740905762, "learning_rate": 4.751286149105761e-05, "loss": 2.8846, "step": 19900 }, { "epoch": 0.15006114991859182, "grad_norm": 2.428818464279175, "learning_rate": 4.750035639523106e-05, "loss": 2.9329, "step": 20000 }, { "epoch": 0.1508114556681848, "grad_norm": 2.052727699279785, "learning_rate": 4.748785129940451e-05, "loss": 2.9008, "step": 20100 }, { "epoch": 0.15156176141777775, "grad_norm": 2.2188313007354736, "learning_rate": 4.747534620357796e-05, "loss": 2.8792, "step": 20200 }, { "epoch": 0.1523120671673707, "grad_norm": 2.7784273624420166, "learning_rate": 4.746284110775141e-05, "loss": 2.9892, "step": 20300 }, { "epoch": 0.15306237291696367, "grad_norm": 1.977217674255371, "learning_rate": 4.745033601192486e-05, "loss": 2.7966, "step": 20400 }, { "epoch": 0.15381267866655662, "grad_norm": 2.821099281311035, "learning_rate": 4.743783091609832e-05, "loss": 2.7967, "step": 20500 }, { "epoch": 0.15456298441614957, "grad_norm": 2.047558307647705, "learning_rate": 4.7425325820271764e-05, "loss": 2.8648, "step": 20600 }, { "epoch": 0.15531329016574255, "grad_norm": 1.8740437030792236, "learning_rate": 4.7412945775403475e-05, "loss": 2.8751, "step": 20700 }, { "epoch": 0.1560635959153355, "grad_norm": 1.9697158336639404, "learning_rate": 4.740044067957693e-05, "loss": 3.0296, "step": 20800 }, { "epoch": 0.15681390166492845, "grad_norm": 1.912837028503418, "learning_rate": 4.7387935583750384e-05, "loss": 2.8416, "step": 20900 }, { "epoch": 0.15756420741452143, "grad_norm": 2.2176849842071533, "learning_rate": 4.737543048792383e-05, "loss": 3.0013, "step": 21000 }, { "epoch": 0.15831451316411438, "grad_norm": 3.4695799350738525, "learning_rate": 4.736292539209728e-05, "loss": 2.8694, "step": 21100 }, { "epoch": 0.15906481891370733, "grad_norm": 1.7114709615707397, "learning_rate": 4.7350420296270734e-05, "loss": 2.811, "step": 21200 }, { "epoch": 0.1598151246633003, "grad_norm": 1.8398678302764893, "learning_rate": 4.733791520044418e-05, "loss": 2.9296, "step": 21300 }, { "epoch": 0.16056543041289326, "grad_norm": 2.5613338947296143, "learning_rate": 4.7325410104617636e-05, "loss": 2.8155, "step": 21400 }, { "epoch": 0.1613157361624862, "grad_norm": 2.0037996768951416, "learning_rate": 4.731290500879108e-05, "loss": 2.8136, "step": 21500 }, { "epoch": 0.16206604191207918, "grad_norm": 1.7177367210388184, "learning_rate": 4.730039991296453e-05, "loss": 2.8415, "step": 21600 }, { "epoch": 0.16281634766167213, "grad_norm": 1.8246324062347412, "learning_rate": 4.7287894817137985e-05, "loss": 2.8983, "step": 21700 }, { "epoch": 0.16356665341126508, "grad_norm": 2.7106547355651855, "learning_rate": 4.727538972131143e-05, "loss": 2.9545, "step": 21800 }, { "epoch": 0.16431695916085806, "grad_norm": 1.869311809539795, "learning_rate": 4.726288462548489e-05, "loss": 2.9326, "step": 21900 }, { "epoch": 0.165067264910451, "grad_norm": 1.6481024026870728, "learning_rate": 4.725037952965834e-05, "loss": 2.9414, "step": 22000 }, { "epoch": 0.16581757066004396, "grad_norm": 2.3044722080230713, "learning_rate": 4.723787443383179e-05, "loss": 2.9919, "step": 22100 }, { "epoch": 0.16656787640963694, "grad_norm": 1.7658532857894897, "learning_rate": 4.722536933800524e-05, "loss": 2.8895, "step": 22200 }, { "epoch": 0.1673181821592299, "grad_norm": 1.7441297769546509, "learning_rate": 4.721286424217869e-05, "loss": 2.8519, "step": 22300 }, { "epoch": 0.16806848790882284, "grad_norm": 2.1378276348114014, "learning_rate": 4.720035914635214e-05, "loss": 2.9571, "step": 22400 }, { "epoch": 0.16881879365841582, "grad_norm": 2.9510457515716553, "learning_rate": 4.718785405052559e-05, "loss": 2.8335, "step": 22500 }, { "epoch": 0.16956909940800877, "grad_norm": 1.5277706384658813, "learning_rate": 4.717534895469905e-05, "loss": 2.9944, "step": 22600 }, { "epoch": 0.17031940515760172, "grad_norm": 2.2335853576660156, "learning_rate": 4.716284385887249e-05, "loss": 2.9252, "step": 22700 }, { "epoch": 0.1710697109071947, "grad_norm": 1.9842278957366943, "learning_rate": 4.715033876304594e-05, "loss": 2.8027, "step": 22800 }, { "epoch": 0.17182001665678764, "grad_norm": 3.873886823654175, "learning_rate": 4.71378336672194e-05, "loss": 3.0052, "step": 22900 }, { "epoch": 0.1725703224063806, "grad_norm": 2.1800994873046875, "learning_rate": 4.7125328571392845e-05, "loss": 2.8643, "step": 23000 }, { "epoch": 0.17332062815597357, "grad_norm": 1.4727580547332764, "learning_rate": 4.71128234755663e-05, "loss": 2.8298, "step": 23100 }, { "epoch": 0.17407093390556652, "grad_norm": 2.2689743041992188, "learning_rate": 4.710031837973975e-05, "loss": 2.8371, "step": 23200 }, { "epoch": 0.17482123965515947, "grad_norm": 3.0331273078918457, "learning_rate": 4.7087938334871465e-05, "loss": 2.8541, "step": 23300 }, { "epoch": 0.17557154540475245, "grad_norm": 1.6711773872375488, "learning_rate": 4.707543323904491e-05, "loss": 2.9312, "step": 23400 }, { "epoch": 0.1763218511543454, "grad_norm": 2.2278354167938232, "learning_rate": 4.7062928143218367e-05, "loss": 2.7552, "step": 23500 }, { "epoch": 0.17707215690393835, "grad_norm": 1.850535273551941, "learning_rate": 4.7050423047391814e-05, "loss": 2.9125, "step": 23600 }, { "epoch": 0.17782246265353133, "grad_norm": 2.3295156955718994, "learning_rate": 4.703791795156526e-05, "loss": 2.9001, "step": 23700 }, { "epoch": 0.17857276840312428, "grad_norm": 2.589034080505371, "learning_rate": 4.7025412855738716e-05, "loss": 2.9435, "step": 23800 }, { "epoch": 0.17932307415271723, "grad_norm": 2.0270025730133057, "learning_rate": 4.7012907759912164e-05, "loss": 2.8136, "step": 23900 }, { "epoch": 0.1800733799023102, "grad_norm": 2.8529727458953857, "learning_rate": 4.700040266408562e-05, "loss": 2.9261, "step": 24000 }, { "epoch": 0.18082368565190315, "grad_norm": 2.380997896194458, "learning_rate": 4.698789756825907e-05, "loss": 2.8571, "step": 24100 }, { "epoch": 0.1815739914014961, "grad_norm": 1.9498546123504639, "learning_rate": 4.6975392472432514e-05, "loss": 2.7322, "step": 24200 }, { "epoch": 0.18232429715108908, "grad_norm": 1.8202521800994873, "learning_rate": 4.696288737660597e-05, "loss": 2.9514, "step": 24300 }, { "epoch": 0.18307460290068203, "grad_norm": 1.8947182893753052, "learning_rate": 4.695038228077942e-05, "loss": 2.9362, "step": 24400 }, { "epoch": 0.18382490865027498, "grad_norm": 1.7599045038223267, "learning_rate": 4.693787718495287e-05, "loss": 2.8014, "step": 24500 }, { "epoch": 0.18457521439986796, "grad_norm": 1.9206533432006836, "learning_rate": 4.6925372089126324e-05, "loss": 2.9764, "step": 24600 }, { "epoch": 0.1853255201494609, "grad_norm": 2.7007830142974854, "learning_rate": 4.691286699329977e-05, "loss": 2.8747, "step": 24700 }, { "epoch": 0.18607582589905386, "grad_norm": 2.558793783187866, "learning_rate": 4.690036189747322e-05, "loss": 2.9334, "step": 24800 }, { "epoch": 0.18682613164864684, "grad_norm": 1.6337898969650269, "learning_rate": 4.6887856801646674e-05, "loss": 2.8744, "step": 24900 }, { "epoch": 0.1875764373982398, "grad_norm": 2.137362480163574, "learning_rate": 4.687535170582013e-05, "loss": 2.9427, "step": 25000 }, { "epoch": 0.18832674314783274, "grad_norm": 1.8304232358932495, "learning_rate": 4.6862846609993576e-05, "loss": 2.8002, "step": 25100 }, { "epoch": 0.18907704889742571, "grad_norm": 2.3082103729248047, "learning_rate": 4.6850341514167024e-05, "loss": 2.7754, "step": 25200 }, { "epoch": 0.18982735464701866, "grad_norm": 2.9110615253448486, "learning_rate": 4.683783641834048e-05, "loss": 2.8743, "step": 25300 }, { "epoch": 0.19057766039661161, "grad_norm": 1.6203787326812744, "learning_rate": 4.6825331322513926e-05, "loss": 2.9024, "step": 25400 }, { "epoch": 0.1913279661462046, "grad_norm": 2.103111743927002, "learning_rate": 4.681282622668738e-05, "loss": 2.976, "step": 25500 }, { "epoch": 0.19207827189579754, "grad_norm": 3.412576198577881, "learning_rate": 4.680032113086083e-05, "loss": 2.7297, "step": 25600 }, { "epoch": 0.1928285776453905, "grad_norm": 1.8204371929168701, "learning_rate": 4.6787816035034275e-05, "loss": 2.7974, "step": 25700 }, { "epoch": 0.19357888339498344, "grad_norm": 2.086120367050171, "learning_rate": 4.677531093920773e-05, "loss": 2.8199, "step": 25800 }, { "epoch": 0.19432918914457642, "grad_norm": 1.8117625713348389, "learning_rate": 4.676280584338118e-05, "loss": 2.8723, "step": 25900 }, { "epoch": 0.19507949489416937, "grad_norm": 1.6242092847824097, "learning_rate": 4.675030074755463e-05, "loss": 2.7067, "step": 26000 }, { "epoch": 0.19582980064376232, "grad_norm": 2.5395679473876953, "learning_rate": 4.6737795651728086e-05, "loss": 2.8167, "step": 26100 }, { "epoch": 0.1965801063933553, "grad_norm": 2.496157646179199, "learning_rate": 4.67254156068598e-05, "loss": 2.8476, "step": 26200 }, { "epoch": 0.19733041214294825, "grad_norm": 2.3229947090148926, "learning_rate": 4.6712910511033244e-05, "loss": 2.8707, "step": 26300 }, { "epoch": 0.1980807178925412, "grad_norm": 1.7415177822113037, "learning_rate": 4.67004054152067e-05, "loss": 2.8748, "step": 26400 }, { "epoch": 0.19883102364213417, "grad_norm": 2.0542354583740234, "learning_rate": 4.668790031938015e-05, "loss": 2.8944, "step": 26500 }, { "epoch": 0.19958132939172712, "grad_norm": 2.5459585189819336, "learning_rate": 4.66753952235536e-05, "loss": 2.8601, "step": 26600 }, { "epoch": 0.20033163514132007, "grad_norm": 1.6492071151733398, "learning_rate": 4.666289012772705e-05, "loss": 2.7641, "step": 26700 }, { "epoch": 0.20108194089091305, "grad_norm": 1.7737778425216675, "learning_rate": 4.66503850319005e-05, "loss": 2.8861, "step": 26800 }, { "epoch": 0.201832246640506, "grad_norm": 1.7727570533752441, "learning_rate": 4.663787993607395e-05, "loss": 2.8884, "step": 26900 }, { "epoch": 0.20258255239009895, "grad_norm": 3.773817300796509, "learning_rate": 4.6625374840247405e-05, "loss": 2.843, "step": 27000 }, { "epoch": 0.20333285813969193, "grad_norm": 1.8134489059448242, "learning_rate": 4.661286974442086e-05, "loss": 2.8451, "step": 27100 }, { "epoch": 0.20408316388928488, "grad_norm": 1.4264103174209595, "learning_rate": 4.66003646485943e-05, "loss": 2.9172, "step": 27200 }, { "epoch": 0.20483346963887783, "grad_norm": 1.6618876457214355, "learning_rate": 4.6587859552767755e-05, "loss": 2.9544, "step": 27300 }, { "epoch": 0.2055837753884708, "grad_norm": 2.221259832382202, "learning_rate": 4.657535445694121e-05, "loss": 2.837, "step": 27400 }, { "epoch": 0.20633408113806376, "grad_norm": 1.7572516202926636, "learning_rate": 4.656284936111466e-05, "loss": 2.9879, "step": 27500 }, { "epoch": 0.2070843868876567, "grad_norm": 1.9788168668746948, "learning_rate": 4.655034426528811e-05, "loss": 2.8065, "step": 27600 }, { "epoch": 0.20783469263724969, "grad_norm": 1.7436387538909912, "learning_rate": 4.653783916946156e-05, "loss": 2.9405, "step": 27700 }, { "epoch": 0.20858499838684263, "grad_norm": 1.6879349946975708, "learning_rate": 4.6525334073635006e-05, "loss": 2.832, "step": 27800 }, { "epoch": 0.20933530413643558, "grad_norm": 2.2082908153533936, "learning_rate": 4.651282897780846e-05, "loss": 2.8082, "step": 27900 }, { "epoch": 0.21008560988602856, "grad_norm": 2.55362606048584, "learning_rate": 4.650032388198191e-05, "loss": 2.9511, "step": 28000 }, { "epoch": 0.2108359156356215, "grad_norm": 1.6244703531265259, "learning_rate": 4.648781878615536e-05, "loss": 3.0384, "step": 28100 }, { "epoch": 0.21158622138521446, "grad_norm": 2.76960825920105, "learning_rate": 4.647531369032881e-05, "loss": 2.8923, "step": 28200 }, { "epoch": 0.21233652713480744, "grad_norm": 2.2289013862609863, "learning_rate": 4.646280859450226e-05, "loss": 2.8817, "step": 28300 }, { "epoch": 0.2130868328844004, "grad_norm": 1.4995954036712646, "learning_rate": 4.645030349867571e-05, "loss": 2.979, "step": 28400 }, { "epoch": 0.21383713863399334, "grad_norm": 2.7849888801574707, "learning_rate": 4.643779840284917e-05, "loss": 2.9362, "step": 28500 }, { "epoch": 0.21458744438358632, "grad_norm": 1.9253735542297363, "learning_rate": 4.6425293307022614e-05, "loss": 2.8909, "step": 28600 }, { "epoch": 0.21533775013317927, "grad_norm": 2.2158021926879883, "learning_rate": 4.641278821119606e-05, "loss": 2.7415, "step": 28700 }, { "epoch": 0.21608805588277222, "grad_norm": 2.479095458984375, "learning_rate": 4.6400283115369517e-05, "loss": 2.8839, "step": 28800 }, { "epoch": 0.2168383616323652, "grad_norm": 2.2623541355133057, "learning_rate": 4.6387778019542964e-05, "loss": 2.8067, "step": 28900 }, { "epoch": 0.21758866738195815, "grad_norm": 2.219982385635376, "learning_rate": 4.637527292371642e-05, "loss": 2.8221, "step": 29000 }, { "epoch": 0.2183389731315511, "grad_norm": 2.3911614418029785, "learning_rate": 4.6362767827889866e-05, "loss": 2.9655, "step": 29100 }, { "epoch": 0.21908927888114407, "grad_norm": 1.617890477180481, "learning_rate": 4.6350262732063314e-05, "loss": 2.9027, "step": 29200 }, { "epoch": 0.21983958463073702, "grad_norm": 1.7603670358657837, "learning_rate": 4.633775763623677e-05, "loss": 2.9389, "step": 29300 }, { "epoch": 0.22058989038032997, "grad_norm": 1.6157697439193726, "learning_rate": 4.6325252540410216e-05, "loss": 2.9397, "step": 29400 }, { "epoch": 0.22134019612992295, "grad_norm": 3.4939823150634766, "learning_rate": 4.631274744458367e-05, "loss": 2.7113, "step": 29500 }, { "epoch": 0.2220905018795159, "grad_norm": 3.512683153152466, "learning_rate": 4.6300242348757125e-05, "loss": 2.6696, "step": 29600 }, { "epoch": 0.22284080762910885, "grad_norm": 1.6698945760726929, "learning_rate": 4.628773725293057e-05, "loss": 2.8901, "step": 29700 }, { "epoch": 0.22359111337870183, "grad_norm": 1.918846607208252, "learning_rate": 4.627523215710402e-05, "loss": 2.9954, "step": 29800 }, { "epoch": 0.22434141912829478, "grad_norm": 1.8170021772384644, "learning_rate": 4.6262727061277474e-05, "loss": 2.8523, "step": 29900 }, { "epoch": 0.22509172487788773, "grad_norm": 2.4521665573120117, "learning_rate": 4.625022196545092e-05, "loss": 2.7861, "step": 30000 }, { "epoch": 0.2258420306274807, "grad_norm": 2.251901388168335, "learning_rate": 4.6237716869624376e-05, "loss": 2.7739, "step": 30100 }, { "epoch": 0.22659233637707366, "grad_norm": 1.872636318206787, "learning_rate": 4.622521177379783e-05, "loss": 2.8905, "step": 30200 }, { "epoch": 0.2273426421266666, "grad_norm": 3.3268468379974365, "learning_rate": 4.621270667797127e-05, "loss": 2.7238, "step": 30300 }, { "epoch": 0.22809294787625958, "grad_norm": 1.7411202192306519, "learning_rate": 4.6200201582144726e-05, "loss": 2.9157, "step": 30400 }, { "epoch": 0.22884325362585253, "grad_norm": 1.6274887323379517, "learning_rate": 4.6187821537276443e-05, "loss": 2.813, "step": 30500 }, { "epoch": 0.22959355937544548, "grad_norm": 1.4126046895980835, "learning_rate": 4.61753164414499e-05, "loss": 2.902, "step": 30600 }, { "epoch": 0.23034386512503846, "grad_norm": 2.386720657348633, "learning_rate": 4.616281134562334e-05, "loss": 2.9945, "step": 30700 }, { "epoch": 0.2310941708746314, "grad_norm": 1.8285157680511475, "learning_rate": 4.615030624979679e-05, "loss": 2.9421, "step": 30800 }, { "epoch": 0.23184447662422436, "grad_norm": 2.434225082397461, "learning_rate": 4.613780115397025e-05, "loss": 2.7716, "step": 30900 }, { "epoch": 0.23259478237381734, "grad_norm": 1.7729718685150146, "learning_rate": 4.6125296058143695e-05, "loss": 2.8518, "step": 31000 }, { "epoch": 0.2333450881234103, "grad_norm": 2.3923394680023193, "learning_rate": 4.611279096231715e-05, "loss": 2.9505, "step": 31100 }, { "epoch": 0.23409539387300324, "grad_norm": 2.025233745574951, "learning_rate": 4.61002858664906e-05, "loss": 2.7759, "step": 31200 }, { "epoch": 0.23484569962259622, "grad_norm": 3.459766387939453, "learning_rate": 4.6087780770664045e-05, "loss": 2.9464, "step": 31300 }, { "epoch": 0.23559600537218917, "grad_norm": 1.8198186159133911, "learning_rate": 4.60752756748375e-05, "loss": 2.8252, "step": 31400 }, { "epoch": 0.23634631112178212, "grad_norm": 1.5480468273162842, "learning_rate": 4.606277057901095e-05, "loss": 2.8288, "step": 31500 }, { "epoch": 0.2370966168713751, "grad_norm": 1.636293649673462, "learning_rate": 4.60502654831844e-05, "loss": 2.802, "step": 31600 }, { "epoch": 0.23784692262096804, "grad_norm": 1.7334058284759521, "learning_rate": 4.603776038735785e-05, "loss": 2.7396, "step": 31700 }, { "epoch": 0.238597228370561, "grad_norm": 1.6925369501113892, "learning_rate": 4.6025255291531296e-05, "loss": 2.872, "step": 31800 }, { "epoch": 0.23934753412015397, "grad_norm": 2.6365368366241455, "learning_rate": 4.601275019570475e-05, "loss": 2.8457, "step": 31900 }, { "epoch": 0.24009783986974692, "grad_norm": 2.5411133766174316, "learning_rate": 4.6000245099878205e-05, "loss": 2.8699, "step": 32000 }, { "epoch": 0.24084814561933987, "grad_norm": 2.671757698059082, "learning_rate": 4.598774000405165e-05, "loss": 2.7072, "step": 32100 }, { "epoch": 0.24159845136893285, "grad_norm": 2.2559661865234375, "learning_rate": 4.597523490822511e-05, "loss": 2.8726, "step": 32200 }, { "epoch": 0.2423487571185258, "grad_norm": 1.8503243923187256, "learning_rate": 4.5962729812398555e-05, "loss": 2.7666, "step": 32300 }, { "epoch": 0.24309906286811875, "grad_norm": 2.6883933544158936, "learning_rate": 4.5950224716572e-05, "loss": 2.884, "step": 32400 }, { "epoch": 0.24384936861771173, "grad_norm": 2.1666815280914307, "learning_rate": 4.593784467170372e-05, "loss": 2.7732, "step": 32500 }, { "epoch": 0.24459967436730468, "grad_norm": 2.240140199661255, "learning_rate": 4.5925339575877174e-05, "loss": 2.8844, "step": 32600 }, { "epoch": 0.24534998011689763, "grad_norm": 1.7328667640686035, "learning_rate": 4.591283448005062e-05, "loss": 3.0013, "step": 32700 }, { "epoch": 0.2461002858664906, "grad_norm": 1.854824185371399, "learning_rate": 4.590032938422407e-05, "loss": 2.7841, "step": 32800 }, { "epoch": 0.24685059161608355, "grad_norm": 8.934103012084961, "learning_rate": 4.5887824288397524e-05, "loss": 2.8199, "step": 32900 }, { "epoch": 0.2476008973656765, "grad_norm": 4.233974933624268, "learning_rate": 4.587531919257098e-05, "loss": 2.9336, "step": 33000 }, { "epoch": 0.24835120311526948, "grad_norm": 2.098357915878296, "learning_rate": 4.5862814096744426e-05, "loss": 2.8782, "step": 33100 }, { "epoch": 0.24910150886486243, "grad_norm": 1.7473543882369995, "learning_rate": 4.5850309000917874e-05, "loss": 2.6892, "step": 33200 }, { "epoch": 0.24985181461445538, "grad_norm": 2.3077099323272705, "learning_rate": 4.583780390509133e-05, "loss": 2.9472, "step": 33300 }, { "epoch": 0.25060212036404833, "grad_norm": 1.4332689046859741, "learning_rate": 4.5825298809264776e-05, "loss": 2.8722, "step": 33400 }, { "epoch": 0.25135242611364134, "grad_norm": 2.5423827171325684, "learning_rate": 4.581279371343823e-05, "loss": 2.8068, "step": 33500 }, { "epoch": 0.2521027318632343, "grad_norm": 1.6381186246871948, "learning_rate": 4.580028861761168e-05, "loss": 2.9112, "step": 33600 }, { "epoch": 0.25285303761282724, "grad_norm": 2.4361109733581543, "learning_rate": 4.5787783521785125e-05, "loss": 2.8379, "step": 33700 }, { "epoch": 0.2536033433624202, "grad_norm": 1.5494210720062256, "learning_rate": 4.577527842595858e-05, "loss": 2.8552, "step": 33800 }, { "epoch": 0.25435364911201314, "grad_norm": 1.8094992637634277, "learning_rate": 4.576277333013203e-05, "loss": 2.9715, "step": 33900 }, { "epoch": 0.2551039548616061, "grad_norm": 2.157790422439575, "learning_rate": 4.575026823430548e-05, "loss": 2.9193, "step": 34000 }, { "epoch": 0.2558542606111991, "grad_norm": 1.8729925155639648, "learning_rate": 4.5737763138478936e-05, "loss": 2.8768, "step": 34100 }, { "epoch": 0.25660456636079204, "grad_norm": 1.7797375917434692, "learning_rate": 4.5725258042652384e-05, "loss": 2.884, "step": 34200 }, { "epoch": 0.257354872110385, "grad_norm": 2.2686097621917725, "learning_rate": 4.571275294682583e-05, "loss": 2.9895, "step": 34300 }, { "epoch": 0.25810517785997794, "grad_norm": 1.994732141494751, "learning_rate": 4.5700247850999286e-05, "loss": 2.882, "step": 34400 }, { "epoch": 0.2588554836095709, "grad_norm": 3.4668467044830322, "learning_rate": 4.5687742755172734e-05, "loss": 2.8839, "step": 34500 }, { "epoch": 0.25960578935916384, "grad_norm": 1.6819559335708618, "learning_rate": 4.567523765934619e-05, "loss": 2.9339, "step": 34600 }, { "epoch": 0.2603560951087568, "grad_norm": Infinity, "learning_rate": 4.566273256351964e-05, "loss": 2.8804, "step": 34700 }, { "epoch": 0.2611064008583498, "grad_norm": 1.8010095357894897, "learning_rate": 4.565035251865135e-05, "loss": 2.9806, "step": 34800 }, { "epoch": 0.26185670660794275, "grad_norm": 1.8052035570144653, "learning_rate": 4.56378474228248e-05, "loss": 2.9537, "step": 34900 }, { "epoch": 0.2626070123575357, "grad_norm": 1.8503724336624146, "learning_rate": 4.5625342326998255e-05, "loss": 2.823, "step": 35000 }, { "epoch": 0.26335731810712865, "grad_norm": 1.831809639930725, "learning_rate": 4.561283723117171e-05, "loss": 2.8459, "step": 35100 }, { "epoch": 0.2641076238567216, "grad_norm": 1.344551920890808, "learning_rate": 4.560033213534515e-05, "loss": 2.8658, "step": 35200 }, { "epoch": 0.26485792960631455, "grad_norm": 1.3670053482055664, "learning_rate": 4.5587827039518605e-05, "loss": 2.8737, "step": 35300 }, { "epoch": 0.26560823535590755, "grad_norm": 1.889812707901001, "learning_rate": 4.557532194369206e-05, "loss": 2.8129, "step": 35400 }, { "epoch": 0.2663585411055005, "grad_norm": 1.8077471256256104, "learning_rate": 4.556281684786551e-05, "loss": 2.8756, "step": 35500 }, { "epoch": 0.26710884685509345, "grad_norm": 3.251093864440918, "learning_rate": 4.555031175203896e-05, "loss": 2.8654, "step": 35600 }, { "epoch": 0.2678591526046864, "grad_norm": 1.6672202348709106, "learning_rate": 4.553780665621241e-05, "loss": 2.7662, "step": 35700 }, { "epoch": 0.26860945835427935, "grad_norm": 1.8247350454330444, "learning_rate": 4.5525301560385856e-05, "loss": 2.8146, "step": 35800 }, { "epoch": 0.2693597641038723, "grad_norm": 1.8367512226104736, "learning_rate": 4.551279646455931e-05, "loss": 2.8806, "step": 35900 }, { "epoch": 0.2701100698534653, "grad_norm": 1.7697057723999023, "learning_rate": 4.550029136873276e-05, "loss": 2.8594, "step": 36000 }, { "epoch": 0.27086037560305826, "grad_norm": 2.52526593208313, "learning_rate": 4.548778627290621e-05, "loss": 2.8896, "step": 36100 }, { "epoch": 0.2716106813526512, "grad_norm": 1.7531195878982544, "learning_rate": 4.547528117707966e-05, "loss": 2.9131, "step": 36200 }, { "epoch": 0.27236098710224416, "grad_norm": 1.9659440517425537, "learning_rate": 4.546277608125311e-05, "loss": 2.8227, "step": 36300 }, { "epoch": 0.2731112928518371, "grad_norm": 2.0619258880615234, "learning_rate": 4.545027098542656e-05, "loss": 2.8633, "step": 36400 }, { "epoch": 0.27386159860143006, "grad_norm": 2.6663386821746826, "learning_rate": 4.543776588960002e-05, "loss": 2.7199, "step": 36500 }, { "epoch": 0.27461190435102306, "grad_norm": 2.9514212608337402, "learning_rate": 4.5425385844731734e-05, "loss": 2.7902, "step": 36600 }, { "epoch": 0.275362210100616, "grad_norm": 2.449620485305786, "learning_rate": 4.541288074890518e-05, "loss": 2.8375, "step": 36700 }, { "epoch": 0.27611251585020896, "grad_norm": 1.971863865852356, "learning_rate": 4.540037565307863e-05, "loss": 2.8139, "step": 36800 }, { "epoch": 0.2768628215998019, "grad_norm": 1.809282898902893, "learning_rate": 4.5387870557252084e-05, "loss": 2.8511, "step": 36900 }, { "epoch": 0.27761312734939486, "grad_norm": 1.910248041152954, "learning_rate": 4.537536546142553e-05, "loss": 2.7814, "step": 37000 }, { "epoch": 0.2783634330989878, "grad_norm": 2.7395217418670654, "learning_rate": 4.5362860365598986e-05, "loss": 2.789, "step": 37100 }, { "epoch": 0.2791137388485808, "grad_norm": 1.7924550771713257, "learning_rate": 4.5350355269772434e-05, "loss": 2.7683, "step": 37200 }, { "epoch": 0.27986404459817377, "grad_norm": 2.2411303520202637, "learning_rate": 4.533785017394588e-05, "loss": 2.772, "step": 37300 }, { "epoch": 0.2806143503477667, "grad_norm": 1.8603242635726929, "learning_rate": 4.5325345078119336e-05, "loss": 2.9647, "step": 37400 }, { "epoch": 0.28136465609735967, "grad_norm": 2.5040013790130615, "learning_rate": 4.531283998229279e-05, "loss": 2.7565, "step": 37500 }, { "epoch": 0.2821149618469526, "grad_norm": 1.8621312379837036, "learning_rate": 4.530033488646624e-05, "loss": 2.9226, "step": 37600 }, { "epoch": 0.28286526759654557, "grad_norm": 1.635377049446106, "learning_rate": 4.5287829790639685e-05, "loss": 2.6693, "step": 37700 }, { "epoch": 0.28361557334613857, "grad_norm": 2.539424419403076, "learning_rate": 4.527532469481314e-05, "loss": 2.8569, "step": 37800 }, { "epoch": 0.2843658790957315, "grad_norm": 2.316051959991455, "learning_rate": 4.526281959898659e-05, "loss": 2.8723, "step": 37900 }, { "epoch": 0.28511618484532447, "grad_norm": 1.8506659269332886, "learning_rate": 4.525031450316004e-05, "loss": 2.8183, "step": 38000 }, { "epoch": 0.2858664905949174, "grad_norm": 3.3667447566986084, "learning_rate": 4.523780940733349e-05, "loss": 2.8288, "step": 38100 }, { "epoch": 0.28661679634451037, "grad_norm": 1.620820164680481, "learning_rate": 4.522530431150694e-05, "loss": 2.7859, "step": 38200 }, { "epoch": 0.2873671020941033, "grad_norm": 1.6306781768798828, "learning_rate": 4.521279921568039e-05, "loss": 2.9228, "step": 38300 }, { "epoch": 0.2881174078436963, "grad_norm": 2.2430574893951416, "learning_rate": 4.520029411985384e-05, "loss": 2.8626, "step": 38400 }, { "epoch": 0.2888677135932893, "grad_norm": 3.015291213989258, "learning_rate": 4.5187789024027294e-05, "loss": 2.7297, "step": 38500 }, { "epoch": 0.2896180193428822, "grad_norm": 2.4550390243530273, "learning_rate": 4.517528392820075e-05, "loss": 2.8743, "step": 38600 }, { "epoch": 0.2903683250924752, "grad_norm": 2.793724298477173, "learning_rate": 4.5162778832374196e-05, "loss": 2.8568, "step": 38700 }, { "epoch": 0.2911186308420681, "grad_norm": 1.3163261413574219, "learning_rate": 4.515027373654764e-05, "loss": 2.7001, "step": 38800 }, { "epoch": 0.2918689365916611, "grad_norm": 1.5911504030227661, "learning_rate": 4.51377686407211e-05, "loss": 2.7606, "step": 38900 }, { "epoch": 0.2926192423412541, "grad_norm": 2.4693686962127686, "learning_rate": 4.5125263544894545e-05, "loss": 2.8602, "step": 39000 }, { "epoch": 0.29336954809084703, "grad_norm": 1.3663588762283325, "learning_rate": 4.5112758449068e-05, "loss": 2.8373, "step": 39100 }, { "epoch": 0.29411985384044, "grad_norm": 2.113858938217163, "learning_rate": 4.510025335324145e-05, "loss": 2.8582, "step": 39200 }, { "epoch": 0.29487015959003293, "grad_norm": 1.7103592157363892, "learning_rate": 4.5087748257414895e-05, "loss": 2.8496, "step": 39300 }, { "epoch": 0.2956204653396259, "grad_norm": 1.6922231912612915, "learning_rate": 4.507524316158835e-05, "loss": 2.6418, "step": 39400 }, { "epoch": 0.29637077108921883, "grad_norm": 2.112529993057251, "learning_rate": 4.5062738065761804e-05, "loss": 2.9207, "step": 39500 }, { "epoch": 0.29712107683881184, "grad_norm": 1.927659034729004, "learning_rate": 4.505023296993525e-05, "loss": 2.9313, "step": 39600 }, { "epoch": 0.2978713825884048, "grad_norm": 2.240842819213867, "learning_rate": 4.5037727874108706e-05, "loss": 2.8331, "step": 39700 }, { "epoch": 0.29862168833799774, "grad_norm": 1.5624395608901978, "learning_rate": 4.502522277828215e-05, "loss": 3.023, "step": 39800 }, { "epoch": 0.2993719940875907, "grad_norm": 3.614797830581665, "learning_rate": 4.50127176824556e-05, "loss": 2.9246, "step": 39900 }, { "epoch": 0.30012229983718364, "grad_norm": 1.6125410795211792, "learning_rate": 4.5000212586629055e-05, "loss": 2.8842, "step": 40000 }, { "epoch": 0.3008726055867766, "grad_norm": 1.9498493671417236, "learning_rate": 4.49877074908025e-05, "loss": 2.8521, "step": 40100 }, { "epoch": 0.3016229113363696, "grad_norm": 1.6664319038391113, "learning_rate": 4.497520239497596e-05, "loss": 2.8421, "step": 40200 }, { "epoch": 0.30237321708596254, "grad_norm": 1.7568260431289673, "learning_rate": 4.4962697299149405e-05, "loss": 2.9177, "step": 40300 }, { "epoch": 0.3031235228355555, "grad_norm": 1.5786614418029785, "learning_rate": 4.495019220332285e-05, "loss": 2.8326, "step": 40400 }, { "epoch": 0.30387382858514844, "grad_norm": 1.1926639080047607, "learning_rate": 4.493768710749631e-05, "loss": 2.7679, "step": 40500 }, { "epoch": 0.3046241343347414, "grad_norm": 2.6250245571136475, "learning_rate": 4.4925307062628025e-05, "loss": 2.8247, "step": 40600 }, { "epoch": 0.30537444008433434, "grad_norm": 2.1046621799468994, "learning_rate": 4.491280196680147e-05, "loss": 2.8179, "step": 40700 }, { "epoch": 0.30612474583392735, "grad_norm": 2.8330626487731934, "learning_rate": 4.490029687097492e-05, "loss": 2.8653, "step": 40800 }, { "epoch": 0.3068750515835203, "grad_norm": 1.8597359657287598, "learning_rate": 4.4887791775148374e-05, "loss": 2.7716, "step": 40900 }, { "epoch": 0.30762535733311325, "grad_norm": 2.0088772773742676, "learning_rate": 4.487541173028009e-05, "loss": 2.7826, "step": 41000 }, { "epoch": 0.3083756630827062, "grad_norm": 1.986589789390564, "learning_rate": 4.4862906634453546e-05, "loss": 2.7858, "step": 41100 }, { "epoch": 0.30912596883229915, "grad_norm": 2.4903311729431152, "learning_rate": 4.4850401538626994e-05, "loss": 2.9293, "step": 41200 }, { "epoch": 0.3098762745818921, "grad_norm": 1.754644513130188, "learning_rate": 4.483789644280044e-05, "loss": 2.8296, "step": 41300 }, { "epoch": 0.3106265803314851, "grad_norm": 2.1615939140319824, "learning_rate": 4.4825391346973896e-05, "loss": 2.7605, "step": 41400 }, { "epoch": 0.31137688608107805, "grad_norm": 1.6017870903015137, "learning_rate": 4.481288625114734e-05, "loss": 2.8869, "step": 41500 }, { "epoch": 0.312127191830671, "grad_norm": 2.1008832454681396, "learning_rate": 4.48003811553208e-05, "loss": 2.8543, "step": 41600 }, { "epoch": 0.31287749758026395, "grad_norm": 2.6504814624786377, "learning_rate": 4.4787876059494245e-05, "loss": 2.8503, "step": 41700 }, { "epoch": 0.3136278033298569, "grad_norm": 1.937817096710205, "learning_rate": 4.477537096366769e-05, "loss": 2.8165, "step": 41800 }, { "epoch": 0.31437810907944985, "grad_norm": 1.2199453115463257, "learning_rate": 4.476286586784115e-05, "loss": 2.8309, "step": 41900 }, { "epoch": 0.31512841482904286, "grad_norm": 1.6390959024429321, "learning_rate": 4.47503607720146e-05, "loss": 2.7877, "step": 42000 }, { "epoch": 0.3158787205786358, "grad_norm": 2.4983131885528564, "learning_rate": 4.473785567618805e-05, "loss": 2.7596, "step": 42100 }, { "epoch": 0.31662902632822876, "grad_norm": 2.409923791885376, "learning_rate": 4.47253505803615e-05, "loss": 2.7885, "step": 42200 }, { "epoch": 0.3173793320778217, "grad_norm": 1.4507412910461426, "learning_rate": 4.471284548453495e-05, "loss": 2.8008, "step": 42300 }, { "epoch": 0.31812963782741466, "grad_norm": 1.5993571281433105, "learning_rate": 4.47003403887084e-05, "loss": 2.8562, "step": 42400 }, { "epoch": 0.3188799435770076, "grad_norm": 1.7244031429290771, "learning_rate": 4.4687835292881853e-05, "loss": 2.8748, "step": 42500 }, { "epoch": 0.3196302493266006, "grad_norm": 1.636936902999878, "learning_rate": 4.46753301970553e-05, "loss": 2.9297, "step": 42600 }, { "epoch": 0.32038055507619356, "grad_norm": 2.508568525314331, "learning_rate": 4.466282510122875e-05, "loss": 2.8105, "step": 42700 }, { "epoch": 0.3211308608257865, "grad_norm": 2.2212836742401123, "learning_rate": 4.46503200054022e-05, "loss": 2.8567, "step": 42800 }, { "epoch": 0.32188116657537946, "grad_norm": 2.363603353500366, "learning_rate": 4.463781490957565e-05, "loss": 2.9173, "step": 42900 }, { "epoch": 0.3226314723249724, "grad_norm": 2.3174736499786377, "learning_rate": 4.4625309813749105e-05, "loss": 2.8854, "step": 43000 }, { "epoch": 0.32338177807456536, "grad_norm": 1.7921342849731445, "learning_rate": 4.461280471792256e-05, "loss": 2.9787, "step": 43100 }, { "epoch": 0.32413208382415837, "grad_norm": 2.579317808151245, "learning_rate": 4.460029962209601e-05, "loss": 2.8547, "step": 43200 }, { "epoch": 0.3248823895737513, "grad_norm": 1.533466100692749, "learning_rate": 4.4587794526269455e-05, "loss": 2.8526, "step": 43300 }, { "epoch": 0.32563269532334427, "grad_norm": 2.116546630859375, "learning_rate": 4.457528943044291e-05, "loss": 2.8872, "step": 43400 }, { "epoch": 0.3263830010729372, "grad_norm": 1.6804090738296509, "learning_rate": 4.456278433461636e-05, "loss": 2.8718, "step": 43500 }, { "epoch": 0.32713330682253017, "grad_norm": 2.4197890758514404, "learning_rate": 4.455027923878981e-05, "loss": 2.8172, "step": 43600 }, { "epoch": 0.3278836125721231, "grad_norm": 2.6898787021636963, "learning_rate": 4.453777414296326e-05, "loss": 2.8848, "step": 43700 }, { "epoch": 0.3286339183217161, "grad_norm": 1.817765474319458, "learning_rate": 4.4525269047136707e-05, "loss": 2.7858, "step": 43800 }, { "epoch": 0.3293842240713091, "grad_norm": 2.005964517593384, "learning_rate": 4.451276395131016e-05, "loss": 2.8433, "step": 43900 }, { "epoch": 0.330134529820902, "grad_norm": 1.3906500339508057, "learning_rate": 4.4500258855483615e-05, "loss": 2.7717, "step": 44000 }, { "epoch": 0.330884835570495, "grad_norm": 1.5706548690795898, "learning_rate": 4.448775375965706e-05, "loss": 2.9197, "step": 44100 }, { "epoch": 0.3316351413200879, "grad_norm": 2.607734203338623, "learning_rate": 4.447524866383052e-05, "loss": 2.8887, "step": 44200 }, { "epoch": 0.3323854470696809, "grad_norm": 2.8906116485595703, "learning_rate": 4.4462743568003965e-05, "loss": 2.6077, "step": 44300 }, { "epoch": 0.3331357528192739, "grad_norm": 1.6617568731307983, "learning_rate": 4.445023847217741e-05, "loss": 2.7842, "step": 44400 }, { "epoch": 0.33388605856886683, "grad_norm": 1.8865357637405396, "learning_rate": 4.443773337635087e-05, "loss": 2.7807, "step": 44500 }, { "epoch": 0.3346363643184598, "grad_norm": 1.8138513565063477, "learning_rate": 4.4425228280524315e-05, "loss": 2.8995, "step": 44600 }, { "epoch": 0.33538667006805273, "grad_norm": 1.6539524793624878, "learning_rate": 4.441272318469777e-05, "loss": 2.9163, "step": 44700 }, { "epoch": 0.3361369758176457, "grad_norm": 1.515578031539917, "learning_rate": 4.440021808887122e-05, "loss": 2.7932, "step": 44800 }, { "epoch": 0.33688728156723863, "grad_norm": 4.048199653625488, "learning_rate": 4.4387712993044664e-05, "loss": 2.8283, "step": 44900 }, { "epoch": 0.33763758731683163, "grad_norm": 2.395970106124878, "learning_rate": 4.437520789721812e-05, "loss": 2.7467, "step": 45000 }, { "epoch": 0.3383878930664246, "grad_norm": 2.4168624877929688, "learning_rate": 4.436270280139157e-05, "loss": 2.8039, "step": 45100 }, { "epoch": 0.33913819881601753, "grad_norm": 1.849647879600525, "learning_rate": 4.435019770556502e-05, "loss": 2.9922, "step": 45200 }, { "epoch": 0.3398885045656105, "grad_norm": 2.4974565505981445, "learning_rate": 4.433769260973847e-05, "loss": 2.7458, "step": 45300 }, { "epoch": 0.34063881031520343, "grad_norm": 2.6197926998138428, "learning_rate": 4.4325312564870186e-05, "loss": 2.805, "step": 45400 }, { "epoch": 0.3413891160647964, "grad_norm": 2.8987743854522705, "learning_rate": 4.431280746904364e-05, "loss": 2.7978, "step": 45500 }, { "epoch": 0.3421394218143894, "grad_norm": 1.8218692541122437, "learning_rate": 4.430030237321709e-05, "loss": 2.8323, "step": 45600 }, { "epoch": 0.34288972756398234, "grad_norm": 1.933688759803772, "learning_rate": 4.428779727739054e-05, "loss": 2.8545, "step": 45700 }, { "epoch": 0.3436400333135753, "grad_norm": 1.630439043045044, "learning_rate": 4.427529218156399e-05, "loss": 2.8124, "step": 45800 }, { "epoch": 0.34439033906316824, "grad_norm": 2.0237631797790527, "learning_rate": 4.426278708573744e-05, "loss": 2.9819, "step": 45900 }, { "epoch": 0.3451406448127612, "grad_norm": 1.995030403137207, "learning_rate": 4.425028198991089e-05, "loss": 2.9124, "step": 46000 }, { "epoch": 0.34589095056235414, "grad_norm": 2.1082022190093994, "learning_rate": 4.423777689408434e-05, "loss": 2.8416, "step": 46100 }, { "epoch": 0.34664125631194714, "grad_norm": 2.062178611755371, "learning_rate": 4.4225271798257794e-05, "loss": 2.7927, "step": 46200 }, { "epoch": 0.3473915620615401, "grad_norm": 2.0528290271759033, "learning_rate": 4.421276670243124e-05, "loss": 2.8274, "step": 46300 }, { "epoch": 0.34814186781113304, "grad_norm": 2.0626959800720215, "learning_rate": 4.4200261606604696e-05, "loss": 2.682, "step": 46400 }, { "epoch": 0.348892173560726, "grad_norm": 1.63450288772583, "learning_rate": 4.4187756510778144e-05, "loss": 2.7497, "step": 46500 }, { "epoch": 0.34964247931031894, "grad_norm": 2.473581314086914, "learning_rate": 4.41752514149516e-05, "loss": 2.8175, "step": 46600 }, { "epoch": 0.3503927850599119, "grad_norm": 2.0072274208068848, "learning_rate": 4.4162746319125046e-05, "loss": 2.947, "step": 46700 }, { "epoch": 0.3511430908095049, "grad_norm": 2.3183093070983887, "learning_rate": 4.415024122329849e-05, "loss": 2.9102, "step": 46800 }, { "epoch": 0.35189339655909785, "grad_norm": 2.4778735637664795, "learning_rate": 4.413773612747195e-05, "loss": 3.0111, "step": 46900 }, { "epoch": 0.3526437023086908, "grad_norm": 4.429402828216553, "learning_rate": 4.4125231031645395e-05, "loss": 2.6742, "step": 47000 }, { "epoch": 0.35339400805828375, "grad_norm": 1.81304931640625, "learning_rate": 4.411272593581885e-05, "loss": 2.7186, "step": 47100 }, { "epoch": 0.3541443138078767, "grad_norm": 2.0577824115753174, "learning_rate": 4.4100220839992304e-05, "loss": 2.8293, "step": 47200 }, { "epoch": 0.35489461955746965, "grad_norm": 2.465747594833374, "learning_rate": 4.4087715744165745e-05, "loss": 2.7808, "step": 47300 }, { "epoch": 0.35564492530706265, "grad_norm": 2.4417009353637695, "learning_rate": 4.40752106483392e-05, "loss": 2.806, "step": 47400 }, { "epoch": 0.3563952310566556, "grad_norm": 2.524198055267334, "learning_rate": 4.4062705552512654e-05, "loss": 2.8359, "step": 47500 }, { "epoch": 0.35714553680624855, "grad_norm": 2.796947717666626, "learning_rate": 4.40502004566861e-05, "loss": 2.7393, "step": 47600 }, { "epoch": 0.3578958425558415, "grad_norm": 1.6126669645309448, "learning_rate": 4.4037695360859556e-05, "loss": 2.9543, "step": 47700 }, { "epoch": 0.35864614830543445, "grad_norm": 2.696415662765503, "learning_rate": 4.4025190265033003e-05, "loss": 2.966, "step": 47800 }, { "epoch": 0.3593964540550274, "grad_norm": 2.836728811264038, "learning_rate": 4.401268516920645e-05, "loss": 2.888, "step": 47900 }, { "epoch": 0.3601467598046204, "grad_norm": 2.075273275375366, "learning_rate": 4.400030512433817e-05, "loss": 2.8086, "step": 48000 }, { "epoch": 0.36089706555421336, "grad_norm": 2.12568998336792, "learning_rate": 4.398780002851162e-05, "loss": 2.6885, "step": 48100 }, { "epoch": 0.3616473713038063, "grad_norm": 1.5556890964508057, "learning_rate": 4.397529493268507e-05, "loss": 2.8604, "step": 48200 }, { "epoch": 0.36239767705339926, "grad_norm": 2.11881422996521, "learning_rate": 4.396278983685852e-05, "loss": 2.9607, "step": 48300 }, { "epoch": 0.3631479828029922, "grad_norm": 1.502994179725647, "learning_rate": 4.395028474103197e-05, "loss": 2.7821, "step": 48400 }, { "epoch": 0.36389828855258516, "grad_norm": 2.619678497314453, "learning_rate": 4.393777964520542e-05, "loss": 2.6069, "step": 48500 }, { "epoch": 0.36464859430217816, "grad_norm": 1.6943306922912598, "learning_rate": 4.3925274549378875e-05, "loss": 2.9707, "step": 48600 }, { "epoch": 0.3653989000517711, "grad_norm": 1.965561032295227, "learning_rate": 4.391276945355233e-05, "loss": 2.9079, "step": 48700 }, { "epoch": 0.36614920580136406, "grad_norm": 1.4873372316360474, "learning_rate": 4.390026435772578e-05, "loss": 2.8827, "step": 48800 }, { "epoch": 0.366899511550957, "grad_norm": 2.244382858276367, "learning_rate": 4.3887759261899224e-05, "loss": 2.7865, "step": 48900 }, { "epoch": 0.36764981730054996, "grad_norm": 1.9347397089004517, "learning_rate": 4.387525416607268e-05, "loss": 2.8804, "step": 49000 }, { "epoch": 0.3684001230501429, "grad_norm": 1.6371665000915527, "learning_rate": 4.3862749070246126e-05, "loss": 2.7113, "step": 49100 }, { "epoch": 0.3691504287997359, "grad_norm": 2.333958625793457, "learning_rate": 4.385024397441958e-05, "loss": 2.7787, "step": 49200 }, { "epoch": 0.36990073454932887, "grad_norm": 1.5161813497543335, "learning_rate": 4.383773887859303e-05, "loss": 2.8244, "step": 49300 }, { "epoch": 0.3706510402989218, "grad_norm": 1.629801869392395, "learning_rate": 4.3825233782766476e-05, "loss": 2.8491, "step": 49400 }, { "epoch": 0.37140134604851477, "grad_norm": 2.237844228744507, "learning_rate": 4.381272868693993e-05, "loss": 2.9458, "step": 49500 }, { "epoch": 0.3721516517981077, "grad_norm": 1.756246566772461, "learning_rate": 4.3800223591113385e-05, "loss": 2.7455, "step": 49600 }, { "epoch": 0.37290195754770067, "grad_norm": 1.8149827718734741, "learning_rate": 4.378771849528683e-05, "loss": 2.8341, "step": 49700 }, { "epoch": 0.3736522632972937, "grad_norm": 2.3598105907440186, "learning_rate": 4.377521339946028e-05, "loss": 2.7944, "step": 49800 }, { "epoch": 0.3744025690468866, "grad_norm": 1.2341505289077759, "learning_rate": 4.3762708303633734e-05, "loss": 2.7783, "step": 49900 }, { "epoch": 0.3751528747964796, "grad_norm": 1.7377938032150269, "learning_rate": 4.375032825876545e-05, "loss": 2.6982, "step": 50000 }, { "epoch": 0.3759031805460725, "grad_norm": 2.384646415710449, "learning_rate": 4.37378231629389e-05, "loss": 2.8248, "step": 50100 }, { "epoch": 0.3766534862956655, "grad_norm": 1.7431467771530151, "learning_rate": 4.3725318067112354e-05, "loss": 2.8266, "step": 50200 }, { "epoch": 0.3774037920452584, "grad_norm": 2.6095237731933594, "learning_rate": 4.37128129712858e-05, "loss": 2.776, "step": 50300 }, { "epoch": 0.37815409779485143, "grad_norm": 2.369279623031616, "learning_rate": 4.370030787545925e-05, "loss": 2.8196, "step": 50400 }, { "epoch": 0.3789044035444444, "grad_norm": 1.9633548259735107, "learning_rate": 4.3687802779632704e-05, "loss": 2.9235, "step": 50500 }, { "epoch": 0.37965470929403733, "grad_norm": 2.3741753101348877, "learning_rate": 4.367529768380615e-05, "loss": 2.8319, "step": 50600 }, { "epoch": 0.3804050150436303, "grad_norm": 1.7014636993408203, "learning_rate": 4.3662792587979606e-05, "loss": 2.7253, "step": 50700 }, { "epoch": 0.38115532079322323, "grad_norm": 2.0373523235321045, "learning_rate": 4.365028749215305e-05, "loss": 2.8011, "step": 50800 }, { "epoch": 0.3819056265428162, "grad_norm": 2.009366273880005, "learning_rate": 4.363778239632651e-05, "loss": 2.9093, "step": 50900 }, { "epoch": 0.3826559322924092, "grad_norm": 1.730726957321167, "learning_rate": 4.3625277300499955e-05, "loss": 2.8339, "step": 51000 }, { "epoch": 0.38340623804200213, "grad_norm": 2.2947230339050293, "learning_rate": 4.361277220467341e-05, "loss": 2.815, "step": 51100 }, { "epoch": 0.3841565437915951, "grad_norm": 2.6862101554870605, "learning_rate": 4.360026710884686e-05, "loss": 2.9077, "step": 51200 }, { "epoch": 0.38490684954118803, "grad_norm": 1.8218668699264526, "learning_rate": 4.3587762013020305e-05, "loss": 2.9121, "step": 51300 }, { "epoch": 0.385657155290781, "grad_norm": 1.977739691734314, "learning_rate": 4.357525691719376e-05, "loss": 2.8922, "step": 51400 }, { "epoch": 0.38640746104037393, "grad_norm": 2.107344150543213, "learning_rate": 4.356275182136721e-05, "loss": 2.8876, "step": 51500 }, { "epoch": 0.3871577667899669, "grad_norm": 1.5303059816360474, "learning_rate": 4.355024672554066e-05, "loss": 2.9375, "step": 51600 }, { "epoch": 0.3879080725395599, "grad_norm": 2.290081262588501, "learning_rate": 4.3537741629714116e-05, "loss": 2.9004, "step": 51700 }, { "epoch": 0.38865837828915284, "grad_norm": 2.3764514923095703, "learning_rate": 4.3525236533887557e-05, "loss": 2.9033, "step": 51800 }, { "epoch": 0.3894086840387458, "grad_norm": 2.1455438137054443, "learning_rate": 4.351273143806101e-05, "loss": 2.7891, "step": 51900 }, { "epoch": 0.39015898978833874, "grad_norm": 1.5297259092330933, "learning_rate": 4.3500226342234465e-05, "loss": 2.9429, "step": 52000 }, { "epoch": 0.3909092955379317, "grad_norm": 1.6705195903778076, "learning_rate": 4.348784629736618e-05, "loss": 2.8681, "step": 52100 }, { "epoch": 0.39165960128752464, "grad_norm": 2.1914303302764893, "learning_rate": 4.347534120153963e-05, "loss": 2.7149, "step": 52200 }, { "epoch": 0.39240990703711764, "grad_norm": 2.668250322341919, "learning_rate": 4.346283610571308e-05, "loss": 2.8088, "step": 52300 }, { "epoch": 0.3931602127867106, "grad_norm": 2.426055669784546, "learning_rate": 4.345033100988653e-05, "loss": 2.6828, "step": 52400 }, { "epoch": 0.39391051853630354, "grad_norm": 1.4846216440200806, "learning_rate": 4.343782591405998e-05, "loss": 2.9168, "step": 52500 }, { "epoch": 0.3946608242858965, "grad_norm": 1.891160488128662, "learning_rate": 4.3425320818233435e-05, "loss": 2.9026, "step": 52600 }, { "epoch": 0.39541113003548944, "grad_norm": 2.6373863220214844, "learning_rate": 4.341281572240688e-05, "loss": 2.9798, "step": 52700 }, { "epoch": 0.3961614357850824, "grad_norm": 1.9230021238327026, "learning_rate": 4.340031062658033e-05, "loss": 2.9966, "step": 52800 }, { "epoch": 0.3969117415346754, "grad_norm": 1.3737983703613281, "learning_rate": 4.3387805530753784e-05, "loss": 2.698, "step": 52900 }, { "epoch": 0.39766204728426835, "grad_norm": 1.6333292722702026, "learning_rate": 4.337530043492723e-05, "loss": 2.8525, "step": 53000 }, { "epoch": 0.3984123530338613, "grad_norm": 1.5975350141525269, "learning_rate": 4.3362795339100686e-05, "loss": 2.8107, "step": 53100 }, { "epoch": 0.39916265878345425, "grad_norm": 1.4535393714904785, "learning_rate": 4.335029024327414e-05, "loss": 2.8859, "step": 53200 }, { "epoch": 0.3999129645330472, "grad_norm": 1.9881583452224731, "learning_rate": 4.333778514744759e-05, "loss": 2.8675, "step": 53300 }, { "epoch": 0.40066327028264015, "grad_norm": 1.7821288108825684, "learning_rate": 4.3325280051621036e-05, "loss": 2.8934, "step": 53400 }, { "epoch": 0.40141357603223315, "grad_norm": 1.5578762292861938, "learning_rate": 4.331277495579449e-05, "loss": 2.9057, "step": 53500 }, { "epoch": 0.4021638817818261, "grad_norm": 1.8474661111831665, "learning_rate": 4.330026985996794e-05, "loss": 2.8631, "step": 53600 }, { "epoch": 0.40291418753141905, "grad_norm": 1.8641854524612427, "learning_rate": 4.328776476414139e-05, "loss": 2.6901, "step": 53700 }, { "epoch": 0.403664493281012, "grad_norm": 1.805714726448059, "learning_rate": 4.327525966831484e-05, "loss": 2.8398, "step": 53800 }, { "epoch": 0.40441479903060495, "grad_norm": 2.4757421016693115, "learning_rate": 4.326275457248829e-05, "loss": 2.8828, "step": 53900 }, { "epoch": 0.4051651047801979, "grad_norm": 2.066080093383789, "learning_rate": 4.325024947666174e-05, "loss": 2.9376, "step": 54000 }, { "epoch": 0.4059154105297909, "grad_norm": 1.4848439693450928, "learning_rate": 4.3237744380835196e-05, "loss": 2.6577, "step": 54100 }, { "epoch": 0.40666571627938386, "grad_norm": 1.6162946224212646, "learning_rate": 4.322536433596691e-05, "loss": 2.7285, "step": 54200 }, { "epoch": 0.4074160220289768, "grad_norm": 2.0194997787475586, "learning_rate": 4.3212859240140355e-05, "loss": 2.7261, "step": 54300 }, { "epoch": 0.40816632777856976, "grad_norm": 2.277468681335449, "learning_rate": 4.320035414431381e-05, "loss": 2.6918, "step": 54400 }, { "epoch": 0.4089166335281627, "grad_norm": 1.91490638256073, "learning_rate": 4.3187849048487263e-05, "loss": 2.9012, "step": 54500 }, { "epoch": 0.40966693927775566, "grad_norm": 1.6008784770965576, "learning_rate": 4.317534395266071e-05, "loss": 2.8544, "step": 54600 }, { "epoch": 0.41041724502734866, "grad_norm": 2.052931547164917, "learning_rate": 4.3162838856834165e-05, "loss": 2.8756, "step": 54700 }, { "epoch": 0.4111675507769416, "grad_norm": 1.81214439868927, "learning_rate": 4.315033376100761e-05, "loss": 2.9345, "step": 54800 }, { "epoch": 0.41191785652653456, "grad_norm": 2.359877109527588, "learning_rate": 4.313782866518106e-05, "loss": 2.857, "step": 54900 }, { "epoch": 0.4126681622761275, "grad_norm": 5.058993816375732, "learning_rate": 4.3125323569354515e-05, "loss": 2.7442, "step": 55000 }, { "epoch": 0.41341846802572046, "grad_norm": 1.352443814277649, "learning_rate": 4.311281847352796e-05, "loss": 2.75, "step": 55100 }, { "epoch": 0.4141687737753134, "grad_norm": 3.0837998390197754, "learning_rate": 4.310031337770142e-05, "loss": 2.8617, "step": 55200 }, { "epoch": 0.4149190795249064, "grad_norm": 3.304598569869995, "learning_rate": 4.3087808281874865e-05, "loss": 2.8011, "step": 55300 }, { "epoch": 0.41566938527449937, "grad_norm": 2.1628146171569824, "learning_rate": 4.307530318604831e-05, "loss": 2.7037, "step": 55400 }, { "epoch": 0.4164196910240923, "grad_norm": 1.854494571685791, "learning_rate": 4.306279809022177e-05, "loss": 2.8208, "step": 55500 }, { "epoch": 0.41716999677368527, "grad_norm": 2.3402278423309326, "learning_rate": 4.305029299439522e-05, "loss": 2.9274, "step": 55600 }, { "epoch": 0.4179203025232782, "grad_norm": 2.2618114948272705, "learning_rate": 4.303778789856867e-05, "loss": 2.7998, "step": 55700 }, { "epoch": 0.41867060827287117, "grad_norm": 2.289133310317993, "learning_rate": 4.3025282802742117e-05, "loss": 2.7973, "step": 55800 }, { "epoch": 0.4194209140224642, "grad_norm": 1.7633171081542969, "learning_rate": 4.301277770691557e-05, "loss": 2.6614, "step": 55900 }, { "epoch": 0.4201712197720571, "grad_norm": 2.3616106510162354, "learning_rate": 4.300027261108902e-05, "loss": 2.6826, "step": 56000 }, { "epoch": 0.4209215255216501, "grad_norm": 2.6640560626983643, "learning_rate": 4.298776751526247e-05, "loss": 2.8727, "step": 56100 }, { "epoch": 0.421671831271243, "grad_norm": 3.2118418216705322, "learning_rate": 4.297526241943593e-05, "loss": 2.7792, "step": 56200 }, { "epoch": 0.422422137020836, "grad_norm": 1.7469470500946045, "learning_rate": 4.296275732360937e-05, "loss": 2.7721, "step": 56300 }, { "epoch": 0.4231724427704289, "grad_norm": 2.5742247104644775, "learning_rate": 4.295025222778282e-05, "loss": 2.8229, "step": 56400 }, { "epoch": 0.42392274852002193, "grad_norm": 2.614771604537964, "learning_rate": 4.293787218291454e-05, "loss": 2.7735, "step": 56500 }, { "epoch": 0.4246730542696149, "grad_norm": 1.6948806047439575, "learning_rate": 4.2925367087087994e-05, "loss": 2.877, "step": 56600 }, { "epoch": 0.42542336001920783, "grad_norm": 1.5341800451278687, "learning_rate": 4.291286199126144e-05, "loss": 2.9099, "step": 56700 }, { "epoch": 0.4261736657688008, "grad_norm": 2.3066458702087402, "learning_rate": 4.290035689543489e-05, "loss": 2.7814, "step": 56800 }, { "epoch": 0.42692397151839373, "grad_norm": 1.5436091423034668, "learning_rate": 4.2887851799608344e-05, "loss": 2.8455, "step": 56900 }, { "epoch": 0.4276742772679867, "grad_norm": 1.6862366199493408, "learning_rate": 4.287534670378179e-05, "loss": 2.7947, "step": 57000 }, { "epoch": 0.4284245830175797, "grad_norm": 2.500542163848877, "learning_rate": 4.2862841607955246e-05, "loss": 2.7616, "step": 57100 }, { "epoch": 0.42917488876717264, "grad_norm": 1.8449748754501343, "learning_rate": 4.2850336512128694e-05, "loss": 2.716, "step": 57200 }, { "epoch": 0.4299251945167656, "grad_norm": 1.2980217933654785, "learning_rate": 4.283783141630214e-05, "loss": 2.8165, "step": 57300 }, { "epoch": 0.43067550026635854, "grad_norm": 1.4376020431518555, "learning_rate": 4.2825326320475596e-05, "loss": 2.6769, "step": 57400 }, { "epoch": 0.4314258060159515, "grad_norm": 1.9046565294265747, "learning_rate": 4.2812821224649043e-05, "loss": 2.6971, "step": 57500 }, { "epoch": 0.43217611176554444, "grad_norm": 1.8515081405639648, "learning_rate": 4.28003161288225e-05, "loss": 2.6904, "step": 57600 }, { "epoch": 0.43292641751513744, "grad_norm": 2.176826000213623, "learning_rate": 4.278781103299595e-05, "loss": 2.9546, "step": 57700 }, { "epoch": 0.4336767232647304, "grad_norm": 1.7699759006500244, "learning_rate": 4.27753059371694e-05, "loss": 2.8245, "step": 57800 }, { "epoch": 0.43442702901432334, "grad_norm": 1.4172072410583496, "learning_rate": 4.276280084134285e-05, "loss": 2.7987, "step": 57900 }, { "epoch": 0.4351773347639163, "grad_norm": 1.3096961975097656, "learning_rate": 4.27502957455163e-05, "loss": 2.8386, "step": 58000 }, { "epoch": 0.43592764051350924, "grad_norm": 4.02482795715332, "learning_rate": 4.273779064968975e-05, "loss": 2.6894, "step": 58100 }, { "epoch": 0.4366779462631022, "grad_norm": 1.8112900257110596, "learning_rate": 4.2725285553863204e-05, "loss": 2.7871, "step": 58200 }, { "epoch": 0.4374282520126952, "grad_norm": 1.929822564125061, "learning_rate": 4.271278045803665e-05, "loss": 2.8116, "step": 58300 }, { "epoch": 0.43817855776228815, "grad_norm": 1.6767934560775757, "learning_rate": 4.27002753622101e-05, "loss": 2.775, "step": 58400 }, { "epoch": 0.4389288635118811, "grad_norm": 2.2732884883880615, "learning_rate": 4.2687770266383554e-05, "loss": 2.8628, "step": 58500 }, { "epoch": 0.43967916926147405, "grad_norm": 1.4579896926879883, "learning_rate": 4.267526517055701e-05, "loss": 2.8021, "step": 58600 }, { "epoch": 0.440429475011067, "grad_norm": 2.8051531314849854, "learning_rate": 4.2662760074730456e-05, "loss": 2.9186, "step": 58700 }, { "epoch": 0.44117978076065995, "grad_norm": 2.0410819053649902, "learning_rate": 4.26502549789039e-05, "loss": 2.9216, "step": 58800 }, { "epoch": 0.44193008651025295, "grad_norm": 1.4961168766021729, "learning_rate": 4.263774988307736e-05, "loss": 2.7266, "step": 58900 }, { "epoch": 0.4426803922598459, "grad_norm": 1.6399474143981934, "learning_rate": 4.2625369838209075e-05, "loss": 2.8163, "step": 59000 }, { "epoch": 0.44343069800943885, "grad_norm": 1.6868058443069458, "learning_rate": 4.261286474238252e-05, "loss": 2.7533, "step": 59100 }, { "epoch": 0.4441810037590318, "grad_norm": 1.8530505895614624, "learning_rate": 4.260035964655598e-05, "loss": 2.6576, "step": 59200 }, { "epoch": 0.44493130950862475, "grad_norm": 2.109194278717041, "learning_rate": 4.2587854550729425e-05, "loss": 2.8182, "step": 59300 }, { "epoch": 0.4456816152582177, "grad_norm": 2.178828239440918, "learning_rate": 4.257534945490287e-05, "loss": 2.7553, "step": 59400 }, { "epoch": 0.4464319210078107, "grad_norm": 1.6734192371368408, "learning_rate": 4.256284435907633e-05, "loss": 2.7861, "step": 59500 }, { "epoch": 0.44718222675740366, "grad_norm": 2.3080146312713623, "learning_rate": 4.2550339263249774e-05, "loss": 2.978, "step": 59600 }, { "epoch": 0.4479325325069966, "grad_norm": 1.3194116353988647, "learning_rate": 4.253783416742323e-05, "loss": 2.8422, "step": 59700 }, { "epoch": 0.44868283825658956, "grad_norm": 2.2628822326660156, "learning_rate": 4.2525329071596676e-05, "loss": 2.5664, "step": 59800 }, { "epoch": 0.4494331440061825, "grad_norm": 2.2251646518707275, "learning_rate": 4.2512823975770124e-05, "loss": 2.7647, "step": 59900 }, { "epoch": 0.45018344975577546, "grad_norm": 2.9580283164978027, "learning_rate": 4.250031887994358e-05, "loss": 2.9117, "step": 60000 }, { "epoch": 0.45093375550536846, "grad_norm": 2.337172031402588, "learning_rate": 4.248781378411703e-05, "loss": 2.9177, "step": 60100 }, { "epoch": 0.4516840612549614, "grad_norm": 2.0606796741485596, "learning_rate": 4.247530868829048e-05, "loss": 2.7904, "step": 60200 }, { "epoch": 0.45243436700455436, "grad_norm": 2.00978684425354, "learning_rate": 4.246280359246393e-05, "loss": 2.9042, "step": 60300 }, { "epoch": 0.4531846727541473, "grad_norm": 1.8550772666931152, "learning_rate": 4.245029849663738e-05, "loss": 2.7212, "step": 60400 }, { "epoch": 0.45393497850374026, "grad_norm": 1.8198041915893555, "learning_rate": 4.243779340081083e-05, "loss": 2.8928, "step": 60500 }, { "epoch": 0.4546852842533332, "grad_norm": 1.7957936525344849, "learning_rate": 4.2425288304984285e-05, "loss": 2.9192, "step": 60600 }, { "epoch": 0.4554355900029262, "grad_norm": 1.9763379096984863, "learning_rate": 4.241278320915774e-05, "loss": 2.841, "step": 60700 }, { "epoch": 0.45618589575251917, "grad_norm": 1.6828467845916748, "learning_rate": 4.240027811333118e-05, "loss": 2.6679, "step": 60800 }, { "epoch": 0.4569362015021121, "grad_norm": 2.0394365787506104, "learning_rate": 4.2387773017504634e-05, "loss": 2.7905, "step": 60900 }, { "epoch": 0.45768650725170507, "grad_norm": 2.5740368366241455, "learning_rate": 4.237526792167809e-05, "loss": 2.9163, "step": 61000 }, { "epoch": 0.458436813001298, "grad_norm": 2.0181334018707275, "learning_rate": 4.2362762825851536e-05, "loss": 2.7965, "step": 61100 }, { "epoch": 0.45918711875089097, "grad_norm": 2.063427686691284, "learning_rate": 4.235025773002499e-05, "loss": 2.8886, "step": 61200 }, { "epoch": 0.45993742450048397, "grad_norm": 2.078721046447754, "learning_rate": 4.233775263419844e-05, "loss": 2.9371, "step": 61300 }, { "epoch": 0.4606877302500769, "grad_norm": 1.570176362991333, "learning_rate": 4.2325247538371886e-05, "loss": 2.7937, "step": 61400 }, { "epoch": 0.46143803599966987, "grad_norm": 1.7420517206192017, "learning_rate": 4.231274244254534e-05, "loss": 2.8278, "step": 61500 }, { "epoch": 0.4621883417492628, "grad_norm": 1.9918489456176758, "learning_rate": 4.230023734671879e-05, "loss": 2.6565, "step": 61600 }, { "epoch": 0.46293864749885577, "grad_norm": 1.5920336246490479, "learning_rate": 4.228773225089224e-05, "loss": 2.8782, "step": 61700 }, { "epoch": 0.4636889532484487, "grad_norm": 1.878685712814331, "learning_rate": 4.227522715506569e-05, "loss": 2.8352, "step": 61800 }, { "epoch": 0.4644392589980417, "grad_norm": 1.5909302234649658, "learning_rate": 4.226272205923914e-05, "loss": 2.8312, "step": 61900 }, { "epoch": 0.4651895647476347, "grad_norm": 2.0783987045288086, "learning_rate": 4.225021696341259e-05, "loss": 2.833, "step": 62000 }, { "epoch": 0.4659398704972276, "grad_norm": 1.6852279901504517, "learning_rate": 4.223783691854431e-05, "loss": 2.7757, "step": 62100 }, { "epoch": 0.4666901762468206, "grad_norm": 2.162444591522217, "learning_rate": 4.2225331822717764e-05, "loss": 2.751, "step": 62200 }, { "epoch": 0.4674404819964135, "grad_norm": 2.341341018676758, "learning_rate": 4.2212826726891205e-05, "loss": 2.8185, "step": 62300 }, { "epoch": 0.4681907877460065, "grad_norm": 2.3352813720703125, "learning_rate": 4.220032163106466e-05, "loss": 2.9015, "step": 62400 }, { "epoch": 0.4689410934955995, "grad_norm": 2.246023178100586, "learning_rate": 4.2187816535238114e-05, "loss": 2.867, "step": 62500 }, { "epoch": 0.46969139924519243, "grad_norm": 2.2533183097839355, "learning_rate": 4.217531143941156e-05, "loss": 2.6517, "step": 62600 }, { "epoch": 0.4704417049947854, "grad_norm": 1.4549832344055176, "learning_rate": 4.2162806343585016e-05, "loss": 2.7506, "step": 62700 }, { "epoch": 0.47119201074437833, "grad_norm": 1.7807466983795166, "learning_rate": 4.215030124775846e-05, "loss": 2.8314, "step": 62800 }, { "epoch": 0.4719423164939713, "grad_norm": 2.079845428466797, "learning_rate": 4.213779615193191e-05, "loss": 2.8351, "step": 62900 }, { "epoch": 0.47269262224356423, "grad_norm": 1.3249704837799072, "learning_rate": 4.2125291056105365e-05, "loss": 2.8722, "step": 63000 }, { "epoch": 0.47344292799315724, "grad_norm": 2.158754825592041, "learning_rate": 4.211278596027882e-05, "loss": 2.7919, "step": 63100 }, { "epoch": 0.4741932337427502, "grad_norm": 2.3303632736206055, "learning_rate": 4.210028086445227e-05, "loss": 2.9459, "step": 63200 }, { "epoch": 0.47494353949234314, "grad_norm": 1.6230649948120117, "learning_rate": 4.2087775768625715e-05, "loss": 2.8876, "step": 63300 }, { "epoch": 0.4756938452419361, "grad_norm": 2.262199878692627, "learning_rate": 4.207527067279917e-05, "loss": 2.6611, "step": 63400 }, { "epoch": 0.47644415099152904, "grad_norm": 4.179419994354248, "learning_rate": 4.206276557697262e-05, "loss": 2.801, "step": 63500 }, { "epoch": 0.477194456741122, "grad_norm": 1.6645431518554688, "learning_rate": 4.205026048114607e-05, "loss": 2.8559, "step": 63600 }, { "epoch": 0.477944762490715, "grad_norm": 2.2447829246520996, "learning_rate": 4.203775538531952e-05, "loss": 2.7862, "step": 63700 }, { "epoch": 0.47869506824030794, "grad_norm": 1.7352089881896973, "learning_rate": 4.2025250289492967e-05, "loss": 2.8127, "step": 63800 }, { "epoch": 0.4794453739899009, "grad_norm": 2.350381374359131, "learning_rate": 4.201274519366642e-05, "loss": 2.8988, "step": 63900 }, { "epoch": 0.48019567973949384, "grad_norm": 1.9579436779022217, "learning_rate": 4.200024009783987e-05, "loss": 2.8905, "step": 64000 }, { "epoch": 0.4809459854890868, "grad_norm": 1.569189190864563, "learning_rate": 4.198773500201332e-05, "loss": 2.8713, "step": 64100 }, { "epoch": 0.48169629123867974, "grad_norm": 1.7493616342544556, "learning_rate": 4.197522990618678e-05, "loss": 2.7486, "step": 64200 }, { "epoch": 0.48244659698827275, "grad_norm": 2.3634774684906006, "learning_rate": 4.196272481036022e-05, "loss": 2.8368, "step": 64300 }, { "epoch": 0.4831969027378657, "grad_norm": 1.4868675470352173, "learning_rate": 4.1950344765491936e-05, "loss": 2.7037, "step": 64400 }, { "epoch": 0.48394720848745865, "grad_norm": 2.3445897102355957, "learning_rate": 4.193783966966539e-05, "loss": 2.657, "step": 64500 }, { "epoch": 0.4846975142370516, "grad_norm": 1.7023659944534302, "learning_rate": 4.1925334573838845e-05, "loss": 2.9026, "step": 64600 }, { "epoch": 0.48544781998664455, "grad_norm": 2.659402370452881, "learning_rate": 4.191282947801229e-05, "loss": 2.8481, "step": 64700 }, { "epoch": 0.4861981257362375, "grad_norm": 1.8160765171051025, "learning_rate": 4.190032438218574e-05, "loss": 2.7548, "step": 64800 }, { "epoch": 0.4869484314858305, "grad_norm": 1.5463461875915527, "learning_rate": 4.1887819286359194e-05, "loss": 2.7867, "step": 64900 }, { "epoch": 0.48769873723542345, "grad_norm": 1.4208579063415527, "learning_rate": 4.187531419053264e-05, "loss": 2.5768, "step": 65000 }, { "epoch": 0.4884490429850164, "grad_norm": 2.760589361190796, "learning_rate": 4.1862809094706096e-05, "loss": 2.8544, "step": 65100 }, { "epoch": 0.48919934873460935, "grad_norm": 1.8087366819381714, "learning_rate": 4.185030399887955e-05, "loss": 2.7054, "step": 65200 }, { "epoch": 0.4899496544842023, "grad_norm": 1.8927841186523438, "learning_rate": 4.183779890305299e-05, "loss": 2.7805, "step": 65300 }, { "epoch": 0.49069996023379525, "grad_norm": 1.9154362678527832, "learning_rate": 4.1825293807226446e-05, "loss": 2.8131, "step": 65400 }, { "epoch": 0.49145026598338826, "grad_norm": 2.3180346488952637, "learning_rate": 4.18127887113999e-05, "loss": 2.8272, "step": 65500 }, { "epoch": 0.4922005717329812, "grad_norm": 2.813215970993042, "learning_rate": 4.180028361557335e-05, "loss": 2.8036, "step": 65600 }, { "epoch": 0.49295087748257416, "grad_norm": 1.9439103603363037, "learning_rate": 4.17877785197468e-05, "loss": 2.7658, "step": 65700 }, { "epoch": 0.4937011832321671, "grad_norm": 1.4861825704574585, "learning_rate": 4.177527342392025e-05, "loss": 2.6556, "step": 65800 }, { "epoch": 0.49445148898176006, "grad_norm": 1.9082887172698975, "learning_rate": 4.17627683280937e-05, "loss": 2.6832, "step": 65900 }, { "epoch": 0.495201794731353, "grad_norm": 1.3060579299926758, "learning_rate": 4.175026323226715e-05, "loss": 2.817, "step": 66000 }, { "epoch": 0.495952100480946, "grad_norm": 1.5570387840270996, "learning_rate": 4.17377581364406e-05, "loss": 2.811, "step": 66100 }, { "epoch": 0.49670240623053896, "grad_norm": 1.6826465129852295, "learning_rate": 4.1725253040614054e-05, "loss": 2.8382, "step": 66200 }, { "epoch": 0.4974527119801319, "grad_norm": 1.7846342325210571, "learning_rate": 4.17127479447875e-05, "loss": 2.7685, "step": 66300 }, { "epoch": 0.49820301772972486, "grad_norm": 1.1958807706832886, "learning_rate": 4.170024284896095e-05, "loss": 2.6143, "step": 66400 }, { "epoch": 0.4989533234793178, "grad_norm": 2.001094341278076, "learning_rate": 4.168786280409267e-05, "loss": 2.9559, "step": 66500 }, { "epoch": 0.49970362922891076, "grad_norm": 2.6946887969970703, "learning_rate": 4.167535770826612e-05, "loss": 2.8068, "step": 66600 }, { "epoch": 0.5004539349785038, "grad_norm": 1.9556910991668701, "learning_rate": 4.1662852612439576e-05, "loss": 2.7805, "step": 66700 }, { "epoch": 0.5012042407280967, "grad_norm": 2.374946117401123, "learning_rate": 4.1650347516613016e-05, "loss": 2.899, "step": 66800 }, { "epoch": 0.5019545464776897, "grad_norm": 1.931427001953125, "learning_rate": 4.163784242078647e-05, "loss": 2.7465, "step": 66900 }, { "epoch": 0.5027048522272827, "grad_norm": 1.8608232736587524, "learning_rate": 4.1625337324959925e-05, "loss": 2.8958, "step": 67000 }, { "epoch": 0.5034551579768756, "grad_norm": 2.3692400455474854, "learning_rate": 4.161283222913337e-05, "loss": 2.6311, "step": 67100 }, { "epoch": 0.5042054637264686, "grad_norm": 1.650046944618225, "learning_rate": 4.160032713330683e-05, "loss": 2.6283, "step": 67200 }, { "epoch": 0.5049557694760615, "grad_norm": 1.72211492061615, "learning_rate": 4.1587822037480275e-05, "loss": 2.8102, "step": 67300 }, { "epoch": 0.5057060752256545, "grad_norm": 1.954256296157837, "learning_rate": 4.157531694165372e-05, "loss": 2.8395, "step": 67400 }, { "epoch": 0.5064563809752474, "grad_norm": 2.238466501235962, "learning_rate": 4.156281184582718e-05, "loss": 2.8306, "step": 67500 }, { "epoch": 0.5072066867248404, "grad_norm": 2.012645721435547, "learning_rate": 4.155030675000063e-05, "loss": 2.7484, "step": 67600 }, { "epoch": 0.5079569924744334, "grad_norm": 1.768333077430725, "learning_rate": 4.153780165417408e-05, "loss": 2.6276, "step": 67700 }, { "epoch": 0.5087072982240263, "grad_norm": 1.8125181198120117, "learning_rate": 4.1525296558347527e-05, "loss": 2.6868, "step": 67800 }, { "epoch": 0.5094576039736193, "grad_norm": 1.7857016324996948, "learning_rate": 4.151279146252098e-05, "loss": 2.826, "step": 67900 }, { "epoch": 0.5102079097232122, "grad_norm": 2.082529306411743, "learning_rate": 4.150028636669443e-05, "loss": 2.8671, "step": 68000 }, { "epoch": 0.5109582154728052, "grad_norm": 1.4744789600372314, "learning_rate": 4.148778127086788e-05, "loss": 2.9147, "step": 68100 }, { "epoch": 0.5117085212223982, "grad_norm": 1.5537129640579224, "learning_rate": 4.147527617504133e-05, "loss": 2.882, "step": 68200 }, { "epoch": 0.5124588269719911, "grad_norm": 1.6418806314468384, "learning_rate": 4.146277107921478e-05, "loss": 2.6326, "step": 68300 }, { "epoch": 0.5132091327215841, "grad_norm": 1.3979482650756836, "learning_rate": 4.145026598338823e-05, "loss": 2.7603, "step": 68400 }, { "epoch": 0.513959438471177, "grad_norm": 1.8105440139770508, "learning_rate": 4.143776088756168e-05, "loss": 2.7512, "step": 68500 }, { "epoch": 0.51470974422077, "grad_norm": 1.3424893617630005, "learning_rate": 4.1425255791735135e-05, "loss": 2.7541, "step": 68600 }, { "epoch": 0.5154600499703629, "grad_norm": 2.366384744644165, "learning_rate": 4.141275069590859e-05, "loss": 2.7135, "step": 68700 }, { "epoch": 0.5162103557199559, "grad_norm": 1.4834479093551636, "learning_rate": 4.140024560008203e-05, "loss": 2.8182, "step": 68800 }, { "epoch": 0.5169606614695489, "grad_norm": 2.2202532291412354, "learning_rate": 4.138786555521375e-05, "loss": 2.8641, "step": 68900 }, { "epoch": 0.5177109672191418, "grad_norm": 1.9048901796340942, "learning_rate": 4.13753604593872e-05, "loss": 2.6684, "step": 69000 }, { "epoch": 0.5184612729687348, "grad_norm": 3.86259388923645, "learning_rate": 4.1362855363560656e-05, "loss": 2.8242, "step": 69100 }, { "epoch": 0.5192115787183277, "grad_norm": 1.9474337100982666, "learning_rate": 4.1350350267734104e-05, "loss": 2.8121, "step": 69200 }, { "epoch": 0.5199618844679207, "grad_norm": 1.3200124502182007, "learning_rate": 4.133784517190755e-05, "loss": 2.766, "step": 69300 }, { "epoch": 0.5207121902175136, "grad_norm": 2.3218255043029785, "learning_rate": 4.1325340076081006e-05, "loss": 2.9561, "step": 69400 }, { "epoch": 0.5214624959671066, "grad_norm": 1.6760667562484741, "learning_rate": 4.1312834980254453e-05, "loss": 2.7438, "step": 69500 }, { "epoch": 0.5222128017166996, "grad_norm": 1.9570534229278564, "learning_rate": 4.130032988442791e-05, "loss": 2.78, "step": 69600 }, { "epoch": 0.5229631074662925, "grad_norm": 1.8822660446166992, "learning_rate": 4.128782478860136e-05, "loss": 2.8734, "step": 69700 }, { "epoch": 0.5237134132158855, "grad_norm": 2.744049072265625, "learning_rate": 4.12753196927748e-05, "loss": 2.9677, "step": 69800 }, { "epoch": 0.5244637189654784, "grad_norm": 1.9885058403015137, "learning_rate": 4.126281459694826e-05, "loss": 2.8034, "step": 69900 }, { "epoch": 0.5252140247150714, "grad_norm": 3.136559009552002, "learning_rate": 4.125030950112171e-05, "loss": 2.8176, "step": 70000 }, { "epoch": 0.5259643304646644, "grad_norm": 1.7924681901931763, "learning_rate": 4.123780440529516e-05, "loss": 2.7107, "step": 70100 }, { "epoch": 0.5267146362142573, "grad_norm": 2.04101824760437, "learning_rate": 4.1225299309468614e-05, "loss": 2.7867, "step": 70200 }, { "epoch": 0.5274649419638503, "grad_norm": 1.6148746013641357, "learning_rate": 4.121279421364206e-05, "loss": 3.0112, "step": 70300 }, { "epoch": 0.5282152477134432, "grad_norm": 1.5649806261062622, "learning_rate": 4.120028911781551e-05, "loss": 2.7449, "step": 70400 }, { "epoch": 0.5289655534630362, "grad_norm": 2.6997034549713135, "learning_rate": 4.1187784021988964e-05, "loss": 2.9302, "step": 70500 }, { "epoch": 0.5297158592126291, "grad_norm": 1.6010438203811646, "learning_rate": 4.117527892616241e-05, "loss": 2.7108, "step": 70600 }, { "epoch": 0.5304661649622221, "grad_norm": 3.609135389328003, "learning_rate": 4.1162773830335866e-05, "loss": 2.7469, "step": 70700 }, { "epoch": 0.5312164707118151, "grad_norm": 1.2075093984603882, "learning_rate": 4.115026873450931e-05, "loss": 2.8129, "step": 70800 }, { "epoch": 0.531966776461408, "grad_norm": 2.1926889419555664, "learning_rate": 4.113776363868276e-05, "loss": 2.668, "step": 70900 }, { "epoch": 0.532717082211001, "grad_norm": 3.109833002090454, "learning_rate": 4.1125258542856215e-05, "loss": 2.767, "step": 71000 }, { "epoch": 0.5334673879605939, "grad_norm": 1.7828940153121948, "learning_rate": 4.111275344702967e-05, "loss": 2.8468, "step": 71100 }, { "epoch": 0.5342176937101869, "grad_norm": 1.580435872077942, "learning_rate": 4.110024835120312e-05, "loss": 3.1312, "step": 71200 }, { "epoch": 0.5349679994597799, "grad_norm": 1.925756573677063, "learning_rate": 4.108774325537657e-05, "loss": 2.6827, "step": 71300 }, { "epoch": 0.5357183052093728, "grad_norm": 1.5760747194290161, "learning_rate": 4.107523815955002e-05, "loss": 2.8722, "step": 71400 }, { "epoch": 0.5364686109589658, "grad_norm": 1.422162413597107, "learning_rate": 4.106285811468174e-05, "loss": 2.6455, "step": 71500 }, { "epoch": 0.5372189167085587, "grad_norm": 2.0439658164978027, "learning_rate": 4.1050353018855184e-05, "loss": 2.7064, "step": 71600 }, { "epoch": 0.5379692224581517, "grad_norm": 1.7054847478866577, "learning_rate": 4.103784792302864e-05, "loss": 2.7891, "step": 71700 }, { "epoch": 0.5387195282077446, "grad_norm": 1.5061097145080566, "learning_rate": 4.1025342827202086e-05, "loss": 2.7133, "step": 71800 }, { "epoch": 0.5394698339573376, "grad_norm": 1.8058314323425293, "learning_rate": 4.1012837731375534e-05, "loss": 2.7162, "step": 71900 }, { "epoch": 0.5402201397069306, "grad_norm": 1.7157914638519287, "learning_rate": 4.100033263554899e-05, "loss": 2.8843, "step": 72000 }, { "epoch": 0.5409704454565235, "grad_norm": 1.999328374862671, "learning_rate": 4.098782753972244e-05, "loss": 2.806, "step": 72100 }, { "epoch": 0.5417207512061165, "grad_norm": 1.6352635622024536, "learning_rate": 4.097532244389589e-05, "loss": 2.734, "step": 72200 }, { "epoch": 0.5424710569557094, "grad_norm": 1.6412452459335327, "learning_rate": 4.096281734806934e-05, "loss": 2.7475, "step": 72300 }, { "epoch": 0.5432213627053024, "grad_norm": 3.26196026802063, "learning_rate": 4.095031225224279e-05, "loss": 2.7979, "step": 72400 }, { "epoch": 0.5439716684548954, "grad_norm": 1.6228463649749756, "learning_rate": 4.093780715641624e-05, "loss": 2.822, "step": 72500 }, { "epoch": 0.5447219742044883, "grad_norm": 2.119394302368164, "learning_rate": 4.0925302060589695e-05, "loss": 2.9448, "step": 72600 }, { "epoch": 0.5454722799540813, "grad_norm": 1.561208724975586, "learning_rate": 4.091279696476314e-05, "loss": 2.9051, "step": 72700 }, { "epoch": 0.5462225857036742, "grad_norm": 1.5126705169677734, "learning_rate": 4.090029186893659e-05, "loss": 2.6996, "step": 72800 }, { "epoch": 0.5469728914532672, "grad_norm": 2.0908243656158447, "learning_rate": 4.0887786773110044e-05, "loss": 2.7615, "step": 72900 }, { "epoch": 0.5477231972028601, "grad_norm": 1.8112488985061646, "learning_rate": 4.087528167728349e-05, "loss": 2.7317, "step": 73000 }, { "epoch": 0.5484735029524531, "grad_norm": 4.222007751464844, "learning_rate": 4.0862776581456946e-05, "loss": 2.916, "step": 73100 }, { "epoch": 0.5492238087020461, "grad_norm": 1.648003339767456, "learning_rate": 4.08502714856304e-05, "loss": 2.8066, "step": 73200 }, { "epoch": 0.549974114451639, "grad_norm": 1.7542273998260498, "learning_rate": 4.083776638980384e-05, "loss": 2.7309, "step": 73300 }, { "epoch": 0.550724420201232, "grad_norm": 1.8702387809753418, "learning_rate": 4.0825261293977296e-05, "loss": 2.745, "step": 73400 }, { "epoch": 0.5514747259508249, "grad_norm": 1.91477370262146, "learning_rate": 4.081275619815075e-05, "loss": 2.8942, "step": 73500 }, { "epoch": 0.5522250317004179, "grad_norm": 2.5143237113952637, "learning_rate": 4.08002511023242e-05, "loss": 2.8204, "step": 73600 }, { "epoch": 0.5529753374500109, "grad_norm": 1.7237865924835205, "learning_rate": 4.078774600649765e-05, "loss": 2.8861, "step": 73700 }, { "epoch": 0.5537256431996038, "grad_norm": 1.3884165287017822, "learning_rate": 4.07752409106711e-05, "loss": 2.8771, "step": 73800 }, { "epoch": 0.5544759489491968, "grad_norm": 1.681843876838684, "learning_rate": 4.076273581484455e-05, "loss": 2.8753, "step": 73900 }, { "epoch": 0.5552262546987897, "grad_norm": 2.1459298133850098, "learning_rate": 4.0750355769976265e-05, "loss": 2.8269, "step": 74000 }, { "epoch": 0.5559765604483827, "grad_norm": 1.7073999643325806, "learning_rate": 4.073785067414972e-05, "loss": 2.6861, "step": 74100 }, { "epoch": 0.5567268661979756, "grad_norm": 2.166677951812744, "learning_rate": 4.0725345578323174e-05, "loss": 2.7608, "step": 74200 }, { "epoch": 0.5574771719475686, "grad_norm": 1.3283852338790894, "learning_rate": 4.0712840482496615e-05, "loss": 2.7824, "step": 74300 }, { "epoch": 0.5582274776971616, "grad_norm": 1.982663869857788, "learning_rate": 4.070033538667007e-05, "loss": 2.7174, "step": 74400 }, { "epoch": 0.5589777834467545, "grad_norm": 1.9972954988479614, "learning_rate": 4.0687830290843524e-05, "loss": 2.6754, "step": 74500 }, { "epoch": 0.5597280891963475, "grad_norm": 1.800122618675232, "learning_rate": 4.067532519501697e-05, "loss": 2.7409, "step": 74600 }, { "epoch": 0.5604783949459404, "grad_norm": 2.356868267059326, "learning_rate": 4.0662820099190426e-05, "loss": 2.8028, "step": 74700 }, { "epoch": 0.5612287006955334, "grad_norm": 2.2198007106781006, "learning_rate": 4.065031500336387e-05, "loss": 2.7849, "step": 74800 }, { "epoch": 0.5619790064451264, "grad_norm": 1.9320533275604248, "learning_rate": 4.063780990753732e-05, "loss": 2.8333, "step": 74900 }, { "epoch": 0.5627293121947193, "grad_norm": 1.3204776048660278, "learning_rate": 4.0625304811710775e-05, "loss": 2.8409, "step": 75000 }, { "epoch": 0.5634796179443123, "grad_norm": 2.0959415435791016, "learning_rate": 4.061279971588422e-05, "loss": 2.7398, "step": 75100 }, { "epoch": 0.5642299236939052, "grad_norm": 1.4534437656402588, "learning_rate": 4.060029462005768e-05, "loss": 2.7395, "step": 75200 }, { "epoch": 0.5649802294434982, "grad_norm": 1.8724315166473389, "learning_rate": 4.0587789524231125e-05, "loss": 2.6887, "step": 75300 }, { "epoch": 0.5657305351930911, "grad_norm": 3.484524965286255, "learning_rate": 4.057528442840457e-05, "loss": 2.9154, "step": 75400 }, { "epoch": 0.5664808409426841, "grad_norm": 1.5693479776382446, "learning_rate": 4.056277933257803e-05, "loss": 2.7031, "step": 75500 }, { "epoch": 0.5672311466922771, "grad_norm": 2.2339892387390137, "learning_rate": 4.055027423675148e-05, "loss": 2.7883, "step": 75600 }, { "epoch": 0.56798145244187, "grad_norm": 1.772937536239624, "learning_rate": 4.053776914092493e-05, "loss": 2.8591, "step": 75700 }, { "epoch": 0.568731758191463, "grad_norm": 1.3991334438323975, "learning_rate": 4.052526404509838e-05, "loss": 2.8966, "step": 75800 }, { "epoch": 0.5694820639410559, "grad_norm": 1.5110833644866943, "learning_rate": 4.051275894927183e-05, "loss": 2.8119, "step": 75900 }, { "epoch": 0.5702323696906489, "grad_norm": 1.4663963317871094, "learning_rate": 4.050025385344528e-05, "loss": 2.8047, "step": 76000 }, { "epoch": 0.570982675440242, "grad_norm": 1.9878113269805908, "learning_rate": 4.0487873808576996e-05, "loss": 2.7245, "step": 76100 }, { "epoch": 0.5717329811898348, "grad_norm": 4.025021553039551, "learning_rate": 4.047536871275045e-05, "loss": 2.8096, "step": 76200 }, { "epoch": 0.5724832869394278, "grad_norm": 1.596652865409851, "learning_rate": 4.04628636169239e-05, "loss": 2.7618, "step": 76300 }, { "epoch": 0.5732335926890207, "grad_norm": 1.4616409540176392, "learning_rate": 4.0450358521097346e-05, "loss": 2.8286, "step": 76400 }, { "epoch": 0.5739838984386137, "grad_norm": 1.683151125907898, "learning_rate": 4.04378534252708e-05, "loss": 2.8323, "step": 76500 }, { "epoch": 0.5747342041882066, "grad_norm": 1.2921260595321655, "learning_rate": 4.0425348329444255e-05, "loss": 2.7577, "step": 76600 }, { "epoch": 0.5754845099377996, "grad_norm": 1.6592955589294434, "learning_rate": 4.04128432336177e-05, "loss": 2.7008, "step": 76700 }, { "epoch": 0.5762348156873927, "grad_norm": 2.841151237487793, "learning_rate": 4.040033813779115e-05, "loss": 2.845, "step": 76800 }, { "epoch": 0.5769851214369855, "grad_norm": 1.8082246780395508, "learning_rate": 4.0387833041964604e-05, "loss": 2.7981, "step": 76900 }, { "epoch": 0.5777354271865786, "grad_norm": 2.01246976852417, "learning_rate": 4.037532794613805e-05, "loss": 2.8506, "step": 77000 }, { "epoch": 0.5784857329361714, "grad_norm": 2.0985543727874756, "learning_rate": 4.0362822850311506e-05, "loss": 2.8285, "step": 77100 }, { "epoch": 0.5792360386857645, "grad_norm": 3.937431573867798, "learning_rate": 4.0350317754484954e-05, "loss": 2.7909, "step": 77200 }, { "epoch": 0.5799863444353575, "grad_norm": 1.587732195854187, "learning_rate": 4.03378126586584e-05, "loss": 2.8111, "step": 77300 }, { "epoch": 0.5807366501849504, "grad_norm": 2.5483558177948, "learning_rate": 4.0325307562831856e-05, "loss": 2.7307, "step": 77400 }, { "epoch": 0.5814869559345434, "grad_norm": 1.4300932884216309, "learning_rate": 4.0312802467005304e-05, "loss": 2.8374, "step": 77500 }, { "epoch": 0.5822372616841363, "grad_norm": 2.1839113235473633, "learning_rate": 4.030029737117876e-05, "loss": 2.789, "step": 77600 }, { "epoch": 0.5829875674337293, "grad_norm": 1.4038469791412354, "learning_rate": 4.028779227535221e-05, "loss": 2.742, "step": 77700 }, { "epoch": 0.5837378731833222, "grad_norm": 1.9752931594848633, "learning_rate": 4.027528717952565e-05, "loss": 2.8397, "step": 77800 }, { "epoch": 0.5844881789329152, "grad_norm": 1.536289095878601, "learning_rate": 4.026278208369911e-05, "loss": 2.9214, "step": 77900 }, { "epoch": 0.5852384846825082, "grad_norm": 1.595198154449463, "learning_rate": 4.025027698787256e-05, "loss": 2.797, "step": 78000 }, { "epoch": 0.5859887904321011, "grad_norm": 1.7838209867477417, "learning_rate": 4.023789694300428e-05, "loss": 2.77, "step": 78100 }, { "epoch": 0.5867390961816941, "grad_norm": 2.4069457054138184, "learning_rate": 4.022539184717773e-05, "loss": 2.7863, "step": 78200 }, { "epoch": 0.587489401931287, "grad_norm": 2.7392654418945312, "learning_rate": 4.0212886751351175e-05, "loss": 2.8472, "step": 78300 }, { "epoch": 0.58823970768088, "grad_norm": 1.6871455907821655, "learning_rate": 4.020038165552463e-05, "loss": 2.6905, "step": 78400 }, { "epoch": 0.588990013430473, "grad_norm": 1.8312381505966187, "learning_rate": 4.018787655969808e-05, "loss": 2.7765, "step": 78500 }, { "epoch": 0.5897403191800659, "grad_norm": 1.5674129724502563, "learning_rate": 4.017537146387153e-05, "loss": 2.8976, "step": 78600 }, { "epoch": 0.5904906249296589, "grad_norm": 2.5288989543914795, "learning_rate": 4.0162866368044986e-05, "loss": 2.9055, "step": 78700 }, { "epoch": 0.5912409306792518, "grad_norm": 1.5132687091827393, "learning_rate": 4.0150361272218426e-05, "loss": 2.6685, "step": 78800 }, { "epoch": 0.5919912364288448, "grad_norm": 1.6623607873916626, "learning_rate": 4.013785617639188e-05, "loss": 2.8445, "step": 78900 }, { "epoch": 0.5927415421784377, "grad_norm": 2.9617021083831787, "learning_rate": 4.0125351080565335e-05, "loss": 2.7764, "step": 79000 }, { "epoch": 0.5934918479280307, "grad_norm": 1.5286788940429688, "learning_rate": 4.011284598473878e-05, "loss": 2.7292, "step": 79100 }, { "epoch": 0.5942421536776237, "grad_norm": 2.454049587249756, "learning_rate": 4.010034088891224e-05, "loss": 2.7272, "step": 79200 }, { "epoch": 0.5949924594272166, "grad_norm": 1.429669737815857, "learning_rate": 4.0087835793085685e-05, "loss": 2.6463, "step": 79300 }, { "epoch": 0.5957427651768096, "grad_norm": 1.8989125490188599, "learning_rate": 4.007533069725913e-05, "loss": 2.7986, "step": 79400 }, { "epoch": 0.5964930709264025, "grad_norm": 1.393453598022461, "learning_rate": 4.006282560143259e-05, "loss": 2.842, "step": 79500 }, { "epoch": 0.5972433766759955, "grad_norm": 1.9139227867126465, "learning_rate": 4.0050320505606035e-05, "loss": 2.8637, "step": 79600 }, { "epoch": 0.5979936824255885, "grad_norm": 1.5694407224655151, "learning_rate": 4.003781540977949e-05, "loss": 2.8473, "step": 79700 }, { "epoch": 0.5987439881751814, "grad_norm": 1.3473284244537354, "learning_rate": 4.0025310313952937e-05, "loss": 2.6745, "step": 79800 }, { "epoch": 0.5994942939247744, "grad_norm": 1.710429310798645, "learning_rate": 4.0012805218126384e-05, "loss": 2.8939, "step": 79900 }, { "epoch": 0.6002445996743673, "grad_norm": 2.2774434089660645, "learning_rate": 4.000030012229984e-05, "loss": 2.8041, "step": 80000 }, { "epoch": 0.6009949054239603, "grad_norm": 1.8041768074035645, "learning_rate": 3.998779502647329e-05, "loss": 2.7862, "step": 80100 }, { "epoch": 0.6017452111735532, "grad_norm": 2.355210065841675, "learning_rate": 3.997541498160501e-05, "loss": 2.658, "step": 80200 }, { "epoch": 0.6024955169231462, "grad_norm": 2.772909641265869, "learning_rate": 3.996290988577845e-05, "loss": 2.755, "step": 80300 }, { "epoch": 0.6032458226727392, "grad_norm": 1.724805235862732, "learning_rate": 3.9950404789951906e-05, "loss": 2.7791, "step": 80400 }, { "epoch": 0.6039961284223321, "grad_norm": 1.5716328620910645, "learning_rate": 3.993789969412536e-05, "loss": 2.766, "step": 80500 }, { "epoch": 0.6047464341719251, "grad_norm": 2.0795557498931885, "learning_rate": 3.992539459829881e-05, "loss": 2.7894, "step": 80600 }, { "epoch": 0.605496739921518, "grad_norm": 1.7078999280929565, "learning_rate": 3.991288950247226e-05, "loss": 2.7966, "step": 80700 }, { "epoch": 0.606247045671111, "grad_norm": 2.1168553829193115, "learning_rate": 3.990038440664571e-05, "loss": 2.5882, "step": 80800 }, { "epoch": 0.606997351420704, "grad_norm": 1.9795398712158203, "learning_rate": 3.988787931081916e-05, "loss": 2.7496, "step": 80900 }, { "epoch": 0.6077476571702969, "grad_norm": 1.368735432624817, "learning_rate": 3.987537421499261e-05, "loss": 2.7983, "step": 81000 }, { "epoch": 0.6084979629198899, "grad_norm": 1.3658225536346436, "learning_rate": 3.9862869119166066e-05, "loss": 2.8848, "step": 81100 }, { "epoch": 0.6092482686694828, "grad_norm": 1.4999414682388306, "learning_rate": 3.9850364023339514e-05, "loss": 2.7792, "step": 81200 }, { "epoch": 0.6099985744190758, "grad_norm": 2.104743480682373, "learning_rate": 3.983785892751296e-05, "loss": 2.9037, "step": 81300 }, { "epoch": 0.6107488801686687, "grad_norm": 2.139868974685669, "learning_rate": 3.9825353831686416e-05, "loss": 2.8691, "step": 81400 }, { "epoch": 0.6114991859182617, "grad_norm": 1.5744115114212036, "learning_rate": 3.9812848735859863e-05, "loss": 2.72, "step": 81500 }, { "epoch": 0.6122494916678547, "grad_norm": 1.8060364723205566, "learning_rate": 3.980034364003332e-05, "loss": 2.7597, "step": 81600 }, { "epoch": 0.6129997974174476, "grad_norm": 1.8314858675003052, "learning_rate": 3.9787838544206765e-05, "loss": 2.7791, "step": 81700 }, { "epoch": 0.6137501031670406, "grad_norm": 2.6994240283966064, "learning_rate": 3.977533344838021e-05, "loss": 2.7547, "step": 81800 }, { "epoch": 0.6145004089166335, "grad_norm": 3.370037317276001, "learning_rate": 3.976282835255367e-05, "loss": 2.6626, "step": 81900 }, { "epoch": 0.6152507146662265, "grad_norm": 1.7389426231384277, "learning_rate": 3.9750323256727115e-05, "loss": 2.7721, "step": 82000 }, { "epoch": 0.6160010204158195, "grad_norm": 2.394606351852417, "learning_rate": 3.973781816090057e-05, "loss": 2.7565, "step": 82100 }, { "epoch": 0.6167513261654124, "grad_norm": 1.5836440324783325, "learning_rate": 3.9725313065074024e-05, "loss": 2.5944, "step": 82200 }, { "epoch": 0.6175016319150054, "grad_norm": 1.9451855421066284, "learning_rate": 3.9712933020205735e-05, "loss": 2.8239, "step": 82300 }, { "epoch": 0.6182519376645983, "grad_norm": 1.6433790922164917, "learning_rate": 3.970042792437918e-05, "loss": 2.6351, "step": 82400 }, { "epoch": 0.6190022434141913, "grad_norm": 3.749052047729492, "learning_rate": 3.968792282855264e-05, "loss": 2.6632, "step": 82500 }, { "epoch": 0.6197525491637842, "grad_norm": 1.7603769302368164, "learning_rate": 3.967541773272609e-05, "loss": 2.7379, "step": 82600 }, { "epoch": 0.6205028549133772, "grad_norm": 2.1063201427459717, "learning_rate": 3.966291263689954e-05, "loss": 2.8222, "step": 82700 }, { "epoch": 0.6212531606629702, "grad_norm": 2.149467945098877, "learning_rate": 3.9650407541072986e-05, "loss": 2.7385, "step": 82800 }, { "epoch": 0.6220034664125631, "grad_norm": 2.316251277923584, "learning_rate": 3.963790244524644e-05, "loss": 2.801, "step": 82900 }, { "epoch": 0.6227537721621561, "grad_norm": 1.716261625289917, "learning_rate": 3.962539734941989e-05, "loss": 2.6946, "step": 83000 }, { "epoch": 0.623504077911749, "grad_norm": 1.813166618347168, "learning_rate": 3.961289225359334e-05, "loss": 2.7912, "step": 83100 }, { "epoch": 0.624254383661342, "grad_norm": 1.6095467805862427, "learning_rate": 3.96003871577668e-05, "loss": 2.7384, "step": 83200 }, { "epoch": 0.625004689410935, "grad_norm": 3.193474769592285, "learning_rate": 3.958788206194024e-05, "loss": 2.7884, "step": 83300 }, { "epoch": 0.6257549951605279, "grad_norm": 1.9918606281280518, "learning_rate": 3.957537696611369e-05, "loss": 2.8825, "step": 83400 }, { "epoch": 0.6265053009101209, "grad_norm": 1.818692922592163, "learning_rate": 3.956287187028715e-05, "loss": 2.6289, "step": 83500 }, { "epoch": 0.6272556066597138, "grad_norm": 2.1321332454681396, "learning_rate": 3.9550366774460594e-05, "loss": 2.5804, "step": 83600 }, { "epoch": 0.6280059124093068, "grad_norm": 2.2757482528686523, "learning_rate": 3.953786167863405e-05, "loss": 2.8153, "step": 83700 }, { "epoch": 0.6287562181588997, "grad_norm": 2.0336246490478516, "learning_rate": 3.9525356582807496e-05, "loss": 2.7424, "step": 83800 }, { "epoch": 0.6295065239084927, "grad_norm": 1.4677085876464844, "learning_rate": 3.9512851486980944e-05, "loss": 2.7577, "step": 83900 }, { "epoch": 0.6302568296580857, "grad_norm": 1.789117693901062, "learning_rate": 3.95003463911544e-05, "loss": 2.6865, "step": 84000 }, { "epoch": 0.6310071354076786, "grad_norm": 1.722722053527832, "learning_rate": 3.9487841295327846e-05, "loss": 2.6695, "step": 84100 }, { "epoch": 0.6317574411572716, "grad_norm": 1.826897382736206, "learning_rate": 3.94753361995013e-05, "loss": 2.7752, "step": 84200 }, { "epoch": 0.6325077469068645, "grad_norm": 1.2495834827423096, "learning_rate": 3.946295615463301e-05, "loss": 2.8658, "step": 84300 }, { "epoch": 0.6332580526564575, "grad_norm": 1.3804091215133667, "learning_rate": 3.9450451058806466e-05, "loss": 2.727, "step": 84400 }, { "epoch": 0.6340083584060505, "grad_norm": 1.628247857093811, "learning_rate": 3.943794596297991e-05, "loss": 2.8504, "step": 84500 }, { "epoch": 0.6347586641556434, "grad_norm": 1.4021284580230713, "learning_rate": 3.942544086715337e-05, "loss": 2.7991, "step": 84600 }, { "epoch": 0.6355089699052364, "grad_norm": 2.0998451709747314, "learning_rate": 3.941293577132682e-05, "loss": 2.5703, "step": 84700 }, { "epoch": 0.6362592756548293, "grad_norm": 1.3721303939819336, "learning_rate": 3.940043067550026e-05, "loss": 2.7384, "step": 84800 }, { "epoch": 0.6370095814044223, "grad_norm": 1.7685487270355225, "learning_rate": 3.938792557967372e-05, "loss": 2.695, "step": 84900 }, { "epoch": 0.6377598871540152, "grad_norm": 2.222243547439575, "learning_rate": 3.937542048384717e-05, "loss": 2.7926, "step": 85000 }, { "epoch": 0.6385101929036082, "grad_norm": 1.4080082178115845, "learning_rate": 3.936291538802062e-05, "loss": 2.725, "step": 85100 }, { "epoch": 0.6392604986532012, "grad_norm": 1.6104655265808105, "learning_rate": 3.9350410292194074e-05, "loss": 2.9113, "step": 85200 }, { "epoch": 0.6400108044027941, "grad_norm": 1.9857717752456665, "learning_rate": 3.933790519636752e-05, "loss": 2.9557, "step": 85300 }, { "epoch": 0.6407611101523871, "grad_norm": 1.9994667768478394, "learning_rate": 3.932540010054097e-05, "loss": 2.8037, "step": 85400 }, { "epoch": 0.64151141590198, "grad_norm": 1.5662387609481812, "learning_rate": 3.9312895004714423e-05, "loss": 2.7574, "step": 85500 }, { "epoch": 0.642261721651573, "grad_norm": 1.7546573877334595, "learning_rate": 3.930038990888788e-05, "loss": 2.8701, "step": 85600 }, { "epoch": 0.6430120274011659, "grad_norm": 2.1866729259490967, "learning_rate": 3.9287884813061325e-05, "loss": 2.719, "step": 85700 }, { "epoch": 0.6437623331507589, "grad_norm": 2.6895482540130615, "learning_rate": 3.927537971723477e-05, "loss": 2.7216, "step": 85800 }, { "epoch": 0.6445126389003519, "grad_norm": 1.2908220291137695, "learning_rate": 3.926287462140823e-05, "loss": 2.6396, "step": 85900 }, { "epoch": 0.6452629446499448, "grad_norm": 3.0361828804016113, "learning_rate": 3.9250369525581675e-05, "loss": 2.8419, "step": 86000 }, { "epoch": 0.6460132503995378, "grad_norm": 1.8758509159088135, "learning_rate": 3.923786442975513e-05, "loss": 2.6377, "step": 86100 }, { "epoch": 0.6467635561491307, "grad_norm": 1.683797836303711, "learning_rate": 3.922535933392858e-05, "loss": 2.813, "step": 86200 }, { "epoch": 0.6475138618987237, "grad_norm": 2.422272205352783, "learning_rate": 3.9212979289060295e-05, "loss": 2.6256, "step": 86300 }, { "epoch": 0.6482641676483167, "grad_norm": 3.2017102241516113, "learning_rate": 3.920047419323374e-05, "loss": 2.7895, "step": 86400 }, { "epoch": 0.6490144733979096, "grad_norm": 2.938483953475952, "learning_rate": 3.9187969097407197e-05, "loss": 2.6608, "step": 86500 }, { "epoch": 0.6497647791475026, "grad_norm": 1.4458643198013306, "learning_rate": 3.9175464001580644e-05, "loss": 2.8399, "step": 86600 }, { "epoch": 0.6505150848970955, "grad_norm": 1.327475666999817, "learning_rate": 3.91629589057541e-05, "loss": 2.8698, "step": 86700 }, { "epoch": 0.6512653906466885, "grad_norm": 1.7359371185302734, "learning_rate": 3.9150453809927546e-05, "loss": 2.8207, "step": 86800 }, { "epoch": 0.6520156963962814, "grad_norm": 1.8355038166046143, "learning_rate": 3.9137948714100994e-05, "loss": 2.8301, "step": 86900 }, { "epoch": 0.6527660021458744, "grad_norm": 1.415246605873108, "learning_rate": 3.912544361827445e-05, "loss": 2.813, "step": 87000 }, { "epoch": 0.6535163078954674, "grad_norm": 2.1772053241729736, "learning_rate": 3.91129385224479e-05, "loss": 2.7791, "step": 87100 }, { "epoch": 0.6542666136450603, "grad_norm": 1.4628409147262573, "learning_rate": 3.910043342662135e-05, "loss": 2.8007, "step": 87200 }, { "epoch": 0.6550169193946533, "grad_norm": 1.6230412721633911, "learning_rate": 3.90879283307948e-05, "loss": 2.849, "step": 87300 }, { "epoch": 0.6557672251442462, "grad_norm": 1.7383617162704468, "learning_rate": 3.907542323496825e-05, "loss": 2.8522, "step": 87400 }, { "epoch": 0.6565175308938392, "grad_norm": 1.5055257081985474, "learning_rate": 3.90629181391417e-05, "loss": 2.6856, "step": 87500 }, { "epoch": 0.6572678366434322, "grad_norm": 1.688821792602539, "learning_rate": 3.9050413043315154e-05, "loss": 2.8062, "step": 87600 }, { "epoch": 0.6580181423930251, "grad_norm": 2.153782367706299, "learning_rate": 3.903790794748861e-05, "loss": 2.9419, "step": 87700 }, { "epoch": 0.6587684481426181, "grad_norm": 3.0772500038146973, "learning_rate": 3.902540285166205e-05, "loss": 2.7652, "step": 87800 }, { "epoch": 0.659518753892211, "grad_norm": 2.00130033493042, "learning_rate": 3.9012897755835504e-05, "loss": 2.592, "step": 87900 }, { "epoch": 0.660269059641804, "grad_norm": 1.4069727659225464, "learning_rate": 3.900039266000896e-05, "loss": 2.7772, "step": 88000 }, { "epoch": 0.6610193653913969, "grad_norm": 1.6952968835830688, "learning_rate": 3.8987887564182406e-05, "loss": 2.8245, "step": 88100 }, { "epoch": 0.66176967114099, "grad_norm": 1.4560846090316772, "learning_rate": 3.897538246835586e-05, "loss": 2.6845, "step": 88200 }, { "epoch": 0.662519976890583, "grad_norm": 1.6621476411819458, "learning_rate": 3.896287737252931e-05, "loss": 2.7295, "step": 88300 }, { "epoch": 0.6632702826401758, "grad_norm": 1.8695025444030762, "learning_rate": 3.8950372276702756e-05, "loss": 2.8119, "step": 88400 }, { "epoch": 0.6640205883897689, "grad_norm": 1.717116117477417, "learning_rate": 3.893786718087621e-05, "loss": 2.6708, "step": 88500 }, { "epoch": 0.6647708941393617, "grad_norm": 1.348455786705017, "learning_rate": 3.892536208504966e-05, "loss": 2.5963, "step": 88600 }, { "epoch": 0.6655211998889548, "grad_norm": 2.7593770027160645, "learning_rate": 3.891285698922311e-05, "loss": 2.8602, "step": 88700 }, { "epoch": 0.6662715056385478, "grad_norm": 1.7213168144226074, "learning_rate": 3.890035189339656e-05, "loss": 2.7944, "step": 88800 }, { "epoch": 0.6670218113881407, "grad_norm": 1.9056806564331055, "learning_rate": 3.888784679757001e-05, "loss": 2.8351, "step": 88900 }, { "epoch": 0.6677721171377337, "grad_norm": 1.497359275817871, "learning_rate": 3.887534170174346e-05, "loss": 2.7338, "step": 89000 }, { "epoch": 0.6685224228873266, "grad_norm": 1.2824490070343018, "learning_rate": 3.8862836605916916e-05, "loss": 2.8468, "step": 89100 }, { "epoch": 0.6692727286369196, "grad_norm": 1.4402879476547241, "learning_rate": 3.8850331510090364e-05, "loss": 2.856, "step": 89200 }, { "epoch": 0.6700230343865125, "grad_norm": 1.761098027229309, "learning_rate": 3.883782641426382e-05, "loss": 2.7629, "step": 89300 }, { "epoch": 0.6707733401361055, "grad_norm": 2.9340689182281494, "learning_rate": 3.8825321318437266e-05, "loss": 2.8025, "step": 89400 }, { "epoch": 0.6715236458856985, "grad_norm": 1.305324673652649, "learning_rate": 3.881294127356898e-05, "loss": 2.9183, "step": 89500 }, { "epoch": 0.6722739516352914, "grad_norm": 1.5187671184539795, "learning_rate": 3.880043617774243e-05, "loss": 2.8417, "step": 89600 }, { "epoch": 0.6730242573848844, "grad_norm": 1.662001609802246, "learning_rate": 3.8787931081915885e-05, "loss": 2.8168, "step": 89700 }, { "epoch": 0.6737745631344773, "grad_norm": 2.1925530433654785, "learning_rate": 3.877542598608933e-05, "loss": 2.7581, "step": 89800 }, { "epoch": 0.6745248688840703, "grad_norm": 1.7450188398361206, "learning_rate": 3.876292089026278e-05, "loss": 2.7341, "step": 89900 }, { "epoch": 0.6752751746336633, "grad_norm": 1.9281235933303833, "learning_rate": 3.8750415794436235e-05, "loss": 2.9505, "step": 90000 }, { "epoch": 0.6760254803832562, "grad_norm": 1.5302766561508179, "learning_rate": 3.873791069860969e-05, "loss": 2.7762, "step": 90100 }, { "epoch": 0.6767757861328492, "grad_norm": 1.9426904916763306, "learning_rate": 3.872540560278314e-05, "loss": 2.73, "step": 90200 }, { "epoch": 0.6775260918824421, "grad_norm": 1.7231568098068237, "learning_rate": 3.8712900506956585e-05, "loss": 2.865, "step": 90300 }, { "epoch": 0.6782763976320351, "grad_norm": 1.5762888193130493, "learning_rate": 3.870039541113004e-05, "loss": 2.7135, "step": 90400 }, { "epoch": 0.679026703381628, "grad_norm": 1.419667363166809, "learning_rate": 3.868789031530349e-05, "loss": 2.8682, "step": 90500 }, { "epoch": 0.679777009131221, "grad_norm": 1.4294648170471191, "learning_rate": 3.867538521947694e-05, "loss": 2.7074, "step": 90600 }, { "epoch": 0.680527314880814, "grad_norm": 1.840993046760559, "learning_rate": 3.866288012365039e-05, "loss": 2.6849, "step": 90700 }, { "epoch": 0.6812776206304069, "grad_norm": 1.6752036809921265, "learning_rate": 3.8650375027823836e-05, "loss": 2.7137, "step": 90800 }, { "epoch": 0.6820279263799999, "grad_norm": 1.2648653984069824, "learning_rate": 3.863786993199729e-05, "loss": 2.8028, "step": 90900 }, { "epoch": 0.6827782321295928, "grad_norm": 2.527987241744995, "learning_rate": 3.862536483617074e-05, "loss": 2.8078, "step": 91000 }, { "epoch": 0.6835285378791858, "grad_norm": 1.7342911958694458, "learning_rate": 3.861285974034419e-05, "loss": 2.7815, "step": 91100 }, { "epoch": 0.6842788436287788, "grad_norm": 1.4683769941329956, "learning_rate": 3.860035464451765e-05, "loss": 2.8754, "step": 91200 }, { "epoch": 0.6850291493783717, "grad_norm": 1.9135417938232422, "learning_rate": 3.858784954869109e-05, "loss": 2.7839, "step": 91300 }, { "epoch": 0.6857794551279647, "grad_norm": 1.6579680442810059, "learning_rate": 3.857534445286454e-05, "loss": 2.831, "step": 91400 }, { "epoch": 0.6865297608775576, "grad_norm": 1.2945892810821533, "learning_rate": 3.8562839357038e-05, "loss": 2.8645, "step": 91500 }, { "epoch": 0.6872800666271506, "grad_norm": 2.1893882751464844, "learning_rate": 3.8550334261211445e-05, "loss": 2.8919, "step": 91600 }, { "epoch": 0.6880303723767435, "grad_norm": 1.5281795263290405, "learning_rate": 3.853795421634316e-05, "loss": 2.7911, "step": 91700 }, { "epoch": 0.6887806781263365, "grad_norm": 1.7366359233856201, "learning_rate": 3.852544912051661e-05, "loss": 2.814, "step": 91800 }, { "epoch": 0.6895309838759295, "grad_norm": 2.478280782699585, "learning_rate": 3.8512944024690064e-05, "loss": 2.8107, "step": 91900 }, { "epoch": 0.6902812896255224, "grad_norm": 1.762414813041687, "learning_rate": 3.850043892886351e-05, "loss": 2.7319, "step": 92000 }, { "epoch": 0.6910315953751154, "grad_norm": 1.6761894226074219, "learning_rate": 3.8487933833036966e-05, "loss": 2.7012, "step": 92100 }, { "epoch": 0.6917819011247083, "grad_norm": 1.4257111549377441, "learning_rate": 3.847542873721042e-05, "loss": 2.8332, "step": 92200 }, { "epoch": 0.6925322068743013, "grad_norm": 2.040152072906494, "learning_rate": 3.846292364138386e-05, "loss": 2.8616, "step": 92300 }, { "epoch": 0.6932825126238943, "grad_norm": 1.800730586051941, "learning_rate": 3.8450418545557316e-05, "loss": 2.6428, "step": 92400 }, { "epoch": 0.6940328183734872, "grad_norm": 2.003187656402588, "learning_rate": 3.843791344973077e-05, "loss": 2.8484, "step": 92500 }, { "epoch": 0.6947831241230802, "grad_norm": 2.19596004486084, "learning_rate": 3.842540835390422e-05, "loss": 2.8565, "step": 92600 }, { "epoch": 0.6955334298726731, "grad_norm": 2.084886312484741, "learning_rate": 3.841290325807767e-05, "loss": 2.7884, "step": 92700 }, { "epoch": 0.6962837356222661, "grad_norm": 1.8188289403915405, "learning_rate": 3.840039816225112e-05, "loss": 2.8003, "step": 92800 }, { "epoch": 0.697034041371859, "grad_norm": 2.6200945377349854, "learning_rate": 3.838789306642457e-05, "loss": 2.6772, "step": 92900 }, { "epoch": 0.697784347121452, "grad_norm": 1.2581435441970825, "learning_rate": 3.837538797059802e-05, "loss": 2.7051, "step": 93000 }, { "epoch": 0.698534652871045, "grad_norm": 1.2769601345062256, "learning_rate": 3.836288287477147e-05, "loss": 2.7507, "step": 93100 }, { "epoch": 0.6992849586206379, "grad_norm": 1.826303482055664, "learning_rate": 3.8350377778944924e-05, "loss": 2.7096, "step": 93200 }, { "epoch": 0.7000352643702309, "grad_norm": 2.0872175693511963, "learning_rate": 3.833787268311837e-05, "loss": 2.6911, "step": 93300 }, { "epoch": 0.7007855701198238, "grad_norm": 2.1276187896728516, "learning_rate": 3.832536758729182e-05, "loss": 2.8131, "step": 93400 }, { "epoch": 0.7015358758694168, "grad_norm": 1.9493252038955688, "learning_rate": 3.8312862491465273e-05, "loss": 2.7796, "step": 93500 }, { "epoch": 0.7022861816190098, "grad_norm": 2.325148105621338, "learning_rate": 3.830035739563873e-05, "loss": 2.7707, "step": 93600 }, { "epoch": 0.7030364873686027, "grad_norm": 2.111128568649292, "learning_rate": 3.8287852299812176e-05, "loss": 2.7617, "step": 93700 }, { "epoch": 0.7037867931181957, "grad_norm": 1.8153074979782104, "learning_rate": 3.827534720398563e-05, "loss": 2.7013, "step": 93800 }, { "epoch": 0.7045370988677886, "grad_norm": 1.58577561378479, "learning_rate": 3.826296715911734e-05, "loss": 2.6117, "step": 93900 }, { "epoch": 0.7052874046173816, "grad_norm": 2.187441825866699, "learning_rate": 3.8250462063290795e-05, "loss": 2.7464, "step": 94000 }, { "epoch": 0.7060377103669745, "grad_norm": 1.3852925300598145, "learning_rate": 3.823795696746424e-05, "loss": 2.7307, "step": 94100 }, { "epoch": 0.7067880161165675, "grad_norm": 2.4628374576568604, "learning_rate": 3.82254518716377e-05, "loss": 2.8532, "step": 94200 }, { "epoch": 0.7075383218661605, "grad_norm": 1.702455997467041, "learning_rate": 3.8212946775811145e-05, "loss": 2.7851, "step": 94300 }, { "epoch": 0.7082886276157534, "grad_norm": 2.506895065307617, "learning_rate": 3.820044167998459e-05, "loss": 2.8564, "step": 94400 }, { "epoch": 0.7090389333653464, "grad_norm": 2.4300551414489746, "learning_rate": 3.818793658415805e-05, "loss": 2.7319, "step": 94500 }, { "epoch": 0.7097892391149393, "grad_norm": 1.5761444568634033, "learning_rate": 3.81754314883315e-05, "loss": 2.5851, "step": 94600 }, { "epoch": 0.7105395448645323, "grad_norm": 1.415381669998169, "learning_rate": 3.816292639250495e-05, "loss": 2.8592, "step": 94700 }, { "epoch": 0.7112898506141253, "grad_norm": 2.1672475337982178, "learning_rate": 3.8150421296678396e-05, "loss": 2.7524, "step": 94800 }, { "epoch": 0.7120401563637182, "grad_norm": 1.4041945934295654, "learning_rate": 3.813791620085185e-05, "loss": 2.8161, "step": 94900 }, { "epoch": 0.7127904621133112, "grad_norm": 2.3905975818634033, "learning_rate": 3.81254111050253e-05, "loss": 2.864, "step": 95000 }, { "epoch": 0.7135407678629041, "grad_norm": 2.06604266166687, "learning_rate": 3.811290600919875e-05, "loss": 2.8001, "step": 95100 }, { "epoch": 0.7142910736124971, "grad_norm": 1.8341466188430786, "learning_rate": 3.81004009133722e-05, "loss": 2.8347, "step": 95200 }, { "epoch": 0.71504137936209, "grad_norm": 3.114030599594116, "learning_rate": 3.808789581754565e-05, "loss": 2.7204, "step": 95300 }, { "epoch": 0.715791685111683, "grad_norm": 2.454150438308716, "learning_rate": 3.80753907217191e-05, "loss": 2.716, "step": 95400 }, { "epoch": 0.716541990861276, "grad_norm": 1.6861200332641602, "learning_rate": 3.806288562589255e-05, "loss": 2.6465, "step": 95500 }, { "epoch": 0.7172922966108689, "grad_norm": 1.577059030532837, "learning_rate": 3.8050380530066004e-05, "loss": 2.8108, "step": 95600 }, { "epoch": 0.7180426023604619, "grad_norm": 2.7634782791137695, "learning_rate": 3.803787543423946e-05, "loss": 2.6565, "step": 95700 }, { "epoch": 0.7187929081100548, "grad_norm": 1.3382186889648438, "learning_rate": 3.80253703384129e-05, "loss": 2.8116, "step": 95800 }, { "epoch": 0.7195432138596478, "grad_norm": 2.372882843017578, "learning_rate": 3.8012865242586354e-05, "loss": 2.6486, "step": 95900 }, { "epoch": 0.7202935196092408, "grad_norm": 1.5089915990829468, "learning_rate": 3.800036014675981e-05, "loss": 2.6279, "step": 96000 }, { "epoch": 0.7210438253588337, "grad_norm": 2.597088098526001, "learning_rate": 3.7987855050933256e-05, "loss": 2.8032, "step": 96100 }, { "epoch": 0.7217941311084267, "grad_norm": 2.005488395690918, "learning_rate": 3.797534995510671e-05, "loss": 2.7059, "step": 96200 }, { "epoch": 0.7225444368580196, "grad_norm": 1.6743812561035156, "learning_rate": 3.796284485928016e-05, "loss": 2.7678, "step": 96300 }, { "epoch": 0.7232947426076126, "grad_norm": 1.4720969200134277, "learning_rate": 3.7950339763453606e-05, "loss": 2.7817, "step": 96400 }, { "epoch": 0.7240450483572055, "grad_norm": 1.346518874168396, "learning_rate": 3.793783466762706e-05, "loss": 2.7007, "step": 96500 }, { "epoch": 0.7247953541067985, "grad_norm": 1.7382440567016602, "learning_rate": 3.792532957180051e-05, "loss": 2.7486, "step": 96600 }, { "epoch": 0.7255456598563915, "grad_norm": 2.893144130706787, "learning_rate": 3.791282447597396e-05, "loss": 2.7098, "step": 96700 }, { "epoch": 0.7262959656059844, "grad_norm": 2.309784173965454, "learning_rate": 3.790044443110567e-05, "loss": 2.8042, "step": 96800 }, { "epoch": 0.7270462713555774, "grad_norm": 1.7501552104949951, "learning_rate": 3.788793933527913e-05, "loss": 2.8368, "step": 96900 }, { "epoch": 0.7277965771051703, "grad_norm": 2.7407829761505127, "learning_rate": 3.787543423945258e-05, "loss": 2.8615, "step": 97000 }, { "epoch": 0.7285468828547633, "grad_norm": 1.8905185461044312, "learning_rate": 3.786292914362603e-05, "loss": 2.8466, "step": 97100 }, { "epoch": 0.7292971886043563, "grad_norm": 1.7229100465774536, "learning_rate": 3.7850424047799484e-05, "loss": 2.6711, "step": 97200 }, { "epoch": 0.7300474943539492, "grad_norm": 2.2127978801727295, "learning_rate": 3.783791895197293e-05, "loss": 2.8234, "step": 97300 }, { "epoch": 0.7307978001035422, "grad_norm": 2.269746780395508, "learning_rate": 3.782541385614638e-05, "loss": 2.7759, "step": 97400 }, { "epoch": 0.7315481058531351, "grad_norm": 1.4153506755828857, "learning_rate": 3.7812908760319833e-05, "loss": 2.7261, "step": 97500 }, { "epoch": 0.7322984116027281, "grad_norm": 1.7201610803604126, "learning_rate": 3.780040366449328e-05, "loss": 2.8254, "step": 97600 }, { "epoch": 0.733048717352321, "grad_norm": 1.8322395086288452, "learning_rate": 3.7787898568666735e-05, "loss": 2.8802, "step": 97700 }, { "epoch": 0.733799023101914, "grad_norm": 2.12200665473938, "learning_rate": 3.777539347284018e-05, "loss": 2.6897, "step": 97800 }, { "epoch": 0.734549328851507, "grad_norm": 1.8079434633255005, "learning_rate": 3.776288837701363e-05, "loss": 2.7792, "step": 97900 }, { "epoch": 0.7352996346010999, "grad_norm": 2.227283477783203, "learning_rate": 3.7750383281187085e-05, "loss": 2.6885, "step": 98000 }, { "epoch": 0.7360499403506929, "grad_norm": 3.5900604724884033, "learning_rate": 3.773787818536054e-05, "loss": 2.7592, "step": 98100 }, { "epoch": 0.7368002461002858, "grad_norm": 2.0623321533203125, "learning_rate": 3.772537308953399e-05, "loss": 2.7539, "step": 98200 }, { "epoch": 0.7375505518498788, "grad_norm": 1.833884835243225, "learning_rate": 3.771286799370744e-05, "loss": 2.7403, "step": 98300 }, { "epoch": 0.7383008575994718, "grad_norm": 1.7104076147079468, "learning_rate": 3.770036289788089e-05, "loss": 2.8434, "step": 98400 }, { "epoch": 0.7390511633490647, "grad_norm": 1.8471003770828247, "learning_rate": 3.768785780205434e-05, "loss": 2.7418, "step": 98500 }, { "epoch": 0.7398014690986577, "grad_norm": 1.7802404165267944, "learning_rate": 3.767535270622779e-05, "loss": 2.89, "step": 98600 }, { "epoch": 0.7405517748482506, "grad_norm": 1.9901918172836304, "learning_rate": 3.766284761040124e-05, "loss": 2.6452, "step": 98700 }, { "epoch": 0.7413020805978436, "grad_norm": 1.7404547929763794, "learning_rate": 3.765034251457469e-05, "loss": 2.7474, "step": 98800 }, { "epoch": 0.7420523863474365, "grad_norm": 2.1622848510742188, "learning_rate": 3.763783741874814e-05, "loss": 2.8053, "step": 98900 }, { "epoch": 0.7428026920970295, "grad_norm": 2.3333253860473633, "learning_rate": 3.762533232292159e-05, "loss": 2.7867, "step": 99000 }, { "epoch": 0.7435529978466225, "grad_norm": 1.6174910068511963, "learning_rate": 3.761295227805331e-05, "loss": 2.6893, "step": 99100 }, { "epoch": 0.7443033035962154, "grad_norm": 1.9193317890167236, "learning_rate": 3.760044718222676e-05, "loss": 2.8742, "step": 99200 }, { "epoch": 0.7450536093458084, "grad_norm": 1.9413771629333496, "learning_rate": 3.758794208640021e-05, "loss": 2.7893, "step": 99300 }, { "epoch": 0.7458039150954013, "grad_norm": 1.2409842014312744, "learning_rate": 3.757543699057366e-05, "loss": 2.8103, "step": 99400 }, { "epoch": 0.7465542208449943, "grad_norm": 2.130619525909424, "learning_rate": 3.756293189474711e-05, "loss": 2.8986, "step": 99500 }, { "epoch": 0.7473045265945873, "grad_norm": 1.9683794975280762, "learning_rate": 3.7550426798920564e-05, "loss": 2.7091, "step": 99600 }, { "epoch": 0.7480548323441802, "grad_norm": 1.419398307800293, "learning_rate": 3.753792170309401e-05, "loss": 2.6756, "step": 99700 }, { "epoch": 0.7488051380937732, "grad_norm": 3.071274995803833, "learning_rate": 3.752541660726746e-05, "loss": 2.7799, "step": 99800 }, { "epoch": 0.7495554438433661, "grad_norm": 2.1027519702911377, "learning_rate": 3.7512911511440914e-05, "loss": 2.7438, "step": 99900 }, { "epoch": 0.7503057495929591, "grad_norm": 1.5898840427398682, "learning_rate": 3.750040641561436e-05, "loss": 2.6039, "step": 100000 }, { "epoch": 0.751056055342552, "grad_norm": 1.6125653982162476, "learning_rate": 3.7487901319787816e-05, "loss": 2.7248, "step": 100100 }, { "epoch": 0.751806361092145, "grad_norm": 2.3452484607696533, "learning_rate": 3.747539622396127e-05, "loss": 2.7028, "step": 100200 }, { "epoch": 0.752556666841738, "grad_norm": 1.4424258470535278, "learning_rate": 3.746289112813472e-05, "loss": 2.7706, "step": 100300 }, { "epoch": 0.753306972591331, "grad_norm": 2.542587995529175, "learning_rate": 3.7450386032308166e-05, "loss": 2.8807, "step": 100400 }, { "epoch": 0.754057278340924, "grad_norm": 2.073136329650879, "learning_rate": 3.743788093648162e-05, "loss": 2.6034, "step": 100500 }, { "epoch": 0.7548075840905168, "grad_norm": 1.918042778968811, "learning_rate": 3.742537584065507e-05, "loss": 2.6199, "step": 100600 }, { "epoch": 0.7555578898401099, "grad_norm": 1.7857468128204346, "learning_rate": 3.741287074482852e-05, "loss": 2.794, "step": 100700 }, { "epoch": 0.7563081955897029, "grad_norm": 1.8158869743347168, "learning_rate": 3.740036564900197e-05, "loss": 2.8093, "step": 100800 }, { "epoch": 0.7570585013392958, "grad_norm": 1.683375597000122, "learning_rate": 3.738786055317542e-05, "loss": 2.6876, "step": 100900 }, { "epoch": 0.7578088070888888, "grad_norm": 1.748569369316101, "learning_rate": 3.737535545734887e-05, "loss": 2.7121, "step": 101000 }, { "epoch": 0.7585591128384817, "grad_norm": 1.5625685453414917, "learning_rate": 3.736285036152232e-05, "loss": 2.613, "step": 101100 }, { "epoch": 0.7593094185880747, "grad_norm": 3.0891995429992676, "learning_rate": 3.7350345265695774e-05, "loss": 2.7793, "step": 101200 }, { "epoch": 0.7600597243376676, "grad_norm": 2.591196298599243, "learning_rate": 3.733784016986923e-05, "loss": 2.8394, "step": 101300 }, { "epoch": 0.7608100300872606, "grad_norm": 2.1731178760528564, "learning_rate": 3.732533507404267e-05, "loss": 2.7039, "step": 101400 }, { "epoch": 0.7615603358368536, "grad_norm": 1.4708701372146606, "learning_rate": 3.7312829978216124e-05, "loss": 2.8174, "step": 101500 }, { "epoch": 0.7623106415864465, "grad_norm": 3.1619391441345215, "learning_rate": 3.730032488238958e-05, "loss": 2.8078, "step": 101600 }, { "epoch": 0.7630609473360395, "grad_norm": 1.8148353099822998, "learning_rate": 3.7287819786563026e-05, "loss": 2.6915, "step": 101700 }, { "epoch": 0.7638112530856324, "grad_norm": 1.5370335578918457, "learning_rate": 3.727531469073648e-05, "loss": 2.8469, "step": 101800 }, { "epoch": 0.7645615588352254, "grad_norm": 1.544983148574829, "learning_rate": 3.726280959490993e-05, "loss": 2.7749, "step": 101900 }, { "epoch": 0.7653118645848184, "grad_norm": 2.5176758766174316, "learning_rate": 3.7250429550041645e-05, "loss": 2.7187, "step": 102000 }, { "epoch": 0.7660621703344113, "grad_norm": 2.733774423599243, "learning_rate": 3.723792445421509e-05, "loss": 2.7311, "step": 102100 }, { "epoch": 0.7668124760840043, "grad_norm": 1.6955556869506836, "learning_rate": 3.722541935838855e-05, "loss": 2.7128, "step": 102200 }, { "epoch": 0.7675627818335972, "grad_norm": 2.4013519287109375, "learning_rate": 3.7212914262561995e-05, "loss": 2.8579, "step": 102300 }, { "epoch": 0.7683130875831902, "grad_norm": 1.762174367904663, "learning_rate": 3.720040916673544e-05, "loss": 2.7217, "step": 102400 }, { "epoch": 0.7690633933327831, "grad_norm": 1.2409788370132446, "learning_rate": 3.71879040709089e-05, "loss": 2.8546, "step": 102500 }, { "epoch": 0.7698136990823761, "grad_norm": 3.6682300567626953, "learning_rate": 3.717539897508235e-05, "loss": 2.7481, "step": 102600 }, { "epoch": 0.7705640048319691, "grad_norm": 1.4161078929901123, "learning_rate": 3.71628938792558e-05, "loss": 2.7246, "step": 102700 }, { "epoch": 0.771314310581562, "grad_norm": 2.0377416610717773, "learning_rate": 3.715038878342925e-05, "loss": 2.4723, "step": 102800 }, { "epoch": 0.772064616331155, "grad_norm": 2.522129535675049, "learning_rate": 3.71378836876027e-05, "loss": 2.7631, "step": 102900 }, { "epoch": 0.7728149220807479, "grad_norm": 1.5055625438690186, "learning_rate": 3.712537859177615e-05, "loss": 2.7533, "step": 103000 }, { "epoch": 0.7735652278303409, "grad_norm": 1.794860601425171, "learning_rate": 3.71128734959496e-05, "loss": 2.7518, "step": 103100 }, { "epoch": 0.7743155335799338, "grad_norm": 1.7509534358978271, "learning_rate": 3.710036840012305e-05, "loss": 2.7781, "step": 103200 }, { "epoch": 0.7750658393295268, "grad_norm": 2.176348924636841, "learning_rate": 3.7087863304296505e-05, "loss": 2.831, "step": 103300 }, { "epoch": 0.7758161450791198, "grad_norm": 1.7376410961151123, "learning_rate": 3.707535820846995e-05, "loss": 2.7834, "step": 103400 }, { "epoch": 0.7765664508287127, "grad_norm": 1.7925366163253784, "learning_rate": 3.70628531126434e-05, "loss": 2.8808, "step": 103500 }, { "epoch": 0.7773167565783057, "grad_norm": 1.4821797609329224, "learning_rate": 3.7050348016816855e-05, "loss": 2.9712, "step": 103600 }, { "epoch": 0.7780670623278986, "grad_norm": 1.2925777435302734, "learning_rate": 3.703784292099031e-05, "loss": 2.7764, "step": 103700 }, { "epoch": 0.7788173680774916, "grad_norm": 2.0385005474090576, "learning_rate": 3.7025337825163757e-05, "loss": 2.6196, "step": 103800 }, { "epoch": 0.7795676738270846, "grad_norm": 2.241489887237549, "learning_rate": 3.7012832729337204e-05, "loss": 2.7735, "step": 103900 }, { "epoch": 0.7803179795766775, "grad_norm": 2.0993852615356445, "learning_rate": 3.700032763351066e-05, "loss": 2.7444, "step": 104000 }, { "epoch": 0.7810682853262705, "grad_norm": 1.7474520206451416, "learning_rate": 3.6987822537684106e-05, "loss": 2.7992, "step": 104100 }, { "epoch": 0.7818185910758634, "grad_norm": 1.3772835731506348, "learning_rate": 3.697531744185756e-05, "loss": 2.8692, "step": 104200 }, { "epoch": 0.7825688968254564, "grad_norm": 1.719907522201538, "learning_rate": 3.696281234603101e-05, "loss": 2.8665, "step": 104300 }, { "epoch": 0.7833192025750493, "grad_norm": 1.5359447002410889, "learning_rate": 3.6950432301162726e-05, "loss": 2.7246, "step": 104400 }, { "epoch": 0.7840695083246423, "grad_norm": 1.6734185218811035, "learning_rate": 3.693792720533617e-05, "loss": 3.0006, "step": 104500 }, { "epoch": 0.7848198140742353, "grad_norm": 1.9300366640090942, "learning_rate": 3.692542210950963e-05, "loss": 2.6807, "step": 104600 }, { "epoch": 0.7855701198238282, "grad_norm": 1.338029384613037, "learning_rate": 3.691291701368308e-05, "loss": 2.6599, "step": 104700 }, { "epoch": 0.7863204255734212, "grad_norm": 1.6170413494110107, "learning_rate": 3.690041191785653e-05, "loss": 2.7059, "step": 104800 }, { "epoch": 0.7870707313230141, "grad_norm": 1.6450586318969727, "learning_rate": 3.688790682202998e-05, "loss": 2.6879, "step": 104900 }, { "epoch": 0.7878210370726071, "grad_norm": 2.462719202041626, "learning_rate": 3.687540172620343e-05, "loss": 2.8916, "step": 105000 }, { "epoch": 0.7885713428222001, "grad_norm": 2.550067663192749, "learning_rate": 3.686289663037688e-05, "loss": 2.699, "step": 105100 }, { "epoch": 0.789321648571793, "grad_norm": 1.1778078079223633, "learning_rate": 3.6850391534550334e-05, "loss": 2.6247, "step": 105200 }, { "epoch": 0.790071954321386, "grad_norm": 1.4586765766143799, "learning_rate": 3.683788643872378e-05, "loss": 2.7591, "step": 105300 }, { "epoch": 0.7908222600709789, "grad_norm": 1.5293437242507935, "learning_rate": 3.682538134289723e-05, "loss": 2.8423, "step": 105400 }, { "epoch": 0.7915725658205719, "grad_norm": 2.084770441055298, "learning_rate": 3.6812876247070683e-05, "loss": 2.7789, "step": 105500 }, { "epoch": 0.7923228715701648, "grad_norm": 1.434921145439148, "learning_rate": 3.680037115124413e-05, "loss": 2.8504, "step": 105600 }, { "epoch": 0.7930731773197578, "grad_norm": 2.485851526260376, "learning_rate": 3.6787866055417586e-05, "loss": 2.7656, "step": 105700 }, { "epoch": 0.7938234830693508, "grad_norm": 1.8833919763565063, "learning_rate": 3.677536095959104e-05, "loss": 2.7507, "step": 105800 }, { "epoch": 0.7945737888189437, "grad_norm": 1.4811413288116455, "learning_rate": 3.676285586376448e-05, "loss": 2.9189, "step": 105900 }, { "epoch": 0.7953240945685367, "grad_norm": 1.7611703872680664, "learning_rate": 3.6750350767937935e-05, "loss": 2.7552, "step": 106000 }, { "epoch": 0.7960744003181296, "grad_norm": 2.4732468128204346, "learning_rate": 3.673784567211139e-05, "loss": 2.7501, "step": 106100 }, { "epoch": 0.7968247060677226, "grad_norm": 1.492567539215088, "learning_rate": 3.672534057628484e-05, "loss": 2.6971, "step": 106200 }, { "epoch": 0.7975750118173156, "grad_norm": 1.4491642713546753, "learning_rate": 3.671283548045829e-05, "loss": 2.8326, "step": 106300 }, { "epoch": 0.7983253175669085, "grad_norm": 1.3526785373687744, "learning_rate": 3.670033038463174e-05, "loss": 2.7601, "step": 106400 }, { "epoch": 0.7990756233165015, "grad_norm": 2.0852034091949463, "learning_rate": 3.668782528880519e-05, "loss": 2.5959, "step": 106500 }, { "epoch": 0.7998259290660944, "grad_norm": 1.4807337522506714, "learning_rate": 3.667532019297864e-05, "loss": 2.7741, "step": 106600 }, { "epoch": 0.8005762348156874, "grad_norm": 1.4539504051208496, "learning_rate": 3.666281509715209e-05, "loss": 2.8172, "step": 106700 }, { "epoch": 0.8013265405652803, "grad_norm": 1.7447978258132935, "learning_rate": 3.665031000132554e-05, "loss": 2.5767, "step": 106800 }, { "epoch": 0.8020768463148733, "grad_norm": 1.8091270923614502, "learning_rate": 3.663780490549899e-05, "loss": 2.8978, "step": 106900 }, { "epoch": 0.8028271520644663, "grad_norm": 1.5875964164733887, "learning_rate": 3.6625299809672445e-05, "loss": 2.7566, "step": 107000 }, { "epoch": 0.8035774578140592, "grad_norm": 1.6898640394210815, "learning_rate": 3.661279471384589e-05, "loss": 2.6448, "step": 107100 }, { "epoch": 0.8043277635636522, "grad_norm": 3.132082462310791, "learning_rate": 3.660041466897761e-05, "loss": 2.8929, "step": 107200 }, { "epoch": 0.8050780693132451, "grad_norm": 1.5938949584960938, "learning_rate": 3.6587909573151065e-05, "loss": 2.8716, "step": 107300 }, { "epoch": 0.8058283750628381, "grad_norm": 1.7738118171691895, "learning_rate": 3.657540447732451e-05, "loss": 2.7529, "step": 107400 }, { "epoch": 0.8065786808124311, "grad_norm": 2.18042254447937, "learning_rate": 3.656289938149796e-05, "loss": 2.6195, "step": 107500 }, { "epoch": 0.807328986562024, "grad_norm": 2.0041494369506836, "learning_rate": 3.6550394285671414e-05, "loss": 2.7808, "step": 107600 }, { "epoch": 0.808079292311617, "grad_norm": 2.3955447673797607, "learning_rate": 3.653788918984486e-05, "loss": 2.8184, "step": 107700 }, { "epoch": 0.8088295980612099, "grad_norm": 1.7603517770767212, "learning_rate": 3.6525384094018317e-05, "loss": 2.8781, "step": 107800 }, { "epoch": 0.8095799038108029, "grad_norm": 2.0119738578796387, "learning_rate": 3.6512878998191764e-05, "loss": 2.7036, "step": 107900 }, { "epoch": 0.8103302095603958, "grad_norm": 1.3950631618499756, "learning_rate": 3.650037390236521e-05, "loss": 2.6886, "step": 108000 }, { "epoch": 0.8110805153099888, "grad_norm": 2.3001210689544678, "learning_rate": 3.6487868806538666e-05, "loss": 2.8471, "step": 108100 }, { "epoch": 0.8118308210595818, "grad_norm": 2.2222349643707275, "learning_rate": 3.647536371071212e-05, "loss": 2.7897, "step": 108200 }, { "epoch": 0.8125811268091747, "grad_norm": 2.0273375511169434, "learning_rate": 3.646285861488557e-05, "loss": 2.8668, "step": 108300 }, { "epoch": 0.8133314325587677, "grad_norm": 3.2324655055999756, "learning_rate": 3.6450353519059016e-05, "loss": 2.7935, "step": 108400 }, { "epoch": 0.8140817383083606, "grad_norm": 2.1587698459625244, "learning_rate": 3.643784842323247e-05, "loss": 2.7521, "step": 108500 }, { "epoch": 0.8148320440579536, "grad_norm": 1.7031806707382202, "learning_rate": 3.642534332740592e-05, "loss": 2.7805, "step": 108600 }, { "epoch": 0.8155823498075466, "grad_norm": 1.7716588973999023, "learning_rate": 3.641283823157937e-05, "loss": 2.7235, "step": 108700 }, { "epoch": 0.8163326555571395, "grad_norm": 2.1572530269622803, "learning_rate": 3.640033313575282e-05, "loss": 2.7115, "step": 108800 }, { "epoch": 0.8170829613067325, "grad_norm": 1.3882415294647217, "learning_rate": 3.638782803992627e-05, "loss": 2.7752, "step": 108900 }, { "epoch": 0.8178332670563254, "grad_norm": 1.4251195192337036, "learning_rate": 3.637532294409972e-05, "loss": 2.8132, "step": 109000 }, { "epoch": 0.8185835728059184, "grad_norm": 1.376570463180542, "learning_rate": 3.6362817848273176e-05, "loss": 2.7567, "step": 109100 }, { "epoch": 0.8193338785555113, "grad_norm": 2.498183250427246, "learning_rate": 3.6350312752446624e-05, "loss": 2.7279, "step": 109200 }, { "epoch": 0.8200841843051043, "grad_norm": 1.8549224138259888, "learning_rate": 3.633780765662008e-05, "loss": 2.7255, "step": 109300 }, { "epoch": 0.8208344900546973, "grad_norm": 2.7776100635528564, "learning_rate": 3.6325302560793526e-05, "loss": 2.7409, "step": 109400 }, { "epoch": 0.8215847958042902, "grad_norm": 1.5596375465393066, "learning_rate": 3.6312797464966974e-05, "loss": 2.6165, "step": 109500 }, { "epoch": 0.8223351015538832, "grad_norm": 2.800283670425415, "learning_rate": 3.630029236914043e-05, "loss": 2.7006, "step": 109600 }, { "epoch": 0.8230854073034761, "grad_norm": 1.8805420398712158, "learning_rate": 3.6287787273313876e-05, "loss": 2.7925, "step": 109700 }, { "epoch": 0.8238357130530691, "grad_norm": 3.022651433944702, "learning_rate": 3.627528217748733e-05, "loss": 2.7497, "step": 109800 }, { "epoch": 0.8245860188026621, "grad_norm": 1.5748002529144287, "learning_rate": 3.626277708166078e-05, "loss": 2.6758, "step": 109900 }, { "epoch": 0.825336324552255, "grad_norm": 1.6859346628189087, "learning_rate": 3.6250271985834225e-05, "loss": 2.5952, "step": 110000 }, { "epoch": 0.826086630301848, "grad_norm": 1.314246416091919, "learning_rate": 3.623776689000768e-05, "loss": 2.7463, "step": 110100 }, { "epoch": 0.8268369360514409, "grad_norm": 1.6181827783584595, "learning_rate": 3.6225261794181134e-05, "loss": 2.6968, "step": 110200 }, { "epoch": 0.8275872418010339, "grad_norm": 2.129987955093384, "learning_rate": 3.621275669835458e-05, "loss": 2.8191, "step": 110300 }, { "epoch": 0.8283375475506268, "grad_norm": 1.5471391677856445, "learning_rate": 3.620025160252803e-05, "loss": 2.8027, "step": 110400 }, { "epoch": 0.8290878533002198, "grad_norm": 3.0299088954925537, "learning_rate": 3.6187746506701484e-05, "loss": 2.7087, "step": 110500 }, { "epoch": 0.8298381590498128, "grad_norm": 2.2691524028778076, "learning_rate": 3.617524141087493e-05, "loss": 2.813, "step": 110600 }, { "epoch": 0.8305884647994057, "grad_norm": 1.398695945739746, "learning_rate": 3.6162736315048386e-05, "loss": 2.703, "step": 110700 }, { "epoch": 0.8313387705489987, "grad_norm": 2.483187437057495, "learning_rate": 3.6150231219221833e-05, "loss": 2.7383, "step": 110800 }, { "epoch": 0.8320890762985916, "grad_norm": 1.6507418155670166, "learning_rate": 3.613772612339529e-05, "loss": 2.731, "step": 110900 }, { "epoch": 0.8328393820481846, "grad_norm": 1.7470734119415283, "learning_rate": 3.6125221027568735e-05, "loss": 2.8131, "step": 111000 }, { "epoch": 0.8335896877977776, "grad_norm": 1.7923749685287476, "learning_rate": 3.611271593174218e-05, "loss": 2.7639, "step": 111100 }, { "epoch": 0.8343399935473705, "grad_norm": 1.8916916847229004, "learning_rate": 3.61003358868739e-05, "loss": 2.9058, "step": 111200 }, { "epoch": 0.8350902992969635, "grad_norm": 1.629653811454773, "learning_rate": 3.6087830791047355e-05, "loss": 2.6957, "step": 111300 }, { "epoch": 0.8358406050465564, "grad_norm": 2.6775405406951904, "learning_rate": 3.60753256952208e-05, "loss": 2.7495, "step": 111400 }, { "epoch": 0.8365909107961494, "grad_norm": 1.5699349641799927, "learning_rate": 3.606282059939426e-05, "loss": 2.8189, "step": 111500 }, { "epoch": 0.8373412165457423, "grad_norm": 1.9303905963897705, "learning_rate": 3.6050315503567705e-05, "loss": 2.8251, "step": 111600 }, { "epoch": 0.8380915222953353, "grad_norm": 2.170698881149292, "learning_rate": 3.603793545869942e-05, "loss": 2.7956, "step": 111700 }, { "epoch": 0.8388418280449284, "grad_norm": 2.0006892681121826, "learning_rate": 3.6025430362872876e-05, "loss": 2.8855, "step": 111800 }, { "epoch": 0.8395921337945212, "grad_norm": 1.8442364931106567, "learning_rate": 3.6012925267046324e-05, "loss": 2.8309, "step": 111900 }, { "epoch": 0.8403424395441143, "grad_norm": 2.04801344871521, "learning_rate": 3.600042017121977e-05, "loss": 2.7767, "step": 112000 }, { "epoch": 0.8410927452937071, "grad_norm": 1.5607504844665527, "learning_rate": 3.5987915075393226e-05, "loss": 2.8049, "step": 112100 }, { "epoch": 0.8418430510433002, "grad_norm": 1.4535925388336182, "learning_rate": 3.5975409979566674e-05, "loss": 2.9276, "step": 112200 }, { "epoch": 0.8425933567928932, "grad_norm": 1.7389081716537476, "learning_rate": 3.596290488374013e-05, "loss": 2.8562, "step": 112300 }, { "epoch": 0.843343662542486, "grad_norm": 1.9134509563446045, "learning_rate": 3.5950399787913576e-05, "loss": 2.7089, "step": 112400 }, { "epoch": 0.844093968292079, "grad_norm": 1.8674659729003906, "learning_rate": 3.5937894692087023e-05, "loss": 2.6855, "step": 112500 }, { "epoch": 0.844844274041672, "grad_norm": 2.053494691848755, "learning_rate": 3.592538959626048e-05, "loss": 2.5593, "step": 112600 }, { "epoch": 0.845594579791265, "grad_norm": 1.7737799882888794, "learning_rate": 3.591288450043393e-05, "loss": 2.7533, "step": 112700 }, { "epoch": 0.8463448855408578, "grad_norm": 2.3246850967407227, "learning_rate": 3.590037940460738e-05, "loss": 2.7442, "step": 112800 }, { "epoch": 0.8470951912904509, "grad_norm": 3.980879306793213, "learning_rate": 3.588787430878083e-05, "loss": 2.7195, "step": 112900 }, { "epoch": 0.8478454970400439, "grad_norm": 1.8886125087738037, "learning_rate": 3.587536921295428e-05, "loss": 2.6668, "step": 113000 }, { "epoch": 0.8485958027896368, "grad_norm": 2.0963170528411865, "learning_rate": 3.586286411712773e-05, "loss": 2.8413, "step": 113100 }, { "epoch": 0.8493461085392298, "grad_norm": 3.9102346897125244, "learning_rate": 3.5850359021301184e-05, "loss": 2.7442, "step": 113200 }, { "epoch": 0.8500964142888227, "grad_norm": 1.681741714477539, "learning_rate": 3.583785392547463e-05, "loss": 2.81, "step": 113300 }, { "epoch": 0.8508467200384157, "grad_norm": 1.4679710865020752, "learning_rate": 3.582534882964808e-05, "loss": 2.7688, "step": 113400 }, { "epoch": 0.8515970257880087, "grad_norm": 1.5894734859466553, "learning_rate": 3.5812843733821534e-05, "loss": 2.7726, "step": 113500 }, { "epoch": 0.8523473315376016, "grad_norm": 1.584130883216858, "learning_rate": 3.580033863799498e-05, "loss": 2.7338, "step": 113600 }, { "epoch": 0.8530976372871946, "grad_norm": 1.592331051826477, "learning_rate": 3.5787833542168436e-05, "loss": 2.871, "step": 113700 }, { "epoch": 0.8538479430367875, "grad_norm": 3.182648181915283, "learning_rate": 3.577532844634189e-05, "loss": 2.6299, "step": 113800 }, { "epoch": 0.8545982487863805, "grad_norm": 1.4650589227676392, "learning_rate": 3.576282335051534e-05, "loss": 2.6372, "step": 113900 }, { "epoch": 0.8553485545359734, "grad_norm": 1.7067883014678955, "learning_rate": 3.5750318254688785e-05, "loss": 2.639, "step": 114000 }, { "epoch": 0.8560988602855664, "grad_norm": 1.8653980493545532, "learning_rate": 3.573781315886224e-05, "loss": 2.7888, "step": 114100 }, { "epoch": 0.8568491660351594, "grad_norm": 1.8871147632598877, "learning_rate": 3.572530806303569e-05, "loss": 2.5622, "step": 114200 }, { "epoch": 0.8575994717847523, "grad_norm": 1.5918534994125366, "learning_rate": 3.571280296720914e-05, "loss": 2.8584, "step": 114300 }, { "epoch": 0.8583497775343453, "grad_norm": 1.4106186628341675, "learning_rate": 3.570029787138259e-05, "loss": 2.8207, "step": 114400 }, { "epoch": 0.8591000832839382, "grad_norm": 1.7402763366699219, "learning_rate": 3.568779277555604e-05, "loss": 2.6892, "step": 114500 }, { "epoch": 0.8598503890335312, "grad_norm": 2.118518352508545, "learning_rate": 3.567528767972949e-05, "loss": 2.8322, "step": 114600 }, { "epoch": 0.8606006947831242, "grad_norm": 2.4325497150421143, "learning_rate": 3.5662782583902946e-05, "loss": 2.7736, "step": 114700 }, { "epoch": 0.8613510005327171, "grad_norm": 1.4302210807800293, "learning_rate": 3.5650277488076393e-05, "loss": 2.7406, "step": 114800 }, { "epoch": 0.8621013062823101, "grad_norm": 1.4650733470916748, "learning_rate": 3.563777239224984e-05, "loss": 2.6857, "step": 114900 }, { "epoch": 0.862851612031903, "grad_norm": 2.904836893081665, "learning_rate": 3.5625267296423295e-05, "loss": 2.6409, "step": 115000 }, { "epoch": 0.863601917781496, "grad_norm": 2.1441166400909424, "learning_rate": 3.561276220059674e-05, "loss": 2.7514, "step": 115100 }, { "epoch": 0.8643522235310889, "grad_norm": 1.2985255718231201, "learning_rate": 3.56002571047702e-05, "loss": 2.7248, "step": 115200 }, { "epoch": 0.8651025292806819, "grad_norm": 1.8815816640853882, "learning_rate": 3.5587752008943645e-05, "loss": 2.7418, "step": 115300 }, { "epoch": 0.8658528350302749, "grad_norm": 1.7480928897857666, "learning_rate": 3.55752469131171e-05, "loss": 2.6493, "step": 115400 }, { "epoch": 0.8666031407798678, "grad_norm": 1.5544555187225342, "learning_rate": 3.556274181729055e-05, "loss": 2.7223, "step": 115500 }, { "epoch": 0.8673534465294608, "grad_norm": 1.6721489429473877, "learning_rate": 3.5550236721463995e-05, "loss": 2.6696, "step": 115600 }, { "epoch": 0.8681037522790537, "grad_norm": 1.9093917608261108, "learning_rate": 3.553785667659571e-05, "loss": 2.7351, "step": 115700 }, { "epoch": 0.8688540580286467, "grad_norm": 1.7973906993865967, "learning_rate": 3.5525351580769167e-05, "loss": 2.7471, "step": 115800 }, { "epoch": 0.8696043637782397, "grad_norm": 2.92752742767334, "learning_rate": 3.5512846484942614e-05, "loss": 2.7132, "step": 115900 }, { "epoch": 0.8703546695278326, "grad_norm": 1.45115327835083, "learning_rate": 3.550034138911606e-05, "loss": 2.8377, "step": 116000 }, { "epoch": 0.8711049752774256, "grad_norm": 1.5277782678604126, "learning_rate": 3.5487836293289516e-05, "loss": 2.8493, "step": 116100 }, { "epoch": 0.8718552810270185, "grad_norm": 2.604386806488037, "learning_rate": 3.547533119746297e-05, "loss": 2.8579, "step": 116200 }, { "epoch": 0.8726055867766115, "grad_norm": 1.9556127786636353, "learning_rate": 3.546282610163642e-05, "loss": 2.8713, "step": 116300 }, { "epoch": 0.8733558925262044, "grad_norm": 1.7837570905685425, "learning_rate": 3.5450321005809866e-05, "loss": 2.8298, "step": 116400 }, { "epoch": 0.8741061982757974, "grad_norm": 1.6068944931030273, "learning_rate": 3.543781590998332e-05, "loss": 2.8054, "step": 116500 }, { "epoch": 0.8748565040253904, "grad_norm": 2.4902660846710205, "learning_rate": 3.542531081415677e-05, "loss": 2.8759, "step": 116600 }, { "epoch": 0.8756068097749833, "grad_norm": 2.2662692070007324, "learning_rate": 3.541280571833022e-05, "loss": 2.7999, "step": 116700 }, { "epoch": 0.8763571155245763, "grad_norm": 1.5675501823425293, "learning_rate": 3.540030062250368e-05, "loss": 2.6321, "step": 116800 }, { "epoch": 0.8771074212741692, "grad_norm": 1.9347895383834839, "learning_rate": 3.538779552667712e-05, "loss": 2.7158, "step": 116900 }, { "epoch": 0.8778577270237622, "grad_norm": 2.0741679668426514, "learning_rate": 3.5375415481808835e-05, "loss": 2.7829, "step": 117000 }, { "epoch": 0.8786080327733552, "grad_norm": 2.197049856185913, "learning_rate": 3.536291038598229e-05, "loss": 2.6707, "step": 117100 }, { "epoch": 0.8793583385229481, "grad_norm": 1.7421218156814575, "learning_rate": 3.5350405290155744e-05, "loss": 2.8366, "step": 117200 }, { "epoch": 0.8801086442725411, "grad_norm": 1.8908511400222778, "learning_rate": 3.533790019432919e-05, "loss": 2.7173, "step": 117300 }, { "epoch": 0.880858950022134, "grad_norm": 2.1893765926361084, "learning_rate": 3.532539509850264e-05, "loss": 2.8404, "step": 117400 }, { "epoch": 0.881609255771727, "grad_norm": 2.7410125732421875, "learning_rate": 3.5312890002676093e-05, "loss": 2.7373, "step": 117500 }, { "epoch": 0.8823595615213199, "grad_norm": 1.7203521728515625, "learning_rate": 3.530038490684954e-05, "loss": 2.7673, "step": 117600 }, { "epoch": 0.8831098672709129, "grad_norm": 2.619187116622925, "learning_rate": 3.5287879811022996e-05, "loss": 2.7265, "step": 117700 }, { "epoch": 0.8838601730205059, "grad_norm": 2.001718521118164, "learning_rate": 3.527537471519644e-05, "loss": 2.5874, "step": 117800 }, { "epoch": 0.8846104787700988, "grad_norm": 2.6792314052581787, "learning_rate": 3.526286961936989e-05, "loss": 2.872, "step": 117900 }, { "epoch": 0.8853607845196918, "grad_norm": 2.03922176361084, "learning_rate": 3.5250364523543345e-05, "loss": 2.6773, "step": 118000 }, { "epoch": 0.8861110902692847, "grad_norm": 2.1245734691619873, "learning_rate": 3.523785942771679e-05, "loss": 2.7643, "step": 118100 }, { "epoch": 0.8868613960188777, "grad_norm": 1.7129524946212769, "learning_rate": 3.522535433189025e-05, "loss": 2.7724, "step": 118200 }, { "epoch": 0.8876117017684707, "grad_norm": 1.176259160041809, "learning_rate": 3.52128492360637e-05, "loss": 2.633, "step": 118300 }, { "epoch": 0.8883620075180636, "grad_norm": 2.3534629344940186, "learning_rate": 3.520034414023715e-05, "loss": 2.644, "step": 118400 }, { "epoch": 0.8891123132676566, "grad_norm": 1.2546147108078003, "learning_rate": 3.51878390444106e-05, "loss": 2.7668, "step": 118500 }, { "epoch": 0.8898626190172495, "grad_norm": 1.2196617126464844, "learning_rate": 3.517533394858405e-05, "loss": 2.9013, "step": 118600 }, { "epoch": 0.8906129247668425, "grad_norm": 1.4486210346221924, "learning_rate": 3.51628288527575e-05, "loss": 2.6838, "step": 118700 }, { "epoch": 0.8913632305164354, "grad_norm": 1.8811619281768799, "learning_rate": 3.515032375693095e-05, "loss": 2.7806, "step": 118800 }, { "epoch": 0.8921135362660284, "grad_norm": 1.6414068937301636, "learning_rate": 3.51378186611044e-05, "loss": 2.7749, "step": 118900 }, { "epoch": 0.8928638420156214, "grad_norm": 3.2242887020111084, "learning_rate": 3.512531356527785e-05, "loss": 2.7803, "step": 119000 }, { "epoch": 0.8936141477652143, "grad_norm": 1.710803747177124, "learning_rate": 3.51128084694513e-05, "loss": 2.8461, "step": 119100 }, { "epoch": 0.8943644535148073, "grad_norm": 2.035167694091797, "learning_rate": 3.510030337362476e-05, "loss": 2.5624, "step": 119200 }, { "epoch": 0.8951147592644002, "grad_norm": 1.6628140211105347, "learning_rate": 3.5087798277798205e-05, "loss": 2.8581, "step": 119300 }, { "epoch": 0.8958650650139932, "grad_norm": 1.6191177368164062, "learning_rate": 3.507529318197165e-05, "loss": 2.6881, "step": 119400 }, { "epoch": 0.8966153707635861, "grad_norm": 1.971297025680542, "learning_rate": 3.506278808614511e-05, "loss": 2.6495, "step": 119500 }, { "epoch": 0.8973656765131791, "grad_norm": 2.498589038848877, "learning_rate": 3.5050282990318555e-05, "loss": 2.8287, "step": 119600 }, { "epoch": 0.8981159822627721, "grad_norm": 1.8486723899841309, "learning_rate": 3.503777789449201e-05, "loss": 2.6908, "step": 119700 }, { "epoch": 0.898866288012365, "grad_norm": 1.3110040426254272, "learning_rate": 3.502527279866546e-05, "loss": 2.7129, "step": 119800 }, { "epoch": 0.899616593761958, "grad_norm": 1.3623932600021362, "learning_rate": 3.501276770283891e-05, "loss": 2.693, "step": 119900 }, { "epoch": 0.9003668995115509, "grad_norm": 1.754114031791687, "learning_rate": 3.500026260701236e-05, "loss": 2.7665, "step": 120000 }, { "epoch": 0.9011172052611439, "grad_norm": 2.236846446990967, "learning_rate": 3.4987757511185806e-05, "loss": 2.6589, "step": 120100 }, { "epoch": 0.9018675110107369, "grad_norm": 2.0746328830718994, "learning_rate": 3.497525241535926e-05, "loss": 2.8308, "step": 120200 }, { "epoch": 0.9026178167603298, "grad_norm": 1.9457752704620361, "learning_rate": 3.4962747319532715e-05, "loss": 2.7326, "step": 120300 }, { "epoch": 0.9033681225099228, "grad_norm": 1.267945647239685, "learning_rate": 3.495024222370616e-05, "loss": 2.8153, "step": 120400 }, { "epoch": 0.9041184282595157, "grad_norm": 1.6235109567642212, "learning_rate": 3.493773712787961e-05, "loss": 2.7465, "step": 120500 }, { "epoch": 0.9048687340091087, "grad_norm": 1.9797066450119019, "learning_rate": 3.4925232032053065e-05, "loss": 2.7858, "step": 120600 }, { "epoch": 0.9056190397587016, "grad_norm": 2.58907413482666, "learning_rate": 3.491272693622651e-05, "loss": 2.6851, "step": 120700 }, { "epoch": 0.9063693455082946, "grad_norm": 2.7708187103271484, "learning_rate": 3.490022184039997e-05, "loss": 2.6085, "step": 120800 }, { "epoch": 0.9071196512578876, "grad_norm": 1.9907089471817017, "learning_rate": 3.4887716744573415e-05, "loss": 2.7507, "step": 120900 }, { "epoch": 0.9078699570074805, "grad_norm": 1.5083345174789429, "learning_rate": 3.487533669970513e-05, "loss": 2.7097, "step": 121000 }, { "epoch": 0.9086202627570735, "grad_norm": 2.59026837348938, "learning_rate": 3.486283160387858e-05, "loss": 2.7313, "step": 121100 }, { "epoch": 0.9093705685066664, "grad_norm": 1.6815210580825806, "learning_rate": 3.4850326508052034e-05, "loss": 2.8571, "step": 121200 }, { "epoch": 0.9101208742562594, "grad_norm": 1.633857250213623, "learning_rate": 3.483782141222549e-05, "loss": 2.6137, "step": 121300 }, { "epoch": 0.9108711800058524, "grad_norm": 1.7087880373001099, "learning_rate": 3.482531631639893e-05, "loss": 2.8092, "step": 121400 }, { "epoch": 0.9116214857554453, "grad_norm": 1.9175612926483154, "learning_rate": 3.4812811220572384e-05, "loss": 2.8063, "step": 121500 }, { "epoch": 0.9123717915050383, "grad_norm": 1.7769569158554077, "learning_rate": 3.480030612474584e-05, "loss": 2.6946, "step": 121600 }, { "epoch": 0.9131220972546312, "grad_norm": 2.1602511405944824, "learning_rate": 3.4787801028919286e-05, "loss": 2.6779, "step": 121700 }, { "epoch": 0.9138724030042242, "grad_norm": 1.7881032228469849, "learning_rate": 3.477529593309274e-05, "loss": 2.6756, "step": 121800 }, { "epoch": 0.9146227087538171, "grad_norm": 1.79860258102417, "learning_rate": 3.476279083726619e-05, "loss": 2.7765, "step": 121900 }, { "epoch": 0.9153730145034101, "grad_norm": 2.142787218093872, "learning_rate": 3.4750285741439635e-05, "loss": 2.726, "step": 122000 }, { "epoch": 0.9161233202530031, "grad_norm": 2.309195041656494, "learning_rate": 3.473778064561309e-05, "loss": 2.7215, "step": 122100 }, { "epoch": 0.916873626002596, "grad_norm": 1.4414927959442139, "learning_rate": 3.472527554978654e-05, "loss": 2.6535, "step": 122200 }, { "epoch": 0.917623931752189, "grad_norm": 1.08432137966156, "learning_rate": 3.471277045395999e-05, "loss": 2.7649, "step": 122300 }, { "epoch": 0.9183742375017819, "grad_norm": 2.3241069316864014, "learning_rate": 3.4700265358133446e-05, "loss": 2.7587, "step": 122400 }, { "epoch": 0.9191245432513749, "grad_norm": 1.723753571510315, "learning_rate": 3.468776026230689e-05, "loss": 2.8067, "step": 122500 }, { "epoch": 0.9198748490009679, "grad_norm": 1.2296757698059082, "learning_rate": 3.467525516648034e-05, "loss": 2.8633, "step": 122600 }, { "epoch": 0.9206251547505608, "grad_norm": 2.3831863403320312, "learning_rate": 3.4662750070653796e-05, "loss": 2.7048, "step": 122700 }, { "epoch": 0.9213754605001538, "grad_norm": 1.888709545135498, "learning_rate": 3.4650244974827243e-05, "loss": 2.7692, "step": 122800 }, { "epoch": 0.9221257662497467, "grad_norm": 2.1730172634124756, "learning_rate": 3.4637864929958954e-05, "loss": 2.8675, "step": 122900 }, { "epoch": 0.9228760719993397, "grad_norm": 1.3621282577514648, "learning_rate": 3.462535983413241e-05, "loss": 2.7341, "step": 123000 }, { "epoch": 0.9236263777489326, "grad_norm": 2.672424793243408, "learning_rate": 3.461285473830586e-05, "loss": 2.7929, "step": 123100 }, { "epoch": 0.9243766834985256, "grad_norm": 1.5711749792099, "learning_rate": 3.460034964247931e-05, "loss": 2.7919, "step": 123200 }, { "epoch": 0.9251269892481186, "grad_norm": 1.7094898223876953, "learning_rate": 3.4587844546652765e-05, "loss": 2.7579, "step": 123300 }, { "epoch": 0.9258772949977115, "grad_norm": 2.183225393295288, "learning_rate": 3.457533945082621e-05, "loss": 2.6047, "step": 123400 }, { "epoch": 0.9266276007473045, "grad_norm": 1.210848093032837, "learning_rate": 3.456283435499966e-05, "loss": 2.8074, "step": 123500 }, { "epoch": 0.9273779064968974, "grad_norm": 2.3928842544555664, "learning_rate": 3.4550329259173115e-05, "loss": 2.8202, "step": 123600 }, { "epoch": 0.9281282122464904, "grad_norm": 1.5313529968261719, "learning_rate": 3.453782416334657e-05, "loss": 2.7452, "step": 123700 }, { "epoch": 0.9288785179960835, "grad_norm": 2.0440316200256348, "learning_rate": 3.452531906752002e-05, "loss": 2.6871, "step": 123800 }, { "epoch": 0.9296288237456763, "grad_norm": 2.232879877090454, "learning_rate": 3.4512813971693464e-05, "loss": 2.7543, "step": 123900 }, { "epoch": 0.9303791294952694, "grad_norm": 2.020946502685547, "learning_rate": 3.450030887586692e-05, "loss": 2.7296, "step": 124000 }, { "epoch": 0.9311294352448622, "grad_norm": 2.312572956085205, "learning_rate": 3.4487803780040366e-05, "loss": 2.5691, "step": 124100 }, { "epoch": 0.9318797409944553, "grad_norm": 1.8636842966079712, "learning_rate": 3.447529868421382e-05, "loss": 2.7494, "step": 124200 }, { "epoch": 0.9326300467440481, "grad_norm": 1.940819263458252, "learning_rate": 3.446279358838727e-05, "loss": 2.6738, "step": 124300 }, { "epoch": 0.9333803524936412, "grad_norm": 3.223583936691284, "learning_rate": 3.445028849256072e-05, "loss": 2.7724, "step": 124400 }, { "epoch": 0.9341306582432342, "grad_norm": 3.002791166305542, "learning_rate": 3.443778339673417e-05, "loss": 2.7523, "step": 124500 }, { "epoch": 0.934880963992827, "grad_norm": 2.473830461502075, "learning_rate": 3.442527830090762e-05, "loss": 2.6439, "step": 124600 }, { "epoch": 0.9356312697424201, "grad_norm": 2.219259738922119, "learning_rate": 3.441277320508107e-05, "loss": 2.8272, "step": 124700 }, { "epoch": 0.936381575492013, "grad_norm": 1.5327974557876587, "learning_rate": 3.440026810925453e-05, "loss": 2.5621, "step": 124800 }, { "epoch": 0.937131881241606, "grad_norm": 1.6243633031845093, "learning_rate": 3.4387763013427974e-05, "loss": 2.7055, "step": 124900 }, { "epoch": 0.937882186991199, "grad_norm": 1.8217263221740723, "learning_rate": 3.437525791760142e-05, "loss": 2.6285, "step": 125000 }, { "epoch": 0.9386324927407919, "grad_norm": 2.458644151687622, "learning_rate": 3.4362752821774876e-05, "loss": 2.7319, "step": 125100 }, { "epoch": 0.9393827984903849, "grad_norm": 1.52069091796875, "learning_rate": 3.4350247725948324e-05, "loss": 2.7196, "step": 125200 }, { "epoch": 0.9401331042399778, "grad_norm": 1.9588080644607544, "learning_rate": 3.433774263012178e-05, "loss": 2.8311, "step": 125300 }, { "epoch": 0.9408834099895708, "grad_norm": 1.884798526763916, "learning_rate": 3.4325237534295226e-05, "loss": 2.7054, "step": 125400 }, { "epoch": 0.9416337157391637, "grad_norm": 3.353766441345215, "learning_rate": 3.4312732438468674e-05, "loss": 2.7921, "step": 125500 }, { "epoch": 0.9423840214887567, "grad_norm": 3.6426191329956055, "learning_rate": 3.430022734264213e-05, "loss": 2.6437, "step": 125600 }, { "epoch": 0.9431343272383497, "grad_norm": 1.5539125204086304, "learning_rate": 3.4287722246815576e-05, "loss": 2.7212, "step": 125700 }, { "epoch": 0.9438846329879426, "grad_norm": 1.9553228616714478, "learning_rate": 3.427521715098903e-05, "loss": 2.5891, "step": 125800 }, { "epoch": 0.9446349387375356, "grad_norm": 1.6018445491790771, "learning_rate": 3.4262712055162485e-05, "loss": 2.9034, "step": 125900 }, { "epoch": 0.9453852444871285, "grad_norm": 1.3278180360794067, "learning_rate": 3.4250206959335925e-05, "loss": 2.7852, "step": 126000 }, { "epoch": 0.9461355502367215, "grad_norm": 2.068157911300659, "learning_rate": 3.423770186350938e-05, "loss": 2.9549, "step": 126100 }, { "epoch": 0.9468858559863145, "grad_norm": 2.098524332046509, "learning_rate": 3.4225196767682834e-05, "loss": 2.8356, "step": 126200 }, { "epoch": 0.9476361617359074, "grad_norm": 1.5843207836151123, "learning_rate": 3.421269167185628e-05, "loss": 2.72, "step": 126300 }, { "epoch": 0.9483864674855004, "grad_norm": 2.073000192642212, "learning_rate": 3.4200186576029736e-05, "loss": 2.7315, "step": 126400 }, { "epoch": 0.9491367732350933, "grad_norm": 1.7920998334884644, "learning_rate": 3.4187681480203184e-05, "loss": 2.8011, "step": 126500 }, { "epoch": 0.9498870789846863, "grad_norm": 1.8779215812683105, "learning_rate": 3.417517638437663e-05, "loss": 2.7343, "step": 126600 }, { "epoch": 0.9506373847342792, "grad_norm": 1.498293399810791, "learning_rate": 3.4162671288550086e-05, "loss": 2.7615, "step": 126700 }, { "epoch": 0.9513876904838722, "grad_norm": 2.3741636276245117, "learning_rate": 3.415016619272354e-05, "loss": 2.7009, "step": 126800 }, { "epoch": 0.9521379962334652, "grad_norm": 1.8603261709213257, "learning_rate": 3.413778614785526e-05, "loss": 2.6816, "step": 126900 }, { "epoch": 0.9528883019830581, "grad_norm": 1.4357452392578125, "learning_rate": 3.41252810520287e-05, "loss": 2.7732, "step": 127000 }, { "epoch": 0.9536386077326511, "grad_norm": 2.0429441928863525, "learning_rate": 3.411277595620215e-05, "loss": 2.7165, "step": 127100 }, { "epoch": 0.954388913482244, "grad_norm": 2.1929800510406494, "learning_rate": 3.410027086037561e-05, "loss": 2.73, "step": 127200 }, { "epoch": 0.955139219231837, "grad_norm": 2.2219526767730713, "learning_rate": 3.4087765764549055e-05, "loss": 2.7274, "step": 127300 }, { "epoch": 0.95588952498143, "grad_norm": 1.6733287572860718, "learning_rate": 3.407526066872251e-05, "loss": 2.8383, "step": 127400 }, { "epoch": 0.9566398307310229, "grad_norm": 1.4969576597213745, "learning_rate": 3.406275557289596e-05, "loss": 2.6044, "step": 127500 }, { "epoch": 0.9573901364806159, "grad_norm": 2.2411766052246094, "learning_rate": 3.4050250477069405e-05, "loss": 2.7348, "step": 127600 }, { "epoch": 0.9581404422302088, "grad_norm": 2.702436923980713, "learning_rate": 3.403787043220112e-05, "loss": 2.6564, "step": 127700 }, { "epoch": 0.9588907479798018, "grad_norm": 1.330409049987793, "learning_rate": 3.4025365336374577e-05, "loss": 2.6736, "step": 127800 }, { "epoch": 0.9596410537293947, "grad_norm": 2.041105270385742, "learning_rate": 3.4012860240548024e-05, "loss": 2.7199, "step": 127900 }, { "epoch": 0.9603913594789877, "grad_norm": 2.0149054527282715, "learning_rate": 3.400035514472147e-05, "loss": 2.7179, "step": 128000 }, { "epoch": 0.9611416652285807, "grad_norm": 1.575789213180542, "learning_rate": 3.3987850048894926e-05, "loss": 2.6443, "step": 128100 }, { "epoch": 0.9618919709781736, "grad_norm": 2.064389228820801, "learning_rate": 3.397534495306838e-05, "loss": 2.6911, "step": 128200 }, { "epoch": 0.9626422767277666, "grad_norm": 1.7309434413909912, "learning_rate": 3.396283985724183e-05, "loss": 2.9392, "step": 128300 }, { "epoch": 0.9633925824773595, "grad_norm": 2.3173775672912598, "learning_rate": 3.3950334761415276e-05, "loss": 2.6843, "step": 128400 }, { "epoch": 0.9641428882269525, "grad_norm": 1.5269014835357666, "learning_rate": 3.393782966558873e-05, "loss": 2.6698, "step": 128500 }, { "epoch": 0.9648931939765455, "grad_norm": 1.3337008953094482, "learning_rate": 3.392532456976218e-05, "loss": 2.7162, "step": 128600 }, { "epoch": 0.9656434997261384, "grad_norm": 1.8574769496917725, "learning_rate": 3.391281947393563e-05, "loss": 2.7265, "step": 128700 }, { "epoch": 0.9663938054757314, "grad_norm": 1.9578945636749268, "learning_rate": 3.390031437810908e-05, "loss": 2.8333, "step": 128800 }, { "epoch": 0.9671441112253243, "grad_norm": 2.1523947715759277, "learning_rate": 3.3887809282282534e-05, "loss": 2.7439, "step": 128900 }, { "epoch": 0.9678944169749173, "grad_norm": 1.7783178091049194, "learning_rate": 3.387530418645598e-05, "loss": 2.6465, "step": 129000 }, { "epoch": 0.9686447227245102, "grad_norm": 1.7966774702072144, "learning_rate": 3.386279909062943e-05, "loss": 2.6035, "step": 129100 }, { "epoch": 0.9693950284741032, "grad_norm": 1.3420403003692627, "learning_rate": 3.3850293994802884e-05, "loss": 2.8436, "step": 129200 }, { "epoch": 0.9701453342236962, "grad_norm": 1.2740237712860107, "learning_rate": 3.383778889897634e-05, "loss": 2.6052, "step": 129300 }, { "epoch": 0.9708956399732891, "grad_norm": 1.5517909526824951, "learning_rate": 3.3825283803149786e-05, "loss": 2.6122, "step": 129400 }, { "epoch": 0.9716459457228821, "grad_norm": 1.6507161855697632, "learning_rate": 3.3812778707323234e-05, "loss": 2.8153, "step": 129500 }, { "epoch": 0.972396251472475, "grad_norm": 2.305720090866089, "learning_rate": 3.380027361149669e-05, "loss": 2.6873, "step": 129600 }, { "epoch": 0.973146557222068, "grad_norm": 1.7731585502624512, "learning_rate": 3.3787768515670136e-05, "loss": 2.6995, "step": 129700 }, { "epoch": 0.973896862971661, "grad_norm": 2.615781307220459, "learning_rate": 3.377526341984359e-05, "loss": 2.6954, "step": 129800 }, { "epoch": 0.9746471687212539, "grad_norm": 2.7829885482788086, "learning_rate": 3.376275832401704e-05, "loss": 2.8878, "step": 129900 }, { "epoch": 0.9753974744708469, "grad_norm": 1.8197336196899414, "learning_rate": 3.3750253228190485e-05, "loss": 2.8647, "step": 130000 }, { "epoch": 0.9761477802204398, "grad_norm": 2.19930362701416, "learning_rate": 3.373774813236394e-05, "loss": 2.7187, "step": 130100 }, { "epoch": 0.9768980859700328, "grad_norm": 1.6864876747131348, "learning_rate": 3.372524303653739e-05, "loss": 2.6297, "step": 130200 }, { "epoch": 0.9776483917196257, "grad_norm": 1.5000112056732178, "learning_rate": 3.371273794071084e-05, "loss": 2.8038, "step": 130300 }, { "epoch": 0.9783986974692187, "grad_norm": 2.4078569412231445, "learning_rate": 3.3700232844884296e-05, "loss": 2.7426, "step": 130400 }, { "epoch": 0.9791490032188117, "grad_norm": 2.2380216121673584, "learning_rate": 3.368772774905774e-05, "loss": 2.7563, "step": 130500 }, { "epoch": 0.9798993089684046, "grad_norm": 2.6041879653930664, "learning_rate": 3.367522265323119e-05, "loss": 2.7624, "step": 130600 }, { "epoch": 0.9806496147179976, "grad_norm": 2.046618938446045, "learning_rate": 3.366284260836291e-05, "loss": 2.8178, "step": 130700 }, { "epoch": 0.9813999204675905, "grad_norm": 1.5017857551574707, "learning_rate": 3.365033751253636e-05, "loss": 2.8721, "step": 130800 }, { "epoch": 0.9821502262171835, "grad_norm": 3.5542492866516113, "learning_rate": 3.363783241670981e-05, "loss": 2.6956, "step": 130900 }, { "epoch": 0.9829005319667765, "grad_norm": 1.5019207000732422, "learning_rate": 3.362532732088326e-05, "loss": 2.6742, "step": 131000 }, { "epoch": 0.9836508377163694, "grad_norm": 1.6145564317703247, "learning_rate": 3.361282222505671e-05, "loss": 2.8067, "step": 131100 }, { "epoch": 0.9844011434659624, "grad_norm": 2.8579325675964355, "learning_rate": 3.360031712923016e-05, "loss": 2.7895, "step": 131200 }, { "epoch": 0.9851514492155553, "grad_norm": 1.9037461280822754, "learning_rate": 3.3587812033403615e-05, "loss": 2.6752, "step": 131300 }, { "epoch": 0.9859017549651483, "grad_norm": 1.7724988460540771, "learning_rate": 3.357530693757707e-05, "loss": 2.7833, "step": 131400 }, { "epoch": 0.9866520607147412, "grad_norm": 2.6155927181243896, "learning_rate": 3.356280184175051e-05, "loss": 2.6528, "step": 131500 }, { "epoch": 0.9874023664643342, "grad_norm": 2.003039836883545, "learning_rate": 3.3550296745923965e-05, "loss": 2.7613, "step": 131600 }, { "epoch": 0.9881526722139272, "grad_norm": 3.3196945190429688, "learning_rate": 3.353779165009742e-05, "loss": 2.6781, "step": 131700 }, { "epoch": 0.9889029779635201, "grad_norm": 2.918325424194336, "learning_rate": 3.352528655427087e-05, "loss": 2.8743, "step": 131800 }, { "epoch": 0.9896532837131131, "grad_norm": 2.9206578731536865, "learning_rate": 3.351278145844432e-05, "loss": 2.7875, "step": 131900 }, { "epoch": 0.990403589462706, "grad_norm": 2.351151943206787, "learning_rate": 3.350027636261777e-05, "loss": 2.7638, "step": 132000 }, { "epoch": 0.991153895212299, "grad_norm": 2.501549243927002, "learning_rate": 3.3487771266791216e-05, "loss": 2.75, "step": 132100 }, { "epoch": 0.991904200961892, "grad_norm": 1.7017168998718262, "learning_rate": 3.347526617096467e-05, "loss": 2.8501, "step": 132200 }, { "epoch": 0.9926545067114849, "grad_norm": 1.4831713438034058, "learning_rate": 3.346276107513812e-05, "loss": 2.6232, "step": 132300 }, { "epoch": 0.9934048124610779, "grad_norm": 2.125901699066162, "learning_rate": 3.345025597931157e-05, "loss": 2.829, "step": 132400 }, { "epoch": 0.9941551182106708, "grad_norm": 2.015575408935547, "learning_rate": 3.343775088348502e-05, "loss": 2.6262, "step": 132500 }, { "epoch": 0.9949054239602638, "grad_norm": 1.8156108856201172, "learning_rate": 3.342524578765847e-05, "loss": 2.8016, "step": 132600 }, { "epoch": 0.9956557297098567, "grad_norm": 1.3836588859558105, "learning_rate": 3.341274069183192e-05, "loss": 2.847, "step": 132700 }, { "epoch": 0.9964060354594497, "grad_norm": 3.261204957962036, "learning_rate": 3.340023559600538e-05, "loss": 2.7517, "step": 132800 }, { "epoch": 0.9971563412090427, "grad_norm": 2.947970390319824, "learning_rate": 3.3387730500178825e-05, "loss": 2.6518, "step": 132900 }, { "epoch": 0.9979066469586356, "grad_norm": 2.032365322113037, "learning_rate": 3.337522540435227e-05, "loss": 2.7161, "step": 133000 }, { "epoch": 0.9986569527082286, "grad_norm": 1.984210729598999, "learning_rate": 3.3362720308525727e-05, "loss": 2.7655, "step": 133100 }, { "epoch": 0.9994072584578215, "grad_norm": 1.9773662090301514, "learning_rate": 3.3350215212699174e-05, "loss": 2.7798, "step": 133200 }, { "epoch": 1.0001575642074145, "grad_norm": 1.506361722946167, "learning_rate": 3.333771011687263e-05, "loss": 2.7336, "step": 133300 }, { "epoch": 1.0009078699570075, "grad_norm": 2.4625422954559326, "learning_rate": 3.332520502104608e-05, "loss": 2.8143, "step": 133400 }, { "epoch": 1.0016581757066005, "grad_norm": 1.7205016613006592, "learning_rate": 3.3312824976177794e-05, "loss": 2.6266, "step": 133500 }, { "epoch": 1.0024084814561933, "grad_norm": 2.3706045150756836, "learning_rate": 3.330031988035124e-05, "loss": 2.7858, "step": 133600 }, { "epoch": 1.0031587872057863, "grad_norm": 1.4823657274246216, "learning_rate": 3.3287814784524696e-05, "loss": 2.6184, "step": 133700 }, { "epoch": 1.0039090929553793, "grad_norm": 1.560542106628418, "learning_rate": 3.327530968869815e-05, "loss": 2.6176, "step": 133800 }, { "epoch": 1.0046593987049723, "grad_norm": 1.8052340745925903, "learning_rate": 3.32628045928716e-05, "loss": 2.6706, "step": 133900 }, { "epoch": 1.0054097044545653, "grad_norm": 1.678030014038086, "learning_rate": 3.3250299497045045e-05, "loss": 2.8084, "step": 134000 }, { "epoch": 1.0061600102041581, "grad_norm": 2.589684247970581, "learning_rate": 3.32377944012185e-05, "loss": 2.6756, "step": 134100 }, { "epoch": 1.0069103159537511, "grad_norm": 2.017125129699707, "learning_rate": 3.322528930539195e-05, "loss": 2.5854, "step": 134200 }, { "epoch": 1.0076606217033441, "grad_norm": 1.6983333826065063, "learning_rate": 3.32127842095654e-05, "loss": 2.7299, "step": 134300 }, { "epoch": 1.0084109274529371, "grad_norm": 1.383798360824585, "learning_rate": 3.320027911373885e-05, "loss": 2.7363, "step": 134400 }, { "epoch": 1.00916123320253, "grad_norm": 1.929149866104126, "learning_rate": 3.318789906887057e-05, "loss": 2.719, "step": 134500 }, { "epoch": 1.009911538952123, "grad_norm": 2.304985523223877, "learning_rate": 3.3175393973044014e-05, "loss": 2.6797, "step": 134600 }, { "epoch": 1.010661844701716, "grad_norm": 1.9865484237670898, "learning_rate": 3.316288887721747e-05, "loss": 2.7373, "step": 134700 }, { "epoch": 1.011412150451309, "grad_norm": 1.8830870389938354, "learning_rate": 3.315038378139092e-05, "loss": 2.7337, "step": 134800 }, { "epoch": 1.012162456200902, "grad_norm": 2.6962387561798096, "learning_rate": 3.3137878685564364e-05, "loss": 2.6679, "step": 134900 }, { "epoch": 1.0129127619504947, "grad_norm": 1.7282845973968506, "learning_rate": 3.312537358973782e-05, "loss": 2.6132, "step": 135000 }, { "epoch": 1.0136630677000877, "grad_norm": 2.394880533218384, "learning_rate": 3.311286849391127e-05, "loss": 2.6539, "step": 135100 }, { "epoch": 1.0144133734496807, "grad_norm": 2.3527872562408447, "learning_rate": 3.310036339808472e-05, "loss": 2.6958, "step": 135200 }, { "epoch": 1.0151636791992737, "grad_norm": 1.6854897737503052, "learning_rate": 3.3087858302258175e-05, "loss": 2.6821, "step": 135300 }, { "epoch": 1.0159139849488668, "grad_norm": 2.3063337802886963, "learning_rate": 3.307535320643162e-05, "loss": 2.6321, "step": 135400 }, { "epoch": 1.0166642906984595, "grad_norm": 1.9544641971588135, "learning_rate": 3.306284811060507e-05, "loss": 2.6931, "step": 135500 }, { "epoch": 1.0174145964480525, "grad_norm": 3.239928960800171, "learning_rate": 3.3050343014778525e-05, "loss": 2.619, "step": 135600 }, { "epoch": 1.0181649021976455, "grad_norm": 1.8127894401550293, "learning_rate": 3.303783791895197e-05, "loss": 2.6672, "step": 135700 }, { "epoch": 1.0189152079472386, "grad_norm": 1.5246764421463013, "learning_rate": 3.302533282312543e-05, "loss": 2.648, "step": 135800 }, { "epoch": 1.0196655136968316, "grad_norm": 1.7304601669311523, "learning_rate": 3.301282772729888e-05, "loss": 2.7526, "step": 135900 }, { "epoch": 1.0204158194464243, "grad_norm": 1.716605305671692, "learning_rate": 3.300032263147232e-05, "loss": 2.776, "step": 136000 }, { "epoch": 1.0211661251960173, "grad_norm": 1.490417718887329, "learning_rate": 3.2987817535645776e-05, "loss": 2.7743, "step": 136100 }, { "epoch": 1.0219164309456104, "grad_norm": 1.623819351196289, "learning_rate": 3.297531243981923e-05, "loss": 2.8001, "step": 136200 }, { "epoch": 1.0226667366952034, "grad_norm": 1.637396216392517, "learning_rate": 3.296280734399268e-05, "loss": 2.5776, "step": 136300 }, { "epoch": 1.0234170424447964, "grad_norm": 2.362349510192871, "learning_rate": 3.295030224816613e-05, "loss": 2.7604, "step": 136400 }, { "epoch": 1.0241673481943891, "grad_norm": 2.63785982131958, "learning_rate": 3.293779715233958e-05, "loss": 2.788, "step": 136500 }, { "epoch": 1.0249176539439822, "grad_norm": 1.5033624172210693, "learning_rate": 3.292529205651303e-05, "loss": 2.8017, "step": 136600 }, { "epoch": 1.0256679596935752, "grad_norm": 2.6846485137939453, "learning_rate": 3.291278696068648e-05, "loss": 2.6088, "step": 136700 }, { "epoch": 1.0264182654431682, "grad_norm": 2.0167624950408936, "learning_rate": 3.290028186485993e-05, "loss": 2.7138, "step": 136800 }, { "epoch": 1.027168571192761, "grad_norm": 1.4350898265838623, "learning_rate": 3.2887776769033384e-05, "loss": 2.5903, "step": 136900 }, { "epoch": 1.027918876942354, "grad_norm": 1.468376636505127, "learning_rate": 3.287527167320683e-05, "loss": 2.68, "step": 137000 }, { "epoch": 1.028669182691947, "grad_norm": 1.7130348682403564, "learning_rate": 3.286276657738028e-05, "loss": 2.6729, "step": 137100 }, { "epoch": 1.02941948844154, "grad_norm": 2.2788913249969482, "learning_rate": 3.2850261481553734e-05, "loss": 2.6109, "step": 137200 }, { "epoch": 1.030169794191133, "grad_norm": 2.376413345336914, "learning_rate": 3.283775638572719e-05, "loss": 2.7101, "step": 137300 }, { "epoch": 1.0309200999407258, "grad_norm": 2.2559778690338135, "learning_rate": 3.2825251289900636e-05, "loss": 2.7735, "step": 137400 }, { "epoch": 1.0316704056903188, "grad_norm": 2.4532463550567627, "learning_rate": 3.2812746194074084e-05, "loss": 2.8164, "step": 137500 }, { "epoch": 1.0324207114399118, "grad_norm": 2.581040620803833, "learning_rate": 3.280024109824754e-05, "loss": 2.6726, "step": 137600 }, { "epoch": 1.0331710171895048, "grad_norm": 1.3125349283218384, "learning_rate": 3.2787736002420986e-05, "loss": 2.6499, "step": 137700 }, { "epoch": 1.0339213229390978, "grad_norm": 1.8186686038970947, "learning_rate": 3.277523090659444e-05, "loss": 2.5967, "step": 137800 }, { "epoch": 1.0346716286886906, "grad_norm": 1.3323516845703125, "learning_rate": 3.2762725810767895e-05, "loss": 2.7241, "step": 137900 }, { "epoch": 1.0354219344382836, "grad_norm": 1.6967620849609375, "learning_rate": 3.2750220714941335e-05, "loss": 2.6256, "step": 138000 }, { "epoch": 1.0361722401878766, "grad_norm": 2.531599521636963, "learning_rate": 3.273771561911479e-05, "loss": 2.6228, "step": 138100 }, { "epoch": 1.0369225459374696, "grad_norm": 1.6661136150360107, "learning_rate": 3.2725210523288244e-05, "loss": 2.7019, "step": 138200 }, { "epoch": 1.0376728516870626, "grad_norm": 1.604600429534912, "learning_rate": 3.271270542746169e-05, "loss": 2.7173, "step": 138300 }, { "epoch": 1.0384231574366554, "grad_norm": 1.5630218982696533, "learning_rate": 3.2700200331635146e-05, "loss": 2.7518, "step": 138400 }, { "epoch": 1.0391734631862484, "grad_norm": 2.380284547805786, "learning_rate": 3.2687695235808594e-05, "loss": 2.6603, "step": 138500 }, { "epoch": 1.0399237689358414, "grad_norm": 2.180375814437866, "learning_rate": 3.267519013998204e-05, "loss": 2.779, "step": 138600 }, { "epoch": 1.0406740746854344, "grad_norm": 1.905275821685791, "learning_rate": 3.2662685044155496e-05, "loss": 2.6897, "step": 138700 }, { "epoch": 1.0414243804350272, "grad_norm": 1.624519944190979, "learning_rate": 3.2650179948328944e-05, "loss": 2.6686, "step": 138800 }, { "epoch": 1.0421746861846202, "grad_norm": 2.193462610244751, "learning_rate": 3.26376748525024e-05, "loss": 2.7185, "step": 138900 }, { "epoch": 1.0429249919342132, "grad_norm": 1.754658579826355, "learning_rate": 3.262516975667585e-05, "loss": 2.6738, "step": 139000 }, { "epoch": 1.0436752976838062, "grad_norm": 2.1371874809265137, "learning_rate": 3.261266466084929e-05, "loss": 2.669, "step": 139100 }, { "epoch": 1.0444256034333992, "grad_norm": 1.723732829093933, "learning_rate": 3.260015956502275e-05, "loss": 2.7026, "step": 139200 }, { "epoch": 1.045175909182992, "grad_norm": 1.7795586585998535, "learning_rate": 3.25876544691962e-05, "loss": 2.8074, "step": 139300 }, { "epoch": 1.045926214932585, "grad_norm": 1.3297231197357178, "learning_rate": 3.257514937336965e-05, "loss": 2.7783, "step": 139400 }, { "epoch": 1.046676520682178, "grad_norm": 1.6649178266525269, "learning_rate": 3.2562644277543104e-05, "loss": 2.6589, "step": 139500 }, { "epoch": 1.047426826431771, "grad_norm": 1.8250395059585571, "learning_rate": 3.255013918171655e-05, "loss": 2.8648, "step": 139600 }, { "epoch": 1.048177132181364, "grad_norm": 1.7360433340072632, "learning_rate": 3.253763408589e-05, "loss": 2.6571, "step": 139700 }, { "epoch": 1.0489274379309568, "grad_norm": 1.7307641506195068, "learning_rate": 3.2525128990063454e-05, "loss": 2.6376, "step": 139800 }, { "epoch": 1.0496777436805498, "grad_norm": 2.287227153778076, "learning_rate": 3.25126238942369e-05, "loss": 2.67, "step": 139900 }, { "epoch": 1.0504280494301428, "grad_norm": 1.856973648071289, "learning_rate": 3.2500118798410356e-05, "loss": 2.5597, "step": 140000 }, { "epoch": 1.0511783551797358, "grad_norm": 1.5944766998291016, "learning_rate": 3.2487738753542066e-05, "loss": 2.5563, "step": 140100 }, { "epoch": 1.0519286609293288, "grad_norm": 3.169595718383789, "learning_rate": 3.247523365771552e-05, "loss": 2.7295, "step": 140200 }, { "epoch": 1.0526789666789216, "grad_norm": 1.6933844089508057, "learning_rate": 3.2462728561888975e-05, "loss": 2.7162, "step": 140300 }, { "epoch": 1.0534292724285146, "grad_norm": 1.4858933687210083, "learning_rate": 3.245022346606242e-05, "loss": 2.6188, "step": 140400 }, { "epoch": 1.0541795781781076, "grad_norm": 3.6662726402282715, "learning_rate": 3.243771837023587e-05, "loss": 2.713, "step": 140500 }, { "epoch": 1.0549298839277006, "grad_norm": 2.465440511703491, "learning_rate": 3.2425213274409325e-05, "loss": 2.7516, "step": 140600 }, { "epoch": 1.0556801896772936, "grad_norm": 2.070746660232544, "learning_rate": 3.241270817858277e-05, "loss": 2.6676, "step": 140700 }, { "epoch": 1.0564304954268864, "grad_norm": 1.7044037580490112, "learning_rate": 3.240020308275623e-05, "loss": 2.7636, "step": 140800 }, { "epoch": 1.0571808011764794, "grad_norm": 2.011751890182495, "learning_rate": 3.2387697986929675e-05, "loss": 2.7166, "step": 140900 }, { "epoch": 1.0579311069260724, "grad_norm": 1.9834530353546143, "learning_rate": 3.237519289110313e-05, "loss": 2.6986, "step": 141000 }, { "epoch": 1.0586814126756654, "grad_norm": 1.9185267686843872, "learning_rate": 3.236268779527658e-05, "loss": 2.6991, "step": 141100 }, { "epoch": 1.0594317184252584, "grad_norm": 3.122422695159912, "learning_rate": 3.2350182699450024e-05, "loss": 2.6759, "step": 141200 }, { "epoch": 1.0601820241748512, "grad_norm": 1.7058501243591309, "learning_rate": 3.233767760362348e-05, "loss": 2.7019, "step": 141300 }, { "epoch": 1.0609323299244442, "grad_norm": 2.3066413402557373, "learning_rate": 3.232517250779693e-05, "loss": 2.8659, "step": 141400 }, { "epoch": 1.0616826356740372, "grad_norm": 1.9399627447128296, "learning_rate": 3.231266741197038e-05, "loss": 2.7034, "step": 141500 }, { "epoch": 1.0624329414236302, "grad_norm": 1.642952799797058, "learning_rate": 3.230016231614383e-05, "loss": 2.7501, "step": 141600 }, { "epoch": 1.063183247173223, "grad_norm": 2.3571033477783203, "learning_rate": 3.228765722031728e-05, "loss": 2.7276, "step": 141700 }, { "epoch": 1.063933552922816, "grad_norm": 1.7665098905563354, "learning_rate": 3.227515212449073e-05, "loss": 2.812, "step": 141800 }, { "epoch": 1.064683858672409, "grad_norm": 3.093867540359497, "learning_rate": 3.2262647028664185e-05, "loss": 2.6085, "step": 141900 }, { "epoch": 1.065434164422002, "grad_norm": 2.1021785736083984, "learning_rate": 3.225014193283763e-05, "loss": 2.6252, "step": 142000 }, { "epoch": 1.066184470171595, "grad_norm": 1.6829279661178589, "learning_rate": 3.223763683701108e-05, "loss": 2.7947, "step": 142100 }, { "epoch": 1.0669347759211878, "grad_norm": 3.02508544921875, "learning_rate": 3.2225131741184534e-05, "loss": 2.7863, "step": 142200 }, { "epoch": 1.0676850816707808, "grad_norm": 1.6394861936569214, "learning_rate": 3.221275169631625e-05, "loss": 2.54, "step": 142300 }, { "epoch": 1.0684353874203738, "grad_norm": 2.2882561683654785, "learning_rate": 3.2200246600489706e-05, "loss": 2.6612, "step": 142400 }, { "epoch": 1.0691856931699668, "grad_norm": 1.3915249109268188, "learning_rate": 3.218774150466315e-05, "loss": 2.584, "step": 142500 }, { "epoch": 1.0699359989195598, "grad_norm": 1.8198875188827515, "learning_rate": 3.21752364088366e-05, "loss": 2.7196, "step": 142600 }, { "epoch": 1.0706863046691526, "grad_norm": 1.5563181638717651, "learning_rate": 3.2162731313010056e-05, "loss": 2.7023, "step": 142700 }, { "epoch": 1.0714366104187456, "grad_norm": 2.2631826400756836, "learning_rate": 3.2150226217183504e-05, "loss": 2.6798, "step": 142800 }, { "epoch": 1.0721869161683386, "grad_norm": 1.9704835414886475, "learning_rate": 3.213772112135696e-05, "loss": 2.6157, "step": 142900 }, { "epoch": 1.0729372219179316, "grad_norm": 2.0767343044281006, "learning_rate": 3.2125216025530406e-05, "loss": 2.6624, "step": 143000 }, { "epoch": 1.0736875276675244, "grad_norm": 1.878402590751648, "learning_rate": 3.211271092970385e-05, "loss": 2.5832, "step": 143100 }, { "epoch": 1.0744378334171174, "grad_norm": 1.926633358001709, "learning_rate": 3.210020583387731e-05, "loss": 2.7511, "step": 143200 }, { "epoch": 1.0751881391667104, "grad_norm": 1.7133803367614746, "learning_rate": 3.2087700738050755e-05, "loss": 2.6203, "step": 143300 }, { "epoch": 1.0759384449163034, "grad_norm": 2.112187385559082, "learning_rate": 3.207519564222421e-05, "loss": 2.7005, "step": 143400 }, { "epoch": 1.0766887506658964, "grad_norm": 1.5935884714126587, "learning_rate": 3.2062690546397664e-05, "loss": 2.7082, "step": 143500 }, { "epoch": 1.0774390564154892, "grad_norm": 1.699811339378357, "learning_rate": 3.2050185450571105e-05, "loss": 2.6739, "step": 143600 }, { "epoch": 1.0781893621650822, "grad_norm": 1.8134757280349731, "learning_rate": 3.203768035474456e-05, "loss": 2.6388, "step": 143700 }, { "epoch": 1.0789396679146752, "grad_norm": 1.9316571950912476, "learning_rate": 3.2025175258918014e-05, "loss": 2.7403, "step": 143800 }, { "epoch": 1.0796899736642682, "grad_norm": 1.7423657178878784, "learning_rate": 3.201267016309146e-05, "loss": 2.627, "step": 143900 }, { "epoch": 1.0804402794138612, "grad_norm": 1.938178300857544, "learning_rate": 3.2000165067264916e-05, "loss": 2.6752, "step": 144000 }, { "epoch": 1.081190585163454, "grad_norm": 1.640541911125183, "learning_rate": 3.1987659971438363e-05, "loss": 2.6819, "step": 144100 }, { "epoch": 1.081940890913047, "grad_norm": 1.7880045175552368, "learning_rate": 3.197515487561181e-05, "loss": 2.7787, "step": 144200 }, { "epoch": 1.08269119666264, "grad_norm": 2.460940361022949, "learning_rate": 3.1962649779785265e-05, "loss": 2.7166, "step": 144300 }, { "epoch": 1.083441502412233, "grad_norm": 2.1047940254211426, "learning_rate": 3.195014468395871e-05, "loss": 2.767, "step": 144400 }, { "epoch": 1.084191808161826, "grad_norm": 1.5028904676437378, "learning_rate": 3.193763958813217e-05, "loss": 2.8118, "step": 144500 }, { "epoch": 1.0849421139114188, "grad_norm": 1.6947925090789795, "learning_rate": 3.1925134492305615e-05, "loss": 2.6606, "step": 144600 }, { "epoch": 1.0856924196610118, "grad_norm": 1.5537571907043457, "learning_rate": 3.191262939647906e-05, "loss": 2.6762, "step": 144700 }, { "epoch": 1.0864427254106048, "grad_norm": 3.850475788116455, "learning_rate": 3.190012430065252e-05, "loss": 2.6948, "step": 144800 }, { "epoch": 1.0871930311601978, "grad_norm": 1.6276918649673462, "learning_rate": 3.188761920482597e-05, "loss": 2.6808, "step": 144900 }, { "epoch": 1.0879433369097908, "grad_norm": 1.6695961952209473, "learning_rate": 3.187511410899942e-05, "loss": 2.7065, "step": 145000 }, { "epoch": 1.0886936426593836, "grad_norm": 2.7308807373046875, "learning_rate": 3.186260901317287e-05, "loss": 2.7374, "step": 145100 }, { "epoch": 1.0894439484089766, "grad_norm": 1.8368226289749146, "learning_rate": 3.185010391734632e-05, "loss": 2.7499, "step": 145200 }, { "epoch": 1.0901942541585696, "grad_norm": 1.7490047216415405, "learning_rate": 3.183759882151977e-05, "loss": 2.8001, "step": 145300 }, { "epoch": 1.0909445599081626, "grad_norm": 3.0912508964538574, "learning_rate": 3.182509372569322e-05, "loss": 2.6486, "step": 145400 }, { "epoch": 1.0916948656577556, "grad_norm": 2.133638858795166, "learning_rate": 3.181258862986667e-05, "loss": 2.6032, "step": 145500 }, { "epoch": 1.0924451714073484, "grad_norm": 1.4417284727096558, "learning_rate": 3.180008353404012e-05, "loss": 2.6788, "step": 145600 }, { "epoch": 1.0931954771569414, "grad_norm": 1.137153148651123, "learning_rate": 3.178757843821357e-05, "loss": 2.7107, "step": 145700 }, { "epoch": 1.0939457829065344, "grad_norm": 1.8389806747436523, "learning_rate": 3.177519839334529e-05, "loss": 2.5441, "step": 145800 }, { "epoch": 1.0946960886561274, "grad_norm": 2.7369024753570557, "learning_rate": 3.1762693297518745e-05, "loss": 2.7948, "step": 145900 }, { "epoch": 1.0954463944057204, "grad_norm": 1.8081830739974976, "learning_rate": 3.175018820169219e-05, "loss": 2.503, "step": 146000 }, { "epoch": 1.0961967001553132, "grad_norm": 2.269320487976074, "learning_rate": 3.173768310586564e-05, "loss": 2.5473, "step": 146100 }, { "epoch": 1.0969470059049062, "grad_norm": 1.7379920482635498, "learning_rate": 3.1725178010039094e-05, "loss": 2.8176, "step": 146200 }, { "epoch": 1.0976973116544992, "grad_norm": 1.775290846824646, "learning_rate": 3.171267291421254e-05, "loss": 2.735, "step": 146300 }, { "epoch": 1.0984476174040922, "grad_norm": 2.2588813304901123, "learning_rate": 3.1700167818385996e-05, "loss": 2.8287, "step": 146400 }, { "epoch": 1.099197923153685, "grad_norm": 1.7880065441131592, "learning_rate": 3.1687662722559444e-05, "loss": 2.7066, "step": 146500 }, { "epoch": 1.099948228903278, "grad_norm": 2.1924448013305664, "learning_rate": 3.167515762673289e-05, "loss": 2.758, "step": 146600 }, { "epoch": 1.100698534652871, "grad_norm": 1.9008406400680542, "learning_rate": 3.1662652530906346e-05, "loss": 2.7251, "step": 146700 }, { "epoch": 1.101448840402464, "grad_norm": 3.2348170280456543, "learning_rate": 3.1650147435079794e-05, "loss": 2.7241, "step": 146800 }, { "epoch": 1.102199146152057, "grad_norm": 2.194167137145996, "learning_rate": 3.163764233925325e-05, "loss": 2.794, "step": 146900 }, { "epoch": 1.1029494519016498, "grad_norm": 1.1891282796859741, "learning_rate": 3.16251372434267e-05, "loss": 2.6023, "step": 147000 }, { "epoch": 1.1036997576512428, "grad_norm": 1.757144570350647, "learning_rate": 3.161263214760014e-05, "loss": 2.7159, "step": 147100 }, { "epoch": 1.1044500634008358, "grad_norm": 1.9053447246551514, "learning_rate": 3.16001270517736e-05, "loss": 2.7657, "step": 147200 }, { "epoch": 1.1052003691504289, "grad_norm": 1.7315359115600586, "learning_rate": 3.158762195594705e-05, "loss": 2.8537, "step": 147300 }, { "epoch": 1.1059506749000219, "grad_norm": 1.8078224658966064, "learning_rate": 3.15751168601205e-05, "loss": 2.5029, "step": 147400 }, { "epoch": 1.1067009806496146, "grad_norm": 1.7773092985153198, "learning_rate": 3.1562611764293954e-05, "loss": 2.618, "step": 147500 }, { "epoch": 1.1074512863992076, "grad_norm": 1.4957611560821533, "learning_rate": 3.15501066684674e-05, "loss": 2.6108, "step": 147600 }, { "epoch": 1.1082015921488007, "grad_norm": 5.628758907318115, "learning_rate": 3.153760157264085e-05, "loss": 2.6727, "step": 147700 }, { "epoch": 1.1089518978983937, "grad_norm": 1.462573528289795, "learning_rate": 3.1525096476814304e-05, "loss": 2.6107, "step": 147800 }, { "epoch": 1.1097022036479864, "grad_norm": 2.569810390472412, "learning_rate": 3.151259138098775e-05, "loss": 2.8218, "step": 147900 }, { "epoch": 1.1104525093975794, "grad_norm": 1.927727460861206, "learning_rate": 3.1500086285161206e-05, "loss": 2.5993, "step": 148000 }, { "epoch": 1.1112028151471725, "grad_norm": 1.0529073476791382, "learning_rate": 3.1487581189334654e-05, "loss": 2.6278, "step": 148100 }, { "epoch": 1.1119531208967655, "grad_norm": 1.6384506225585938, "learning_rate": 3.147507609350811e-05, "loss": 2.7817, "step": 148200 }, { "epoch": 1.1127034266463585, "grad_norm": 1.5371356010437012, "learning_rate": 3.1462570997681556e-05, "loss": 2.7614, "step": 148300 }, { "epoch": 1.1134537323959512, "grad_norm": 2.2915663719177246, "learning_rate": 3.145006590185501e-05, "loss": 2.7123, "step": 148400 }, { "epoch": 1.1142040381455443, "grad_norm": 1.4268431663513184, "learning_rate": 3.143768585698673e-05, "loss": 2.7404, "step": 148500 }, { "epoch": 1.1149543438951373, "grad_norm": 1.7134815454483032, "learning_rate": 3.1425180761160175e-05, "loss": 2.7914, "step": 148600 }, { "epoch": 1.1157046496447303, "grad_norm": 1.730419397354126, "learning_rate": 3.141267566533362e-05, "loss": 2.629, "step": 148700 }, { "epoch": 1.1164549553943233, "grad_norm": 1.610206961631775, "learning_rate": 3.140017056950708e-05, "loss": 2.7143, "step": 148800 }, { "epoch": 1.117205261143916, "grad_norm": 1.5310837030410767, "learning_rate": 3.1387665473680525e-05, "loss": 2.6911, "step": 148900 }, { "epoch": 1.117955566893509, "grad_norm": 1.5578666925430298, "learning_rate": 3.137516037785398e-05, "loss": 2.6527, "step": 149000 }, { "epoch": 1.118705872643102, "grad_norm": 1.4954904317855835, "learning_rate": 3.136265528202743e-05, "loss": 2.6778, "step": 149100 }, { "epoch": 1.119456178392695, "grad_norm": 1.4160420894622803, "learning_rate": 3.1350150186200874e-05, "loss": 2.749, "step": 149200 }, { "epoch": 1.120206484142288, "grad_norm": 1.5255879163742065, "learning_rate": 3.133764509037433e-05, "loss": 2.6752, "step": 149300 }, { "epoch": 1.1209567898918809, "grad_norm": 1.9648244380950928, "learning_rate": 3.132513999454778e-05, "loss": 2.7962, "step": 149400 }, { "epoch": 1.1217070956414739, "grad_norm": 1.9490673542022705, "learning_rate": 3.131263489872123e-05, "loss": 2.742, "step": 149500 }, { "epoch": 1.1224574013910669, "grad_norm": 1.459101915359497, "learning_rate": 3.130012980289468e-05, "loss": 2.7001, "step": 149600 }, { "epoch": 1.1232077071406599, "grad_norm": 2.101391553878784, "learning_rate": 3.128762470706813e-05, "loss": 2.6723, "step": 149700 }, { "epoch": 1.1239580128902529, "grad_norm": 1.5564522743225098, "learning_rate": 3.127511961124158e-05, "loss": 2.7229, "step": 149800 }, { "epoch": 1.1247083186398457, "grad_norm": 1.6671067476272583, "learning_rate": 3.1262614515415035e-05, "loss": 2.6826, "step": 149900 }, { "epoch": 1.1254586243894387, "grad_norm": 2.2309608459472656, "learning_rate": 3.125010941958848e-05, "loss": 2.6896, "step": 150000 }, { "epoch": 1.1262089301390317, "grad_norm": 1.4956547021865845, "learning_rate": 3.123760432376193e-05, "loss": 2.7026, "step": 150100 }, { "epoch": 1.1269592358886247, "grad_norm": 3.7093396186828613, "learning_rate": 3.1225099227935385e-05, "loss": 2.7188, "step": 150200 }, { "epoch": 1.1277095416382177, "grad_norm": 3.313662528991699, "learning_rate": 3.121259413210884e-05, "loss": 2.6051, "step": 150300 }, { "epoch": 1.1284598473878105, "grad_norm": 1.7080368995666504, "learning_rate": 3.1200089036282287e-05, "loss": 2.6557, "step": 150400 }, { "epoch": 1.1292101531374035, "grad_norm": 2.5746982097625732, "learning_rate": 3.118758394045574e-05, "loss": 2.6022, "step": 150500 }, { "epoch": 1.1299604588869965, "grad_norm": 1.6665984392166138, "learning_rate": 3.117507884462919e-05, "loss": 2.7079, "step": 150600 }, { "epoch": 1.1307107646365895, "grad_norm": 1.9325661659240723, "learning_rate": 3.1162573748802636e-05, "loss": 2.5807, "step": 150700 }, { "epoch": 1.1314610703861825, "grad_norm": 2.8675858974456787, "learning_rate": 3.115006865297609e-05, "loss": 2.774, "step": 150800 }, { "epoch": 1.1322113761357753, "grad_norm": 2.237074375152588, "learning_rate": 3.113768860810781e-05, "loss": 2.7844, "step": 150900 }, { "epoch": 1.1329616818853683, "grad_norm": 2.5213379859924316, "learning_rate": 3.1125183512281256e-05, "loss": 2.6561, "step": 151000 }, { "epoch": 1.1337119876349613, "grad_norm": 2.2290422916412354, "learning_rate": 3.11126784164547e-05, "loss": 2.744, "step": 151100 }, { "epoch": 1.1344622933845543, "grad_norm": 2.211362600326538, "learning_rate": 3.110017332062816e-05, "loss": 2.7428, "step": 151200 }, { "epoch": 1.135212599134147, "grad_norm": 2.054154634475708, "learning_rate": 3.1087668224801605e-05, "loss": 2.6372, "step": 151300 }, { "epoch": 1.13596290488374, "grad_norm": 1.7715333700180054, "learning_rate": 3.107516312897506e-05, "loss": 2.6236, "step": 151400 }, { "epoch": 1.136713210633333, "grad_norm": 2.8540232181549072, "learning_rate": 3.1062658033148514e-05, "loss": 2.7841, "step": 151500 }, { "epoch": 1.137463516382926, "grad_norm": 3.078665256500244, "learning_rate": 3.1050152937321955e-05, "loss": 2.7718, "step": 151600 }, { "epoch": 1.138213822132519, "grad_norm": 1.6878719329833984, "learning_rate": 3.103764784149541e-05, "loss": 2.676, "step": 151700 }, { "epoch": 1.1389641278821119, "grad_norm": 2.170750617980957, "learning_rate": 3.1025142745668864e-05, "loss": 2.7278, "step": 151800 }, { "epoch": 1.1397144336317049, "grad_norm": 3.598398447036743, "learning_rate": 3.101263764984231e-05, "loss": 2.6585, "step": 151900 }, { "epoch": 1.1404647393812979, "grad_norm": 1.2039016485214233, "learning_rate": 3.1000132554015766e-05, "loss": 2.5206, "step": 152000 }, { "epoch": 1.141215045130891, "grad_norm": 1.3850873708724976, "learning_rate": 3.0987627458189213e-05, "loss": 2.6587, "step": 152100 }, { "epoch": 1.1419653508804837, "grad_norm": 2.2425103187561035, "learning_rate": 3.097512236236266e-05, "loss": 2.643, "step": 152200 }, { "epoch": 1.1427156566300767, "grad_norm": 1.619978904724121, "learning_rate": 3.0962617266536116e-05, "loss": 2.8041, "step": 152300 }, { "epoch": 1.1434659623796697, "grad_norm": 2.4051244258880615, "learning_rate": 3.095011217070956e-05, "loss": 2.7189, "step": 152400 }, { "epoch": 1.1442162681292627, "grad_norm": 1.6488378047943115, "learning_rate": 3.093760707488302e-05, "loss": 2.7634, "step": 152500 }, { "epoch": 1.1449665738788557, "grad_norm": 2.0663297176361084, "learning_rate": 3.0925101979056465e-05, "loss": 2.6367, "step": 152600 }, { "epoch": 1.1457168796284485, "grad_norm": 1.5859345197677612, "learning_rate": 3.091259688322992e-05, "loss": 2.6021, "step": 152700 }, { "epoch": 1.1464671853780415, "grad_norm": 3.0242671966552734, "learning_rate": 3.090009178740337e-05, "loss": 2.8734, "step": 152800 }, { "epoch": 1.1472174911276345, "grad_norm": 2.707847833633423, "learning_rate": 3.088758669157682e-05, "loss": 2.7207, "step": 152900 }, { "epoch": 1.1479677968772275, "grad_norm": 4.639288425445557, "learning_rate": 3.087508159575027e-05, "loss": 2.6921, "step": 153000 }, { "epoch": 1.1487181026268205, "grad_norm": 2.0065853595733643, "learning_rate": 3.086257649992372e-05, "loss": 2.5842, "step": 153100 }, { "epoch": 1.1494684083764133, "grad_norm": 2.14754056930542, "learning_rate": 3.085007140409717e-05, "loss": 2.681, "step": 153200 }, { "epoch": 1.1502187141260063, "grad_norm": 1.9091839790344238, "learning_rate": 3.083756630827062e-05, "loss": 2.6607, "step": 153300 }, { "epoch": 1.1509690198755993, "grad_norm": 3.3357298374176025, "learning_rate": 3.082506121244407e-05, "loss": 2.5527, "step": 153400 }, { "epoch": 1.1517193256251923, "grad_norm": 2.737943649291992, "learning_rate": 3.081255611661753e-05, "loss": 2.7394, "step": 153500 }, { "epoch": 1.1524696313747853, "grad_norm": 1.965874433517456, "learning_rate": 3.0800051020790975e-05, "loss": 2.7747, "step": 153600 }, { "epoch": 1.153219937124378, "grad_norm": 1.9635308980941772, "learning_rate": 3.078754592496442e-05, "loss": 2.7288, "step": 153700 }, { "epoch": 1.153970242873971, "grad_norm": 1.6183407306671143, "learning_rate": 3.077504082913788e-05, "loss": 2.7716, "step": 153800 }, { "epoch": 1.154720548623564, "grad_norm": 1.4296313524246216, "learning_rate": 3.0762535733311325e-05, "loss": 2.746, "step": 153900 }, { "epoch": 1.155470854373157, "grad_norm": 1.8811399936676025, "learning_rate": 3.075003063748478e-05, "loss": 2.7069, "step": 154000 }, { "epoch": 1.1562211601227501, "grad_norm": 2.353447198867798, "learning_rate": 3.073752554165823e-05, "loss": 2.8591, "step": 154100 }, { "epoch": 1.156971465872343, "grad_norm": 2.5770363807678223, "learning_rate": 3.0725020445831675e-05, "loss": 2.6469, "step": 154200 }, { "epoch": 1.157721771621936, "grad_norm": 1.4603722095489502, "learning_rate": 3.071251535000513e-05, "loss": 2.669, "step": 154300 }, { "epoch": 1.158472077371529, "grad_norm": 2.0990638732910156, "learning_rate": 3.070001025417858e-05, "loss": 2.6906, "step": 154400 }, { "epoch": 1.159222383121122, "grad_norm": 1.8391274213790894, "learning_rate": 3.068750515835203e-05, "loss": 2.5282, "step": 154500 }, { "epoch": 1.159972688870715, "grad_norm": 1.4975647926330566, "learning_rate": 3.0675000062525486e-05, "loss": 2.5839, "step": 154600 }, { "epoch": 1.1607229946203077, "grad_norm": 1.7333004474639893, "learning_rate": 3.0662494966698926e-05, "loss": 2.5594, "step": 154700 }, { "epoch": 1.1614733003699007, "grad_norm": 2.1990020275115967, "learning_rate": 3.064998987087238e-05, "loss": 2.628, "step": 154800 }, { "epoch": 1.1622236061194937, "grad_norm": 1.7170847654342651, "learning_rate": 3.06376098260041e-05, "loss": 2.6937, "step": 154900 }, { "epoch": 1.1629739118690867, "grad_norm": 1.9457818269729614, "learning_rate": 3.062510473017755e-05, "loss": 2.6377, "step": 155000 }, { "epoch": 1.1637242176186797, "grad_norm": 1.6830402612686157, "learning_rate": 3.0612599634351e-05, "loss": 2.6288, "step": 155100 }, { "epoch": 1.1644745233682725, "grad_norm": 2.2886171340942383, "learning_rate": 3.060009453852445e-05, "loss": 2.7032, "step": 155200 }, { "epoch": 1.1652248291178655, "grad_norm": 1.4968938827514648, "learning_rate": 3.05875894426979e-05, "loss": 2.6792, "step": 155300 }, { "epoch": 1.1659751348674585, "grad_norm": 1.4467170238494873, "learning_rate": 3.057508434687135e-05, "loss": 2.7573, "step": 155400 }, { "epoch": 1.1667254406170515, "grad_norm": 1.5763930082321167, "learning_rate": 3.0562579251044804e-05, "loss": 2.7562, "step": 155500 }, { "epoch": 1.1674757463666445, "grad_norm": 2.221635580062866, "learning_rate": 3.0550199206176515e-05, "loss": 2.6531, "step": 155600 }, { "epoch": 1.1682260521162373, "grad_norm": 2.162463665008545, "learning_rate": 3.053769411034997e-05, "loss": 2.7684, "step": 155700 }, { "epoch": 1.1689763578658303, "grad_norm": 1.825631022453308, "learning_rate": 3.052518901452342e-05, "loss": 2.7418, "step": 155800 }, { "epoch": 1.1697266636154233, "grad_norm": 1.8772563934326172, "learning_rate": 3.051268391869687e-05, "loss": 2.5558, "step": 155900 }, { "epoch": 1.1704769693650163, "grad_norm": 1.708125352859497, "learning_rate": 3.0500178822870322e-05, "loss": 2.6556, "step": 156000 }, { "epoch": 1.1712272751146091, "grad_norm": 3.0531585216522217, "learning_rate": 3.048767372704377e-05, "loss": 2.6389, "step": 156100 }, { "epoch": 1.1719775808642021, "grad_norm": 1.5534058809280396, "learning_rate": 3.047516863121722e-05, "loss": 2.6381, "step": 156200 }, { "epoch": 1.1727278866137951, "grad_norm": 2.130002021789551, "learning_rate": 3.0462663535390672e-05, "loss": 2.8474, "step": 156300 }, { "epoch": 1.1734781923633881, "grad_norm": 1.8062463998794556, "learning_rate": 3.0450158439564126e-05, "loss": 2.6704, "step": 156400 }, { "epoch": 1.1742284981129811, "grad_norm": 1.9708492755889893, "learning_rate": 3.0437653343737577e-05, "loss": 2.7217, "step": 156500 }, { "epoch": 1.174978803862574, "grad_norm": 1.7808706760406494, "learning_rate": 3.0425148247911022e-05, "loss": 2.7329, "step": 156600 }, { "epoch": 1.175729109612167, "grad_norm": 1.7159823179244995, "learning_rate": 3.0412643152084476e-05, "loss": 2.6183, "step": 156700 }, { "epoch": 1.17647941536176, "grad_norm": 1.6477577686309814, "learning_rate": 3.0400138056257927e-05, "loss": 2.6654, "step": 156800 }, { "epoch": 1.177229721111353, "grad_norm": 3.1671056747436523, "learning_rate": 3.0387632960431378e-05, "loss": 2.7036, "step": 156900 }, { "epoch": 1.1779800268609457, "grad_norm": 2.826876640319824, "learning_rate": 3.037512786460483e-05, "loss": 2.7431, "step": 157000 }, { "epoch": 1.1787303326105387, "grad_norm": 2.241215944290161, "learning_rate": 3.0362622768778277e-05, "loss": 2.7669, "step": 157100 }, { "epoch": 1.1794806383601317, "grad_norm": 1.572340488433838, "learning_rate": 3.0350117672951728e-05, "loss": 2.8354, "step": 157200 }, { "epoch": 1.1802309441097247, "grad_norm": 1.9356483221054077, "learning_rate": 3.033761257712518e-05, "loss": 2.7124, "step": 157300 }, { "epoch": 1.1809812498593177, "grad_norm": 2.507463216781616, "learning_rate": 3.032510748129863e-05, "loss": 2.7232, "step": 157400 }, { "epoch": 1.1817315556089105, "grad_norm": 1.892715573310852, "learning_rate": 3.0312602385472084e-05, "loss": 2.6658, "step": 157500 }, { "epoch": 1.1824818613585035, "grad_norm": 2.244960308074951, "learning_rate": 3.030009728964553e-05, "loss": 2.7259, "step": 157600 }, { "epoch": 1.1832321671080965, "grad_norm": 1.8760526180267334, "learning_rate": 3.028759219381898e-05, "loss": 2.6372, "step": 157700 }, { "epoch": 1.1839824728576895, "grad_norm": 1.6014811992645264, "learning_rate": 3.0275087097992434e-05, "loss": 2.5321, "step": 157800 }, { "epoch": 1.1847327786072825, "grad_norm": 2.5671606063842773, "learning_rate": 3.0262582002165885e-05, "loss": 2.6973, "step": 157900 }, { "epoch": 1.1854830843568753, "grad_norm": 1.7399225234985352, "learning_rate": 3.0250076906339336e-05, "loss": 2.5715, "step": 158000 }, { "epoch": 1.1862333901064683, "grad_norm": 1.6961214542388916, "learning_rate": 3.0237571810512787e-05, "loss": 2.639, "step": 158100 }, { "epoch": 1.1869836958560613, "grad_norm": 1.6794229745864868, "learning_rate": 3.0225066714686235e-05, "loss": 2.6565, "step": 158200 }, { "epoch": 1.1877340016056543, "grad_norm": 1.5944491624832153, "learning_rate": 3.0212561618859686e-05, "loss": 2.7795, "step": 158300 }, { "epoch": 1.1884843073552473, "grad_norm": 1.5023751258850098, "learning_rate": 3.0200056523033137e-05, "loss": 2.5435, "step": 158400 }, { "epoch": 1.1892346131048401, "grad_norm": 1.3463249206542969, "learning_rate": 3.018755142720659e-05, "loss": 2.6435, "step": 158500 }, { "epoch": 1.1899849188544331, "grad_norm": 1.6961604356765747, "learning_rate": 3.0175046331380042e-05, "loss": 2.6365, "step": 158600 }, { "epoch": 1.1907352246040261, "grad_norm": 2.166304349899292, "learning_rate": 3.0162541235553486e-05, "loss": 2.4933, "step": 158700 }, { "epoch": 1.1914855303536191, "grad_norm": 1.7747185230255127, "learning_rate": 3.015003613972694e-05, "loss": 2.7195, "step": 158800 }, { "epoch": 1.1922358361032122, "grad_norm": 1.9803110361099243, "learning_rate": 3.0137531043900392e-05, "loss": 2.6306, "step": 158900 }, { "epoch": 1.192986141852805, "grad_norm": 2.092134475708008, "learning_rate": 3.0125025948073843e-05, "loss": 2.5334, "step": 159000 }, { "epoch": 1.193736447602398, "grad_norm": 1.6333526372909546, "learning_rate": 3.0112520852247294e-05, "loss": 2.6131, "step": 159100 }, { "epoch": 1.194486753351991, "grad_norm": 1.6592342853546143, "learning_rate": 3.010001575642074e-05, "loss": 2.6525, "step": 159200 }, { "epoch": 1.195237059101584, "grad_norm": 2.1927566528320312, "learning_rate": 3.0087510660594192e-05, "loss": 2.6097, "step": 159300 }, { "epoch": 1.195987364851177, "grad_norm": 1.481833815574646, "learning_rate": 3.0075005564767643e-05, "loss": 2.5798, "step": 159400 }, { "epoch": 1.1967376706007697, "grad_norm": 1.8503817319869995, "learning_rate": 3.0062500468941094e-05, "loss": 2.7258, "step": 159500 }, { "epoch": 1.1974879763503627, "grad_norm": 2.3780412673950195, "learning_rate": 3.005012042407281e-05, "loss": 2.6784, "step": 159600 }, { "epoch": 1.1982382820999558, "grad_norm": 3.0807063579559326, "learning_rate": 3.003761532824626e-05, "loss": 2.6058, "step": 159700 }, { "epoch": 1.1989885878495488, "grad_norm": 2.0139012336730957, "learning_rate": 3.002511023241971e-05, "loss": 2.549, "step": 159800 }, { "epoch": 1.1997388935991418, "grad_norm": 2.4510881900787354, "learning_rate": 3.0012605136593165e-05, "loss": 2.8031, "step": 159900 }, { "epoch": 1.2004891993487345, "grad_norm": 1.5510354042053223, "learning_rate": 3.0000100040766616e-05, "loss": 2.677, "step": 160000 }, { "epoch": 1.2012395050983276, "grad_norm": 2.581087589263916, "learning_rate": 2.9987594944940067e-05, "loss": 2.5421, "step": 160100 }, { "epoch": 1.2019898108479206, "grad_norm": 1.8449212312698364, "learning_rate": 2.9975089849113515e-05, "loss": 2.5985, "step": 160200 }, { "epoch": 1.2027401165975136, "grad_norm": 1.5238192081451416, "learning_rate": 2.9962584753286966e-05, "loss": 2.7659, "step": 160300 }, { "epoch": 1.2034904223471066, "grad_norm": 1.582699179649353, "learning_rate": 2.9950079657460417e-05, "loss": 2.7722, "step": 160400 }, { "epoch": 1.2042407280966994, "grad_norm": 1.7945042848587036, "learning_rate": 2.9937574561633868e-05, "loss": 2.5941, "step": 160500 }, { "epoch": 1.2049910338462924, "grad_norm": 2.8288733959198, "learning_rate": 2.9925069465807322e-05, "loss": 2.7356, "step": 160600 }, { "epoch": 1.2057413395958854, "grad_norm": 1.7147351503372192, "learning_rate": 2.9912564369980766e-05, "loss": 2.7291, "step": 160700 }, { "epoch": 1.2064916453454784, "grad_norm": 2.1863229274749756, "learning_rate": 2.9900059274154217e-05, "loss": 2.7086, "step": 160800 }, { "epoch": 1.2072419510950712, "grad_norm": 2.4298956394195557, "learning_rate": 2.988755417832767e-05, "loss": 2.6959, "step": 160900 }, { "epoch": 1.2079922568446642, "grad_norm": 2.350435733795166, "learning_rate": 2.9875049082501123e-05, "loss": 2.8071, "step": 161000 }, { "epoch": 1.2087425625942572, "grad_norm": 1.7936413288116455, "learning_rate": 2.9862543986674574e-05, "loss": 2.6278, "step": 161100 }, { "epoch": 1.2094928683438502, "grad_norm": 1.4054607152938843, "learning_rate": 2.9850163941806288e-05, "loss": 2.6061, "step": 161200 }, { "epoch": 1.2102431740934432, "grad_norm": 1.736035943031311, "learning_rate": 2.983765884597974e-05, "loss": 2.7669, "step": 161300 }, { "epoch": 1.210993479843036, "grad_norm": 3.3650176525115967, "learning_rate": 2.982515375015319e-05, "loss": 2.6344, "step": 161400 }, { "epoch": 1.211743785592629, "grad_norm": 2.7249937057495117, "learning_rate": 2.981264865432664e-05, "loss": 2.5065, "step": 161500 }, { "epoch": 1.212494091342222, "grad_norm": 1.625995397567749, "learning_rate": 2.980014355850009e-05, "loss": 2.7288, "step": 161600 }, { "epoch": 1.213244397091815, "grad_norm": 2.309746742248535, "learning_rate": 2.978763846267354e-05, "loss": 2.7863, "step": 161700 }, { "epoch": 1.2139947028414078, "grad_norm": 1.781184196472168, "learning_rate": 2.977513336684699e-05, "loss": 2.7011, "step": 161800 }, { "epoch": 1.2147450085910008, "grad_norm": 1.9482370615005493, "learning_rate": 2.976262827102044e-05, "loss": 2.5662, "step": 161900 }, { "epoch": 1.2154953143405938, "grad_norm": 2.2134320735931396, "learning_rate": 2.9750123175193896e-05, "loss": 2.7655, "step": 162000 }, { "epoch": 1.2162456200901868, "grad_norm": 1.5169562101364136, "learning_rate": 2.973761807936734e-05, "loss": 2.6234, "step": 162100 }, { "epoch": 1.2169959258397798, "grad_norm": 2.615671396255493, "learning_rate": 2.972511298354079e-05, "loss": 2.6987, "step": 162200 }, { "epoch": 1.2177462315893726, "grad_norm": 2.379973888397217, "learning_rate": 2.9712607887714246e-05, "loss": 2.7009, "step": 162300 }, { "epoch": 1.2184965373389656, "grad_norm": 1.5326720476150513, "learning_rate": 2.9700102791887697e-05, "loss": 2.696, "step": 162400 }, { "epoch": 1.2192468430885586, "grad_norm": 1.679052710533142, "learning_rate": 2.9687597696061148e-05, "loss": 2.7507, "step": 162500 }, { "epoch": 1.2199971488381516, "grad_norm": 1.787352204322815, "learning_rate": 2.96750926002346e-05, "loss": 2.774, "step": 162600 }, { "epoch": 1.2207474545877446, "grad_norm": 1.4553879499435425, "learning_rate": 2.9662587504408046e-05, "loss": 2.6296, "step": 162700 }, { "epoch": 1.2214977603373374, "grad_norm": 1.9878545999526978, "learning_rate": 2.9650082408581497e-05, "loss": 2.7348, "step": 162800 }, { "epoch": 1.2222480660869304, "grad_norm": 2.101619005203247, "learning_rate": 2.9637577312754948e-05, "loss": 2.6626, "step": 162900 }, { "epoch": 1.2229983718365234, "grad_norm": 1.6041131019592285, "learning_rate": 2.9625072216928403e-05, "loss": 2.6674, "step": 163000 }, { "epoch": 1.2237486775861164, "grad_norm": 2.1617913246154785, "learning_rate": 2.9612567121101854e-05, "loss": 2.6638, "step": 163100 }, { "epoch": 1.2244989833357094, "grad_norm": 1.5291482210159302, "learning_rate": 2.9600062025275298e-05, "loss": 2.6456, "step": 163200 }, { "epoch": 1.2252492890853022, "grad_norm": 1.4028154611587524, "learning_rate": 2.9587556929448752e-05, "loss": 2.6593, "step": 163300 }, { "epoch": 1.2259995948348952, "grad_norm": 3.2948403358459473, "learning_rate": 2.9575051833622203e-05, "loss": 2.6425, "step": 163400 }, { "epoch": 1.2267499005844882, "grad_norm": 2.444234609603882, "learning_rate": 2.9562546737795654e-05, "loss": 2.6165, "step": 163500 }, { "epoch": 1.2275002063340812, "grad_norm": 1.6614465713500977, "learning_rate": 2.9550041641969105e-05, "loss": 2.582, "step": 163600 }, { "epoch": 1.2282505120836742, "grad_norm": 1.928938627243042, "learning_rate": 2.9537536546142553e-05, "loss": 2.7929, "step": 163700 }, { "epoch": 1.229000817833267, "grad_norm": 1.6945748329162598, "learning_rate": 2.9525031450316004e-05, "loss": 2.7175, "step": 163800 }, { "epoch": 1.22975112358286, "grad_norm": 1.9175260066986084, "learning_rate": 2.9512526354489455e-05, "loss": 2.7486, "step": 163900 }, { "epoch": 1.230501429332453, "grad_norm": 1.5497617721557617, "learning_rate": 2.9500021258662906e-05, "loss": 2.6773, "step": 164000 }, { "epoch": 1.231251735082046, "grad_norm": 1.9723308086395264, "learning_rate": 2.948751616283636e-05, "loss": 2.6528, "step": 164100 }, { "epoch": 1.232002040831639, "grad_norm": 2.185891628265381, "learning_rate": 2.9475011067009805e-05, "loss": 2.5992, "step": 164200 }, { "epoch": 1.2327523465812318, "grad_norm": 1.849042534828186, "learning_rate": 2.9462505971183256e-05, "loss": 2.6034, "step": 164300 }, { "epoch": 1.2335026523308248, "grad_norm": 1.3349130153656006, "learning_rate": 2.945000087535671e-05, "loss": 2.7661, "step": 164400 }, { "epoch": 1.2342529580804178, "grad_norm": 1.2598233222961426, "learning_rate": 2.9437620830488428e-05, "loss": 2.8027, "step": 164500 }, { "epoch": 1.2350032638300108, "grad_norm": 1.7447640895843506, "learning_rate": 2.942511573466188e-05, "loss": 2.6428, "step": 164600 }, { "epoch": 1.2357535695796038, "grad_norm": 2.4438281059265137, "learning_rate": 2.9412610638835326e-05, "loss": 2.8315, "step": 164700 }, { "epoch": 1.2365038753291966, "grad_norm": 1.6094211339950562, "learning_rate": 2.9400105543008777e-05, "loss": 2.7163, "step": 164800 }, { "epoch": 1.2372541810787896, "grad_norm": 1.6952067613601685, "learning_rate": 2.9387600447182228e-05, "loss": 2.6936, "step": 164900 }, { "epoch": 1.2380044868283826, "grad_norm": 1.5326900482177734, "learning_rate": 2.937509535135568e-05, "loss": 2.6436, "step": 165000 }, { "epoch": 1.2387547925779756, "grad_norm": 2.3361260890960693, "learning_rate": 2.9362590255529134e-05, "loss": 2.8245, "step": 165100 }, { "epoch": 1.2395050983275684, "grad_norm": 1.1241310834884644, "learning_rate": 2.9350085159702578e-05, "loss": 2.6486, "step": 165200 }, { "epoch": 1.2402554040771614, "grad_norm": 1.4676048755645752, "learning_rate": 2.933758006387603e-05, "loss": 2.8012, "step": 165300 }, { "epoch": 1.2410057098267544, "grad_norm": 1.4275197982788086, "learning_rate": 2.9325074968049483e-05, "loss": 2.6708, "step": 165400 }, { "epoch": 1.2417560155763474, "grad_norm": 1.7790518999099731, "learning_rate": 2.9312569872222934e-05, "loss": 2.7277, "step": 165500 }, { "epoch": 1.2425063213259404, "grad_norm": 1.4538968801498413, "learning_rate": 2.9300064776396385e-05, "loss": 2.6296, "step": 165600 }, { "epoch": 1.2432566270755332, "grad_norm": 1.7014902830123901, "learning_rate": 2.9287559680569833e-05, "loss": 2.429, "step": 165700 }, { "epoch": 1.2440069328251262, "grad_norm": 1.9662827253341675, "learning_rate": 2.9275054584743284e-05, "loss": 2.5821, "step": 165800 }, { "epoch": 1.2447572385747192, "grad_norm": 3.009957790374756, "learning_rate": 2.9262549488916735e-05, "loss": 2.5839, "step": 165900 }, { "epoch": 1.2455075443243122, "grad_norm": 1.9526479244232178, "learning_rate": 2.9250044393090186e-05, "loss": 2.5271, "step": 166000 }, { "epoch": 1.246257850073905, "grad_norm": 1.5931519269943237, "learning_rate": 2.9237539297263637e-05, "loss": 2.6699, "step": 166100 }, { "epoch": 1.247008155823498, "grad_norm": 2.6722753047943115, "learning_rate": 2.9225034201437085e-05, "loss": 2.7369, "step": 166200 }, { "epoch": 1.247758461573091, "grad_norm": 1.5257519483566284, "learning_rate": 2.9212529105610536e-05, "loss": 2.4985, "step": 166300 }, { "epoch": 1.248508767322684, "grad_norm": 1.8070734739303589, "learning_rate": 2.9200024009783987e-05, "loss": 2.6458, "step": 166400 }, { "epoch": 1.249259073072277, "grad_norm": 2.3379578590393066, "learning_rate": 2.918751891395744e-05, "loss": 2.7396, "step": 166500 }, { "epoch": 1.2500093788218698, "grad_norm": 1.8366920948028564, "learning_rate": 2.9175013818130892e-05, "loss": 2.6383, "step": 166600 }, { "epoch": 1.2507596845714628, "grad_norm": 1.6428078413009644, "learning_rate": 2.9162633773262603e-05, "loss": 2.7276, "step": 166700 }, { "epoch": 1.2515099903210558, "grad_norm": 2.436091661453247, "learning_rate": 2.9150128677436057e-05, "loss": 2.7223, "step": 166800 }, { "epoch": 1.2522602960706488, "grad_norm": 1.678900122642517, "learning_rate": 2.9137623581609508e-05, "loss": 2.6526, "step": 166900 }, { "epoch": 1.2530106018202418, "grad_norm": 1.6463650465011597, "learning_rate": 2.912511848578296e-05, "loss": 2.6908, "step": 167000 }, { "epoch": 1.2537609075698346, "grad_norm": 1.8483959436416626, "learning_rate": 2.911261338995641e-05, "loss": 2.7775, "step": 167100 }, { "epoch": 1.2545112133194276, "grad_norm": 2.021369695663452, "learning_rate": 2.9100108294129858e-05, "loss": 2.6783, "step": 167200 }, { "epoch": 1.2552615190690206, "grad_norm": 1.763389229774475, "learning_rate": 2.908760319830331e-05, "loss": 2.7528, "step": 167300 }, { "epoch": 1.2560118248186136, "grad_norm": 1.913179636001587, "learning_rate": 2.907509810247676e-05, "loss": 2.6899, "step": 167400 }, { "epoch": 1.2567621305682066, "grad_norm": 1.7071776390075684, "learning_rate": 2.9062593006650214e-05, "loss": 2.7021, "step": 167500 }, { "epoch": 1.2575124363177994, "grad_norm": 1.9597529172897339, "learning_rate": 2.9050087910823665e-05, "loss": 2.6033, "step": 167600 }, { "epoch": 1.2582627420673924, "grad_norm": 2.172943592071533, "learning_rate": 2.903758281499711e-05, "loss": 2.8595, "step": 167700 }, { "epoch": 1.2590130478169854, "grad_norm": 1.6071462631225586, "learning_rate": 2.9025077719170564e-05, "loss": 2.5005, "step": 167800 }, { "epoch": 1.2597633535665784, "grad_norm": 2.779536485671997, "learning_rate": 2.9012572623344015e-05, "loss": 2.618, "step": 167900 }, { "epoch": 1.2605136593161714, "grad_norm": 1.70002281665802, "learning_rate": 2.9000067527517466e-05, "loss": 2.7078, "step": 168000 }, { "epoch": 1.2612639650657642, "grad_norm": 3.2119832038879395, "learning_rate": 2.8987562431690917e-05, "loss": 2.6224, "step": 168100 }, { "epoch": 1.2620142708153572, "grad_norm": 1.3843220472335815, "learning_rate": 2.8975057335864365e-05, "loss": 2.7589, "step": 168200 }, { "epoch": 1.2627645765649502, "grad_norm": 1.571257472038269, "learning_rate": 2.8962552240037816e-05, "loss": 2.6658, "step": 168300 }, { "epoch": 1.2635148823145432, "grad_norm": 1.6021267175674438, "learning_rate": 2.8950047144211267e-05, "loss": 2.8323, "step": 168400 }, { "epoch": 1.2642651880641362, "grad_norm": 2.9832842350006104, "learning_rate": 2.8937542048384718e-05, "loss": 2.5681, "step": 168500 }, { "epoch": 1.265015493813729, "grad_norm": 1.6035830974578857, "learning_rate": 2.8925036952558172e-05, "loss": 2.7778, "step": 168600 }, { "epoch": 1.265765799563322, "grad_norm": 1.4419310092926025, "learning_rate": 2.8912531856731616e-05, "loss": 2.8264, "step": 168700 }, { "epoch": 1.266516105312915, "grad_norm": 1.534411072731018, "learning_rate": 2.8900026760905067e-05, "loss": 2.7753, "step": 168800 }, { "epoch": 1.267266411062508, "grad_norm": 1.6933016777038574, "learning_rate": 2.8887521665078522e-05, "loss": 2.6418, "step": 168900 }, { "epoch": 1.268016716812101, "grad_norm": 1.3642734289169312, "learning_rate": 2.8875016569251973e-05, "loss": 2.6903, "step": 169000 }, { "epoch": 1.2687670225616938, "grad_norm": 2.5530407428741455, "learning_rate": 2.8862511473425424e-05, "loss": 2.6112, "step": 169100 }, { "epoch": 1.2695173283112868, "grad_norm": 3.1722495555877686, "learning_rate": 2.885000637759887e-05, "loss": 2.6536, "step": 169200 }, { "epoch": 1.2702676340608798, "grad_norm": 1.8804843425750732, "learning_rate": 2.8837501281772322e-05, "loss": 2.6819, "step": 169300 }, { "epoch": 1.2710179398104728, "grad_norm": 2.3999500274658203, "learning_rate": 2.8824996185945773e-05, "loss": 2.7392, "step": 169400 }, { "epoch": 1.2717682455600658, "grad_norm": 1.7894198894500732, "learning_rate": 2.8812491090119224e-05, "loss": 2.7091, "step": 169500 }, { "epoch": 1.2725185513096586, "grad_norm": 1.6150710582733154, "learning_rate": 2.879998599429268e-05, "loss": 2.6922, "step": 169600 }, { "epoch": 1.2732688570592516, "grad_norm": 1.737809181213379, "learning_rate": 2.8787480898466123e-05, "loss": 2.6952, "step": 169700 }, { "epoch": 1.2740191628088446, "grad_norm": 2.9736130237579346, "learning_rate": 2.8774975802639574e-05, "loss": 2.5163, "step": 169800 }, { "epoch": 1.2747694685584376, "grad_norm": 3.0098166465759277, "learning_rate": 2.876247070681303e-05, "loss": 2.6202, "step": 169900 }, { "epoch": 1.2755197743080307, "grad_norm": 1.6261156797409058, "learning_rate": 2.874996561098648e-05, "loss": 2.5528, "step": 170000 }, { "epoch": 1.2762700800576234, "grad_norm": 1.4931358098983765, "learning_rate": 2.873746051515993e-05, "loss": 2.7528, "step": 170100 }, { "epoch": 1.2770203858072164, "grad_norm": 1.9534275531768799, "learning_rate": 2.8724955419333378e-05, "loss": 2.658, "step": 170200 }, { "epoch": 1.2777706915568094, "grad_norm": 2.727031707763672, "learning_rate": 2.871245032350683e-05, "loss": 2.7795, "step": 170300 }, { "epoch": 1.2785209973064022, "grad_norm": 1.514417290687561, "learning_rate": 2.869994522768028e-05, "loss": 2.7522, "step": 170400 }, { "epoch": 1.2792713030559955, "grad_norm": 2.1546642780303955, "learning_rate": 2.868744013185373e-05, "loss": 2.7298, "step": 170500 }, { "epoch": 1.2800216088055882, "grad_norm": 1.7461342811584473, "learning_rate": 2.8674935036027186e-05, "loss": 2.6998, "step": 170600 }, { "epoch": 1.2807719145551812, "grad_norm": 1.9018776416778564, "learning_rate": 2.8662554991158896e-05, "loss": 2.6864, "step": 170700 }, { "epoch": 1.2815222203047743, "grad_norm": 2.210263729095459, "learning_rate": 2.8650049895332347e-05, "loss": 2.6959, "step": 170800 }, { "epoch": 1.282272526054367, "grad_norm": 2.556363344192505, "learning_rate": 2.86375447995058e-05, "loss": 2.7583, "step": 170900 }, { "epoch": 1.28302283180396, "grad_norm": 1.725257396697998, "learning_rate": 2.8625039703679253e-05, "loss": 2.7045, "step": 171000 }, { "epoch": 1.283773137553553, "grad_norm": 2.2728281021118164, "learning_rate": 2.8612534607852704e-05, "loss": 2.7536, "step": 171100 }, { "epoch": 1.284523443303146, "grad_norm": 1.400018334388733, "learning_rate": 2.8600029512026148e-05, "loss": 2.6634, "step": 171200 }, { "epoch": 1.285273749052739, "grad_norm": 1.8337582349777222, "learning_rate": 2.858764946715787e-05, "loss": 2.8339, "step": 171300 }, { "epoch": 1.2860240548023318, "grad_norm": 1.6151295900344849, "learning_rate": 2.857514437133132e-05, "loss": 2.5369, "step": 171400 }, { "epoch": 1.2867743605519248, "grad_norm": 1.5595643520355225, "learning_rate": 2.856263927550477e-05, "loss": 2.7579, "step": 171500 }, { "epoch": 1.2875246663015179, "grad_norm": 2.375655174255371, "learning_rate": 2.8550134179678222e-05, "loss": 2.6859, "step": 171600 }, { "epoch": 1.2882749720511109, "grad_norm": 2.5121374130249023, "learning_rate": 2.853762908385167e-05, "loss": 2.7214, "step": 171700 }, { "epoch": 1.2890252778007039, "grad_norm": 2.055074453353882, "learning_rate": 2.852512398802512e-05, "loss": 2.5287, "step": 171800 }, { "epoch": 1.2897755835502966, "grad_norm": 2.078190565109253, "learning_rate": 2.851261889219857e-05, "loss": 2.6921, "step": 171900 }, { "epoch": 1.2905258892998897, "grad_norm": 1.8961753845214844, "learning_rate": 2.8500113796372026e-05, "loss": 2.69, "step": 172000 }, { "epoch": 1.2912761950494827, "grad_norm": 2.691211462020874, "learning_rate": 2.8487608700545477e-05, "loss": 2.762, "step": 172100 }, { "epoch": 1.2920265007990757, "grad_norm": 2.3475282192230225, "learning_rate": 2.847510360471892e-05, "loss": 2.776, "step": 172200 }, { "epoch": 1.2927768065486687, "grad_norm": 2.771507740020752, "learning_rate": 2.8462598508892376e-05, "loss": 2.653, "step": 172300 }, { "epoch": 1.2935271122982615, "grad_norm": 1.907757043838501, "learning_rate": 2.8450093413065827e-05, "loss": 2.5867, "step": 172400 }, { "epoch": 1.2942774180478545, "grad_norm": 3.4639267921447754, "learning_rate": 2.8437588317239278e-05, "loss": 2.6152, "step": 172500 }, { "epoch": 1.2950277237974475, "grad_norm": 1.4565232992172241, "learning_rate": 2.842508322141273e-05, "loss": 2.6084, "step": 172600 }, { "epoch": 1.2957780295470405, "grad_norm": 1.739876627922058, "learning_rate": 2.8412578125586176e-05, "loss": 2.6198, "step": 172700 }, { "epoch": 1.2965283352966335, "grad_norm": 1.7565710544586182, "learning_rate": 2.8400073029759627e-05, "loss": 2.8882, "step": 172800 }, { "epoch": 1.2972786410462263, "grad_norm": 3.0085716247558594, "learning_rate": 2.8387567933933078e-05, "loss": 2.5427, "step": 172900 }, { "epoch": 1.2980289467958193, "grad_norm": 2.044863224029541, "learning_rate": 2.837506283810653e-05, "loss": 2.7407, "step": 173000 }, { "epoch": 1.2987792525454123, "grad_norm": 3.349438190460205, "learning_rate": 2.8362557742279984e-05, "loss": 2.837, "step": 173100 }, { "epoch": 1.2995295582950053, "grad_norm": 1.5186107158660889, "learning_rate": 2.8350052646453428e-05, "loss": 2.6643, "step": 173200 }, { "epoch": 1.3002798640445983, "grad_norm": 2.1541013717651367, "learning_rate": 2.833754755062688e-05, "loss": 2.5516, "step": 173300 }, { "epoch": 1.301030169794191, "grad_norm": 2.3283751010894775, "learning_rate": 2.8325042454800333e-05, "loss": 2.664, "step": 173400 }, { "epoch": 1.301780475543784, "grad_norm": 1.9946272373199463, "learning_rate": 2.8312537358973784e-05, "loss": 2.7081, "step": 173500 }, { "epoch": 1.302530781293377, "grad_norm": 1.8153612613677979, "learning_rate": 2.8300032263147235e-05, "loss": 2.6905, "step": 173600 }, { "epoch": 1.30328108704297, "grad_norm": 2.1266720294952393, "learning_rate": 2.8287527167320683e-05, "loss": 2.6519, "step": 173700 }, { "epoch": 1.304031392792563, "grad_norm": 1.9597357511520386, "learning_rate": 2.8275022071494134e-05, "loss": 2.5832, "step": 173800 }, { "epoch": 1.3047816985421559, "grad_norm": 2.15895414352417, "learning_rate": 2.8262516975667585e-05, "loss": 2.81, "step": 173900 }, { "epoch": 1.3055320042917489, "grad_norm": 2.353057861328125, "learning_rate": 2.8250011879841036e-05, "loss": 2.7387, "step": 174000 }, { "epoch": 1.3062823100413419, "grad_norm": 2.2740776538848877, "learning_rate": 2.823750678401449e-05, "loss": 2.692, "step": 174100 }, { "epoch": 1.3070326157909349, "grad_norm": 1.63338041305542, "learning_rate": 2.8225001688187935e-05, "loss": 2.5133, "step": 174200 }, { "epoch": 1.3077829215405279, "grad_norm": 2.8563575744628906, "learning_rate": 2.8212496592361386e-05, "loss": 2.7152, "step": 174300 }, { "epoch": 1.3085332272901207, "grad_norm": 1.4785983562469482, "learning_rate": 2.819999149653484e-05, "loss": 2.7634, "step": 174400 }, { "epoch": 1.3092835330397137, "grad_norm": 2.421739339828491, "learning_rate": 2.818748640070829e-05, "loss": 2.7682, "step": 174500 }, { "epoch": 1.3100338387893067, "grad_norm": 1.8378455638885498, "learning_rate": 2.8174981304881742e-05, "loss": 2.7205, "step": 174600 }, { "epoch": 1.3107841445388995, "grad_norm": 1.8934015035629272, "learning_rate": 2.816247620905519e-05, "loss": 2.6974, "step": 174700 }, { "epoch": 1.3115344502884927, "grad_norm": 2.1176929473876953, "learning_rate": 2.814997111322864e-05, "loss": 2.5878, "step": 174800 }, { "epoch": 1.3122847560380855, "grad_norm": 2.1738133430480957, "learning_rate": 2.8137466017402092e-05, "loss": 2.5436, "step": 174900 }, { "epoch": 1.3130350617876785, "grad_norm": 1.8557885885238647, "learning_rate": 2.8124960921575543e-05, "loss": 2.6754, "step": 175000 }, { "epoch": 1.3137853675372715, "grad_norm": 1.2751420736312866, "learning_rate": 2.811258087670726e-05, "loss": 2.6224, "step": 175100 }, { "epoch": 1.3145356732868643, "grad_norm": 1.8054916858673096, "learning_rate": 2.8100075780880708e-05, "loss": 2.6204, "step": 175200 }, { "epoch": 1.3152859790364573, "grad_norm": 2.2539968490600586, "learning_rate": 2.808757068505416e-05, "loss": 2.6995, "step": 175300 }, { "epoch": 1.3160362847860503, "grad_norm": 2.7267472743988037, "learning_rate": 2.807506558922761e-05, "loss": 2.7608, "step": 175400 }, { "epoch": 1.3167865905356433, "grad_norm": 3.0948050022125244, "learning_rate": 2.8062560493401064e-05, "loss": 2.5935, "step": 175500 }, { "epoch": 1.3175368962852363, "grad_norm": 1.3742412328720093, "learning_rate": 2.8050055397574515e-05, "loss": 2.705, "step": 175600 }, { "epoch": 1.318287202034829, "grad_norm": 2.3074605464935303, "learning_rate": 2.803755030174796e-05, "loss": 2.8622, "step": 175700 }, { "epoch": 1.319037507784422, "grad_norm": 1.690291404724121, "learning_rate": 2.8025045205921414e-05, "loss": 2.6682, "step": 175800 }, { "epoch": 1.319787813534015, "grad_norm": 2.6199307441711426, "learning_rate": 2.8012540110094865e-05, "loss": 2.7105, "step": 175900 }, { "epoch": 1.320538119283608, "grad_norm": 1.5092711448669434, "learning_rate": 2.8000035014268316e-05, "loss": 2.7221, "step": 176000 }, { "epoch": 1.321288425033201, "grad_norm": 1.232857584953308, "learning_rate": 2.7987529918441767e-05, "loss": 2.6511, "step": 176100 }, { "epoch": 1.3220387307827939, "grad_norm": 1.3577404022216797, "learning_rate": 2.7975024822615215e-05, "loss": 2.7534, "step": 176200 }, { "epoch": 1.3227890365323869, "grad_norm": 2.358859062194824, "learning_rate": 2.7962519726788666e-05, "loss": 2.6467, "step": 176300 }, { "epoch": 1.32353934228198, "grad_norm": 1.9058277606964111, "learning_rate": 2.7950014630962117e-05, "loss": 2.6958, "step": 176400 }, { "epoch": 1.324289648031573, "grad_norm": 1.4720020294189453, "learning_rate": 2.793750953513557e-05, "loss": 2.6891, "step": 176500 }, { "epoch": 1.325039953781166, "grad_norm": 1.8331005573272705, "learning_rate": 2.7925004439309022e-05, "loss": 2.6581, "step": 176600 }, { "epoch": 1.3257902595307587, "grad_norm": 3.6002492904663086, "learning_rate": 2.7912499343482466e-05, "loss": 2.6638, "step": 176700 }, { "epoch": 1.3265405652803517, "grad_norm": 1.440679907798767, "learning_rate": 2.789999424765592e-05, "loss": 2.7371, "step": 176800 }, { "epoch": 1.3272908710299447, "grad_norm": 1.749809980392456, "learning_rate": 2.7887489151829372e-05, "loss": 2.6452, "step": 176900 }, { "epoch": 1.3280411767795377, "grad_norm": 2.1863925457000732, "learning_rate": 2.7874984056002823e-05, "loss": 2.6275, "step": 177000 }, { "epoch": 1.3287914825291307, "grad_norm": 2.6005706787109375, "learning_rate": 2.7862478960176274e-05, "loss": 2.7289, "step": 177100 }, { "epoch": 1.3295417882787235, "grad_norm": 1.6927862167358398, "learning_rate": 2.7849973864349725e-05, "loss": 2.6331, "step": 177200 }, { "epoch": 1.3302920940283165, "grad_norm": 3.9180355072021484, "learning_rate": 2.7837468768523173e-05, "loss": 2.7992, "step": 177300 }, { "epoch": 1.3310423997779095, "grad_norm": 2.137777090072632, "learning_rate": 2.7824963672696624e-05, "loss": 2.6736, "step": 177400 }, { "epoch": 1.3317927055275025, "grad_norm": 1.6137255430221558, "learning_rate": 2.7812458576870078e-05, "loss": 2.6464, "step": 177500 }, { "epoch": 1.3325430112770955, "grad_norm": 1.6831631660461426, "learning_rate": 2.779995348104353e-05, "loss": 2.6575, "step": 177600 }, { "epoch": 1.3332933170266883, "grad_norm": 1.7758989334106445, "learning_rate": 2.778744838521698e-05, "loss": 2.8881, "step": 177700 }, { "epoch": 1.3340436227762813, "grad_norm": 1.664899230003357, "learning_rate": 2.7774943289390428e-05, "loss": 2.5666, "step": 177800 }, { "epoch": 1.3347939285258743, "grad_norm": 1.3698954582214355, "learning_rate": 2.776243819356388e-05, "loss": 2.8527, "step": 177900 }, { "epoch": 1.3355442342754673, "grad_norm": 2.3016505241394043, "learning_rate": 2.774993309773733e-05, "loss": 2.7436, "step": 178000 }, { "epoch": 1.3362945400250603, "grad_norm": 1.6525745391845703, "learning_rate": 2.773742800191078e-05, "loss": 2.7173, "step": 178100 }, { "epoch": 1.337044845774653, "grad_norm": 1.8210221529006958, "learning_rate": 2.772492290608423e-05, "loss": 2.7148, "step": 178200 }, { "epoch": 1.337795151524246, "grad_norm": 1.9526591300964355, "learning_rate": 2.771241781025768e-05, "loss": 2.7333, "step": 178300 }, { "epoch": 1.3385454572738391, "grad_norm": 1.858132004737854, "learning_rate": 2.769991271443113e-05, "loss": 2.6856, "step": 178400 }, { "epoch": 1.3392957630234321, "grad_norm": 2.316981077194214, "learning_rate": 2.768740761860458e-05, "loss": 2.7052, "step": 178500 }, { "epoch": 1.3400460687730251, "grad_norm": 1.5918549299240112, "learning_rate": 2.7674902522778036e-05, "loss": 2.8034, "step": 178600 }, { "epoch": 1.340796374522618, "grad_norm": 2.0680387020111084, "learning_rate": 2.7662397426951487e-05, "loss": 2.6955, "step": 178700 }, { "epoch": 1.341546680272211, "grad_norm": 2.0887560844421387, "learning_rate": 2.764989233112493e-05, "loss": 2.5605, "step": 178800 }, { "epoch": 1.342296986021804, "grad_norm": 2.3032336235046387, "learning_rate": 2.7637387235298385e-05, "loss": 2.6484, "step": 178900 }, { "epoch": 1.343047291771397, "grad_norm": 2.8582942485809326, "learning_rate": 2.7624882139471836e-05, "loss": 2.6655, "step": 179000 }, { "epoch": 1.34379759752099, "grad_norm": 1.8620680570602417, "learning_rate": 2.7612377043645287e-05, "loss": 2.7944, "step": 179100 }, { "epoch": 1.3445479032705827, "grad_norm": 1.9464343786239624, "learning_rate": 2.7599996998777e-05, "loss": 2.7469, "step": 179200 }, { "epoch": 1.3452982090201757, "grad_norm": 1.2662584781646729, "learning_rate": 2.7587491902950452e-05, "loss": 2.7376, "step": 179300 }, { "epoch": 1.3460485147697687, "grad_norm": 1.8883945941925049, "learning_rate": 2.7574986807123903e-05, "loss": 2.5784, "step": 179400 }, { "epoch": 1.3467988205193615, "grad_norm": 2.5625102519989014, "learning_rate": 2.7562481711297355e-05, "loss": 2.7406, "step": 179500 }, { "epoch": 1.3475491262689547, "grad_norm": 1.5699968338012695, "learning_rate": 2.7549976615470806e-05, "loss": 2.6782, "step": 179600 }, { "epoch": 1.3482994320185475, "grad_norm": 1.8465286493301392, "learning_rate": 2.753747151964426e-05, "loss": 2.6272, "step": 179700 }, { "epoch": 1.3490497377681405, "grad_norm": 2.1160318851470947, "learning_rate": 2.7524966423817704e-05, "loss": 2.7329, "step": 179800 }, { "epoch": 1.3498000435177335, "grad_norm": 1.489628553390503, "learning_rate": 2.751246132799116e-05, "loss": 2.7616, "step": 179900 }, { "epoch": 1.3505503492673263, "grad_norm": 1.4126601219177246, "learning_rate": 2.749995623216461e-05, "loss": 2.5903, "step": 180000 }, { "epoch": 1.3513006550169193, "grad_norm": 2.2393507957458496, "learning_rate": 2.748745113633806e-05, "loss": 2.8505, "step": 180100 }, { "epoch": 1.3520509607665123, "grad_norm": 1.5649322271347046, "learning_rate": 2.747494604051151e-05, "loss": 2.7762, "step": 180200 }, { "epoch": 1.3528012665161053, "grad_norm": 3.5221967697143555, "learning_rate": 2.746244094468496e-05, "loss": 2.5911, "step": 180300 }, { "epoch": 1.3535515722656983, "grad_norm": 2.950406312942505, "learning_rate": 2.744993584885841e-05, "loss": 2.8192, "step": 180400 }, { "epoch": 1.3543018780152911, "grad_norm": 1.735953450202942, "learning_rate": 2.743743075303186e-05, "loss": 2.6675, "step": 180500 }, { "epoch": 1.3550521837648841, "grad_norm": 2.3017523288726807, "learning_rate": 2.7424925657205312e-05, "loss": 2.6132, "step": 180600 }, { "epoch": 1.3558024895144771, "grad_norm": 1.3577923774719238, "learning_rate": 2.7412420561378767e-05, "loss": 2.7918, "step": 180700 }, { "epoch": 1.3565527952640701, "grad_norm": 1.9505960941314697, "learning_rate": 2.739991546555221e-05, "loss": 2.5938, "step": 180800 }, { "epoch": 1.3573031010136631, "grad_norm": 2.025113821029663, "learning_rate": 2.7387410369725662e-05, "loss": 2.7157, "step": 180900 }, { "epoch": 1.358053406763256, "grad_norm": 1.755550742149353, "learning_rate": 2.7374905273899116e-05, "loss": 2.7459, "step": 181000 }, { "epoch": 1.358803712512849, "grad_norm": 2.845264196395874, "learning_rate": 2.7362400178072567e-05, "loss": 2.7302, "step": 181100 }, { "epoch": 1.359554018262442, "grad_norm": 1.586623191833496, "learning_rate": 2.7350020133204278e-05, "loss": 2.7027, "step": 181200 }, { "epoch": 1.360304324012035, "grad_norm": 2.359431505203247, "learning_rate": 2.7337515037377732e-05, "loss": 2.5549, "step": 181300 }, { "epoch": 1.361054629761628, "grad_norm": 1.9929102659225464, "learning_rate": 2.7325009941551183e-05, "loss": 2.5359, "step": 181400 }, { "epoch": 1.3618049355112207, "grad_norm": 2.412789821624756, "learning_rate": 2.7312504845724634e-05, "loss": 2.7568, "step": 181500 }, { "epoch": 1.3625552412608137, "grad_norm": 1.527474284172058, "learning_rate": 2.7299999749898086e-05, "loss": 2.5139, "step": 181600 }, { "epoch": 1.3633055470104067, "grad_norm": 2.3984882831573486, "learning_rate": 2.7287494654071537e-05, "loss": 2.5916, "step": 181700 }, { "epoch": 1.3640558527599997, "grad_norm": 1.87788987159729, "learning_rate": 2.7274989558244984e-05, "loss": 2.6517, "step": 181800 }, { "epoch": 1.3648061585095927, "grad_norm": 1.672071933746338, "learning_rate": 2.7262484462418435e-05, "loss": 2.6619, "step": 181900 }, { "epoch": 1.3655564642591855, "grad_norm": 1.8514885902404785, "learning_rate": 2.724997936659189e-05, "loss": 2.5663, "step": 182000 }, { "epoch": 1.3663067700087785, "grad_norm": 1.9372291564941406, "learning_rate": 2.723747427076534e-05, "loss": 2.6008, "step": 182100 }, { "epoch": 1.3670570757583715, "grad_norm": 1.8502960205078125, "learning_rate": 2.722496917493879e-05, "loss": 2.6861, "step": 182200 }, { "epoch": 1.3678073815079645, "grad_norm": 1.8316816091537476, "learning_rate": 2.721246407911224e-05, "loss": 2.7762, "step": 182300 }, { "epoch": 1.3685576872575576, "grad_norm": 1.9478440284729004, "learning_rate": 2.719995898328569e-05, "loss": 2.6869, "step": 182400 }, { "epoch": 1.3693079930071503, "grad_norm": 1.7097535133361816, "learning_rate": 2.718745388745914e-05, "loss": 2.6382, "step": 182500 }, { "epoch": 1.3700582987567433, "grad_norm": 2.414336681365967, "learning_rate": 2.7174948791632592e-05, "loss": 2.6593, "step": 182600 }, { "epoch": 1.3708086045063363, "grad_norm": 2.4093520641326904, "learning_rate": 2.7162443695806043e-05, "loss": 2.5829, "step": 182700 }, { "epoch": 1.3715589102559294, "grad_norm": 1.4354403018951416, "learning_rate": 2.714993859997949e-05, "loss": 2.5505, "step": 182800 }, { "epoch": 1.3723092160055224, "grad_norm": 1.5392740964889526, "learning_rate": 2.7137433504152942e-05, "loss": 2.6814, "step": 182900 }, { "epoch": 1.3730595217551151, "grad_norm": 2.110496759414673, "learning_rate": 2.7124928408326393e-05, "loss": 2.699, "step": 183000 }, { "epoch": 1.3738098275047081, "grad_norm": 1.6475162506103516, "learning_rate": 2.7112548363458114e-05, "loss": 2.6362, "step": 183100 }, { "epoch": 1.3745601332543012, "grad_norm": 1.4381693601608276, "learning_rate": 2.7100043267631558e-05, "loss": 2.5843, "step": 183200 }, { "epoch": 1.3753104390038942, "grad_norm": 2.1145710945129395, "learning_rate": 2.708753817180501e-05, "loss": 2.7781, "step": 183300 }, { "epoch": 1.3760607447534872, "grad_norm": 1.829634189605713, "learning_rate": 2.7075033075978463e-05, "loss": 2.5143, "step": 183400 }, { "epoch": 1.37681105050308, "grad_norm": 2.2611563205718994, "learning_rate": 2.7062527980151914e-05, "loss": 2.6393, "step": 183500 }, { "epoch": 1.377561356252673, "grad_norm": 1.7687994241714478, "learning_rate": 2.7050022884325365e-05, "loss": 2.6841, "step": 183600 }, { "epoch": 1.378311662002266, "grad_norm": 1.7421174049377441, "learning_rate": 2.7037517788498816e-05, "loss": 2.6536, "step": 183700 }, { "epoch": 1.379061967751859, "grad_norm": 1.5058093070983887, "learning_rate": 2.7025012692672264e-05, "loss": 2.6927, "step": 183800 }, { "epoch": 1.379812273501452, "grad_norm": 1.6666686534881592, "learning_rate": 2.7012507596845715e-05, "loss": 2.6084, "step": 183900 }, { "epoch": 1.3805625792510448, "grad_norm": 2.0162296295166016, "learning_rate": 2.7000002501019166e-05, "loss": 2.6809, "step": 184000 }, { "epoch": 1.3813128850006378, "grad_norm": 1.9707823991775513, "learning_rate": 2.6987497405192617e-05, "loss": 2.603, "step": 184100 }, { "epoch": 1.3820631907502308, "grad_norm": 2.724116563796997, "learning_rate": 2.697499230936607e-05, "loss": 2.675, "step": 184200 }, { "epoch": 1.3828134964998235, "grad_norm": 1.35624361038208, "learning_rate": 2.6962487213539516e-05, "loss": 2.5504, "step": 184300 }, { "epoch": 1.3835638022494168, "grad_norm": 1.9325608015060425, "learning_rate": 2.694998211771297e-05, "loss": 2.5436, "step": 184400 }, { "epoch": 1.3843141079990096, "grad_norm": 2.2514398097991943, "learning_rate": 2.693747702188642e-05, "loss": 2.669, "step": 184500 }, { "epoch": 1.3850644137486026, "grad_norm": 2.292560577392578, "learning_rate": 2.6924971926059872e-05, "loss": 2.7252, "step": 184600 }, { "epoch": 1.3858147194981956, "grad_norm": 1.6606899499893188, "learning_rate": 2.6912466830233323e-05, "loss": 2.542, "step": 184700 }, { "epoch": 1.3865650252477884, "grad_norm": 1.4699032306671143, "learning_rate": 2.689996173440677e-05, "loss": 2.635, "step": 184800 }, { "epoch": 1.3873153309973814, "grad_norm": 1.3151440620422363, "learning_rate": 2.6887456638580222e-05, "loss": 2.5479, "step": 184900 }, { "epoch": 1.3880656367469744, "grad_norm": 1.2677141427993774, "learning_rate": 2.6874951542753673e-05, "loss": 2.6231, "step": 185000 }, { "epoch": 1.3888159424965674, "grad_norm": 1.9233605861663818, "learning_rate": 2.6862446446927124e-05, "loss": 2.5735, "step": 185100 }, { "epoch": 1.3895662482461604, "grad_norm": 2.156437873840332, "learning_rate": 2.684994135110058e-05, "loss": 2.5101, "step": 185200 }, { "epoch": 1.3903165539957532, "grad_norm": 2.441939115524292, "learning_rate": 2.6837436255274023e-05, "loss": 2.6515, "step": 185300 }, { "epoch": 1.3910668597453462, "grad_norm": 1.5016742944717407, "learning_rate": 2.6824931159447474e-05, "loss": 2.6244, "step": 185400 }, { "epoch": 1.3918171654949392, "grad_norm": 2.2767980098724365, "learning_rate": 2.6812426063620928e-05, "loss": 2.799, "step": 185500 }, { "epoch": 1.3925674712445322, "grad_norm": 2.2884340286254883, "learning_rate": 2.679992096779438e-05, "loss": 2.8001, "step": 185600 }, { "epoch": 1.3933177769941252, "grad_norm": 1.270044207572937, "learning_rate": 2.678741587196783e-05, "loss": 2.6535, "step": 185700 }, { "epoch": 1.394068082743718, "grad_norm": 2.1062402725219727, "learning_rate": 2.6774910776141278e-05, "loss": 2.6682, "step": 185800 }, { "epoch": 1.394818388493311, "grad_norm": 2.634737730026245, "learning_rate": 2.676240568031473e-05, "loss": 2.6519, "step": 185900 }, { "epoch": 1.395568694242904, "grad_norm": 1.734197974205017, "learning_rate": 2.674990058448818e-05, "loss": 2.71, "step": 186000 }, { "epoch": 1.396318999992497, "grad_norm": 2.571078300476074, "learning_rate": 2.673739548866163e-05, "loss": 2.7211, "step": 186100 }, { "epoch": 1.39706930574209, "grad_norm": 2.2524309158325195, "learning_rate": 2.6724890392835085e-05, "loss": 2.5555, "step": 186200 }, { "epoch": 1.3978196114916828, "grad_norm": 1.581094741821289, "learning_rate": 2.671238529700853e-05, "loss": 2.7617, "step": 186300 }, { "epoch": 1.3985699172412758, "grad_norm": 2.021963119506836, "learning_rate": 2.669988020118198e-05, "loss": 2.7532, "step": 186400 }, { "epoch": 1.3993202229908688, "grad_norm": 1.8687928915023804, "learning_rate": 2.6687375105355435e-05, "loss": 2.6939, "step": 186500 }, { "epoch": 1.4000705287404618, "grad_norm": 2.4565229415893555, "learning_rate": 2.6674870009528886e-05, "loss": 2.6402, "step": 186600 }, { "epoch": 1.4008208344900548, "grad_norm": 2.416722059249878, "learning_rate": 2.6662364913702337e-05, "loss": 2.5866, "step": 186700 }, { "epoch": 1.4015711402396476, "grad_norm": 2.8118159770965576, "learning_rate": 2.6649859817875784e-05, "loss": 2.7538, "step": 186800 }, { "epoch": 1.4023214459892406, "grad_norm": 2.274078130722046, "learning_rate": 2.6637354722049235e-05, "loss": 2.7467, "step": 186900 }, { "epoch": 1.4030717517388336, "grad_norm": 1.8635174036026, "learning_rate": 2.6624849626222686e-05, "loss": 2.5351, "step": 187000 }, { "epoch": 1.4038220574884266, "grad_norm": 2.162843942642212, "learning_rate": 2.6612469581354404e-05, "loss": 2.6449, "step": 187100 }, { "epoch": 1.4045723632380196, "grad_norm": 1.211068034172058, "learning_rate": 2.6600089536486118e-05, "loss": 2.5427, "step": 187200 }, { "epoch": 1.4053226689876124, "grad_norm": 1.7736153602600098, "learning_rate": 2.658758444065957e-05, "loss": 2.7136, "step": 187300 }, { "epoch": 1.4060729747372054, "grad_norm": 1.4300193786621094, "learning_rate": 2.657507934483302e-05, "loss": 2.5798, "step": 187400 }, { "epoch": 1.4068232804867984, "grad_norm": 2.9687020778656006, "learning_rate": 2.656257424900647e-05, "loss": 2.6182, "step": 187500 }, { "epoch": 1.4075735862363914, "grad_norm": 2.8161816596984863, "learning_rate": 2.6550069153179925e-05, "loss": 2.7002, "step": 187600 }, { "epoch": 1.4083238919859844, "grad_norm": 2.3275234699249268, "learning_rate": 2.653756405735337e-05, "loss": 2.5556, "step": 187700 }, { "epoch": 1.4090741977355772, "grad_norm": 2.299506902694702, "learning_rate": 2.652505896152682e-05, "loss": 2.6807, "step": 187800 }, { "epoch": 1.4098245034851702, "grad_norm": 1.4807440042495728, "learning_rate": 2.6512553865700275e-05, "loss": 2.7716, "step": 187900 }, { "epoch": 1.4105748092347632, "grad_norm": 1.7656402587890625, "learning_rate": 2.6500048769873726e-05, "loss": 2.6359, "step": 188000 }, { "epoch": 1.4113251149843562, "grad_norm": 2.135549783706665, "learning_rate": 2.6487543674047177e-05, "loss": 2.7052, "step": 188100 }, { "epoch": 1.4120754207339492, "grad_norm": 1.688325047492981, "learning_rate": 2.6475038578220628e-05, "loss": 2.647, "step": 188200 }, { "epoch": 1.412825726483542, "grad_norm": 2.415625810623169, "learning_rate": 2.6462533482394076e-05, "loss": 2.5737, "step": 188300 }, { "epoch": 1.413576032233135, "grad_norm": 1.8799352645874023, "learning_rate": 2.6450028386567527e-05, "loss": 2.8259, "step": 188400 }, { "epoch": 1.414326337982728, "grad_norm": 1.6393635272979736, "learning_rate": 2.6437523290740978e-05, "loss": 2.5927, "step": 188500 }, { "epoch": 1.4150766437323208, "grad_norm": 1.819187045097351, "learning_rate": 2.642501819491443e-05, "loss": 2.7394, "step": 188600 }, { "epoch": 1.415826949481914, "grad_norm": 1.6083693504333496, "learning_rate": 2.6412513099087883e-05, "loss": 2.6093, "step": 188700 }, { "epoch": 1.4165772552315068, "grad_norm": 1.4180208444595337, "learning_rate": 2.6400008003261327e-05, "loss": 2.8009, "step": 188800 }, { "epoch": 1.4173275609810998, "grad_norm": 2.539964199066162, "learning_rate": 2.638750290743478e-05, "loss": 2.5444, "step": 188900 }, { "epoch": 1.4180778667306928, "grad_norm": 1.5936284065246582, "learning_rate": 2.6374997811608233e-05, "loss": 2.7157, "step": 189000 }, { "epoch": 1.4188281724802856, "grad_norm": 2.6064798831939697, "learning_rate": 2.6362492715781684e-05, "loss": 2.7845, "step": 189100 }, { "epoch": 1.4195784782298786, "grad_norm": 1.4355292320251465, "learning_rate": 2.6349987619955135e-05, "loss": 2.5823, "step": 189200 }, { "epoch": 1.4203287839794716, "grad_norm": 1.4292118549346924, "learning_rate": 2.6337482524128583e-05, "loss": 2.7451, "step": 189300 }, { "epoch": 1.4210790897290646, "grad_norm": 1.7892532348632812, "learning_rate": 2.6324977428302034e-05, "loss": 2.6004, "step": 189400 }, { "epoch": 1.4218293954786576, "grad_norm": 2.1332509517669678, "learning_rate": 2.6312472332475485e-05, "loss": 2.5342, "step": 189500 }, { "epoch": 1.4225797012282504, "grad_norm": 2.013422727584839, "learning_rate": 2.6299967236648936e-05, "loss": 2.6959, "step": 189600 }, { "epoch": 1.4233300069778434, "grad_norm": 1.7371375560760498, "learning_rate": 2.628746214082239e-05, "loss": 2.6992, "step": 189700 }, { "epoch": 1.4240803127274364, "grad_norm": 2.10980486869812, "learning_rate": 2.6274957044995834e-05, "loss": 2.6651, "step": 189800 }, { "epoch": 1.4248306184770294, "grad_norm": 2.06339168548584, "learning_rate": 2.6262451949169285e-05, "loss": 2.6629, "step": 189900 }, { "epoch": 1.4255809242266224, "grad_norm": 1.770938754081726, "learning_rate": 2.624994685334274e-05, "loss": 2.7459, "step": 190000 }, { "epoch": 1.4263312299762152, "grad_norm": 1.9833102226257324, "learning_rate": 2.623744175751619e-05, "loss": 2.5761, "step": 190100 }, { "epoch": 1.4270815357258082, "grad_norm": 1.6441419124603271, "learning_rate": 2.622493666168964e-05, "loss": 2.6509, "step": 190200 }, { "epoch": 1.4278318414754012, "grad_norm": 2.4493520259857178, "learning_rate": 2.621243156586309e-05, "loss": 2.7483, "step": 190300 }, { "epoch": 1.4285821472249942, "grad_norm": 1.5390734672546387, "learning_rate": 2.619992647003654e-05, "loss": 2.5349, "step": 190400 }, { "epoch": 1.4293324529745872, "grad_norm": 2.2572429180145264, "learning_rate": 2.618742137420999e-05, "loss": 2.6748, "step": 190500 }, { "epoch": 1.43008275872418, "grad_norm": 1.5981416702270508, "learning_rate": 2.6174916278383442e-05, "loss": 2.6439, "step": 190600 }, { "epoch": 1.430833064473773, "grad_norm": 2.432184934616089, "learning_rate": 2.6162411182556897e-05, "loss": 2.6611, "step": 190700 }, { "epoch": 1.431583370223366, "grad_norm": 2.1028635501861572, "learning_rate": 2.614990608673034e-05, "loss": 2.6312, "step": 190800 }, { "epoch": 1.432333675972959, "grad_norm": 1.219407081604004, "learning_rate": 2.6137400990903792e-05, "loss": 2.6237, "step": 190900 }, { "epoch": 1.433083981722552, "grad_norm": 2.1047921180725098, "learning_rate": 2.6124895895077246e-05, "loss": 2.7589, "step": 191000 }, { "epoch": 1.4338342874721448, "grad_norm": 2.0609841346740723, "learning_rate": 2.6112390799250697e-05, "loss": 2.7114, "step": 191100 }, { "epoch": 1.4345845932217378, "grad_norm": 1.6695895195007324, "learning_rate": 2.6100010754382415e-05, "loss": 2.5295, "step": 191200 }, { "epoch": 1.4353348989713308, "grad_norm": 2.1893577575683594, "learning_rate": 2.6087505658555862e-05, "loss": 2.8611, "step": 191300 }, { "epoch": 1.4360852047209238, "grad_norm": 3.248234987258911, "learning_rate": 2.6075000562729314e-05, "loss": 2.7034, "step": 191400 }, { "epoch": 1.4368355104705168, "grad_norm": 2.1774096488952637, "learning_rate": 2.6062495466902765e-05, "loss": 2.7603, "step": 191500 }, { "epoch": 1.4375858162201096, "grad_norm": 2.106631278991699, "learning_rate": 2.6049990371076216e-05, "loss": 2.7165, "step": 191600 }, { "epoch": 1.4383361219697026, "grad_norm": 1.764487862586975, "learning_rate": 2.6037485275249667e-05, "loss": 2.7103, "step": 191700 }, { "epoch": 1.4390864277192956, "grad_norm": 2.1316723823547363, "learning_rate": 2.6024980179423114e-05, "loss": 2.6783, "step": 191800 }, { "epoch": 1.4398367334688886, "grad_norm": 1.6774919033050537, "learning_rate": 2.6012475083596565e-05, "loss": 2.789, "step": 191900 }, { "epoch": 1.4405870392184816, "grad_norm": 1.7075845003128052, "learning_rate": 2.5999969987770016e-05, "loss": 2.5208, "step": 192000 }, { "epoch": 1.4413373449680744, "grad_norm": 2.232797622680664, "learning_rate": 2.598746489194347e-05, "loss": 2.6718, "step": 192100 }, { "epoch": 1.4420876507176674, "grad_norm": 2.048640012741089, "learning_rate": 2.597495979611692e-05, "loss": 2.7327, "step": 192200 }, { "epoch": 1.4428379564672604, "grad_norm": 2.3327572345733643, "learning_rate": 2.5962454700290366e-05, "loss": 2.6294, "step": 192300 }, { "epoch": 1.4435882622168534, "grad_norm": 1.7062629461288452, "learning_rate": 2.594994960446382e-05, "loss": 2.4304, "step": 192400 }, { "epoch": 1.4443385679664464, "grad_norm": 2.093198776245117, "learning_rate": 2.593744450863727e-05, "loss": 2.5946, "step": 192500 }, { "epoch": 1.4450888737160392, "grad_norm": 1.664161205291748, "learning_rate": 2.5924939412810722e-05, "loss": 2.706, "step": 192600 }, { "epoch": 1.4458391794656322, "grad_norm": 2.0000665187835693, "learning_rate": 2.5912434316984173e-05, "loss": 2.7427, "step": 192700 }, { "epoch": 1.4465894852152252, "grad_norm": 1.5687319040298462, "learning_rate": 2.589992922115762e-05, "loss": 2.7019, "step": 192800 }, { "epoch": 1.4473397909648182, "grad_norm": 2.2011959552764893, "learning_rate": 2.5887424125331072e-05, "loss": 2.6538, "step": 192900 }, { "epoch": 1.4480900967144112, "grad_norm": 2.5736420154571533, "learning_rate": 2.5874919029504523e-05, "loss": 2.6196, "step": 193000 }, { "epoch": 1.448840402464004, "grad_norm": 1.945537805557251, "learning_rate": 2.5862413933677977e-05, "loss": 2.7982, "step": 193100 }, { "epoch": 1.449590708213597, "grad_norm": 2.2626149654388428, "learning_rate": 2.5850033888809695e-05, "loss": 2.8169, "step": 193200 }, { "epoch": 1.45034101396319, "grad_norm": 1.3258017301559448, "learning_rate": 2.583752879298314e-05, "loss": 2.6852, "step": 193300 }, { "epoch": 1.4510913197127828, "grad_norm": 1.668379545211792, "learning_rate": 2.582502369715659e-05, "loss": 2.6351, "step": 193400 }, { "epoch": 1.451841625462376, "grad_norm": 1.202345609664917, "learning_rate": 2.5812518601330044e-05, "loss": 2.7061, "step": 193500 }, { "epoch": 1.4525919312119688, "grad_norm": 1.5924277305603027, "learning_rate": 2.5800013505503496e-05, "loss": 2.6885, "step": 193600 }, { "epoch": 1.4533422369615618, "grad_norm": 1.5975316762924194, "learning_rate": 2.5787508409676947e-05, "loss": 2.6693, "step": 193700 }, { "epoch": 1.4540925427111548, "grad_norm": 1.5682274103164673, "learning_rate": 2.5775003313850394e-05, "loss": 2.7463, "step": 193800 }, { "epoch": 1.4548428484607476, "grad_norm": 2.051135540008545, "learning_rate": 2.5762498218023845e-05, "loss": 2.6687, "step": 193900 }, { "epoch": 1.4555931542103406, "grad_norm": 2.879499912261963, "learning_rate": 2.5749993122197296e-05, "loss": 2.7575, "step": 194000 }, { "epoch": 1.4563434599599336, "grad_norm": 2.640082597732544, "learning_rate": 2.5737488026370747e-05, "loss": 2.6265, "step": 194100 }, { "epoch": 1.4570937657095266, "grad_norm": 1.9398301839828491, "learning_rate": 2.57249829305442e-05, "loss": 2.7479, "step": 194200 }, { "epoch": 1.4578440714591197, "grad_norm": 1.7896203994750977, "learning_rate": 2.5712477834717646e-05, "loss": 2.7008, "step": 194300 }, { "epoch": 1.4585943772087124, "grad_norm": 2.4437692165374756, "learning_rate": 2.5699972738891097e-05, "loss": 2.8315, "step": 194400 }, { "epoch": 1.4593446829583054, "grad_norm": 2.0581753253936768, "learning_rate": 2.568746764306455e-05, "loss": 2.7329, "step": 194500 }, { "epoch": 1.4600949887078984, "grad_norm": 2.5018150806427, "learning_rate": 2.5674962547238002e-05, "loss": 2.6206, "step": 194600 }, { "epoch": 1.4608452944574915, "grad_norm": 2.230682611465454, "learning_rate": 2.5662457451411453e-05, "loss": 2.6169, "step": 194700 }, { "epoch": 1.4615956002070845, "grad_norm": 2.6271965503692627, "learning_rate": 2.56499523555849e-05, "loss": 2.5469, "step": 194800 }, { "epoch": 1.4623459059566772, "grad_norm": 1.5373567342758179, "learning_rate": 2.5637447259758352e-05, "loss": 2.6014, "step": 194900 }, { "epoch": 1.4630962117062702, "grad_norm": 1.5876331329345703, "learning_rate": 2.5624942163931803e-05, "loss": 2.741, "step": 195000 }, { "epoch": 1.4638465174558632, "grad_norm": 1.8301210403442383, "learning_rate": 2.5612437068105254e-05, "loss": 2.6897, "step": 195100 }, { "epoch": 1.4645968232054563, "grad_norm": 1.9007768630981445, "learning_rate": 2.559993197227871e-05, "loss": 2.7419, "step": 195200 }, { "epoch": 1.4653471289550493, "grad_norm": 2.168782949447632, "learning_rate": 2.558755192741042e-05, "loss": 2.6768, "step": 195300 }, { "epoch": 1.466097434704642, "grad_norm": 2.235910177230835, "learning_rate": 2.557504683158387e-05, "loss": 2.7035, "step": 195400 }, { "epoch": 1.466847740454235, "grad_norm": 2.2615158557891846, "learning_rate": 2.556254173575732e-05, "loss": 2.7318, "step": 195500 }, { "epoch": 1.467598046203828, "grad_norm": 1.891579508781433, "learning_rate": 2.5550036639930775e-05, "loss": 2.7258, "step": 195600 }, { "epoch": 1.468348351953421, "grad_norm": 1.5511633157730103, "learning_rate": 2.5537531544104226e-05, "loss": 2.5997, "step": 195700 }, { "epoch": 1.469098657703014, "grad_norm": 1.5903886556625366, "learning_rate": 2.552502644827767e-05, "loss": 2.6115, "step": 195800 }, { "epoch": 1.4698489634526068, "grad_norm": 1.364157795906067, "learning_rate": 2.5512521352451125e-05, "loss": 2.6784, "step": 195900 }, { "epoch": 1.4705992692021999, "grad_norm": 1.5906531810760498, "learning_rate": 2.5500016256624576e-05, "loss": 2.7046, "step": 196000 }, { "epoch": 1.4713495749517929, "grad_norm": 2.0848536491394043, "learning_rate": 2.5487511160798027e-05, "loss": 2.6201, "step": 196100 }, { "epoch": 1.4720998807013859, "grad_norm": 1.3955910205841064, "learning_rate": 2.547513111592974e-05, "loss": 2.7428, "step": 196200 }, { "epoch": 1.4728501864509789, "grad_norm": 2.071633815765381, "learning_rate": 2.5462626020103192e-05, "loss": 2.673, "step": 196300 }, { "epoch": 1.4736004922005717, "grad_norm": 2.3613293170928955, "learning_rate": 2.5450120924276643e-05, "loss": 2.6643, "step": 196400 }, { "epoch": 1.4743507979501647, "grad_norm": 1.3429721593856812, "learning_rate": 2.5437615828450094e-05, "loss": 2.7017, "step": 196500 }, { "epoch": 1.4751011036997577, "grad_norm": 2.3996503353118896, "learning_rate": 2.542511073262355e-05, "loss": 2.6209, "step": 196600 }, { "epoch": 1.4758514094493507, "grad_norm": 1.7078642845153809, "learning_rate": 2.5412605636796993e-05, "loss": 2.675, "step": 196700 }, { "epoch": 1.4766017151989437, "grad_norm": 2.2044920921325684, "learning_rate": 2.5400100540970444e-05, "loss": 2.5535, "step": 196800 }, { "epoch": 1.4773520209485365, "grad_norm": 2.227616786956787, "learning_rate": 2.53875954451439e-05, "loss": 2.6645, "step": 196900 }, { "epoch": 1.4781023266981295, "grad_norm": 1.995450735092163, "learning_rate": 2.537509034931735e-05, "loss": 2.7358, "step": 197000 }, { "epoch": 1.4788526324477225, "grad_norm": 1.438651442527771, "learning_rate": 2.53625852534908e-05, "loss": 2.6715, "step": 197100 }, { "epoch": 1.4796029381973155, "grad_norm": 2.525571823120117, "learning_rate": 2.535008015766425e-05, "loss": 2.65, "step": 197200 }, { "epoch": 1.4803532439469085, "grad_norm": 2.2555627822875977, "learning_rate": 2.53375750618377e-05, "loss": 2.6959, "step": 197300 }, { "epoch": 1.4811035496965013, "grad_norm": 1.453940749168396, "learning_rate": 2.532506996601115e-05, "loss": 2.6445, "step": 197400 }, { "epoch": 1.4818538554460943, "grad_norm": 1.4563626050949097, "learning_rate": 2.53125648701846e-05, "loss": 2.6054, "step": 197500 }, { "epoch": 1.4826041611956873, "grad_norm": 1.7528483867645264, "learning_rate": 2.5300059774358052e-05, "loss": 2.7386, "step": 197600 }, { "epoch": 1.4833544669452803, "grad_norm": 2.723092794418335, "learning_rate": 2.5287554678531506e-05, "loss": 2.6736, "step": 197700 }, { "epoch": 1.4841047726948733, "grad_norm": 1.7552059888839722, "learning_rate": 2.527504958270495e-05, "loss": 2.6755, "step": 197800 }, { "epoch": 1.484855078444466, "grad_norm": 1.6301065683364868, "learning_rate": 2.5262544486878402e-05, "loss": 2.5695, "step": 197900 }, { "epoch": 1.485605384194059, "grad_norm": 1.4971996545791626, "learning_rate": 2.5250039391051856e-05, "loss": 2.5767, "step": 198000 }, { "epoch": 1.486355689943652, "grad_norm": 1.5230311155319214, "learning_rate": 2.5237534295225307e-05, "loss": 2.8554, "step": 198100 }, { "epoch": 1.4871059956932449, "grad_norm": 1.7575076818466187, "learning_rate": 2.5225029199398758e-05, "loss": 2.6361, "step": 198200 }, { "epoch": 1.487856301442838, "grad_norm": 1.621943712234497, "learning_rate": 2.5212524103572206e-05, "loss": 2.8841, "step": 198300 }, { "epoch": 1.4886066071924309, "grad_norm": 2.9874024391174316, "learning_rate": 2.5200019007745657e-05, "loss": 2.5291, "step": 198400 }, { "epoch": 1.4893569129420239, "grad_norm": 2.34475040435791, "learning_rate": 2.5187513911919108e-05, "loss": 2.7192, "step": 198500 }, { "epoch": 1.4901072186916169, "grad_norm": 2.489389419555664, "learning_rate": 2.517500881609256e-05, "loss": 2.5694, "step": 198600 }, { "epoch": 1.4908575244412097, "grad_norm": 1.5018150806427002, "learning_rate": 2.5162503720266013e-05, "loss": 2.6394, "step": 198700 }, { "epoch": 1.4916078301908027, "grad_norm": 2.3877789974212646, "learning_rate": 2.5149998624439457e-05, "loss": 2.7111, "step": 198800 }, { "epoch": 1.4923581359403957, "grad_norm": 3.0600132942199707, "learning_rate": 2.513749352861291e-05, "loss": 2.7566, "step": 198900 }, { "epoch": 1.4931084416899887, "grad_norm": 1.646432876586914, "learning_rate": 2.5124988432786363e-05, "loss": 2.8064, "step": 199000 }, { "epoch": 1.4938587474395817, "grad_norm": 2.063530921936035, "learning_rate": 2.5112483336959814e-05, "loss": 2.637, "step": 199100 }, { "epoch": 1.4946090531891745, "grad_norm": 1.6920430660247803, "learning_rate": 2.5099978241133265e-05, "loss": 2.6988, "step": 199200 }, { "epoch": 1.4953593589387675, "grad_norm": 1.4087765216827393, "learning_rate": 2.5087473145306713e-05, "loss": 2.6763, "step": 199300 }, { "epoch": 1.4961096646883605, "grad_norm": 2.051161050796509, "learning_rate": 2.5074968049480164e-05, "loss": 2.711, "step": 199400 }, { "epoch": 1.4968599704379535, "grad_norm": 2.586760997772217, "learning_rate": 2.5062462953653615e-05, "loss": 2.7577, "step": 199500 }, { "epoch": 1.4976102761875465, "grad_norm": 3.5310282707214355, "learning_rate": 2.5049957857827066e-05, "loss": 2.5457, "step": 199600 }, { "epoch": 1.4983605819371393, "grad_norm": 2.8686726093292236, "learning_rate": 2.503745276200052e-05, "loss": 2.6657, "step": 199700 }, { "epoch": 1.4991108876867323, "grad_norm": 1.6192210912704468, "learning_rate": 2.5024947666173964e-05, "loss": 2.5669, "step": 199800 }, { "epoch": 1.4998611934363253, "grad_norm": 1.6960114240646362, "learning_rate": 2.5012442570347415e-05, "loss": 2.6935, "step": 199900 }, { "epoch": 1.5006114991859183, "grad_norm": 1.81368887424469, "learning_rate": 2.499993747452087e-05, "loss": 2.5692, "step": 200000 }, { "epoch": 1.5013618049355113, "grad_norm": 2.1089258193969727, "learning_rate": 2.498743237869432e-05, "loss": 2.6062, "step": 200100 }, { "epoch": 1.502112110685104, "grad_norm": 1.7970737218856812, "learning_rate": 2.497492728286777e-05, "loss": 2.6931, "step": 200200 }, { "epoch": 1.502862416434697, "grad_norm": 2.4378740787506104, "learning_rate": 2.4962547237999486e-05, "loss": 2.7058, "step": 200300 }, { "epoch": 1.50361272218429, "grad_norm": 1.6912623643875122, "learning_rate": 2.4950042142172937e-05, "loss": 2.8058, "step": 200400 }, { "epoch": 1.504363027933883, "grad_norm": 3.236279010772705, "learning_rate": 2.4937537046346388e-05, "loss": 2.6846, "step": 200500 }, { "epoch": 1.505113333683476, "grad_norm": 1.6042827367782593, "learning_rate": 2.492503195051984e-05, "loss": 2.6887, "step": 200600 }, { "epoch": 1.505863639433069, "grad_norm": 1.388132929801941, "learning_rate": 2.491252685469329e-05, "loss": 2.5912, "step": 200700 }, { "epoch": 1.506613945182662, "grad_norm": 2.1201438903808594, "learning_rate": 2.490002175886674e-05, "loss": 2.7698, "step": 200800 }, { "epoch": 1.507364250932255, "grad_norm": 2.100926160812378, "learning_rate": 2.488751666304019e-05, "loss": 2.5988, "step": 200900 }, { "epoch": 1.508114556681848, "grad_norm": 1.4231605529785156, "learning_rate": 2.487501156721364e-05, "loss": 2.7404, "step": 201000 }, { "epoch": 1.508864862431441, "grad_norm": 1.8083412647247314, "learning_rate": 2.4862506471387094e-05, "loss": 2.5918, "step": 201100 }, { "epoch": 1.5096151681810337, "grad_norm": 1.9285329580307007, "learning_rate": 2.485000137556054e-05, "loss": 2.7443, "step": 201200 }, { "epoch": 1.5103654739306267, "grad_norm": 2.036794662475586, "learning_rate": 2.4837496279733993e-05, "loss": 2.5553, "step": 201300 }, { "epoch": 1.5111157796802197, "grad_norm": 3.0540273189544678, "learning_rate": 2.482511623486571e-05, "loss": 2.7236, "step": 201400 }, { "epoch": 1.5118660854298125, "grad_norm": 2.1343636512756348, "learning_rate": 2.481261113903916e-05, "loss": 2.9027, "step": 201500 }, { "epoch": 1.5126163911794057, "grad_norm": 2.210637092590332, "learning_rate": 2.480010604321261e-05, "loss": 2.6479, "step": 201600 }, { "epoch": 1.5133666969289985, "grad_norm": 1.287975788116455, "learning_rate": 2.478760094738606e-05, "loss": 2.5973, "step": 201700 }, { "epoch": 1.5141170026785915, "grad_norm": 1.9121443033218384, "learning_rate": 2.4775095851559514e-05, "loss": 2.5793, "step": 201800 }, { "epoch": 1.5148673084281845, "grad_norm": 1.5482051372528076, "learning_rate": 2.476259075573296e-05, "loss": 2.5291, "step": 201900 }, { "epoch": 1.5156176141777773, "grad_norm": 1.4475493431091309, "learning_rate": 2.4750085659906413e-05, "loss": 2.6815, "step": 202000 }, { "epoch": 1.5163679199273705, "grad_norm": 2.010852098464966, "learning_rate": 2.4737580564079864e-05, "loss": 2.7025, "step": 202100 }, { "epoch": 1.5171182256769633, "grad_norm": 1.8609579801559448, "learning_rate": 2.4725075468253315e-05, "loss": 2.6428, "step": 202200 }, { "epoch": 1.5178685314265563, "grad_norm": 2.150235176086426, "learning_rate": 2.4712570372426766e-05, "loss": 2.6444, "step": 202300 }, { "epoch": 1.5186188371761493, "grad_norm": 2.0972821712493896, "learning_rate": 2.4700065276600213e-05, "loss": 2.5769, "step": 202400 }, { "epoch": 1.519369142925742, "grad_norm": 1.7547329664230347, "learning_rate": 2.4687560180773668e-05, "loss": 2.7014, "step": 202500 }, { "epoch": 1.5201194486753353, "grad_norm": 1.5560473203659058, "learning_rate": 2.467505508494712e-05, "loss": 2.7016, "step": 202600 }, { "epoch": 1.520869754424928, "grad_norm": 2.1379055976867676, "learning_rate": 2.4662549989120566e-05, "loss": 2.5587, "step": 202700 }, { "epoch": 1.5216200601745211, "grad_norm": 3.509908437728882, "learning_rate": 2.465004489329402e-05, "loss": 2.6586, "step": 202800 }, { "epoch": 1.5223703659241141, "grad_norm": 2.155240058898926, "learning_rate": 2.463753979746747e-05, "loss": 2.6631, "step": 202900 }, { "epoch": 1.523120671673707, "grad_norm": 2.161431074142456, "learning_rate": 2.462503470164092e-05, "loss": 2.6696, "step": 203000 }, { "epoch": 1.5238709774233001, "grad_norm": 2.180020332336426, "learning_rate": 2.461252960581437e-05, "loss": 2.7836, "step": 203100 }, { "epoch": 1.524621283172893, "grad_norm": 2.2725253105163574, "learning_rate": 2.460002450998782e-05, "loss": 2.8005, "step": 203200 }, { "epoch": 1.525371588922486, "grad_norm": 2.145050525665283, "learning_rate": 2.4587519414161273e-05, "loss": 2.6099, "step": 203300 }, { "epoch": 1.526121894672079, "grad_norm": 2.1631505489349365, "learning_rate": 2.457501431833472e-05, "loss": 2.5644, "step": 203400 }, { "epoch": 1.5268722004216717, "grad_norm": 2.404207944869995, "learning_rate": 2.4562509222508175e-05, "loss": 2.6781, "step": 203500 }, { "epoch": 1.527622506171265, "grad_norm": 2.0282812118530273, "learning_rate": 2.4550004126681626e-05, "loss": 2.6712, "step": 203600 }, { "epoch": 1.5283728119208577, "grad_norm": 1.9803839921951294, "learning_rate": 2.4537499030855073e-05, "loss": 2.7518, "step": 203700 }, { "epoch": 1.5291231176704507, "grad_norm": 2.2950377464294434, "learning_rate": 2.4524993935028524e-05, "loss": 2.7144, "step": 203800 }, { "epoch": 1.5298734234200437, "grad_norm": 2.332221746444702, "learning_rate": 2.4512488839201975e-05, "loss": 2.6444, "step": 203900 }, { "epoch": 1.5306237291696365, "grad_norm": 2.472106695175171, "learning_rate": 2.4499983743375426e-05, "loss": 2.5933, "step": 204000 }, { "epoch": 1.5313740349192297, "grad_norm": 1.7040878534317017, "learning_rate": 2.4487478647548877e-05, "loss": 2.6514, "step": 204100 }, { "epoch": 1.5321243406688225, "grad_norm": 1.5852810144424438, "learning_rate": 2.4474973551722328e-05, "loss": 2.641, "step": 204200 }, { "epoch": 1.5328746464184155, "grad_norm": 1.931869626045227, "learning_rate": 2.446246845589578e-05, "loss": 2.6974, "step": 204300 }, { "epoch": 1.5336249521680085, "grad_norm": 1.9322582483291626, "learning_rate": 2.444996336006923e-05, "loss": 2.6402, "step": 204400 }, { "epoch": 1.5343752579176013, "grad_norm": 1.6241053342819214, "learning_rate": 2.4437583315200944e-05, "loss": 2.5481, "step": 204500 }, { "epoch": 1.5351255636671945, "grad_norm": 1.5189838409423828, "learning_rate": 2.4425078219374395e-05, "loss": 2.4382, "step": 204600 }, { "epoch": 1.5358758694167873, "grad_norm": 1.634372353553772, "learning_rate": 2.4412573123547846e-05, "loss": 2.8249, "step": 204700 }, { "epoch": 1.5366261751663803, "grad_norm": 2.016907215118408, "learning_rate": 2.4400068027721297e-05, "loss": 2.8031, "step": 204800 }, { "epoch": 1.5373764809159733, "grad_norm": 1.4836604595184326, "learning_rate": 2.438756293189475e-05, "loss": 2.482, "step": 204900 }, { "epoch": 1.5381267866655661, "grad_norm": 2.349299192428589, "learning_rate": 2.43750578360682e-05, "loss": 2.5149, "step": 205000 }, { "epoch": 1.5388770924151591, "grad_norm": 1.325929880142212, "learning_rate": 2.436255274024165e-05, "loss": 2.6667, "step": 205100 }, { "epoch": 1.5396273981647521, "grad_norm": 2.0187489986419678, "learning_rate": 2.43500476444151e-05, "loss": 2.7668, "step": 205200 }, { "epoch": 1.5403777039143451, "grad_norm": 2.074080228805542, "learning_rate": 2.4337542548588552e-05, "loss": 2.6161, "step": 205300 }, { "epoch": 1.5411280096639381, "grad_norm": 3.2694406509399414, "learning_rate": 2.4325037452762e-05, "loss": 2.6589, "step": 205400 }, { "epoch": 1.541878315413531, "grad_norm": 1.896914005279541, "learning_rate": 2.431253235693545e-05, "loss": 2.7554, "step": 205500 }, { "epoch": 1.542628621163124, "grad_norm": 1.5034841299057007, "learning_rate": 2.4300027261108906e-05, "loss": 2.7931, "step": 205600 }, { "epoch": 1.543378926912717, "grad_norm": 1.6821403503417969, "learning_rate": 2.4287522165282353e-05, "loss": 2.653, "step": 205700 }, { "epoch": 1.5441292326623097, "grad_norm": 1.7687114477157593, "learning_rate": 2.4275017069455804e-05, "loss": 2.6999, "step": 205800 }, { "epoch": 1.544879538411903, "grad_norm": 1.9460889101028442, "learning_rate": 2.4262511973629255e-05, "loss": 2.5456, "step": 205900 }, { "epoch": 1.5456298441614957, "grad_norm": 1.8550443649291992, "learning_rate": 2.4250006877802706e-05, "loss": 2.522, "step": 206000 }, { "epoch": 1.5463801499110887, "grad_norm": 1.4341129064559937, "learning_rate": 2.4237501781976157e-05, "loss": 2.6996, "step": 206100 }, { "epoch": 1.5471304556606817, "grad_norm": 2.889885902404785, "learning_rate": 2.4224996686149605e-05, "loss": 2.6969, "step": 206200 }, { "epoch": 1.5478807614102745, "grad_norm": 1.4495594501495361, "learning_rate": 2.421249159032306e-05, "loss": 2.4954, "step": 206300 }, { "epoch": 1.5486310671598678, "grad_norm": 2.2730181217193604, "learning_rate": 2.4199986494496507e-05, "loss": 2.5867, "step": 206400 }, { "epoch": 1.5493813729094605, "grad_norm": 1.7124449014663696, "learning_rate": 2.4187481398669958e-05, "loss": 2.8189, "step": 206500 }, { "epoch": 1.5501316786590535, "grad_norm": 1.7644494771957397, "learning_rate": 2.4174976302843412e-05, "loss": 2.6403, "step": 206600 }, { "epoch": 1.5508819844086466, "grad_norm": 2.0883936882019043, "learning_rate": 2.416247120701686e-05, "loss": 2.6653, "step": 206700 }, { "epoch": 1.5516322901582393, "grad_norm": 1.8128715753555298, "learning_rate": 2.414996611119031e-05, "loss": 2.6139, "step": 206800 }, { "epoch": 1.5523825959078326, "grad_norm": 1.7460479736328125, "learning_rate": 2.4137461015363762e-05, "loss": 2.6781, "step": 206900 }, { "epoch": 1.5531329016574253, "grad_norm": 1.8737783432006836, "learning_rate": 2.4124955919537213e-05, "loss": 2.6852, "step": 207000 }, { "epoch": 1.5538832074070184, "grad_norm": 1.9360527992248535, "learning_rate": 2.4112450823710664e-05, "loss": 2.7422, "step": 207100 }, { "epoch": 1.5546335131566114, "grad_norm": 2.1941254138946533, "learning_rate": 2.409994572788411e-05, "loss": 2.6793, "step": 207200 }, { "epoch": 1.5553838189062041, "grad_norm": 1.4945056438446045, "learning_rate": 2.4087440632057566e-05, "loss": 2.5977, "step": 207300 }, { "epoch": 1.5561341246557974, "grad_norm": 1.74807608127594, "learning_rate": 2.4074935536231017e-05, "loss": 2.5972, "step": 207400 }, { "epoch": 1.5568844304053902, "grad_norm": 1.8540807962417603, "learning_rate": 2.4062430440404465e-05, "loss": 2.693, "step": 207500 }, { "epoch": 1.5576347361549832, "grad_norm": 1.523244023323059, "learning_rate": 2.4049925344577916e-05, "loss": 2.689, "step": 207600 }, { "epoch": 1.5583850419045762, "grad_norm": 2.089001178741455, "learning_rate": 2.4037420248751367e-05, "loss": 2.6395, "step": 207700 }, { "epoch": 1.559135347654169, "grad_norm": 2.2964847087860107, "learning_rate": 2.4024915152924818e-05, "loss": 2.418, "step": 207800 }, { "epoch": 1.5598856534037622, "grad_norm": 2.1706175804138184, "learning_rate": 2.401241005709827e-05, "loss": 2.6841, "step": 207900 }, { "epoch": 1.560635959153355, "grad_norm": 2.294679880142212, "learning_rate": 2.399990496127172e-05, "loss": 2.6833, "step": 208000 }, { "epoch": 1.561386264902948, "grad_norm": 1.4239764213562012, "learning_rate": 2.398739986544517e-05, "loss": 2.6509, "step": 208100 }, { "epoch": 1.562136570652541, "grad_norm": 1.590937614440918, "learning_rate": 2.397489476961862e-05, "loss": 2.7791, "step": 208200 }, { "epoch": 1.5628868764021338, "grad_norm": 2.7314019203186035, "learning_rate": 2.396238967379207e-05, "loss": 2.7021, "step": 208300 }, { "epoch": 1.563637182151727, "grad_norm": 6.89345645904541, "learning_rate": 2.3949884577965524e-05, "loss": 2.8637, "step": 208400 }, { "epoch": 1.5643874879013198, "grad_norm": 1.6298549175262451, "learning_rate": 2.393737948213897e-05, "loss": 2.7037, "step": 208500 }, { "epoch": 1.5651377936509128, "grad_norm": 2.202756404876709, "learning_rate": 2.3924874386312422e-05, "loss": 2.7167, "step": 208600 }, { "epoch": 1.5658880994005058, "grad_norm": 2.0448014736175537, "learning_rate": 2.391249434144414e-05, "loss": 2.6456, "step": 208700 }, { "epoch": 1.5666384051500986, "grad_norm": 1.9135278463363647, "learning_rate": 2.389998924561759e-05, "loss": 2.5602, "step": 208800 }, { "epoch": 1.5673887108996918, "grad_norm": 2.2612831592559814, "learning_rate": 2.3887484149791042e-05, "loss": 2.6379, "step": 208900 }, { "epoch": 1.5681390166492846, "grad_norm": 1.869597315788269, "learning_rate": 2.3874979053964493e-05, "loss": 2.7398, "step": 209000 }, { "epoch": 1.5688893223988776, "grad_norm": 1.4013866186141968, "learning_rate": 2.3862473958137944e-05, "loss": 2.6511, "step": 209100 }, { "epoch": 1.5696396281484706, "grad_norm": 1.6572636365890503, "learning_rate": 2.384996886231139e-05, "loss": 2.7466, "step": 209200 }, { "epoch": 1.5703899338980634, "grad_norm": 2.6015913486480713, "learning_rate": 2.3837463766484843e-05, "loss": 2.6177, "step": 209300 }, { "epoch": 1.5711402396476564, "grad_norm": 1.7559616565704346, "learning_rate": 2.3824958670658297e-05, "loss": 2.6395, "step": 209400 }, { "epoch": 1.5718905453972494, "grad_norm": 2.1540348529815674, "learning_rate": 2.3812453574831745e-05, "loss": 2.7712, "step": 209500 }, { "epoch": 1.5726408511468424, "grad_norm": 1.4391160011291504, "learning_rate": 2.3799948479005196e-05, "loss": 2.8449, "step": 209600 }, { "epoch": 1.5733911568964354, "grad_norm": 1.5159740447998047, "learning_rate": 2.3787443383178647e-05, "loss": 2.7511, "step": 209700 }, { "epoch": 1.5741414626460282, "grad_norm": 1.6683372259140015, "learning_rate": 2.3774938287352098e-05, "loss": 2.6819, "step": 209800 }, { "epoch": 1.5748917683956212, "grad_norm": 1.9487346410751343, "learning_rate": 2.376243319152555e-05, "loss": 2.6418, "step": 209900 }, { "epoch": 1.5756420741452142, "grad_norm": 2.121859550476074, "learning_rate": 2.3749928095698996e-05, "loss": 2.6972, "step": 210000 }, { "epoch": 1.5763923798948072, "grad_norm": 2.3342883586883545, "learning_rate": 2.373742299987245e-05, "loss": 2.5339, "step": 210100 }, { "epoch": 1.5771426856444002, "grad_norm": 1.668552041053772, "learning_rate": 2.37249179040459e-05, "loss": 2.5567, "step": 210200 }, { "epoch": 1.577892991393993, "grad_norm": 2.04553484916687, "learning_rate": 2.371241280821935e-05, "loss": 2.8484, "step": 210300 }, { "epoch": 1.578643297143586, "grad_norm": 1.8889521360397339, "learning_rate": 2.36999077123928e-05, "loss": 2.5426, "step": 210400 }, { "epoch": 1.579393602893179, "grad_norm": 2.0191898345947266, "learning_rate": 2.368740261656625e-05, "loss": 2.7811, "step": 210500 }, { "epoch": 1.5801439086427718, "grad_norm": 1.3545690774917603, "learning_rate": 2.3674897520739702e-05, "loss": 2.7096, "step": 210600 }, { "epoch": 1.580894214392365, "grad_norm": 3.534245014190674, "learning_rate": 2.3662517475871416e-05, "loss": 2.6967, "step": 210700 }, { "epoch": 1.5816445201419578, "grad_norm": 2.194981098175049, "learning_rate": 2.365001238004487e-05, "loss": 2.7012, "step": 210800 }, { "epoch": 1.5823948258915508, "grad_norm": 2.736421585083008, "learning_rate": 2.363750728421832e-05, "loss": 2.7633, "step": 210900 }, { "epoch": 1.5831451316411438, "grad_norm": 2.223950147628784, "learning_rate": 2.362500218839177e-05, "loss": 2.5336, "step": 211000 }, { "epoch": 1.5838954373907366, "grad_norm": 1.7940837144851685, "learning_rate": 2.3612497092565224e-05, "loss": 2.6213, "step": 211100 }, { "epoch": 1.5846457431403298, "grad_norm": 1.8259481191635132, "learning_rate": 2.359999199673867e-05, "loss": 2.5849, "step": 211200 }, { "epoch": 1.5853960488899226, "grad_norm": 1.7804980278015137, "learning_rate": 2.3587486900912123e-05, "loss": 2.6584, "step": 211300 }, { "epoch": 1.5861463546395156, "grad_norm": 2.755124568939209, "learning_rate": 2.3574981805085574e-05, "loss": 2.6765, "step": 211400 }, { "epoch": 1.5868966603891086, "grad_norm": 1.8881587982177734, "learning_rate": 2.3562476709259025e-05, "loss": 2.695, "step": 211500 }, { "epoch": 1.5876469661387014, "grad_norm": 2.220754861831665, "learning_rate": 2.3549971613432476e-05, "loss": 2.5886, "step": 211600 }, { "epoch": 1.5883972718882946, "grad_norm": 2.372572660446167, "learning_rate": 2.3537466517605923e-05, "loss": 2.6225, "step": 211700 }, { "epoch": 1.5891475776378874, "grad_norm": 2.303907871246338, "learning_rate": 2.3524961421779378e-05, "loss": 2.5309, "step": 211800 }, { "epoch": 1.5898978833874804, "grad_norm": 1.602046251296997, "learning_rate": 2.351245632595283e-05, "loss": 2.5845, "step": 211900 }, { "epoch": 1.5906481891370734, "grad_norm": 1.4311492443084717, "learning_rate": 2.3499951230126276e-05, "loss": 2.7077, "step": 212000 }, { "epoch": 1.5913984948866662, "grad_norm": 2.0976390838623047, "learning_rate": 2.3487446134299727e-05, "loss": 2.6936, "step": 212100 }, { "epoch": 1.5921488006362594, "grad_norm": 2.124450206756592, "learning_rate": 2.347494103847318e-05, "loss": 2.6146, "step": 212200 }, { "epoch": 1.5928991063858522, "grad_norm": 2.457709789276123, "learning_rate": 2.346243594264663e-05, "loss": 2.7673, "step": 212300 }, { "epoch": 1.5936494121354452, "grad_norm": 1.560652732849121, "learning_rate": 2.344993084682008e-05, "loss": 2.6716, "step": 212400 }, { "epoch": 1.5943997178850382, "grad_norm": 1.622583031654358, "learning_rate": 2.343742575099353e-05, "loss": 2.5689, "step": 212500 }, { "epoch": 1.595150023634631, "grad_norm": 2.3172354698181152, "learning_rate": 2.3424920655166982e-05, "loss": 2.6595, "step": 212600 }, { "epoch": 1.5959003293842242, "grad_norm": 1.8477153778076172, "learning_rate": 2.341241555934043e-05, "loss": 2.8263, "step": 212700 }, { "epoch": 1.596650635133817, "grad_norm": 2.0466196537017822, "learning_rate": 2.339991046351388e-05, "loss": 2.7022, "step": 212800 }, { "epoch": 1.59740094088341, "grad_norm": 1.6608169078826904, "learning_rate": 2.33875304186456e-05, "loss": 2.7904, "step": 212900 }, { "epoch": 1.598151246633003, "grad_norm": 1.597933292388916, "learning_rate": 2.337502532281905e-05, "loss": 2.6293, "step": 213000 }, { "epoch": 1.5989015523825958, "grad_norm": 1.8194364309310913, "learning_rate": 2.33625202269925e-05, "loss": 2.6762, "step": 213100 }, { "epoch": 1.599651858132189, "grad_norm": 1.8341060876846313, "learning_rate": 2.335001513116595e-05, "loss": 2.7249, "step": 213200 }, { "epoch": 1.6004021638817818, "grad_norm": 1.4455487728118896, "learning_rate": 2.3337510035339403e-05, "loss": 2.6612, "step": 213300 }, { "epoch": 1.6011524696313748, "grad_norm": 2.053220510482788, "learning_rate": 2.3325129990471117e-05, "loss": 2.622, "step": 213400 }, { "epoch": 1.6019027753809678, "grad_norm": 1.6470357179641724, "learning_rate": 2.3312624894644568e-05, "loss": 2.6866, "step": 213500 }, { "epoch": 1.6026530811305606, "grad_norm": 1.7653957605361938, "learning_rate": 2.330011979881802e-05, "loss": 2.6105, "step": 213600 }, { "epoch": 1.6034033868801538, "grad_norm": 1.8257803916931152, "learning_rate": 2.328761470299147e-05, "loss": 2.5839, "step": 213700 }, { "epoch": 1.6041536926297466, "grad_norm": 1.349300503730774, "learning_rate": 2.327510960716492e-05, "loss": 2.6873, "step": 213800 }, { "epoch": 1.6049039983793396, "grad_norm": 2.191150188446045, "learning_rate": 2.326260451133837e-05, "loss": 2.6145, "step": 213900 }, { "epoch": 1.6056543041289326, "grad_norm": 1.4239863157272339, "learning_rate": 2.3250099415511823e-05, "loss": 2.6813, "step": 214000 }, { "epoch": 1.6064046098785254, "grad_norm": 2.0201666355133057, "learning_rate": 2.3237594319685274e-05, "loss": 2.7, "step": 214100 }, { "epoch": 1.6071549156281184, "grad_norm": 1.7763317823410034, "learning_rate": 2.3225089223858725e-05, "loss": 2.8168, "step": 214200 }, { "epoch": 1.6079052213777114, "grad_norm": 3.5185296535491943, "learning_rate": 2.3212584128032176e-05, "loss": 2.5623, "step": 214300 }, { "epoch": 1.6086555271273044, "grad_norm": 5.291778087615967, "learning_rate": 2.3200079032205623e-05, "loss": 2.6869, "step": 214400 }, { "epoch": 1.6094058328768974, "grad_norm": 1.6559803485870361, "learning_rate": 2.3187573936379074e-05, "loss": 2.5925, "step": 214500 }, { "epoch": 1.6101561386264902, "grad_norm": 1.5434479713439941, "learning_rate": 2.317506884055253e-05, "loss": 2.7446, "step": 214600 }, { "epoch": 1.6109064443760832, "grad_norm": 1.5879334211349487, "learning_rate": 2.3162563744725976e-05, "loss": 2.865, "step": 214700 }, { "epoch": 1.6116567501256762, "grad_norm": 2.0916430950164795, "learning_rate": 2.3150058648899427e-05, "loss": 2.4523, "step": 214800 }, { "epoch": 1.6124070558752692, "grad_norm": 1.5642669200897217, "learning_rate": 2.313755355307288e-05, "loss": 2.7002, "step": 214900 }, { "epoch": 1.6131573616248622, "grad_norm": 1.3920570611953735, "learning_rate": 2.312504845724633e-05, "loss": 2.6553, "step": 215000 }, { "epoch": 1.613907667374455, "grad_norm": 2.2546136379241943, "learning_rate": 2.311254336141978e-05, "loss": 2.5907, "step": 215100 }, { "epoch": 1.614657973124048, "grad_norm": 1.736397385597229, "learning_rate": 2.3100038265593228e-05, "loss": 2.6002, "step": 215200 }, { "epoch": 1.615408278873641, "grad_norm": 2.0272679328918457, "learning_rate": 2.3087533169766683e-05, "loss": 2.7074, "step": 215300 }, { "epoch": 1.6161585846232338, "grad_norm": 1.900700330734253, "learning_rate": 2.307502807394013e-05, "loss": 2.6552, "step": 215400 }, { "epoch": 1.616908890372827, "grad_norm": 1.8870704174041748, "learning_rate": 2.306252297811358e-05, "loss": 2.7129, "step": 215500 }, { "epoch": 1.6176591961224198, "grad_norm": 2.2378344535827637, "learning_rate": 2.3050017882287032e-05, "loss": 2.6663, "step": 215600 }, { "epoch": 1.6184095018720128, "grad_norm": 1.8033249378204346, "learning_rate": 2.3037512786460483e-05, "loss": 2.6056, "step": 215700 }, { "epoch": 1.6191598076216058, "grad_norm": 2.8534274101257324, "learning_rate": 2.3025007690633934e-05, "loss": 2.5439, "step": 215800 }, { "epoch": 1.6199101133711986, "grad_norm": 1.9303184747695923, "learning_rate": 2.3012502594807385e-05, "loss": 2.6991, "step": 215900 }, { "epoch": 1.6206604191207918, "grad_norm": 2.143984079360962, "learning_rate": 2.2999997498980836e-05, "loss": 2.6821, "step": 216000 }, { "epoch": 1.6214107248703846, "grad_norm": 1.30686616897583, "learning_rate": 2.2987492403154287e-05, "loss": 2.7227, "step": 216100 }, { "epoch": 1.6221610306199776, "grad_norm": 1.8607473373413086, "learning_rate": 2.2974987307327735e-05, "loss": 2.7598, "step": 216200 }, { "epoch": 1.6229113363695706, "grad_norm": 1.5802544355392456, "learning_rate": 2.296248221150119e-05, "loss": 2.6847, "step": 216300 }, { "epoch": 1.6236616421191634, "grad_norm": 1.7330067157745361, "learning_rate": 2.294997711567464e-05, "loss": 2.7945, "step": 216400 }, { "epoch": 1.6244119478687566, "grad_norm": 2.3764166831970215, "learning_rate": 2.2937472019848088e-05, "loss": 2.7287, "step": 216500 }, { "epoch": 1.6251622536183494, "grad_norm": 2.0163722038269043, "learning_rate": 2.292496692402154e-05, "loss": 2.696, "step": 216600 }, { "epoch": 1.6259125593679424, "grad_norm": 1.5997662544250488, "learning_rate": 2.291246182819499e-05, "loss": 2.6259, "step": 216700 }, { "epoch": 1.6266628651175354, "grad_norm": 2.424314260482788, "learning_rate": 2.289995673236844e-05, "loss": 2.6652, "step": 216800 }, { "epoch": 1.6274131708671282, "grad_norm": 1.9379658699035645, "learning_rate": 2.2887451636541892e-05, "loss": 2.6117, "step": 216900 }, { "epoch": 1.6281634766167215, "grad_norm": 2.0159506797790527, "learning_rate": 2.2874946540715343e-05, "loss": 2.8187, "step": 217000 }, { "epoch": 1.6289137823663142, "grad_norm": 1.616491436958313, "learning_rate": 2.2862441444888794e-05, "loss": 2.6354, "step": 217100 }, { "epoch": 1.6296640881159072, "grad_norm": 1.5793458223342896, "learning_rate": 2.2849936349062245e-05, "loss": 2.6718, "step": 217200 }, { "epoch": 1.6304143938655002, "grad_norm": 1.9046307802200317, "learning_rate": 2.2837431253235693e-05, "loss": 2.679, "step": 217300 }, { "epoch": 1.631164699615093, "grad_norm": 1.6559135913848877, "learning_rate": 2.2824926157409147e-05, "loss": 2.7831, "step": 217400 }, { "epoch": 1.6319150053646863, "grad_norm": 3.4604032039642334, "learning_rate": 2.281254611254086e-05, "loss": 2.6784, "step": 217500 }, { "epoch": 1.632665311114279, "grad_norm": 1.9900543689727783, "learning_rate": 2.2800041016714312e-05, "loss": 2.601, "step": 217600 }, { "epoch": 1.633415616863872, "grad_norm": 1.3109620809555054, "learning_rate": 2.2787535920887763e-05, "loss": 2.6667, "step": 217700 }, { "epoch": 1.634165922613465, "grad_norm": 1.899025797843933, "learning_rate": 2.277515587601948e-05, "loss": 2.6962, "step": 217800 }, { "epoch": 1.6349162283630578, "grad_norm": 2.1243443489074707, "learning_rate": 2.2762650780192928e-05, "loss": 2.8646, "step": 217900 }, { "epoch": 1.635666534112651, "grad_norm": 2.6961724758148193, "learning_rate": 2.275014568436638e-05, "loss": 2.72, "step": 218000 }, { "epoch": 1.6364168398622438, "grad_norm": 1.3051576614379883, "learning_rate": 2.273764058853983e-05, "loss": 2.7848, "step": 218100 }, { "epoch": 1.6371671456118368, "grad_norm": 1.7395566701889038, "learning_rate": 2.272513549271328e-05, "loss": 2.5541, "step": 218200 }, { "epoch": 1.6379174513614299, "grad_norm": 1.7536131143569946, "learning_rate": 2.2712630396886732e-05, "loss": 2.6417, "step": 218300 }, { "epoch": 1.6386677571110226, "grad_norm": 1.4140440225601196, "learning_rate": 2.2700125301060183e-05, "loss": 2.7121, "step": 218400 }, { "epoch": 1.6394180628606159, "grad_norm": 1.4018161296844482, "learning_rate": 2.2687620205233634e-05, "loss": 2.5566, "step": 218500 }, { "epoch": 1.6401683686102086, "grad_norm": 1.7409790754318237, "learning_rate": 2.2675115109407085e-05, "loss": 2.681, "step": 218600 }, { "epoch": 1.6409186743598017, "grad_norm": 2.0826380252838135, "learning_rate": 2.2662610013580536e-05, "loss": 2.6333, "step": 218700 }, { "epoch": 1.6416689801093947, "grad_norm": 1.5947473049163818, "learning_rate": 2.2650104917753987e-05, "loss": 2.799, "step": 218800 }, { "epoch": 1.6424192858589874, "grad_norm": 2.5116727352142334, "learning_rate": 2.2637599821927435e-05, "loss": 2.6926, "step": 218900 }, { "epoch": 1.6431695916085804, "grad_norm": 2.3083572387695312, "learning_rate": 2.2625094726100886e-05, "loss": 2.6566, "step": 219000 }, { "epoch": 1.6439198973581735, "grad_norm": 1.164133906364441, "learning_rate": 2.261258963027434e-05, "loss": 2.614, "step": 219100 }, { "epoch": 1.6446702031077665, "grad_norm": 2.942304849624634, "learning_rate": 2.2600084534447788e-05, "loss": 2.6805, "step": 219200 }, { "epoch": 1.6454205088573595, "grad_norm": 1.9932761192321777, "learning_rate": 2.258757943862124e-05, "loss": 2.7188, "step": 219300 }, { "epoch": 1.6461708146069522, "grad_norm": 2.2600295543670654, "learning_rate": 2.257507434279469e-05, "loss": 2.8422, "step": 219400 }, { "epoch": 1.6469211203565453, "grad_norm": 2.175851821899414, "learning_rate": 2.256256924696814e-05, "loss": 2.4742, "step": 219500 }, { "epoch": 1.6476714261061383, "grad_norm": 2.1297738552093506, "learning_rate": 2.2550064151141592e-05, "loss": 2.6876, "step": 219600 }, { "epoch": 1.6484217318557313, "grad_norm": 1.4707212448120117, "learning_rate": 2.253755905531504e-05, "loss": 2.7152, "step": 219700 }, { "epoch": 1.6491720376053243, "grad_norm": 2.2523763179779053, "learning_rate": 2.2525053959488494e-05, "loss": 2.7461, "step": 219800 }, { "epoch": 1.649922343354917, "grad_norm": 1.9453898668289185, "learning_rate": 2.2512548863661942e-05, "loss": 2.5417, "step": 219900 }, { "epoch": 1.65067264910451, "grad_norm": 2.9125006198883057, "learning_rate": 2.2500043767835393e-05, "loss": 2.6484, "step": 220000 }, { "epoch": 1.651422954854103, "grad_norm": 2.2684895992279053, "learning_rate": 2.2487538672008844e-05, "loss": 2.8075, "step": 220100 }, { "epoch": 1.6521732606036958, "grad_norm": 3.027303695678711, "learning_rate": 2.2475033576182295e-05, "loss": 2.6683, "step": 220200 }, { "epoch": 1.652923566353289, "grad_norm": 2.8337433338165283, "learning_rate": 2.2462528480355746e-05, "loss": 2.5183, "step": 220300 }, { "epoch": 1.6536738721028819, "grad_norm": 1.5271075963974, "learning_rate": 2.2450023384529197e-05, "loss": 2.5551, "step": 220400 }, { "epoch": 1.6544241778524749, "grad_norm": 1.5589492321014404, "learning_rate": 2.2437518288702648e-05, "loss": 2.9156, "step": 220500 }, { "epoch": 1.6551744836020679, "grad_norm": 1.7837162017822266, "learning_rate": 2.24250131928761e-05, "loss": 2.6906, "step": 220600 }, { "epoch": 1.6559247893516607, "grad_norm": 1.4313409328460693, "learning_rate": 2.2412508097049547e-05, "loss": 2.6849, "step": 220700 }, { "epoch": 1.6566750951012539, "grad_norm": 2.002150297164917, "learning_rate": 2.2400003001223e-05, "loss": 2.6866, "step": 220800 }, { "epoch": 1.6574254008508467, "grad_norm": 1.8767738342285156, "learning_rate": 2.2387497905396452e-05, "loss": 2.822, "step": 220900 }, { "epoch": 1.6581757066004397, "grad_norm": 2.5035178661346436, "learning_rate": 2.23749928095699e-05, "loss": 2.7354, "step": 221000 }, { "epoch": 1.6589260123500327, "grad_norm": 1.5552382469177246, "learning_rate": 2.236248771374335e-05, "loss": 2.6986, "step": 221100 }, { "epoch": 1.6596763180996255, "grad_norm": 2.1604185104370117, "learning_rate": 2.23499826179168e-05, "loss": 2.639, "step": 221200 }, { "epoch": 1.6604266238492187, "grad_norm": 2.173304796218872, "learning_rate": 2.2337477522090253e-05, "loss": 2.6682, "step": 221300 }, { "epoch": 1.6611769295988115, "grad_norm": 2.228212356567383, "learning_rate": 2.2324972426263704e-05, "loss": 2.5692, "step": 221400 }, { "epoch": 1.6619272353484045, "grad_norm": 2.0109689235687256, "learning_rate": 2.2312467330437155e-05, "loss": 2.6308, "step": 221500 }, { "epoch": 1.6626775410979975, "grad_norm": 1.7450299263000488, "learning_rate": 2.2299962234610606e-05, "loss": 2.7932, "step": 221600 }, { "epoch": 1.6634278468475903, "grad_norm": 1.769237756729126, "learning_rate": 2.2287457138784057e-05, "loss": 2.7926, "step": 221700 }, { "epoch": 1.6641781525971835, "grad_norm": 1.392811894416809, "learning_rate": 2.2274952042957504e-05, "loss": 2.6987, "step": 221800 }, { "epoch": 1.6649284583467763, "grad_norm": 2.527249813079834, "learning_rate": 2.2262571998089222e-05, "loss": 2.6568, "step": 221900 }, { "epoch": 1.6656787640963693, "grad_norm": 1.9440959692001343, "learning_rate": 2.2250066902262673e-05, "loss": 2.7129, "step": 222000 }, { "epoch": 1.6664290698459623, "grad_norm": 2.2354142665863037, "learning_rate": 2.2237561806436124e-05, "loss": 2.6893, "step": 222100 }, { "epoch": 1.667179375595555, "grad_norm": 1.7800205945968628, "learning_rate": 2.2225056710609575e-05, "loss": 2.72, "step": 222200 }, { "epoch": 1.6679296813451483, "grad_norm": 1.9350738525390625, "learning_rate": 2.2212551614783026e-05, "loss": 2.6342, "step": 222300 }, { "epoch": 1.668679987094741, "grad_norm": 2.9655203819274902, "learning_rate": 2.2200046518956477e-05, "loss": 2.7425, "step": 222400 }, { "epoch": 1.669430292844334, "grad_norm": 1.7410802841186523, "learning_rate": 2.2187541423129924e-05, "loss": 2.7619, "step": 222500 }, { "epoch": 1.670180598593927, "grad_norm": 2.5167412757873535, "learning_rate": 2.217503632730338e-05, "loss": 2.6963, "step": 222600 }, { "epoch": 1.6709309043435199, "grad_norm": 2.2056829929351807, "learning_rate": 2.2162531231476826e-05, "loss": 2.7267, "step": 222700 }, { "epoch": 1.671681210093113, "grad_norm": 1.451205849647522, "learning_rate": 2.2150026135650278e-05, "loss": 2.5918, "step": 222800 }, { "epoch": 1.6724315158427059, "grad_norm": 1.5442097187042236, "learning_rate": 2.2137521039823732e-05, "loss": 2.7323, "step": 222900 }, { "epoch": 1.673181821592299, "grad_norm": 2.9414186477661133, "learning_rate": 2.212501594399718e-05, "loss": 2.7259, "step": 223000 }, { "epoch": 1.673932127341892, "grad_norm": 2.947091579437256, "learning_rate": 2.211251084817063e-05, "loss": 2.7027, "step": 223100 }, { "epoch": 1.6746824330914847, "grad_norm": 1.6822746992111206, "learning_rate": 2.210000575234408e-05, "loss": 2.7723, "step": 223200 }, { "epoch": 1.6754327388410777, "grad_norm": 1.76139235496521, "learning_rate": 2.2087500656517533e-05, "loss": 2.6927, "step": 223300 }, { "epoch": 1.6761830445906707, "grad_norm": 2.2572617530822754, "learning_rate": 2.2074995560690984e-05, "loss": 2.7434, "step": 223400 }, { "epoch": 1.6769333503402637, "grad_norm": 2.7565524578094482, "learning_rate": 2.206249046486443e-05, "loss": 2.7363, "step": 223500 }, { "epoch": 1.6776836560898567, "grad_norm": 1.8020607233047485, "learning_rate": 2.2049985369037886e-05, "loss": 2.7861, "step": 223600 }, { "epoch": 1.6784339618394495, "grad_norm": 1.5213381052017212, "learning_rate": 2.2037480273211333e-05, "loss": 2.6381, "step": 223700 }, { "epoch": 1.6791842675890425, "grad_norm": 1.8700028657913208, "learning_rate": 2.2024975177384784e-05, "loss": 2.6845, "step": 223800 }, { "epoch": 1.6799345733386355, "grad_norm": 2.625070571899414, "learning_rate": 2.2012595132516502e-05, "loss": 2.5852, "step": 223900 }, { "epoch": 1.6806848790882285, "grad_norm": 1.6308817863464355, "learning_rate": 2.2000090036689953e-05, "loss": 2.7936, "step": 224000 }, { "epoch": 1.6814351848378215, "grad_norm": 1.642497181892395, "learning_rate": 2.1987709991821667e-05, "loss": 2.6633, "step": 224100 }, { "epoch": 1.6821854905874143, "grad_norm": 2.1032848358154297, "learning_rate": 2.1975204895995118e-05, "loss": 2.5569, "step": 224200 }, { "epoch": 1.6829357963370073, "grad_norm": 3.364049196243286, "learning_rate": 2.1962699800168572e-05, "loss": 2.6276, "step": 224300 }, { "epoch": 1.6836861020866003, "grad_norm": 3.633951187133789, "learning_rate": 2.195019470434202e-05, "loss": 2.6793, "step": 224400 }, { "epoch": 1.684436407836193, "grad_norm": 1.9154820442199707, "learning_rate": 2.193768960851547e-05, "loss": 2.7451, "step": 224500 }, { "epoch": 1.6851867135857863, "grad_norm": 3.3942954540252686, "learning_rate": 2.1925184512688922e-05, "loss": 2.6274, "step": 224600 }, { "epoch": 1.685937019335379, "grad_norm": 4.171758651733398, "learning_rate": 2.1912679416862373e-05, "loss": 2.7057, "step": 224700 }, { "epoch": 1.686687325084972, "grad_norm": 1.5650051832199097, "learning_rate": 2.1900174321035824e-05, "loss": 2.6981, "step": 224800 }, { "epoch": 1.687437630834565, "grad_norm": 1.5548356771469116, "learning_rate": 2.188766922520927e-05, "loss": 2.6588, "step": 224900 }, { "epoch": 1.6881879365841579, "grad_norm": 2.4556992053985596, "learning_rate": 2.1875164129382726e-05, "loss": 2.6581, "step": 225000 }, { "epoch": 1.6889382423337511, "grad_norm": 1.50163733959198, "learning_rate": 2.1862659033556177e-05, "loss": 2.592, "step": 225100 }, { "epoch": 1.689688548083344, "grad_norm": 2.598390579223633, "learning_rate": 2.1850153937729625e-05, "loss": 2.471, "step": 225200 }, { "epoch": 1.690438853832937, "grad_norm": 2.3662960529327393, "learning_rate": 2.1837648841903076e-05, "loss": 2.7151, "step": 225300 }, { "epoch": 1.69118915958253, "grad_norm": 2.4257919788360596, "learning_rate": 2.1825143746076527e-05, "loss": 2.6732, "step": 225400 }, { "epoch": 1.6919394653321227, "grad_norm": 1.9842973947525024, "learning_rate": 2.1812638650249978e-05, "loss": 2.7373, "step": 225500 }, { "epoch": 1.692689771081716, "grad_norm": 2.733893394470215, "learning_rate": 2.180013355442343e-05, "loss": 2.6838, "step": 225600 }, { "epoch": 1.6934400768313087, "grad_norm": 2.408416748046875, "learning_rate": 2.178762845859688e-05, "loss": 2.6732, "step": 225700 }, { "epoch": 1.6941903825809017, "grad_norm": 1.7934565544128418, "learning_rate": 2.177512336277033e-05, "loss": 2.6603, "step": 225800 }, { "epoch": 1.6949406883304947, "grad_norm": 2.1759707927703857, "learning_rate": 2.1762618266943778e-05, "loss": 2.7078, "step": 225900 }, { "epoch": 1.6956909940800875, "grad_norm": 1.6626242399215698, "learning_rate": 2.1750113171117233e-05, "loss": 2.7746, "step": 226000 }, { "epoch": 1.6964412998296807, "grad_norm": 1.867409110069275, "learning_rate": 2.1737608075290684e-05, "loss": 2.6613, "step": 226100 }, { "epoch": 1.6971916055792735, "grad_norm": 1.5468112230300903, "learning_rate": 2.172510297946413e-05, "loss": 2.618, "step": 226200 }, { "epoch": 1.6979419113288665, "grad_norm": 1.7310351133346558, "learning_rate": 2.1712597883637582e-05, "loss": 2.7604, "step": 226300 }, { "epoch": 1.6986922170784595, "grad_norm": 2.409095048904419, "learning_rate": 2.1700092787811033e-05, "loss": 2.7381, "step": 226400 }, { "epoch": 1.6994425228280523, "grad_norm": 1.8305583000183105, "learning_rate": 2.1687587691984484e-05, "loss": 2.6427, "step": 226500 }, { "epoch": 1.7001928285776455, "grad_norm": 2.3224306106567383, "learning_rate": 2.1675082596157935e-05, "loss": 2.6901, "step": 226600 }, { "epoch": 1.7009431343272383, "grad_norm": 2.368906021118164, "learning_rate": 2.1662577500331386e-05, "loss": 2.7054, "step": 226700 }, { "epoch": 1.7016934400768313, "grad_norm": 2.6215744018554688, "learning_rate": 2.1650072404504837e-05, "loss": 2.8275, "step": 226800 }, { "epoch": 1.7024437458264243, "grad_norm": 3.798626184463501, "learning_rate": 2.163756730867829e-05, "loss": 2.5592, "step": 226900 }, { "epoch": 1.703194051576017, "grad_norm": 2.3431851863861084, "learning_rate": 2.1625062212851736e-05, "loss": 2.6765, "step": 227000 }, { "epoch": 1.7039443573256103, "grad_norm": 2.9458303451538086, "learning_rate": 2.161255711702519e-05, "loss": 2.6833, "step": 227100 }, { "epoch": 1.7046946630752031, "grad_norm": 2.046957492828369, "learning_rate": 2.1600052021198638e-05, "loss": 2.7045, "step": 227200 }, { "epoch": 1.7054449688247961, "grad_norm": 2.2607297897338867, "learning_rate": 2.158754692537209e-05, "loss": 2.6543, "step": 227300 }, { "epoch": 1.7061952745743891, "grad_norm": 2.5782697200775146, "learning_rate": 2.1575041829545544e-05, "loss": 2.6921, "step": 227400 }, { "epoch": 1.706945580323982, "grad_norm": 1.7078994512557983, "learning_rate": 2.156253673371899e-05, "loss": 2.7819, "step": 227500 }, { "epoch": 1.7076958860735751, "grad_norm": 1.4737212657928467, "learning_rate": 2.1550031637892442e-05, "loss": 2.7857, "step": 227600 }, { "epoch": 1.708446191823168, "grad_norm": 2.6936941146850586, "learning_rate": 2.1537526542065893e-05, "loss": 2.6374, "step": 227700 }, { "epoch": 1.709196497572761, "grad_norm": 2.938537836074829, "learning_rate": 2.1525021446239344e-05, "loss": 2.7262, "step": 227800 }, { "epoch": 1.709946803322354, "grad_norm": 1.4872084856033325, "learning_rate": 2.1512516350412795e-05, "loss": 2.4963, "step": 227900 }, { "epoch": 1.7106971090719467, "grad_norm": 2.0066847801208496, "learning_rate": 2.1500011254586243e-05, "loss": 2.6041, "step": 228000 }, { "epoch": 1.7114474148215397, "grad_norm": 1.7235535383224487, "learning_rate": 2.1487631209717964e-05, "loss": 2.6565, "step": 228100 }, { "epoch": 1.7121977205711327, "grad_norm": 2.493252754211426, "learning_rate": 2.147512611389141e-05, "loss": 2.6376, "step": 228200 }, { "epoch": 1.7129480263207257, "grad_norm": 1.6539744138717651, "learning_rate": 2.1462621018064862e-05, "loss": 2.5885, "step": 228300 }, { "epoch": 1.7136983320703187, "grad_norm": 1.9925140142440796, "learning_rate": 2.1450115922238313e-05, "loss": 2.6437, "step": 228400 }, { "epoch": 1.7144486378199115, "grad_norm": 1.5011165142059326, "learning_rate": 2.1437610826411764e-05, "loss": 2.6594, "step": 228500 }, { "epoch": 1.7151989435695045, "grad_norm": 1.6055272817611694, "learning_rate": 2.1425105730585215e-05, "loss": 2.6576, "step": 228600 }, { "epoch": 1.7159492493190975, "grad_norm": 1.819006323814392, "learning_rate": 2.1412600634758663e-05, "loss": 2.6283, "step": 228700 }, { "epoch": 1.7166995550686905, "grad_norm": 1.942347764968872, "learning_rate": 2.1400095538932117e-05, "loss": 2.5843, "step": 228800 }, { "epoch": 1.7174498608182835, "grad_norm": 2.1329710483551025, "learning_rate": 2.138759044310557e-05, "loss": 2.551, "step": 228900 }, { "epoch": 1.7182001665678763, "grad_norm": 5.06547737121582, "learning_rate": 2.1375085347279016e-05, "loss": 2.5696, "step": 229000 }, { "epoch": 1.7189504723174693, "grad_norm": 2.2004969120025635, "learning_rate": 2.1362580251452467e-05, "loss": 2.7105, "step": 229100 }, { "epoch": 1.7197007780670623, "grad_norm": 1.5758631229400635, "learning_rate": 2.1350075155625918e-05, "loss": 2.6337, "step": 229200 }, { "epoch": 1.7204510838166551, "grad_norm": 1.2778080701828003, "learning_rate": 2.133757005979937e-05, "loss": 2.7561, "step": 229300 }, { "epoch": 1.7212013895662484, "grad_norm": 4.094877243041992, "learning_rate": 2.132506496397282e-05, "loss": 2.693, "step": 229400 }, { "epoch": 1.7219516953158411, "grad_norm": 2.1604299545288086, "learning_rate": 2.131255986814627e-05, "loss": 2.6913, "step": 229500 }, { "epoch": 1.7227020010654341, "grad_norm": 2.644935369491577, "learning_rate": 2.1300054772319722e-05, "loss": 2.6807, "step": 229600 }, { "epoch": 1.7234523068150271, "grad_norm": 2.5642855167388916, "learning_rate": 2.128754967649317e-05, "loss": 2.6983, "step": 229700 }, { "epoch": 1.72420261256462, "grad_norm": 1.2715543508529663, "learning_rate": 2.1275044580666624e-05, "loss": 2.5651, "step": 229800 }, { "epoch": 1.7249529183142132, "grad_norm": 1.571607232093811, "learning_rate": 2.1262539484840075e-05, "loss": 2.8091, "step": 229900 }, { "epoch": 1.725703224063806, "grad_norm": 1.4893946647644043, "learning_rate": 2.1250034389013523e-05, "loss": 2.8609, "step": 230000 }, { "epoch": 1.726453529813399, "grad_norm": 1.9312849044799805, "learning_rate": 2.1237529293186974e-05, "loss": 2.7146, "step": 230100 }, { "epoch": 1.727203835562992, "grad_norm": 1.927961826324463, "learning_rate": 2.122514924831869e-05, "loss": 2.8615, "step": 230200 }, { "epoch": 1.7279541413125847, "grad_norm": 1.5116468667984009, "learning_rate": 2.1212644152492142e-05, "loss": 2.6923, "step": 230300 }, { "epoch": 1.728704447062178, "grad_norm": 2.198054552078247, "learning_rate": 2.120013905666559e-05, "loss": 2.6427, "step": 230400 }, { "epoch": 1.7294547528117707, "grad_norm": 1.5680928230285645, "learning_rate": 2.1187633960839044e-05, "loss": 2.7638, "step": 230500 }, { "epoch": 1.7302050585613638, "grad_norm": 2.5105812549591064, "learning_rate": 2.1175128865012495e-05, "loss": 2.617, "step": 230600 }, { "epoch": 1.7309553643109568, "grad_norm": 2.960282564163208, "learning_rate": 2.1162623769185943e-05, "loss": 2.6026, "step": 230700 }, { "epoch": 1.7317056700605495, "grad_norm": 2.559345006942749, "learning_rate": 2.1150118673359394e-05, "loss": 2.564, "step": 230800 }, { "epoch": 1.7324559758101428, "grad_norm": 1.8766919374465942, "learning_rate": 2.1137613577532845e-05, "loss": 2.6832, "step": 230900 }, { "epoch": 1.7332062815597356, "grad_norm": 2.5372142791748047, "learning_rate": 2.1125108481706296e-05, "loss": 2.5571, "step": 231000 }, { "epoch": 1.7339565873093286, "grad_norm": 1.7264524698257446, "learning_rate": 2.1112603385879747e-05, "loss": 2.7609, "step": 231100 }, { "epoch": 1.7347068930589216, "grad_norm": 2.0107192993164062, "learning_rate": 2.1100098290053198e-05, "loss": 2.66, "step": 231200 }, { "epoch": 1.7354571988085143, "grad_norm": 1.5663074254989624, "learning_rate": 2.108759319422665e-05, "loss": 2.7958, "step": 231300 }, { "epoch": 1.7362075045581076, "grad_norm": 1.8299221992492676, "learning_rate": 2.10750880984001e-05, "loss": 2.7036, "step": 231400 }, { "epoch": 1.7369578103077004, "grad_norm": 2.017726182937622, "learning_rate": 2.1062583002573548e-05, "loss": 2.7304, "step": 231500 }, { "epoch": 1.7377081160572934, "grad_norm": 1.7150956392288208, "learning_rate": 2.1050077906747002e-05, "loss": 2.7561, "step": 231600 }, { "epoch": 1.7384584218068864, "grad_norm": 2.350635051727295, "learning_rate": 2.103757281092045e-05, "loss": 2.7069, "step": 231700 }, { "epoch": 1.7392087275564792, "grad_norm": 1.9525022506713867, "learning_rate": 2.10250677150939e-05, "loss": 2.5654, "step": 231800 }, { "epoch": 1.7399590333060724, "grad_norm": 1.5096113681793213, "learning_rate": 2.1012562619267355e-05, "loss": 2.7868, "step": 231900 }, { "epoch": 1.7407093390556652, "grad_norm": 1.7647546529769897, "learning_rate": 2.1000057523440803e-05, "loss": 2.5488, "step": 232000 }, { "epoch": 1.7414596448052582, "grad_norm": 2.2272799015045166, "learning_rate": 2.0987552427614254e-05, "loss": 2.8153, "step": 232100 }, { "epoch": 1.7422099505548512, "grad_norm": 2.1509602069854736, "learning_rate": 2.0975172382745968e-05, "loss": 2.7133, "step": 232200 }, { "epoch": 1.742960256304444, "grad_norm": 2.7352397441864014, "learning_rate": 2.0962667286919422e-05, "loss": 2.6652, "step": 232300 }, { "epoch": 1.7437105620540372, "grad_norm": 2.1561439037323, "learning_rate": 2.095016219109287e-05, "loss": 2.6848, "step": 232400 }, { "epoch": 1.74446086780363, "grad_norm": 2.1544761657714844, "learning_rate": 2.093765709526632e-05, "loss": 2.5392, "step": 232500 }, { "epoch": 1.745211173553223, "grad_norm": 2.0134339332580566, "learning_rate": 2.0925151999439775e-05, "loss": 2.6733, "step": 232600 }, { "epoch": 1.745961479302816, "grad_norm": 2.945084810256958, "learning_rate": 2.0912646903613223e-05, "loss": 2.6183, "step": 232700 }, { "epoch": 1.7467117850524088, "grad_norm": 2.14143443107605, "learning_rate": 2.0900141807786674e-05, "loss": 2.6763, "step": 232800 }, { "epoch": 1.7474620908020018, "grad_norm": 2.036839246749878, "learning_rate": 2.0887636711960125e-05, "loss": 2.6796, "step": 232900 }, { "epoch": 1.7482123965515948, "grad_norm": 2.096057653427124, "learning_rate": 2.0875131616133576e-05, "loss": 2.6469, "step": 233000 }, { "epoch": 1.7489627023011878, "grad_norm": 1.8960082530975342, "learning_rate": 2.0862626520307027e-05, "loss": 2.7322, "step": 233100 }, { "epoch": 1.7497130080507808, "grad_norm": 2.0218799114227295, "learning_rate": 2.0850121424480475e-05, "loss": 2.6939, "step": 233200 }, { "epoch": 1.7504633138003736, "grad_norm": 1.7324466705322266, "learning_rate": 2.083761632865393e-05, "loss": 2.6151, "step": 233300 }, { "epoch": 1.7512136195499666, "grad_norm": 1.5997709035873413, "learning_rate": 2.082511123282738e-05, "loss": 2.622, "step": 233400 }, { "epoch": 1.7519639252995596, "grad_norm": 2.0385847091674805, "learning_rate": 2.0812606137000828e-05, "loss": 2.5483, "step": 233500 }, { "epoch": 1.7527142310491526, "grad_norm": 2.454127788543701, "learning_rate": 2.080010104117428e-05, "loss": 2.7893, "step": 233600 }, { "epoch": 1.7534645367987456, "grad_norm": 2.2953572273254395, "learning_rate": 2.078759594534773e-05, "loss": 2.5111, "step": 233700 }, { "epoch": 1.7542148425483384, "grad_norm": 1.743990421295166, "learning_rate": 2.077509084952118e-05, "loss": 2.6786, "step": 233800 }, { "epoch": 1.7549651482979314, "grad_norm": 2.20953369140625, "learning_rate": 2.0762585753694632e-05, "loss": 2.6552, "step": 233900 }, { "epoch": 1.7557154540475244, "grad_norm": 2.6880462169647217, "learning_rate": 2.0750080657868083e-05, "loss": 2.6636, "step": 234000 }, { "epoch": 1.7564657597971172, "grad_norm": 1.9508897066116333, "learning_rate": 2.0737575562041534e-05, "loss": 2.7365, "step": 234100 }, { "epoch": 1.7572160655467104, "grad_norm": 1.488824486732483, "learning_rate": 2.072507046621498e-05, "loss": 2.6288, "step": 234200 }, { "epoch": 1.7579663712963032, "grad_norm": 2.9083096981048584, "learning_rate": 2.0712565370388436e-05, "loss": 2.708, "step": 234300 }, { "epoch": 1.7587166770458962, "grad_norm": 2.076066255569458, "learning_rate": 2.0700060274561887e-05, "loss": 2.7346, "step": 234400 }, { "epoch": 1.7594669827954892, "grad_norm": 1.4346624612808228, "learning_rate": 2.0687555178735334e-05, "loss": 2.7802, "step": 234500 }, { "epoch": 1.760217288545082, "grad_norm": 2.576371669769287, "learning_rate": 2.0675050082908785e-05, "loss": 2.5814, "step": 234600 }, { "epoch": 1.7609675942946752, "grad_norm": 2.294673204421997, "learning_rate": 2.0662544987082237e-05, "loss": 2.7449, "step": 234700 }, { "epoch": 1.761717900044268, "grad_norm": 2.585655927658081, "learning_rate": 2.0650039891255688e-05, "loss": 2.5173, "step": 234800 }, { "epoch": 1.762468205793861, "grad_norm": 1.3728859424591064, "learning_rate": 2.063753479542914e-05, "loss": 2.6139, "step": 234900 }, { "epoch": 1.763218511543454, "grad_norm": 1.5088621377944946, "learning_rate": 2.062502969960259e-05, "loss": 2.7182, "step": 235000 }, { "epoch": 1.7639688172930468, "grad_norm": 1.725733995437622, "learning_rate": 2.061252460377604e-05, "loss": 2.6943, "step": 235100 }, { "epoch": 1.76471912304264, "grad_norm": 1.8392852544784546, "learning_rate": 2.060001950794949e-05, "loss": 2.6336, "step": 235200 }, { "epoch": 1.7654694287922328, "grad_norm": 1.5541396141052246, "learning_rate": 2.058751441212294e-05, "loss": 2.6141, "step": 235300 }, { "epoch": 1.7662197345418258, "grad_norm": 1.8344281911849976, "learning_rate": 2.0575009316296394e-05, "loss": 2.6836, "step": 235400 }, { "epoch": 1.7669700402914188, "grad_norm": 1.7275099754333496, "learning_rate": 2.056250422046984e-05, "loss": 2.7155, "step": 235500 }, { "epoch": 1.7677203460410116, "grad_norm": 1.4333891868591309, "learning_rate": 2.0549999124643292e-05, "loss": 2.4977, "step": 235600 }, { "epoch": 1.7684706517906048, "grad_norm": 1.3624305725097656, "learning_rate": 2.0537494028816747e-05, "loss": 2.6978, "step": 235700 }, { "epoch": 1.7692209575401976, "grad_norm": 1.909151554107666, "learning_rate": 2.0524988932990194e-05, "loss": 2.5561, "step": 235800 }, { "epoch": 1.7699712632897906, "grad_norm": 1.7748063802719116, "learning_rate": 2.0512483837163645e-05, "loss": 2.561, "step": 235900 }, { "epoch": 1.7707215690393836, "grad_norm": 1.907529592514038, "learning_rate": 2.0499978741337096e-05, "loss": 2.776, "step": 236000 }, { "epoch": 1.7714718747889764, "grad_norm": 1.6992416381835938, "learning_rate": 2.0487473645510547e-05, "loss": 2.6319, "step": 236100 }, { "epoch": 1.7722221805385696, "grad_norm": 2.2553610801696777, "learning_rate": 2.0474968549684e-05, "loss": 2.6462, "step": 236200 }, { "epoch": 1.7729724862881624, "grad_norm": 1.6813592910766602, "learning_rate": 2.0462588504815712e-05, "loss": 2.7214, "step": 236300 }, { "epoch": 1.7737227920377554, "grad_norm": 2.044344425201416, "learning_rate": 2.0450083408989167e-05, "loss": 2.6457, "step": 236400 }, { "epoch": 1.7744730977873484, "grad_norm": 2.1221981048583984, "learning_rate": 2.0437578313162614e-05, "loss": 2.721, "step": 236500 }, { "epoch": 1.7752234035369412, "grad_norm": 1.5901119709014893, "learning_rate": 2.0425073217336065e-05, "loss": 2.6024, "step": 236600 }, { "epoch": 1.7759737092865344, "grad_norm": 2.1145317554473877, "learning_rate": 2.0412568121509516e-05, "loss": 2.5955, "step": 236700 }, { "epoch": 1.7767240150361272, "grad_norm": 1.5674196481704712, "learning_rate": 2.0400063025682967e-05, "loss": 2.6195, "step": 236800 }, { "epoch": 1.7774743207857202, "grad_norm": 1.894516944885254, "learning_rate": 2.038755792985642e-05, "loss": 2.6952, "step": 236900 }, { "epoch": 1.7782246265353132, "grad_norm": 2.389307737350464, "learning_rate": 2.0375052834029866e-05, "loss": 2.5795, "step": 237000 }, { "epoch": 1.778974932284906, "grad_norm": 2.213615894317627, "learning_rate": 2.036254773820332e-05, "loss": 2.5488, "step": 237100 }, { "epoch": 1.7797252380344992, "grad_norm": 1.5238951444625854, "learning_rate": 2.0350042642376768e-05, "loss": 2.6718, "step": 237200 }, { "epoch": 1.780475543784092, "grad_norm": 2.5962228775024414, "learning_rate": 2.033753754655022e-05, "loss": 2.5357, "step": 237300 }, { "epoch": 1.781225849533685, "grad_norm": 1.757934331893921, "learning_rate": 2.032503245072367e-05, "loss": 2.6823, "step": 237400 }, { "epoch": 1.781976155283278, "grad_norm": 1.4847657680511475, "learning_rate": 2.031252735489712e-05, "loss": 2.4997, "step": 237500 }, { "epoch": 1.7827264610328708, "grad_norm": 2.0446112155914307, "learning_rate": 2.0300022259070572e-05, "loss": 2.6122, "step": 237600 }, { "epoch": 1.7834767667824638, "grad_norm": 2.48160719871521, "learning_rate": 2.0287517163244023e-05, "loss": 2.6758, "step": 237700 }, { "epoch": 1.7842270725320568, "grad_norm": 1.4609678983688354, "learning_rate": 2.0275012067417474e-05, "loss": 2.7812, "step": 237800 }, { "epoch": 1.7849773782816498, "grad_norm": 1.7571443319320679, "learning_rate": 2.0262506971590925e-05, "loss": 2.6689, "step": 237900 }, { "epoch": 1.7857276840312428, "grad_norm": 1.5639338493347168, "learning_rate": 2.0250001875764373e-05, "loss": 2.5407, "step": 238000 }, { "epoch": 1.7864779897808356, "grad_norm": 2.5263724327087402, "learning_rate": 2.0237496779937827e-05, "loss": 2.6687, "step": 238100 }, { "epoch": 1.7872282955304286, "grad_norm": 2.093700408935547, "learning_rate": 2.022499168411128e-05, "loss": 2.6178, "step": 238200 }, { "epoch": 1.7879786012800216, "grad_norm": 2.3037915229797363, "learning_rate": 2.0212611639242992e-05, "loss": 2.6086, "step": 238300 }, { "epoch": 1.7887289070296146, "grad_norm": 1.6837750673294067, "learning_rate": 2.0200106543416443e-05, "loss": 2.5998, "step": 238400 }, { "epoch": 1.7894792127792076, "grad_norm": 2.325162649154663, "learning_rate": 2.0187601447589894e-05, "loss": 2.6174, "step": 238500 }, { "epoch": 1.7902295185288004, "grad_norm": 3.066486120223999, "learning_rate": 2.0175096351763345e-05, "loss": 2.6814, "step": 238600 }, { "epoch": 1.7909798242783934, "grad_norm": 1.894046425819397, "learning_rate": 2.0162591255936793e-05, "loss": 2.6959, "step": 238700 }, { "epoch": 1.7917301300279864, "grad_norm": 2.9144182205200195, "learning_rate": 2.0150086160110247e-05, "loss": 2.8131, "step": 238800 }, { "epoch": 1.7924804357775792, "grad_norm": 1.9275459051132202, "learning_rate": 2.01375810642837e-05, "loss": 2.74, "step": 238900 }, { "epoch": 1.7932307415271724, "grad_norm": 2.13606858253479, "learning_rate": 2.0125075968457146e-05, "loss": 2.6587, "step": 239000 }, { "epoch": 1.7939810472767652, "grad_norm": 1.4926011562347412, "learning_rate": 2.0112570872630597e-05, "loss": 2.756, "step": 239100 }, { "epoch": 1.7947313530263582, "grad_norm": 1.3614166975021362, "learning_rate": 2.0100065776804048e-05, "loss": 2.608, "step": 239200 }, { "epoch": 1.7954816587759512, "grad_norm": 1.6563140153884888, "learning_rate": 2.00875606809775e-05, "loss": 2.6412, "step": 239300 }, { "epoch": 1.796231964525544, "grad_norm": 2.4985170364379883, "learning_rate": 2.007505558515095e-05, "loss": 2.4636, "step": 239400 }, { "epoch": 1.7969822702751372, "grad_norm": 2.3329861164093018, "learning_rate": 2.00625504893244e-05, "loss": 2.6218, "step": 239500 }, { "epoch": 1.79773257602473, "grad_norm": 1.7209120988845825, "learning_rate": 2.0050045393497852e-05, "loss": 2.7213, "step": 239600 }, { "epoch": 1.798482881774323, "grad_norm": 2.8511054515838623, "learning_rate": 2.0037540297671303e-05, "loss": 2.6857, "step": 239700 }, { "epoch": 1.799233187523916, "grad_norm": 2.1554057598114014, "learning_rate": 2.002503520184475e-05, "loss": 2.4453, "step": 239800 }, { "epoch": 1.7999834932735088, "grad_norm": 2.195219039916992, "learning_rate": 2.0012530106018205e-05, "loss": 2.6109, "step": 239900 }, { "epoch": 1.800733799023102, "grad_norm": 3.071748971939087, "learning_rate": 2.0000025010191653e-05, "loss": 2.6454, "step": 240000 }, { "epoch": 1.8014841047726948, "grad_norm": 1.779133677482605, "learning_rate": 1.9987519914365104e-05, "loss": 2.7067, "step": 240100 }, { "epoch": 1.8022344105222878, "grad_norm": 2.610356569290161, "learning_rate": 1.9975014818538555e-05, "loss": 2.7514, "step": 240200 }, { "epoch": 1.8029847162718808, "grad_norm": 1.3093671798706055, "learning_rate": 1.9962634773670272e-05, "loss": 2.6687, "step": 240300 }, { "epoch": 1.8037350220214736, "grad_norm": 1.7078965902328491, "learning_rate": 1.9950129677843723e-05, "loss": 2.6746, "step": 240400 }, { "epoch": 1.8044853277710668, "grad_norm": 2.3835387229919434, "learning_rate": 1.993762458201717e-05, "loss": 2.624, "step": 240500 }, { "epoch": 1.8052356335206596, "grad_norm": 2.9722561836242676, "learning_rate": 1.9925119486190625e-05, "loss": 2.6099, "step": 240600 }, { "epoch": 1.8059859392702526, "grad_norm": 1.6831305027008057, "learning_rate": 1.9912614390364073e-05, "loss": 2.6416, "step": 240700 }, { "epoch": 1.8067362450198456, "grad_norm": 1.5434209108352661, "learning_rate": 1.9900109294537524e-05, "loss": 2.8543, "step": 240800 }, { "epoch": 1.8074865507694384, "grad_norm": 1.762873649597168, "learning_rate": 1.988760419871098e-05, "loss": 2.7928, "step": 240900 }, { "epoch": 1.8082368565190317, "grad_norm": 1.965562343597412, "learning_rate": 1.9875099102884426e-05, "loss": 2.5908, "step": 241000 }, { "epoch": 1.8089871622686244, "grad_norm": 2.7890501022338867, "learning_rate": 1.9862594007057877e-05, "loss": 2.7518, "step": 241100 }, { "epoch": 1.8097374680182174, "grad_norm": 1.5331215858459473, "learning_rate": 1.9850088911231328e-05, "loss": 2.5709, "step": 241200 }, { "epoch": 1.8104877737678104, "grad_norm": 1.7127416133880615, "learning_rate": 1.983758381540478e-05, "loss": 2.6671, "step": 241300 }, { "epoch": 1.8112380795174032, "grad_norm": 2.720362901687622, "learning_rate": 1.982507871957823e-05, "loss": 2.7542, "step": 241400 }, { "epoch": 1.8119883852669965, "grad_norm": 1.9819942712783813, "learning_rate": 1.9812573623751678e-05, "loss": 2.664, "step": 241500 }, { "epoch": 1.8127386910165892, "grad_norm": 2.1834018230438232, "learning_rate": 1.9800068527925132e-05, "loss": 2.7261, "step": 241600 }, { "epoch": 1.8134889967661822, "grad_norm": 1.5239250659942627, "learning_rate": 1.978756343209858e-05, "loss": 2.7594, "step": 241700 }, { "epoch": 1.8142393025157753, "grad_norm": 1.6288478374481201, "learning_rate": 1.977505833627203e-05, "loss": 2.6382, "step": 241800 }, { "epoch": 1.814989608265368, "grad_norm": 1.9677284955978394, "learning_rate": 1.9762553240445482e-05, "loss": 2.855, "step": 241900 }, { "epoch": 1.815739914014961, "grad_norm": 2.1924750804901123, "learning_rate": 1.9750048144618933e-05, "loss": 2.6586, "step": 242000 }, { "epoch": 1.816490219764554, "grad_norm": 1.989654302597046, "learning_rate": 1.9737543048792384e-05, "loss": 2.6044, "step": 242100 }, { "epoch": 1.817240525514147, "grad_norm": 1.856346607208252, "learning_rate": 1.9725037952965835e-05, "loss": 2.7402, "step": 242200 }, { "epoch": 1.81799083126374, "grad_norm": 1.4387255907058716, "learning_rate": 1.9712532857139286e-05, "loss": 2.6397, "step": 242300 }, { "epoch": 1.8187411370133328, "grad_norm": 2.1290903091430664, "learning_rate": 1.9700027761312737e-05, "loss": 2.6814, "step": 242400 }, { "epoch": 1.8194914427629258, "grad_norm": 2.3373937606811523, "learning_rate": 1.968764771644445e-05, "loss": 2.666, "step": 242500 }, { "epoch": 1.8202417485125189, "grad_norm": 2.3154046535491943, "learning_rate": 1.9675142620617902e-05, "loss": 2.7108, "step": 242600 }, { "epoch": 1.8209920542621119, "grad_norm": 2.2559609413146973, "learning_rate": 1.9662637524791353e-05, "loss": 2.5171, "step": 242700 }, { "epoch": 1.8217423600117049, "grad_norm": 2.293944835662842, "learning_rate": 1.9650132428964804e-05, "loss": 2.6809, "step": 242800 }, { "epoch": 1.8224926657612976, "grad_norm": 1.6735013723373413, "learning_rate": 1.9637627333138255e-05, "loss": 2.631, "step": 242900 }, { "epoch": 1.8232429715108907, "grad_norm": 2.1979851722717285, "learning_rate": 1.9625122237311706e-05, "loss": 2.6549, "step": 243000 }, { "epoch": 1.8239932772604837, "grad_norm": 1.634627103805542, "learning_rate": 1.9612617141485157e-05, "loss": 2.6088, "step": 243100 }, { "epoch": 1.8247435830100764, "grad_norm": 1.3770557641983032, "learning_rate": 1.9600112045658605e-05, "loss": 2.6652, "step": 243200 }, { "epoch": 1.8254938887596697, "grad_norm": 2.9374234676361084, "learning_rate": 1.958760694983206e-05, "loss": 2.5717, "step": 243300 }, { "epoch": 1.8262441945092625, "grad_norm": 2.4097371101379395, "learning_rate": 1.957510185400551e-05, "loss": 2.6627, "step": 243400 }, { "epoch": 1.8269945002588555, "grad_norm": 1.9661647081375122, "learning_rate": 1.9562596758178958e-05, "loss": 2.64, "step": 243500 }, { "epoch": 1.8277448060084485, "grad_norm": 1.8697978258132935, "learning_rate": 1.955009166235241e-05, "loss": 2.4116, "step": 243600 }, { "epoch": 1.8284951117580412, "grad_norm": 2.5360474586486816, "learning_rate": 1.9537711617484126e-05, "loss": 2.484, "step": 243700 }, { "epoch": 1.8292454175076345, "grad_norm": 2.519473075866699, "learning_rate": 1.9525206521657577e-05, "loss": 2.6434, "step": 243800 }, { "epoch": 1.8299957232572273, "grad_norm": 2.0157980918884277, "learning_rate": 1.9512701425831025e-05, "loss": 2.5423, "step": 243900 }, { "epoch": 1.8307460290068203, "grad_norm": 2.1172356605529785, "learning_rate": 1.950019633000448e-05, "loss": 2.5173, "step": 244000 }, { "epoch": 1.8314963347564133, "grad_norm": 2.0660037994384766, "learning_rate": 1.948769123417793e-05, "loss": 2.6554, "step": 244100 }, { "epoch": 1.832246640506006, "grad_norm": 1.4909330606460571, "learning_rate": 1.9475186138351378e-05, "loss": 2.6931, "step": 244200 }, { "epoch": 1.8329969462555993, "grad_norm": 1.6804496049880981, "learning_rate": 1.946268104252483e-05, "loss": 2.6934, "step": 244300 }, { "epoch": 1.833747252005192, "grad_norm": 2.1285321712493896, "learning_rate": 1.945017594669828e-05, "loss": 2.5917, "step": 244400 }, { "epoch": 1.834497557754785, "grad_norm": 1.3955317735671997, "learning_rate": 1.943767085087173e-05, "loss": 2.717, "step": 244500 }, { "epoch": 1.835247863504378, "grad_norm": 1.538334846496582, "learning_rate": 1.9425165755045182e-05, "loss": 2.6446, "step": 244600 }, { "epoch": 1.8359981692539709, "grad_norm": 1.889443278312683, "learning_rate": 1.9412660659218633e-05, "loss": 2.6972, "step": 244700 }, { "epoch": 1.836748475003564, "grad_norm": 1.5855387449264526, "learning_rate": 1.9400155563392084e-05, "loss": 2.6698, "step": 244800 }, { "epoch": 1.8374987807531569, "grad_norm": 1.2815237045288086, "learning_rate": 1.9387650467565535e-05, "loss": 2.6299, "step": 244900 }, { "epoch": 1.8382490865027499, "grad_norm": 1.588518738746643, "learning_rate": 1.9375145371738983e-05, "loss": 2.7547, "step": 245000 }, { "epoch": 1.8389993922523429, "grad_norm": 1.8294564485549927, "learning_rate": 1.9362640275912437e-05, "loss": 2.5461, "step": 245100 }, { "epoch": 1.8397496980019357, "grad_norm": 1.423226237297058, "learning_rate": 1.9350135180085885e-05, "loss": 2.5486, "step": 245200 }, { "epoch": 1.840500003751529, "grad_norm": 2.348594903945923, "learning_rate": 1.9337630084259336e-05, "loss": 2.664, "step": 245300 }, { "epoch": 1.8412503095011217, "grad_norm": 2.1124086380004883, "learning_rate": 1.932512498843279e-05, "loss": 2.6063, "step": 245400 }, { "epoch": 1.8420006152507147, "grad_norm": 2.496587038040161, "learning_rate": 1.9312619892606238e-05, "loss": 2.6357, "step": 245500 }, { "epoch": 1.8427509210003077, "grad_norm": 3.135674238204956, "learning_rate": 1.930011479677969e-05, "loss": 2.7198, "step": 245600 }, { "epoch": 1.8435012267499005, "grad_norm": 2.4303200244903564, "learning_rate": 1.928760970095314e-05, "loss": 2.6895, "step": 245700 }, { "epoch": 1.8442515324994937, "grad_norm": 1.9850927591323853, "learning_rate": 1.927510460512659e-05, "loss": 2.6312, "step": 245800 }, { "epoch": 1.8450018382490865, "grad_norm": 1.3978691101074219, "learning_rate": 1.9262599509300042e-05, "loss": 2.7259, "step": 245900 }, { "epoch": 1.8457521439986795, "grad_norm": 1.4420465230941772, "learning_rate": 1.925009441347349e-05, "loss": 2.775, "step": 246000 }, { "epoch": 1.8465024497482725, "grad_norm": 1.934191107749939, "learning_rate": 1.9237589317646944e-05, "loss": 2.648, "step": 246100 }, { "epoch": 1.8472527554978653, "grad_norm": 1.7577964067459106, "learning_rate": 1.922508422182039e-05, "loss": 2.5874, "step": 246200 }, { "epoch": 1.8480030612474585, "grad_norm": 1.614180088043213, "learning_rate": 1.9212579125993842e-05, "loss": 2.7278, "step": 246300 }, { "epoch": 1.8487533669970513, "grad_norm": 2.308777093887329, "learning_rate": 1.9200074030167293e-05, "loss": 2.5389, "step": 246400 }, { "epoch": 1.8495036727466443, "grad_norm": 1.7731596231460571, "learning_rate": 1.9187568934340744e-05, "loss": 2.656, "step": 246500 }, { "epoch": 1.8502539784962373, "grad_norm": 1.4671971797943115, "learning_rate": 1.9175063838514196e-05, "loss": 2.478, "step": 246600 }, { "epoch": 1.85100428424583, "grad_norm": 2.135796070098877, "learning_rate": 1.9162558742687647e-05, "loss": 2.5894, "step": 246700 }, { "epoch": 1.851754589995423, "grad_norm": 2.2213010787963867, "learning_rate": 1.9150053646861098e-05, "loss": 2.779, "step": 246800 }, { "epoch": 1.852504895745016, "grad_norm": 2.5350582599639893, "learning_rate": 1.913754855103455e-05, "loss": 2.639, "step": 246900 }, { "epoch": 1.853255201494609, "grad_norm": 1.7705528736114502, "learning_rate": 1.9125043455207996e-05, "loss": 2.785, "step": 247000 }, { "epoch": 1.854005507244202, "grad_norm": 2.8787081241607666, "learning_rate": 1.9112538359381447e-05, "loss": 2.646, "step": 247100 }, { "epoch": 1.8547558129937949, "grad_norm": 2.0169389247894287, "learning_rate": 1.91000332635549e-05, "loss": 2.5934, "step": 247200 }, { "epoch": 1.8555061187433879, "grad_norm": 2.018906831741333, "learning_rate": 1.908752816772835e-05, "loss": 2.7106, "step": 247300 }, { "epoch": 1.856256424492981, "grad_norm": 1.372273564338684, "learning_rate": 1.90750230719018e-05, "loss": 2.6686, "step": 247400 }, { "epoch": 1.857006730242574, "grad_norm": 2.4166929721832275, "learning_rate": 1.906251797607525e-05, "loss": 2.6727, "step": 247500 }, { "epoch": 1.857757035992167, "grad_norm": 2.6054015159606934, "learning_rate": 1.9050012880248702e-05, "loss": 2.7338, "step": 247600 }, { "epoch": 1.8585073417417597, "grad_norm": 2.6403098106384277, "learning_rate": 1.9037632835380416e-05, "loss": 2.7857, "step": 247700 }, { "epoch": 1.8592576474913527, "grad_norm": 1.837641954421997, "learning_rate": 1.902512773955387e-05, "loss": 2.6815, "step": 247800 }, { "epoch": 1.8600079532409457, "grad_norm": 1.396510362625122, "learning_rate": 1.9012622643727322e-05, "loss": 2.706, "step": 247900 }, { "epoch": 1.8607582589905385, "grad_norm": 2.052950143814087, "learning_rate": 1.900011754790077e-05, "loss": 2.7879, "step": 248000 }, { "epoch": 1.8615085647401317, "grad_norm": 2.0150725841522217, "learning_rate": 1.898761245207422e-05, "loss": 2.5773, "step": 248100 }, { "epoch": 1.8622588704897245, "grad_norm": 1.6160995960235596, "learning_rate": 1.897510735624767e-05, "loss": 2.6716, "step": 248200 }, { "epoch": 1.8630091762393175, "grad_norm": 1.71308434009552, "learning_rate": 1.8962602260421122e-05, "loss": 2.5132, "step": 248300 }, { "epoch": 1.8637594819889105, "grad_norm": 1.947446584701538, "learning_rate": 1.8950097164594573e-05, "loss": 2.792, "step": 248400 }, { "epoch": 1.8645097877385033, "grad_norm": 2.748284101486206, "learning_rate": 1.8937592068768024e-05, "loss": 2.6677, "step": 248500 }, { "epoch": 1.8652600934880965, "grad_norm": 2.410334348678589, "learning_rate": 1.8925086972941475e-05, "loss": 2.6624, "step": 248600 }, { "epoch": 1.8660103992376893, "grad_norm": 2.010688066482544, "learning_rate": 1.8912581877114926e-05, "loss": 2.5974, "step": 248700 }, { "epoch": 1.8667607049872823, "grad_norm": 2.134237289428711, "learning_rate": 1.8900076781288374e-05, "loss": 2.5147, "step": 248800 }, { "epoch": 1.8675110107368753, "grad_norm": 3.0411338806152344, "learning_rate": 1.888757168546183e-05, "loss": 2.6576, "step": 248900 }, { "epoch": 1.868261316486468, "grad_norm": 1.948861837387085, "learning_rate": 1.8875066589635276e-05, "loss": 2.6721, "step": 249000 }, { "epoch": 1.8690116222360613, "grad_norm": 1.943029522895813, "learning_rate": 1.8862561493808727e-05, "loss": 2.7549, "step": 249100 }, { "epoch": 1.869761927985654, "grad_norm": 1.7768739461898804, "learning_rate": 1.8850056397982178e-05, "loss": 2.6829, "step": 249200 }, { "epoch": 1.870512233735247, "grad_norm": 1.627284049987793, "learning_rate": 1.883755130215563e-05, "loss": 2.6827, "step": 249300 }, { "epoch": 1.8712625394848401, "grad_norm": 1.7401014566421509, "learning_rate": 1.882504620632908e-05, "loss": 2.5439, "step": 249400 }, { "epoch": 1.872012845234433, "grad_norm": 2.6514625549316406, "learning_rate": 1.881254111050253e-05, "loss": 2.6269, "step": 249500 }, { "epoch": 1.8727631509840261, "grad_norm": 1.790933609008789, "learning_rate": 1.8800036014675982e-05, "loss": 2.7539, "step": 249600 }, { "epoch": 1.873513456733619, "grad_norm": 2.569087266921997, "learning_rate": 1.8787530918849433e-05, "loss": 2.8083, "step": 249700 }, { "epoch": 1.874263762483212, "grad_norm": 3.5353331565856934, "learning_rate": 1.8775150873981147e-05, "loss": 2.5799, "step": 249800 }, { "epoch": 1.875014068232805, "grad_norm": 2.4337809085845947, "learning_rate": 1.87626457781546e-05, "loss": 2.5116, "step": 249900 }, { "epoch": 1.8757643739823977, "grad_norm": 2.794623374938965, "learning_rate": 1.875014068232805e-05, "loss": 2.601, "step": 250000 }, { "epoch": 1.876514679731991, "grad_norm": 2.1222431659698486, "learning_rate": 1.87376355865015e-05, "loss": 2.739, "step": 250100 }, { "epoch": 1.8772649854815837, "grad_norm": 2.4682207107543945, "learning_rate": 1.872513049067495e-05, "loss": 2.6235, "step": 250200 }, { "epoch": 1.8780152912311767, "grad_norm": 2.7395594120025635, "learning_rate": 1.8712625394848402e-05, "loss": 2.662, "step": 250300 }, { "epoch": 1.8787655969807697, "grad_norm": 2.540083169937134, "learning_rate": 1.8700120299021853e-05, "loss": 2.6334, "step": 250400 }, { "epoch": 1.8795159027303625, "grad_norm": 1.3574124574661255, "learning_rate": 1.86876152031953e-05, "loss": 2.7299, "step": 250500 }, { "epoch": 1.8802662084799557, "grad_norm": 1.5648738145828247, "learning_rate": 1.8675110107368755e-05, "loss": 2.6686, "step": 250600 }, { "epoch": 1.8810165142295485, "grad_norm": 2.1889748573303223, "learning_rate": 1.8662605011542203e-05, "loss": 2.6535, "step": 250700 }, { "epoch": 1.8817668199791415, "grad_norm": 2.722856283187866, "learning_rate": 1.8650099915715654e-05, "loss": 2.6803, "step": 250800 }, { "epoch": 1.8825171257287345, "grad_norm": 1.9224767684936523, "learning_rate": 1.8637594819889105e-05, "loss": 2.5626, "step": 250900 }, { "epoch": 1.8832674314783273, "grad_norm": 2.5875918865203857, "learning_rate": 1.8625089724062556e-05, "loss": 2.7952, "step": 251000 }, { "epoch": 1.8840177372279205, "grad_norm": 2.1289308071136475, "learning_rate": 1.8612584628236007e-05, "loss": 2.6517, "step": 251100 }, { "epoch": 1.8847680429775133, "grad_norm": 1.7152509689331055, "learning_rate": 1.8600079532409458e-05, "loss": 2.7359, "step": 251200 }, { "epoch": 1.8855183487271063, "grad_norm": 1.4894063472747803, "learning_rate": 1.858757443658291e-05, "loss": 2.7078, "step": 251300 }, { "epoch": 1.8862686544766993, "grad_norm": 1.943729043006897, "learning_rate": 1.857506934075636e-05, "loss": 2.6524, "step": 251400 }, { "epoch": 1.8870189602262921, "grad_norm": 2.175356149673462, "learning_rate": 1.8562564244929808e-05, "loss": 2.5142, "step": 251500 }, { "epoch": 1.8877692659758851, "grad_norm": 3.3817198276519775, "learning_rate": 1.855005914910326e-05, "loss": 2.6194, "step": 251600 }, { "epoch": 1.8885195717254781, "grad_norm": 2.7424333095550537, "learning_rate": 1.8537554053276713e-05, "loss": 2.6326, "step": 251700 }, { "epoch": 1.8892698774750711, "grad_norm": 3.2029032707214355, "learning_rate": 1.8525174008408427e-05, "loss": 2.6233, "step": 251800 }, { "epoch": 1.8900201832246641, "grad_norm": 2.0599582195281982, "learning_rate": 1.8512668912581878e-05, "loss": 2.7811, "step": 251900 }, { "epoch": 1.890770488974257, "grad_norm": 2.00640606880188, "learning_rate": 1.850016381675533e-05, "loss": 2.6289, "step": 252000 }, { "epoch": 1.89152079472385, "grad_norm": 2.061690092086792, "learning_rate": 1.848765872092878e-05, "loss": 2.7014, "step": 252100 }, { "epoch": 1.892271100473443, "grad_norm": 1.4206527471542358, "learning_rate": 1.8475153625102228e-05, "loss": 2.6056, "step": 252200 }, { "epoch": 1.893021406223036, "grad_norm": 1.6741445064544678, "learning_rate": 1.8462648529275682e-05, "loss": 2.6456, "step": 252300 }, { "epoch": 1.893771711972629, "grad_norm": 1.87335205078125, "learning_rate": 1.8450143433449133e-05, "loss": 2.7571, "step": 252400 }, { "epoch": 1.8945220177222217, "grad_norm": 1.2965327501296997, "learning_rate": 1.843763833762258e-05, "loss": 2.6598, "step": 252500 }, { "epoch": 1.8952723234718147, "grad_norm": 2.39717698097229, "learning_rate": 1.8425133241796032e-05, "loss": 2.705, "step": 252600 }, { "epoch": 1.8960226292214077, "grad_norm": 2.8840994834899902, "learning_rate": 1.8412628145969483e-05, "loss": 2.6026, "step": 252700 }, { "epoch": 1.8967729349710005, "grad_norm": 1.6449042558670044, "learning_rate": 1.8400123050142934e-05, "loss": 2.6151, "step": 252800 }, { "epoch": 1.8975232407205938, "grad_norm": 3.312671661376953, "learning_rate": 1.8387617954316385e-05, "loss": 2.5755, "step": 252900 }, { "epoch": 1.8982735464701865, "grad_norm": 1.8365360498428345, "learning_rate": 1.8375112858489836e-05, "loss": 2.8042, "step": 253000 }, { "epoch": 1.8990238522197795, "grad_norm": 1.5134824514389038, "learning_rate": 1.8362607762663287e-05, "loss": 2.5865, "step": 253100 }, { "epoch": 1.8997741579693725, "grad_norm": 1.8856240510940552, "learning_rate": 1.8350102666836738e-05, "loss": 2.6654, "step": 253200 }, { "epoch": 1.9005244637189653, "grad_norm": 3.020228624343872, "learning_rate": 1.8337597571010186e-05, "loss": 2.5426, "step": 253300 }, { "epoch": 1.9012747694685586, "grad_norm": 3.8491618633270264, "learning_rate": 1.832509247518364e-05, "loss": 2.7363, "step": 253400 }, { "epoch": 1.9020250752181513, "grad_norm": 1.5367169380187988, "learning_rate": 1.8312587379357088e-05, "loss": 2.7639, "step": 253500 }, { "epoch": 1.9027753809677443, "grad_norm": 2.5799076557159424, "learning_rate": 1.830008228353054e-05, "loss": 2.6177, "step": 253600 }, { "epoch": 1.9035256867173374, "grad_norm": 2.597324848175049, "learning_rate": 1.828757718770399e-05, "loss": 2.5821, "step": 253700 }, { "epoch": 1.9042759924669301, "grad_norm": 1.5852059125900269, "learning_rate": 1.827507209187744e-05, "loss": 2.4795, "step": 253800 }, { "epoch": 1.9050262982165234, "grad_norm": 1.4749221801757812, "learning_rate": 1.8262692047009158e-05, "loss": 2.7156, "step": 253900 }, { "epoch": 1.9057766039661161, "grad_norm": 1.7689855098724365, "learning_rate": 1.8250186951182606e-05, "loss": 2.7091, "step": 254000 }, { "epoch": 1.9065269097157092, "grad_norm": 1.6710466146469116, "learning_rate": 1.823768185535606e-05, "loss": 2.8122, "step": 254100 }, { "epoch": 1.9072772154653022, "grad_norm": 1.858849287033081, "learning_rate": 1.8225176759529508e-05, "loss": 2.6536, "step": 254200 }, { "epoch": 1.908027521214895, "grad_norm": 1.9238032102584839, "learning_rate": 1.821267166370296e-05, "loss": 2.6023, "step": 254300 }, { "epoch": 1.9087778269644882, "grad_norm": 1.9961943626403809, "learning_rate": 1.820016656787641e-05, "loss": 2.5168, "step": 254400 }, { "epoch": 1.909528132714081, "grad_norm": 1.7366199493408203, "learning_rate": 1.818766147204986e-05, "loss": 2.5334, "step": 254500 }, { "epoch": 1.910278438463674, "grad_norm": 1.530107855796814, "learning_rate": 1.8175156376223312e-05, "loss": 2.6956, "step": 254600 }, { "epoch": 1.911028744213267, "grad_norm": 1.704067349433899, "learning_rate": 1.8162651280396763e-05, "loss": 2.7258, "step": 254700 }, { "epoch": 1.9117790499628597, "grad_norm": 1.9711380004882812, "learning_rate": 1.8150146184570214e-05, "loss": 2.6474, "step": 254800 }, { "epoch": 1.912529355712453, "grad_norm": 1.7507041692733765, "learning_rate": 1.8137641088743665e-05, "loss": 2.6568, "step": 254900 }, { "epoch": 1.9132796614620458, "grad_norm": 2.011385917663574, "learning_rate": 1.8125135992917113e-05, "loss": 2.6466, "step": 255000 }, { "epoch": 1.9140299672116388, "grad_norm": 2.0659637451171875, "learning_rate": 1.8112630897090567e-05, "loss": 2.5177, "step": 255100 }, { "epoch": 1.9147802729612318, "grad_norm": 1.3386443853378296, "learning_rate": 1.8100125801264015e-05, "loss": 2.5828, "step": 255200 }, { "epoch": 1.9155305787108245, "grad_norm": 2.2372944355010986, "learning_rate": 1.8087620705437466e-05, "loss": 2.8161, "step": 255300 }, { "epoch": 1.9162808844604178, "grad_norm": 2.0023882389068604, "learning_rate": 1.8075115609610917e-05, "loss": 2.5869, "step": 255400 }, { "epoch": 1.9170311902100106, "grad_norm": 2.3426153659820557, "learning_rate": 1.8062610513784368e-05, "loss": 2.5751, "step": 255500 }, { "epoch": 1.9177814959596036, "grad_norm": 1.7273321151733398, "learning_rate": 1.805010541795782e-05, "loss": 2.5647, "step": 255600 }, { "epoch": 1.9185318017091966, "grad_norm": 1.5089417695999146, "learning_rate": 1.803760032213127e-05, "loss": 2.6614, "step": 255700 }, { "epoch": 1.9192821074587894, "grad_norm": 1.675614833831787, "learning_rate": 1.802509522630472e-05, "loss": 2.5745, "step": 255800 }, { "epoch": 1.9200324132083826, "grad_norm": 3.547013759613037, "learning_rate": 1.8012715181436438e-05, "loss": 2.6485, "step": 255900 }, { "epoch": 1.9207827189579754, "grad_norm": 2.0943636894226074, "learning_rate": 1.8000210085609886e-05, "loss": 2.6482, "step": 256000 }, { "epoch": 1.9215330247075684, "grad_norm": 2.1445560455322266, "learning_rate": 1.7987704989783337e-05, "loss": 2.7354, "step": 256100 }, { "epoch": 1.9222833304571614, "grad_norm": 1.4504034519195557, "learning_rate": 1.7975199893956788e-05, "loss": 2.4891, "step": 256200 }, { "epoch": 1.9230336362067542, "grad_norm": 1.6100106239318848, "learning_rate": 1.796269479813024e-05, "loss": 2.7708, "step": 256300 }, { "epoch": 1.9237839419563472, "grad_norm": 1.4349716901779175, "learning_rate": 1.795018970230369e-05, "loss": 2.6387, "step": 256400 }, { "epoch": 1.9245342477059402, "grad_norm": 1.530157446861267, "learning_rate": 1.793768460647714e-05, "loss": 2.5951, "step": 256500 }, { "epoch": 1.9252845534555332, "grad_norm": 2.7150931358337402, "learning_rate": 1.7925179510650592e-05, "loss": 2.6458, "step": 256600 }, { "epoch": 1.9260348592051262, "grad_norm": 1.3801782131195068, "learning_rate": 1.791267441482404e-05, "loss": 2.6163, "step": 256700 }, { "epoch": 1.926785164954719, "grad_norm": 1.2852301597595215, "learning_rate": 1.790016931899749e-05, "loss": 2.5102, "step": 256800 }, { "epoch": 1.927535470704312, "grad_norm": 2.1170237064361572, "learning_rate": 1.7887664223170945e-05, "loss": 2.5522, "step": 256900 }, { "epoch": 1.928285776453905, "grad_norm": 2.742659568786621, "learning_rate": 1.7875159127344393e-05, "loss": 2.5525, "step": 257000 }, { "epoch": 1.9290360822034978, "grad_norm": 1.4303311109542847, "learning_rate": 1.7862654031517844e-05, "loss": 2.5783, "step": 257100 }, { "epoch": 1.929786387953091, "grad_norm": 1.4929662942886353, "learning_rate": 1.7850148935691295e-05, "loss": 2.7393, "step": 257200 }, { "epoch": 1.9305366937026838, "grad_norm": 1.7507370710372925, "learning_rate": 1.7837643839864746e-05, "loss": 2.6988, "step": 257300 }, { "epoch": 1.9312869994522768, "grad_norm": 1.5197575092315674, "learning_rate": 1.7825138744038197e-05, "loss": 2.5892, "step": 257400 }, { "epoch": 1.9320373052018698, "grad_norm": 2.2296142578125, "learning_rate": 1.7812633648211648e-05, "loss": 2.7295, "step": 257500 }, { "epoch": 1.9327876109514626, "grad_norm": 2.5994763374328613, "learning_rate": 1.78001285523851e-05, "loss": 2.5676, "step": 257600 }, { "epoch": 1.9335379167010558, "grad_norm": 1.5836387872695923, "learning_rate": 1.778762345655855e-05, "loss": 2.6746, "step": 257700 }, { "epoch": 1.9342882224506486, "grad_norm": 1.4261826276779175, "learning_rate": 1.7775118360731997e-05, "loss": 2.644, "step": 257800 }, { "epoch": 1.9350385282002416, "grad_norm": Infinity, "learning_rate": 1.7762613264905452e-05, "loss": 2.5037, "step": 257900 }, { "epoch": 1.9357888339498346, "grad_norm": 1.868338704109192, "learning_rate": 1.7750233220037166e-05, "loss": 2.5762, "step": 258000 }, { "epoch": 1.9365391396994274, "grad_norm": 1.9578315019607544, "learning_rate": 1.7737728124210617e-05, "loss": 2.7248, "step": 258100 }, { "epoch": 1.9372894454490206, "grad_norm": 1.333659052848816, "learning_rate": 1.7725223028384068e-05, "loss": 2.6117, "step": 258200 }, { "epoch": 1.9380397511986134, "grad_norm": 2.1805105209350586, "learning_rate": 1.771271793255752e-05, "loss": 2.6957, "step": 258300 }, { "epoch": 1.9387900569482064, "grad_norm": 1.7275646924972534, "learning_rate": 1.770021283673097e-05, "loss": 2.6193, "step": 258400 }, { "epoch": 1.9395403626977994, "grad_norm": 2.1591503620147705, "learning_rate": 1.7687707740904418e-05, "loss": 2.587, "step": 258500 }, { "epoch": 1.9402906684473922, "grad_norm": 2.4728832244873047, "learning_rate": 1.7675202645077872e-05, "loss": 2.6334, "step": 258600 }, { "epoch": 1.9410409741969854, "grad_norm": 1.7293034791946411, "learning_rate": 1.766269754925132e-05, "loss": 2.4865, "step": 258700 }, { "epoch": 1.9417912799465782, "grad_norm": 1.6920278072357178, "learning_rate": 1.765019245342477e-05, "loss": 2.6464, "step": 258800 }, { "epoch": 1.9425415856961712, "grad_norm": 1.7649983167648315, "learning_rate": 1.763768735759822e-05, "loss": 2.7817, "step": 258900 }, { "epoch": 1.9432918914457642, "grad_norm": 2.1094963550567627, "learning_rate": 1.7625182261771673e-05, "loss": 2.6338, "step": 259000 }, { "epoch": 1.944042197195357, "grad_norm": 3.053461790084839, "learning_rate": 1.7612677165945124e-05, "loss": 2.7145, "step": 259100 }, { "epoch": 1.9447925029449502, "grad_norm": 2.723445177078247, "learning_rate": 1.7600172070118575e-05, "loss": 2.6512, "step": 259200 }, { "epoch": 1.945542808694543, "grad_norm": 1.6987370252609253, "learning_rate": 1.7587666974292026e-05, "loss": 2.6971, "step": 259300 }, { "epoch": 1.946293114444136, "grad_norm": 1.473152995109558, "learning_rate": 1.7575161878465477e-05, "loss": 2.6498, "step": 259400 }, { "epoch": 1.947043420193729, "grad_norm": 1.554927945137024, "learning_rate": 1.7562656782638924e-05, "loss": 2.7004, "step": 259500 }, { "epoch": 1.9477937259433218, "grad_norm": 2.4804141521453857, "learning_rate": 1.755015168681238e-05, "loss": 2.6936, "step": 259600 }, { "epoch": 1.948544031692915, "grad_norm": 1.3431508541107178, "learning_rate": 1.7537646590985826e-05, "loss": 2.7807, "step": 259700 }, { "epoch": 1.9492943374425078, "grad_norm": 2.235924243927002, "learning_rate": 1.7525141495159277e-05, "loss": 2.6641, "step": 259800 }, { "epoch": 1.9500446431921008, "grad_norm": 1.6332472562789917, "learning_rate": 1.751263639933273e-05, "loss": 2.6787, "step": 259900 }, { "epoch": 1.9507949489416938, "grad_norm": 1.7098653316497803, "learning_rate": 1.7500256354464446e-05, "loss": 2.5077, "step": 260000 }, { "epoch": 1.9515452546912866, "grad_norm": 2.581625461578369, "learning_rate": 1.7487751258637897e-05, "loss": 2.7294, "step": 260100 }, { "epoch": 1.9522955604408798, "grad_norm": 1.8880912065505981, "learning_rate": 1.7475246162811344e-05, "loss": 2.6649, "step": 260200 }, { "epoch": 1.9530458661904726, "grad_norm": 2.279460906982422, "learning_rate": 1.74627410669848e-05, "loss": 2.745, "step": 260300 }, { "epoch": 1.9537961719400656, "grad_norm": 1.5559329986572266, "learning_rate": 1.745023597115825e-05, "loss": 2.523, "step": 260400 }, { "epoch": 1.9545464776896586, "grad_norm": 1.9194825887680054, "learning_rate": 1.7437730875331697e-05, "loss": 2.4987, "step": 260500 }, { "epoch": 1.9552967834392514, "grad_norm": 1.8490175008773804, "learning_rate": 1.742522577950515e-05, "loss": 2.5883, "step": 260600 }, { "epoch": 1.9560470891888444, "grad_norm": 2.067253828048706, "learning_rate": 1.74127206836786e-05, "loss": 2.7414, "step": 260700 }, { "epoch": 1.9567973949384374, "grad_norm": 1.5518747568130493, "learning_rate": 1.740021558785205e-05, "loss": 2.5706, "step": 260800 }, { "epoch": 1.9575477006880304, "grad_norm": 1.7387211322784424, "learning_rate": 1.7387835542983765e-05, "loss": 2.6245, "step": 260900 }, { "epoch": 1.9582980064376234, "grad_norm": 2.2671072483062744, "learning_rate": 1.737533044715722e-05, "loss": 2.7475, "step": 261000 }, { "epoch": 1.9590483121872162, "grad_norm": 1.3497689962387085, "learning_rate": 1.736282535133067e-05, "loss": 2.7124, "step": 261100 }, { "epoch": 1.9597986179368092, "grad_norm": 2.346736192703247, "learning_rate": 1.7350320255504118e-05, "loss": 2.7734, "step": 261200 }, { "epoch": 1.9605489236864022, "grad_norm": 1.556463360786438, "learning_rate": 1.733781515967757e-05, "loss": 2.6186, "step": 261300 }, { "epoch": 1.9612992294359952, "grad_norm": 1.5068199634552002, "learning_rate": 1.732531006385102e-05, "loss": 2.746, "step": 261400 }, { "epoch": 1.9620495351855882, "grad_norm": 2.7748889923095703, "learning_rate": 1.731280496802447e-05, "loss": 2.7127, "step": 261500 }, { "epoch": 1.962799840935181, "grad_norm": 1.6793774366378784, "learning_rate": 1.730029987219792e-05, "loss": 2.6464, "step": 261600 }, { "epoch": 1.963550146684774, "grad_norm": 2.07407808303833, "learning_rate": 1.7287794776371373e-05, "loss": 2.6515, "step": 261700 }, { "epoch": 1.964300452434367, "grad_norm": 1.8998160362243652, "learning_rate": 1.7275289680544824e-05, "loss": 2.7431, "step": 261800 }, { "epoch": 1.9650507581839598, "grad_norm": 1.5926882028579712, "learning_rate": 1.726278458471827e-05, "loss": 2.6904, "step": 261900 }, { "epoch": 1.965801063933553, "grad_norm": 1.4189183712005615, "learning_rate": 1.7250279488891726e-05, "loss": 2.6298, "step": 262000 }, { "epoch": 1.9665513696831458, "grad_norm": 1.603713035583496, "learning_rate": 1.7237774393065177e-05, "loss": 2.6631, "step": 262100 }, { "epoch": 1.9673016754327388, "grad_norm": 2.3907320499420166, "learning_rate": 1.7225269297238624e-05, "loss": 2.6531, "step": 262200 }, { "epoch": 1.9680519811823318, "grad_norm": 2.5791590213775635, "learning_rate": 1.7212764201412075e-05, "loss": 2.6777, "step": 262300 }, { "epoch": 1.9688022869319246, "grad_norm": 2.541947603225708, "learning_rate": 1.7200259105585526e-05, "loss": 2.6233, "step": 262400 }, { "epoch": 1.9695525926815178, "grad_norm": 2.1411702632904053, "learning_rate": 1.7187754009758977e-05, "loss": 2.7185, "step": 262500 }, { "epoch": 1.9703028984311106, "grad_norm": 1.3984342813491821, "learning_rate": 1.717524891393243e-05, "loss": 2.7806, "step": 262600 }, { "epoch": 1.9710532041807036, "grad_norm": 3.437453269958496, "learning_rate": 1.716274381810588e-05, "loss": 2.5969, "step": 262700 }, { "epoch": 1.9718035099302966, "grad_norm": 1.5812036991119385, "learning_rate": 1.715023872227933e-05, "loss": 2.7224, "step": 262800 }, { "epoch": 1.9725538156798894, "grad_norm": 1.937514305114746, "learning_rate": 1.713773362645278e-05, "loss": 2.6646, "step": 262900 }, { "epoch": 1.9733041214294826, "grad_norm": 2.475349187850952, "learning_rate": 1.712522853062623e-05, "loss": 2.501, "step": 263000 }, { "epoch": 1.9740544271790754, "grad_norm": 2.4121994972229004, "learning_rate": 1.7112723434799684e-05, "loss": 2.5797, "step": 263100 }, { "epoch": 1.9748047329286684, "grad_norm": 1.491394281387329, "learning_rate": 1.7100343389931398e-05, "loss": 2.5247, "step": 263200 }, { "epoch": 1.9755550386782614, "grad_norm": 2.6575005054473877, "learning_rate": 1.708783829410485e-05, "loss": 2.6553, "step": 263300 }, { "epoch": 1.9763053444278542, "grad_norm": 2.2641093730926514, "learning_rate": 1.70753331982783e-05, "loss": 2.5821, "step": 263400 }, { "epoch": 1.9770556501774474, "grad_norm": 1.6803584098815918, "learning_rate": 1.706282810245175e-05, "loss": 2.6141, "step": 263500 }, { "epoch": 1.9778059559270402, "grad_norm": 1.506150484085083, "learning_rate": 1.70503230066252e-05, "loss": 2.7188, "step": 263600 }, { "epoch": 1.9785562616766332, "grad_norm": 1.6019290685653687, "learning_rate": 1.703781791079865e-05, "loss": 2.5571, "step": 263700 }, { "epoch": 1.9793065674262262, "grad_norm": 1.5638952255249023, "learning_rate": 1.7025312814972104e-05, "loss": 2.6212, "step": 263800 }, { "epoch": 1.980056873175819, "grad_norm": 1.5205044746398926, "learning_rate": 1.701280771914555e-05, "loss": 2.5281, "step": 263900 }, { "epoch": 1.9808071789254122, "grad_norm": 1.29277765750885, "learning_rate": 1.7000302623319002e-05, "loss": 2.6136, "step": 264000 }, { "epoch": 1.981557484675005, "grad_norm": 1.7331585884094238, "learning_rate": 1.6987797527492453e-05, "loss": 2.6637, "step": 264100 }, { "epoch": 1.982307790424598, "grad_norm": 2.64298939704895, "learning_rate": 1.6975292431665904e-05, "loss": 2.5847, "step": 264200 }, { "epoch": 1.983058096174191, "grad_norm": 2.5457816123962402, "learning_rate": 1.6962787335839355e-05, "loss": 2.6022, "step": 264300 }, { "epoch": 1.9838084019237838, "grad_norm": 2.404686689376831, "learning_rate": 1.6950282240012806e-05, "loss": 2.6649, "step": 264400 }, { "epoch": 1.984558707673377, "grad_norm": 2.3296926021575928, "learning_rate": 1.6937777144186257e-05, "loss": 2.7805, "step": 264500 }, { "epoch": 1.9853090134229698, "grad_norm": 1.8637409210205078, "learning_rate": 1.692527204835971e-05, "loss": 2.6272, "step": 264600 }, { "epoch": 1.9860593191725628, "grad_norm": 1.9200565814971924, "learning_rate": 1.6912766952533156e-05, "loss": 2.64, "step": 264700 }, { "epoch": 1.9868096249221558, "grad_norm": 1.738950490951538, "learning_rate": 1.690026185670661e-05, "loss": 2.7733, "step": 264800 }, { "epoch": 1.9875599306717486, "grad_norm": 2.1724181175231934, "learning_rate": 1.688775676088006e-05, "loss": 2.6501, "step": 264900 }, { "epoch": 1.9883102364213419, "grad_norm": 2.8755903244018555, "learning_rate": 1.687525166505351e-05, "loss": 2.572, "step": 265000 }, { "epoch": 1.9890605421709346, "grad_norm": 1.465320110321045, "learning_rate": 1.686274656922696e-05, "loss": 2.6432, "step": 265100 }, { "epoch": 1.9898108479205276, "grad_norm": 1.6557397842407227, "learning_rate": 1.685024147340041e-05, "loss": 2.6518, "step": 265200 }, { "epoch": 1.9905611536701207, "grad_norm": 2.5539002418518066, "learning_rate": 1.6837736377573862e-05, "loss": 2.6236, "step": 265300 }, { "epoch": 1.9913114594197134, "grad_norm": 1.5264166593551636, "learning_rate": 1.6825231281747313e-05, "loss": 2.6144, "step": 265400 }, { "epoch": 1.9920617651693064, "grad_norm": 2.2079837322235107, "learning_rate": 1.6812726185920764e-05, "loss": 2.5527, "step": 265500 }, { "epoch": 1.9928120709188994, "grad_norm": 2.326749801635742, "learning_rate": 1.6800221090094215e-05, "loss": 2.6178, "step": 265600 }, { "epoch": 1.9935623766684925, "grad_norm": 1.9287779331207275, "learning_rate": 1.6787715994267663e-05, "loss": 2.5564, "step": 265700 }, { "epoch": 1.9943126824180855, "grad_norm": 1.6664142608642578, "learning_rate": 1.6775210898441114e-05, "loss": 2.6149, "step": 265800 }, { "epoch": 1.9950629881676782, "grad_norm": 1.5867160558700562, "learning_rate": 1.6762705802614568e-05, "loss": 2.7218, "step": 265900 }, { "epoch": 1.9958132939172712, "grad_norm": 1.5249080657958984, "learning_rate": 1.6750200706788016e-05, "loss": 2.6322, "step": 266000 }, { "epoch": 1.9965635996668643, "grad_norm": 1.8630748987197876, "learning_rate": 1.6737695610961467e-05, "loss": 2.6529, "step": 266100 }, { "epoch": 1.9973139054164573, "grad_norm": 1.7823160886764526, "learning_rate": 1.6725190515134918e-05, "loss": 2.6514, "step": 266200 }, { "epoch": 1.9980642111660503, "grad_norm": 1.4697669744491577, "learning_rate": 1.671268541930837e-05, "loss": 2.6114, "step": 266300 }, { "epoch": 1.998814516915643, "grad_norm": 2.2857394218444824, "learning_rate": 1.670018032348182e-05, "loss": 2.5723, "step": 266400 }, { "epoch": 1.999564822665236, "grad_norm": 1.7718755006790161, "learning_rate": 1.668767522765527e-05, "loss": 2.6927, "step": 266500 }, { "epoch": 2.000315128414829, "grad_norm": 2.2402408123016357, "learning_rate": 1.6675170131828722e-05, "loss": 2.6648, "step": 266600 }, { "epoch": 2.001065434164422, "grad_norm": 2.547269821166992, "learning_rate": 1.6662665036002173e-05, "loss": 2.5616, "step": 266700 }, { "epoch": 2.001815739914015, "grad_norm": 1.4752693176269531, "learning_rate": 1.665015994017562e-05, "loss": 2.7266, "step": 266800 }, { "epoch": 2.002566045663608, "grad_norm": 2.793300151824951, "learning_rate": 1.6637654844349075e-05, "loss": 2.5414, "step": 266900 }, { "epoch": 2.003316351413201, "grad_norm": 1.277145266532898, "learning_rate": 1.6625149748522523e-05, "loss": 2.6243, "step": 267000 }, { "epoch": 2.004066657162794, "grad_norm": 2.261084794998169, "learning_rate": 1.6612644652695974e-05, "loss": 2.527, "step": 267100 }, { "epoch": 2.0048169629123866, "grad_norm": 1.852373719215393, "learning_rate": 1.660026460782769e-05, "loss": 2.604, "step": 267200 }, { "epoch": 2.00556726866198, "grad_norm": 1.6193424463272095, "learning_rate": 1.6587759512001142e-05, "loss": 2.6073, "step": 267300 }, { "epoch": 2.0063175744115727, "grad_norm": 1.630508303642273, "learning_rate": 1.6575254416174593e-05, "loss": 2.5649, "step": 267400 }, { "epoch": 2.007067880161166, "grad_norm": 1.7177302837371826, "learning_rate": 1.656274932034804e-05, "loss": 2.4241, "step": 267500 }, { "epoch": 2.0078181859107587, "grad_norm": 1.8808315992355347, "learning_rate": 1.6550244224521495e-05, "loss": 2.6337, "step": 267600 }, { "epoch": 2.0085684916603515, "grad_norm": 1.861169457435608, "learning_rate": 1.6537739128694943e-05, "loss": 2.5579, "step": 267700 }, { "epoch": 2.0093187974099447, "grad_norm": 2.9229841232299805, "learning_rate": 1.6525234032868394e-05, "loss": 2.6212, "step": 267800 }, { "epoch": 2.0100691031595375, "grad_norm": 3.0620925426483154, "learning_rate": 1.6512728937041845e-05, "loss": 2.5829, "step": 267900 }, { "epoch": 2.0108194089091307, "grad_norm": 2.0653817653656006, "learning_rate": 1.6500223841215296e-05, "loss": 2.6461, "step": 268000 }, { "epoch": 2.0115697146587235, "grad_norm": 1.7721291780471802, "learning_rate": 1.6487718745388747e-05, "loss": 2.5019, "step": 268100 }, { "epoch": 2.0123200204083163, "grad_norm": 2.9883275032043457, "learning_rate": 1.6475213649562195e-05, "loss": 2.6026, "step": 268200 }, { "epoch": 2.0130703261579095, "grad_norm": 1.8960425853729248, "learning_rate": 1.646270855373565e-05, "loss": 2.5515, "step": 268300 }, { "epoch": 2.0138206319075023, "grad_norm": 1.6239732503890991, "learning_rate": 1.64502034579091e-05, "loss": 2.6849, "step": 268400 }, { "epoch": 2.014570937657095, "grad_norm": 1.8226354122161865, "learning_rate": 1.6437698362082548e-05, "loss": 2.7804, "step": 268500 }, { "epoch": 2.0153212434066883, "grad_norm": 3.095590353012085, "learning_rate": 1.6425193266256002e-05, "loss": 2.5062, "step": 268600 }, { "epoch": 2.016071549156281, "grad_norm": 2.5039119720458984, "learning_rate": 1.6412688170429453e-05, "loss": 2.567, "step": 268700 }, { "epoch": 2.0168218549058743, "grad_norm": 3.3203885555267334, "learning_rate": 1.64001830746029e-05, "loss": 2.6203, "step": 268800 }, { "epoch": 2.017572160655467, "grad_norm": 3.0031638145446777, "learning_rate": 1.638767797877635e-05, "loss": 2.7586, "step": 268900 }, { "epoch": 2.01832246640506, "grad_norm": 1.8770102262496948, "learning_rate": 1.6375172882949803e-05, "loss": 2.6855, "step": 269000 }, { "epoch": 2.019072772154653, "grad_norm": 1.4221240282058716, "learning_rate": 1.636279283808152e-05, "loss": 2.5157, "step": 269100 }, { "epoch": 2.019823077904246, "grad_norm": 2.9415597915649414, "learning_rate": 1.6350287742254968e-05, "loss": 2.6037, "step": 269200 }, { "epoch": 2.020573383653839, "grad_norm": 2.2768921852111816, "learning_rate": 1.6337782646428422e-05, "loss": 2.6712, "step": 269300 }, { "epoch": 2.021323689403432, "grad_norm": 2.2945632934570312, "learning_rate": 1.6325277550601873e-05, "loss": 2.747, "step": 269400 }, { "epoch": 2.0220739951530247, "grad_norm": 1.6556711196899414, "learning_rate": 1.631277245477532e-05, "loss": 2.571, "step": 269500 }, { "epoch": 2.022824300902618, "grad_norm": 1.6495393514633179, "learning_rate": 1.6300267358948772e-05, "loss": 2.5345, "step": 269600 }, { "epoch": 2.0235746066522107, "grad_norm": 1.551377296447754, "learning_rate": 1.6287762263122223e-05, "loss": 2.6586, "step": 269700 }, { "epoch": 2.024324912401804, "grad_norm": 2.1542305946350098, "learning_rate": 1.6275257167295674e-05, "loss": 2.6358, "step": 269800 }, { "epoch": 2.0250752181513967, "grad_norm": 1.7477498054504395, "learning_rate": 1.6262752071469125e-05, "loss": 2.6156, "step": 269900 }, { "epoch": 2.0258255239009895, "grad_norm": 3.1574432849884033, "learning_rate": 1.6250246975642576e-05, "loss": 2.5673, "step": 270000 }, { "epoch": 2.0265758296505827, "grad_norm": 2.975872039794922, "learning_rate": 1.6237741879816027e-05, "loss": 2.706, "step": 270100 }, { "epoch": 2.0273261354001755, "grad_norm": 2.238830089569092, "learning_rate": 1.6225236783989474e-05, "loss": 2.6655, "step": 270200 }, { "epoch": 2.0280764411497687, "grad_norm": 1.6144245862960815, "learning_rate": 1.6212731688162925e-05, "loss": 2.524, "step": 270300 }, { "epoch": 2.0288267468993615, "grad_norm": 1.1884866952896118, "learning_rate": 1.620022659233638e-05, "loss": 2.5979, "step": 270400 }, { "epoch": 2.0295770526489543, "grad_norm": 3.8817458152770996, "learning_rate": 1.6187721496509828e-05, "loss": 2.6758, "step": 270500 }, { "epoch": 2.0303273583985475, "grad_norm": 1.2338323593139648, "learning_rate": 1.617521640068328e-05, "loss": 2.654, "step": 270600 }, { "epoch": 2.0310776641481403, "grad_norm": 1.5768078565597534, "learning_rate": 1.616271130485673e-05, "loss": 2.669, "step": 270700 }, { "epoch": 2.0318279698977335, "grad_norm": 3.7660133838653564, "learning_rate": 1.615020620903018e-05, "loss": 2.7412, "step": 270800 }, { "epoch": 2.0325782756473263, "grad_norm": 1.6602234840393066, "learning_rate": 1.613770111320363e-05, "loss": 2.5335, "step": 270900 }, { "epoch": 2.033328581396919, "grad_norm": 1.9954493045806885, "learning_rate": 1.6125196017377083e-05, "loss": 2.6653, "step": 271000 }, { "epoch": 2.0340788871465123, "grad_norm": 2.781728744506836, "learning_rate": 1.6112690921550534e-05, "loss": 2.4991, "step": 271100 }, { "epoch": 2.034829192896105, "grad_norm": 1.9195590019226074, "learning_rate": 1.6100185825723985e-05, "loss": 2.5325, "step": 271200 }, { "epoch": 2.0355794986456983, "grad_norm": 1.7321584224700928, "learning_rate": 1.6087680729897432e-05, "loss": 2.5963, "step": 271300 }, { "epoch": 2.036329804395291, "grad_norm": 1.8383845090866089, "learning_rate": 1.6075175634070887e-05, "loss": 2.6913, "step": 271400 }, { "epoch": 2.037080110144884, "grad_norm": 2.932032585144043, "learning_rate": 1.6062670538244334e-05, "loss": 2.5611, "step": 271500 }, { "epoch": 2.037830415894477, "grad_norm": 1.732875108718872, "learning_rate": 1.6050165442417785e-05, "loss": 2.7102, "step": 271600 }, { "epoch": 2.03858072164407, "grad_norm": 2.32299542427063, "learning_rate": 1.6037660346591236e-05, "loss": 2.7147, "step": 271700 }, { "epoch": 2.039331027393663, "grad_norm": 1.4475300312042236, "learning_rate": 1.6025155250764687e-05, "loss": 2.6696, "step": 271800 }, { "epoch": 2.040081333143256, "grad_norm": 2.012021064758301, "learning_rate": 1.601265015493814e-05, "loss": 2.4894, "step": 271900 }, { "epoch": 2.0408316388928487, "grad_norm": 1.5530692338943481, "learning_rate": 1.6000145059111586e-05, "loss": 2.7016, "step": 272000 }, { "epoch": 2.041581944642442, "grad_norm": 3.056774854660034, "learning_rate": 1.598763996328504e-05, "loss": 2.6441, "step": 272100 }, { "epoch": 2.0423322503920347, "grad_norm": 1.5705320835113525, "learning_rate": 1.597513486745849e-05, "loss": 2.5604, "step": 272200 }, { "epoch": 2.043082556141628, "grad_norm": 2.4277994632720947, "learning_rate": 1.596262977163194e-05, "loss": 2.5746, "step": 272300 }, { "epoch": 2.0438328618912207, "grad_norm": 2.794066905975342, "learning_rate": 1.5950124675805393e-05, "loss": 2.5118, "step": 272400 }, { "epoch": 2.0445831676408135, "grad_norm": 2.049724817276001, "learning_rate": 1.5937744630937107e-05, "loss": 2.4712, "step": 272500 }, { "epoch": 2.0453334733904067, "grad_norm": 2.2150654792785645, "learning_rate": 1.592523953511056e-05, "loss": 2.8041, "step": 272600 }, { "epoch": 2.0460837791399995, "grad_norm": 2.3555567264556885, "learning_rate": 1.5912734439284006e-05, "loss": 2.6587, "step": 272700 }, { "epoch": 2.0468340848895927, "grad_norm": 3.0987696647644043, "learning_rate": 1.590022934345746e-05, "loss": 2.7353, "step": 272800 }, { "epoch": 2.0475843906391855, "grad_norm": 2.4583680629730225, "learning_rate": 1.588772424763091e-05, "loss": 2.5549, "step": 272900 }, { "epoch": 2.0483346963887783, "grad_norm": 1.687198519706726, "learning_rate": 1.587521915180436e-05, "loss": 2.5018, "step": 273000 }, { "epoch": 2.0490850021383715, "grad_norm": 1.785650372505188, "learning_rate": 1.5862714055977814e-05, "loss": 2.6685, "step": 273100 }, { "epoch": 2.0498353078879643, "grad_norm": 1.732263207435608, "learning_rate": 1.5850208960151265e-05, "loss": 2.5965, "step": 273200 }, { "epoch": 2.050585613637557, "grad_norm": 1.425388216972351, "learning_rate": 1.5837703864324712e-05, "loss": 2.7476, "step": 273300 }, { "epoch": 2.0513359193871503, "grad_norm": 1.9841063022613525, "learning_rate": 1.5825198768498163e-05, "loss": 2.6501, "step": 273400 }, { "epoch": 2.052086225136743, "grad_norm": 1.7648241519927979, "learning_rate": 1.5812693672671614e-05, "loss": 2.4575, "step": 273500 }, { "epoch": 2.0528365308863363, "grad_norm": 1.4637242555618286, "learning_rate": 1.5800188576845065e-05, "loss": 2.6032, "step": 273600 }, { "epoch": 2.053586836635929, "grad_norm": 1.6486506462097168, "learning_rate": 1.5787683481018516e-05, "loss": 2.518, "step": 273700 }, { "epoch": 2.054337142385522, "grad_norm": 2.0791258811950684, "learning_rate": 1.5775178385191967e-05, "loss": 2.6636, "step": 273800 }, { "epoch": 2.055087448135115, "grad_norm": 2.574260711669922, "learning_rate": 1.576267328936542e-05, "loss": 2.4566, "step": 273900 }, { "epoch": 2.055837753884708, "grad_norm": 2.792750835418701, "learning_rate": 1.5750168193538866e-05, "loss": 2.6949, "step": 274000 }, { "epoch": 2.056588059634301, "grad_norm": 1.3676414489746094, "learning_rate": 1.5737663097712317e-05, "loss": 2.5863, "step": 274100 }, { "epoch": 2.057338365383894, "grad_norm": 1.5195049047470093, "learning_rate": 1.572515800188577e-05, "loss": 2.6203, "step": 274200 }, { "epoch": 2.0580886711334867, "grad_norm": 1.6325689554214478, "learning_rate": 1.571265290605922e-05, "loss": 2.7858, "step": 274300 }, { "epoch": 2.05883897688308, "grad_norm": 2.055983781814575, "learning_rate": 1.570014781023267e-05, "loss": 2.3532, "step": 274400 }, { "epoch": 2.0595892826326727, "grad_norm": 2.3013532161712646, "learning_rate": 1.568764271440612e-05, "loss": 2.7134, "step": 274500 }, { "epoch": 2.060339588382266, "grad_norm": 1.8955894708633423, "learning_rate": 1.5675137618579572e-05, "loss": 2.6393, "step": 274600 }, { "epoch": 2.0610898941318587, "grad_norm": 1.3596699237823486, "learning_rate": 1.5662632522753023e-05, "loss": 2.5276, "step": 274700 }, { "epoch": 2.0618401998814515, "grad_norm": 1.5457043647766113, "learning_rate": 1.5650127426926474e-05, "loss": 2.5148, "step": 274800 }, { "epoch": 2.0625905056310447, "grad_norm": 2.5071654319763184, "learning_rate": 1.5637622331099925e-05, "loss": 2.6174, "step": 274900 }, { "epoch": 2.0633408113806375, "grad_norm": 3.012010335922241, "learning_rate": 1.5625117235273376e-05, "loss": 2.6002, "step": 275000 }, { "epoch": 2.0640911171302307, "grad_norm": 1.6617134809494019, "learning_rate": 1.5612612139446824e-05, "loss": 2.5307, "step": 275100 }, { "epoch": 2.0648414228798235, "grad_norm": 2.0190725326538086, "learning_rate": 1.5600107043620278e-05, "loss": 2.5297, "step": 275200 }, { "epoch": 2.0655917286294163, "grad_norm": 1.7301822900772095, "learning_rate": 1.5587601947793726e-05, "loss": 2.6916, "step": 275300 }, { "epoch": 2.0663420343790095, "grad_norm": 1.8581063747406006, "learning_rate": 1.5575096851967177e-05, "loss": 2.6523, "step": 275400 }, { "epoch": 2.0670923401286023, "grad_norm": 4.131937503814697, "learning_rate": 1.5562591756140628e-05, "loss": 2.7172, "step": 275500 }, { "epoch": 2.0678426458781956, "grad_norm": 1.4995851516723633, "learning_rate": 1.555008666031408e-05, "loss": 2.6425, "step": 275600 }, { "epoch": 2.0685929516277883, "grad_norm": 1.2580821514129639, "learning_rate": 1.553758156448753e-05, "loss": 2.4013, "step": 275700 }, { "epoch": 2.069343257377381, "grad_norm": 1.74099862575531, "learning_rate": 1.5525076468660977e-05, "loss": 2.5928, "step": 275800 }, { "epoch": 2.0700935631269743, "grad_norm": 2.3499698638916016, "learning_rate": 1.5512571372834432e-05, "loss": 2.5939, "step": 275900 }, { "epoch": 2.070843868876567, "grad_norm": 2.589768648147583, "learning_rate": 1.5500066277007883e-05, "loss": 2.667, "step": 276000 }, { "epoch": 2.0715941746261604, "grad_norm": 2.029981851577759, "learning_rate": 1.548756118118133e-05, "loss": 2.5283, "step": 276100 }, { "epoch": 2.072344480375753, "grad_norm": 1.9751225709915161, "learning_rate": 1.547505608535478e-05, "loss": 2.6, "step": 276200 }, { "epoch": 2.073094786125346, "grad_norm": 2.292552947998047, "learning_rate": 1.5462550989528233e-05, "loss": 2.7221, "step": 276300 }, { "epoch": 2.073845091874939, "grad_norm": 1.5877695083618164, "learning_rate": 1.5450045893701684e-05, "loss": 2.6302, "step": 276400 }, { "epoch": 2.074595397624532, "grad_norm": 1.5271410942077637, "learning_rate": 1.5437665848833398e-05, "loss": 2.6911, "step": 276500 }, { "epoch": 2.075345703374125, "grad_norm": 1.4667056798934937, "learning_rate": 1.5425160753006852e-05, "loss": 2.7396, "step": 276600 }, { "epoch": 2.076096009123718, "grad_norm": 1.8413212299346924, "learning_rate": 1.5412655657180303e-05, "loss": 2.6488, "step": 276700 }, { "epoch": 2.0768463148733107, "grad_norm": 2.2260541915893555, "learning_rate": 1.540015056135375e-05, "loss": 2.6342, "step": 276800 }, { "epoch": 2.077596620622904, "grad_norm": 3.2603626251220703, "learning_rate": 1.5387645465527205e-05, "loss": 2.676, "step": 276900 }, { "epoch": 2.0783469263724967, "grad_norm": 2.219975709915161, "learning_rate": 1.5375140369700653e-05, "loss": 2.5673, "step": 277000 }, { "epoch": 2.07909723212209, "grad_norm": 1.822508692741394, "learning_rate": 1.536276032483237e-05, "loss": 2.5401, "step": 277100 }, { "epoch": 2.0798475378716827, "grad_norm": 2.6338443756103516, "learning_rate": 1.5350255229005818e-05, "loss": 2.5823, "step": 277200 }, { "epoch": 2.0805978436212755, "grad_norm": 1.5245455503463745, "learning_rate": 1.5337750133179272e-05, "loss": 2.622, "step": 277300 }, { "epoch": 2.0813481493708688, "grad_norm": 2.5660324096679688, "learning_rate": 1.5325245037352723e-05, "loss": 2.5997, "step": 277400 }, { "epoch": 2.0820984551204615, "grad_norm": 3.9620282649993896, "learning_rate": 1.531273994152617e-05, "loss": 2.6333, "step": 277500 }, { "epoch": 2.0828487608700543, "grad_norm": 2.239569664001465, "learning_rate": 1.5300234845699625e-05, "loss": 2.5862, "step": 277600 }, { "epoch": 2.0835990666196476, "grad_norm": 2.3694260120391846, "learning_rate": 1.5287729749873076e-05, "loss": 2.6037, "step": 277700 }, { "epoch": 2.0843493723692403, "grad_norm": 2.2301387786865234, "learning_rate": 1.5275224654046524e-05, "loss": 2.4938, "step": 277800 }, { "epoch": 2.0850996781188336, "grad_norm": 3.1902756690979004, "learning_rate": 1.5262719558219975e-05, "loss": 2.6848, "step": 277900 }, { "epoch": 2.0858499838684263, "grad_norm": 1.7883789539337158, "learning_rate": 1.5250214462393426e-05, "loss": 2.5356, "step": 278000 }, { "epoch": 2.086600289618019, "grad_norm": 1.3899365663528442, "learning_rate": 1.5237709366566877e-05, "loss": 2.7439, "step": 278100 }, { "epoch": 2.0873505953676124, "grad_norm": 2.1089305877685547, "learning_rate": 1.5225204270740328e-05, "loss": 2.6056, "step": 278200 }, { "epoch": 2.088100901117205, "grad_norm": 3.484403133392334, "learning_rate": 1.5212699174913777e-05, "loss": 2.611, "step": 278300 }, { "epoch": 2.0888512068667984, "grad_norm": 1.5124826431274414, "learning_rate": 1.520019407908723e-05, "loss": 2.5701, "step": 278400 }, { "epoch": 2.089601512616391, "grad_norm": 1.7955474853515625, "learning_rate": 1.5187688983260678e-05, "loss": 2.5443, "step": 278500 }, { "epoch": 2.090351818365984, "grad_norm": 1.6321946382522583, "learning_rate": 1.517518388743413e-05, "loss": 2.759, "step": 278600 }, { "epoch": 2.091102124115577, "grad_norm": 1.6438418626785278, "learning_rate": 1.5162678791607581e-05, "loss": 2.6164, "step": 278700 }, { "epoch": 2.09185242986517, "grad_norm": 1.8935269117355347, "learning_rate": 1.515017369578103e-05, "loss": 2.6436, "step": 278800 }, { "epoch": 2.092602735614763, "grad_norm": 2.1477179527282715, "learning_rate": 1.5137668599954483e-05, "loss": 2.6412, "step": 278900 }, { "epoch": 2.093353041364356, "grad_norm": 2.21591854095459, "learning_rate": 1.5125163504127931e-05, "loss": 2.6866, "step": 279000 }, { "epoch": 2.0941033471139487, "grad_norm": 1.5368157625198364, "learning_rate": 1.5112658408301384e-05, "loss": 2.5845, "step": 279100 }, { "epoch": 2.094853652863542, "grad_norm": 2.843698024749756, "learning_rate": 1.5100153312474835e-05, "loss": 2.6065, "step": 279200 }, { "epoch": 2.0956039586131348, "grad_norm": 2.1951515674591064, "learning_rate": 1.5087648216648284e-05, "loss": 2.681, "step": 279300 }, { "epoch": 2.096354264362728, "grad_norm": 1.6951334476470947, "learning_rate": 1.5075143120821735e-05, "loss": 2.6676, "step": 279400 }, { "epoch": 2.0971045701123208, "grad_norm": 2.0418477058410645, "learning_rate": 1.5062638024995188e-05, "loss": 2.544, "step": 279500 }, { "epoch": 2.0978548758619135, "grad_norm": 2.942700147628784, "learning_rate": 1.5050132929168637e-05, "loss": 2.6277, "step": 279600 }, { "epoch": 2.0986051816115068, "grad_norm": 1.6399418115615845, "learning_rate": 1.5037627833342088e-05, "loss": 2.7145, "step": 279700 }, { "epoch": 2.0993554873610996, "grad_norm": 1.2473076581954956, "learning_rate": 1.5025122737515537e-05, "loss": 2.5276, "step": 279800 }, { "epoch": 2.100105793110693, "grad_norm": 2.64461350440979, "learning_rate": 1.5012617641688988e-05, "loss": 2.5691, "step": 279900 }, { "epoch": 2.1008560988602856, "grad_norm": 3.5007901191711426, "learning_rate": 1.5000112545862441e-05, "loss": 2.7262, "step": 280000 }, { "epoch": 2.1016064046098784, "grad_norm": 1.9394218921661377, "learning_rate": 1.498760745003589e-05, "loss": 2.6153, "step": 280100 }, { "epoch": 2.1023567103594716, "grad_norm": 1.4596965312957764, "learning_rate": 1.4975102354209342e-05, "loss": 2.714, "step": 280200 }, { "epoch": 2.1031070161090644, "grad_norm": 2.6543591022491455, "learning_rate": 1.496259725838279e-05, "loss": 2.64, "step": 280300 }, { "epoch": 2.1038573218586576, "grad_norm": 2.22940993309021, "learning_rate": 1.4950092162556242e-05, "loss": 2.6483, "step": 280400 }, { "epoch": 2.1046076276082504, "grad_norm": 1.6116981506347656, "learning_rate": 1.4937587066729695e-05, "loss": 2.6478, "step": 280500 }, { "epoch": 2.105357933357843, "grad_norm": 1.611094355583191, "learning_rate": 1.4925081970903144e-05, "loss": 2.6524, "step": 280600 }, { "epoch": 2.1061082391074364, "grad_norm": 1.6551082134246826, "learning_rate": 1.4912576875076595e-05, "loss": 2.4957, "step": 280700 }, { "epoch": 2.106858544857029, "grad_norm": 1.6871726512908936, "learning_rate": 1.4900071779250044e-05, "loss": 2.6682, "step": 280800 }, { "epoch": 2.1076088506066224, "grad_norm": 2.3850042819976807, "learning_rate": 1.4887566683423495e-05, "loss": 2.5581, "step": 280900 }, { "epoch": 2.108359156356215, "grad_norm": 2.4673516750335693, "learning_rate": 1.4875061587596948e-05, "loss": 2.7111, "step": 281000 }, { "epoch": 2.109109462105808, "grad_norm": 1.9516642093658447, "learning_rate": 1.4862681542728662e-05, "loss": 2.5769, "step": 281100 }, { "epoch": 2.109859767855401, "grad_norm": 1.5790793895721436, "learning_rate": 1.4850176446902115e-05, "loss": 2.6404, "step": 281200 }, { "epoch": 2.110610073604994, "grad_norm": 1.9544450044631958, "learning_rate": 1.4837671351075564e-05, "loss": 2.6818, "step": 281300 }, { "epoch": 2.111360379354587, "grad_norm": 1.8459813594818115, "learning_rate": 1.4825166255249015e-05, "loss": 2.5653, "step": 281400 }, { "epoch": 2.11211068510418, "grad_norm": 1.8307775259017944, "learning_rate": 1.4812661159422464e-05, "loss": 2.5393, "step": 281500 }, { "epoch": 2.1128609908537728, "grad_norm": 2.436234474182129, "learning_rate": 1.4800156063595915e-05, "loss": 2.6155, "step": 281600 }, { "epoch": 2.113611296603366, "grad_norm": 2.8782477378845215, "learning_rate": 1.4787650967769368e-05, "loss": 2.6726, "step": 281700 }, { "epoch": 2.114361602352959, "grad_norm": 1.912814974784851, "learning_rate": 1.4775145871942816e-05, "loss": 2.7204, "step": 281800 }, { "epoch": 2.115111908102552, "grad_norm": 1.7999627590179443, "learning_rate": 1.4762640776116268e-05, "loss": 2.5673, "step": 281900 }, { "epoch": 2.115862213852145, "grad_norm": 2.2847900390625, "learning_rate": 1.475013568028972e-05, "loss": 2.5685, "step": 282000 }, { "epoch": 2.1166125196017376, "grad_norm": 2.2668302059173584, "learning_rate": 1.4737755635421435e-05, "loss": 2.583, "step": 282100 }, { "epoch": 2.117362825351331, "grad_norm": 2.745530843734741, "learning_rate": 1.4725250539594886e-05, "loss": 2.5055, "step": 282200 }, { "epoch": 2.1181131311009236, "grad_norm": 2.475560426712036, "learning_rate": 1.4712745443768336e-05, "loss": 2.6842, "step": 282300 }, { "epoch": 2.118863436850517, "grad_norm": 1.5045394897460938, "learning_rate": 1.4700240347941788e-05, "loss": 2.7346, "step": 282400 }, { "epoch": 2.1196137426001096, "grad_norm": 2.8855714797973633, "learning_rate": 1.4687735252115238e-05, "loss": 2.488, "step": 282500 }, { "epoch": 2.1203640483497024, "grad_norm": 2.3939929008483887, "learning_rate": 1.4675230156288689e-05, "loss": 2.6446, "step": 282600 }, { "epoch": 2.1211143540992956, "grad_norm": 1.9670120477676392, "learning_rate": 1.466272506046214e-05, "loss": 2.6495, "step": 282700 }, { "epoch": 2.1218646598488884, "grad_norm": 3.028554677963257, "learning_rate": 1.4650219964635589e-05, "loss": 2.6171, "step": 282800 }, { "epoch": 2.122614965598481, "grad_norm": 3.151322603225708, "learning_rate": 1.4637714868809042e-05, "loss": 2.6059, "step": 282900 }, { "epoch": 2.1233652713480744, "grad_norm": 1.807918667793274, "learning_rate": 1.462520977298249e-05, "loss": 2.6275, "step": 283000 }, { "epoch": 2.124115577097667, "grad_norm": 1.5127813816070557, "learning_rate": 1.4612704677155942e-05, "loss": 2.6452, "step": 283100 }, { "epoch": 2.1248658828472604, "grad_norm": 2.076702356338501, "learning_rate": 1.4600199581329393e-05, "loss": 2.538, "step": 283200 }, { "epoch": 2.125616188596853, "grad_norm": 1.9322682619094849, "learning_rate": 1.4587694485502842e-05, "loss": 2.7115, "step": 283300 }, { "epoch": 2.126366494346446, "grad_norm": 2.186235189437866, "learning_rate": 1.4575189389676295e-05, "loss": 2.5297, "step": 283400 }, { "epoch": 2.127116800096039, "grad_norm": 1.8538247346878052, "learning_rate": 1.4562684293849743e-05, "loss": 2.6602, "step": 283500 }, { "epoch": 2.127867105845632, "grad_norm": 2.869323968887329, "learning_rate": 1.4550179198023195e-05, "loss": 2.4786, "step": 283600 }, { "epoch": 2.128617411595225, "grad_norm": 1.538003921508789, "learning_rate": 1.4537674102196646e-05, "loss": 2.571, "step": 283700 }, { "epoch": 2.129367717344818, "grad_norm": 1.5188028812408447, "learning_rate": 1.4525169006370096e-05, "loss": 2.6447, "step": 283800 }, { "epoch": 2.130118023094411, "grad_norm": 2.1261672973632812, "learning_rate": 1.4512663910543547e-05, "loss": 2.5524, "step": 283900 }, { "epoch": 2.130868328844004, "grad_norm": 2.238792896270752, "learning_rate": 1.4500158814717e-05, "loss": 2.624, "step": 284000 }, { "epoch": 2.131618634593597, "grad_norm": 2.7635505199432373, "learning_rate": 1.4487653718890449e-05, "loss": 2.7164, "step": 284100 }, { "epoch": 2.13236894034319, "grad_norm": 1.7492811679840088, "learning_rate": 1.44751486230639e-05, "loss": 2.5259, "step": 284200 }, { "epoch": 2.133119246092783, "grad_norm": 1.870453953742981, "learning_rate": 1.4462643527237349e-05, "loss": 2.6691, "step": 284300 }, { "epoch": 2.1338695518423756, "grad_norm": 1.817758560180664, "learning_rate": 1.4450263482369066e-05, "loss": 2.723, "step": 284400 }, { "epoch": 2.134619857591969, "grad_norm": 2.76562762260437, "learning_rate": 1.4437758386542516e-05, "loss": 2.694, "step": 284500 }, { "epoch": 2.1353701633415616, "grad_norm": 2.245270013809204, "learning_rate": 1.4425253290715967e-05, "loss": 2.5336, "step": 284600 }, { "epoch": 2.136120469091155, "grad_norm": 2.272625207901001, "learning_rate": 1.441274819488942e-05, "loss": 2.6537, "step": 284700 }, { "epoch": 2.1368707748407476, "grad_norm": 2.031076669692993, "learning_rate": 1.4400243099062869e-05, "loss": 2.5057, "step": 284800 }, { "epoch": 2.1376210805903404, "grad_norm": 1.7785309553146362, "learning_rate": 1.438773800323632e-05, "loss": 2.6218, "step": 284900 }, { "epoch": 2.1383713863399336, "grad_norm": 1.3273032903671265, "learning_rate": 1.437523290740977e-05, "loss": 2.7127, "step": 285000 }, { "epoch": 2.1391216920895264, "grad_norm": 2.5998315811157227, "learning_rate": 1.436272781158322e-05, "loss": 2.6141, "step": 285100 }, { "epoch": 2.1398719978391196, "grad_norm": 3.7405354976654053, "learning_rate": 1.4350222715756673e-05, "loss": 2.5989, "step": 285200 }, { "epoch": 2.1406223035887124, "grad_norm": 1.7198535203933716, "learning_rate": 1.4337717619930122e-05, "loss": 2.4351, "step": 285300 }, { "epoch": 2.141372609338305, "grad_norm": 2.0736961364746094, "learning_rate": 1.4325212524103573e-05, "loss": 2.5899, "step": 285400 }, { "epoch": 2.1421229150878984, "grad_norm": 1.7546526193618774, "learning_rate": 1.4312707428277023e-05, "loss": 2.6518, "step": 285500 }, { "epoch": 2.142873220837491, "grad_norm": 2.2827603816986084, "learning_rate": 1.4300202332450474e-05, "loss": 2.6999, "step": 285600 }, { "epoch": 2.1436235265870844, "grad_norm": 1.981650471687317, "learning_rate": 1.4287697236623926e-05, "loss": 2.7106, "step": 285700 }, { "epoch": 2.144373832336677, "grad_norm": 1.6493781805038452, "learning_rate": 1.4275192140797376e-05, "loss": 2.6648, "step": 285800 }, { "epoch": 2.14512413808627, "grad_norm": 1.8726309537887573, "learning_rate": 1.4262687044970827e-05, "loss": 2.5624, "step": 285900 }, { "epoch": 2.1458744438358632, "grad_norm": 1.7083123922348022, "learning_rate": 1.4250181949144276e-05, "loss": 2.5805, "step": 286000 }, { "epoch": 2.146624749585456, "grad_norm": 2.3406569957733154, "learning_rate": 1.4237676853317727e-05, "loss": 2.5403, "step": 286100 }, { "epoch": 2.147375055335049, "grad_norm": 1.9052152633666992, "learning_rate": 1.422517175749118e-05, "loss": 2.5069, "step": 286200 }, { "epoch": 2.148125361084642, "grad_norm": 1.5892269611358643, "learning_rate": 1.4212666661664627e-05, "loss": 2.6302, "step": 286300 }, { "epoch": 2.148875666834235, "grad_norm": 1.727994680404663, "learning_rate": 1.420016156583808e-05, "loss": 2.6041, "step": 286400 }, { "epoch": 2.149625972583828, "grad_norm": 1.8256988525390625, "learning_rate": 1.4187656470011531e-05, "loss": 2.5473, "step": 286500 }, { "epoch": 2.150376278333421, "grad_norm": 2.313018560409546, "learning_rate": 1.4175276425143247e-05, "loss": 2.5418, "step": 286600 }, { "epoch": 2.1511265840830136, "grad_norm": 1.9019219875335693, "learning_rate": 1.4162771329316698e-05, "loss": 2.6497, "step": 286700 }, { "epoch": 2.151876889832607, "grad_norm": 3.549004077911377, "learning_rate": 1.4150266233490147e-05, "loss": 2.5701, "step": 286800 }, { "epoch": 2.1526271955821996, "grad_norm": 1.627005934715271, "learning_rate": 1.41377611376636e-05, "loss": 2.5143, "step": 286900 }, { "epoch": 2.153377501331793, "grad_norm": 2.804222583770752, "learning_rate": 1.4125256041837047e-05, "loss": 2.6048, "step": 287000 }, { "epoch": 2.1541278070813856, "grad_norm": 2.246830940246582, "learning_rate": 1.41127509460105e-05, "loss": 2.5233, "step": 287100 }, { "epoch": 2.1548781128309784, "grad_norm": 2.480682134628296, "learning_rate": 1.4100245850183951e-05, "loss": 2.6487, "step": 287200 }, { "epoch": 2.1556284185805716, "grad_norm": 2.7923614978790283, "learning_rate": 1.40877407543574e-05, "loss": 2.6131, "step": 287300 }, { "epoch": 2.1563787243301644, "grad_norm": 1.4622749090194702, "learning_rate": 1.4075235658530853e-05, "loss": 2.6799, "step": 287400 }, { "epoch": 2.1571290300797576, "grad_norm": 1.9294774532318115, "learning_rate": 1.4062730562704301e-05, "loss": 2.593, "step": 287500 }, { "epoch": 2.1578793358293504, "grad_norm": 1.4590038061141968, "learning_rate": 1.4050225466877754e-05, "loss": 2.4974, "step": 287600 }, { "epoch": 2.158629641578943, "grad_norm": 1.5956857204437256, "learning_rate": 1.4037720371051205e-05, "loss": 2.5404, "step": 287700 }, { "epoch": 2.1593799473285364, "grad_norm": 2.5202929973602295, "learning_rate": 1.4025215275224654e-05, "loss": 2.5932, "step": 287800 }, { "epoch": 2.1601302530781292, "grad_norm": 2.3690176010131836, "learning_rate": 1.4012710179398107e-05, "loss": 2.7229, "step": 287900 }, { "epoch": 2.1608805588277225, "grad_norm": 2.2231907844543457, "learning_rate": 1.4000205083571554e-05, "loss": 2.516, "step": 288000 }, { "epoch": 2.1616308645773152, "grad_norm": 1.634812831878662, "learning_rate": 1.3987699987745007e-05, "loss": 2.6033, "step": 288100 }, { "epoch": 2.162381170326908, "grad_norm": 1.1950350999832153, "learning_rate": 1.3975194891918458e-05, "loss": 2.7162, "step": 288200 }, { "epoch": 2.1631314760765012, "grad_norm": 1.9139399528503418, "learning_rate": 1.3962689796091907e-05, "loss": 2.6308, "step": 288300 }, { "epoch": 2.163881781826094, "grad_norm": 3.363934278488159, "learning_rate": 1.3950184700265358e-05, "loss": 2.4763, "step": 288400 }, { "epoch": 2.1646320875756873, "grad_norm": 1.38664710521698, "learning_rate": 1.3937679604438811e-05, "loss": 2.5708, "step": 288500 }, { "epoch": 2.16538239332528, "grad_norm": 1.6060891151428223, "learning_rate": 1.392517450861226e-05, "loss": 2.6419, "step": 288600 }, { "epoch": 2.166132699074873, "grad_norm": 1.654812216758728, "learning_rate": 1.3912669412785711e-05, "loss": 2.6362, "step": 288700 }, { "epoch": 2.166883004824466, "grad_norm": 1.5254578590393066, "learning_rate": 1.390016431695916e-05, "loss": 2.5613, "step": 288800 }, { "epoch": 2.167633310574059, "grad_norm": 1.805935263633728, "learning_rate": 1.3887659221132612e-05, "loss": 2.6773, "step": 288900 }, { "epoch": 2.168383616323652, "grad_norm": 2.424960136413574, "learning_rate": 1.3875154125306064e-05, "loss": 2.6443, "step": 289000 }, { "epoch": 2.169133922073245, "grad_norm": 2.1860477924346924, "learning_rate": 1.3862649029479514e-05, "loss": 2.4326, "step": 289100 }, { "epoch": 2.1698842278228376, "grad_norm": 1.6448380947113037, "learning_rate": 1.3850143933652965e-05, "loss": 2.6376, "step": 289200 }, { "epoch": 2.170634533572431, "grad_norm": 1.8025215864181519, "learning_rate": 1.3837638837826414e-05, "loss": 2.6439, "step": 289300 }, { "epoch": 2.1713848393220236, "grad_norm": 1.273437738418579, "learning_rate": 1.3825133741999865e-05, "loss": 2.6086, "step": 289400 }, { "epoch": 2.172135145071617, "grad_norm": 1.9654873609542847, "learning_rate": 1.3812628646173318e-05, "loss": 2.511, "step": 289500 }, { "epoch": 2.1728854508212097, "grad_norm": 2.0684196949005127, "learning_rate": 1.3800123550346767e-05, "loss": 2.7387, "step": 289600 }, { "epoch": 2.1736357565708024, "grad_norm": 1.7364364862442017, "learning_rate": 1.3787618454520218e-05, "loss": 2.5796, "step": 289700 }, { "epoch": 2.1743860623203957, "grad_norm": 1.59194815158844, "learning_rate": 1.3775113358693667e-05, "loss": 2.6229, "step": 289800 }, { "epoch": 2.1751363680699884, "grad_norm": 1.9288030862808228, "learning_rate": 1.3762608262867118e-05, "loss": 2.7332, "step": 289900 }, { "epoch": 2.1758866738195817, "grad_norm": 2.0022902488708496, "learning_rate": 1.3750103167040571e-05, "loss": 2.6727, "step": 290000 }, { "epoch": 2.1766369795691745, "grad_norm": 2.8409759998321533, "learning_rate": 1.3737598071214019e-05, "loss": 2.5909, "step": 290100 }, { "epoch": 2.1773872853187672, "grad_norm": 1.6408852338790894, "learning_rate": 1.3725092975387472e-05, "loss": 2.5642, "step": 290200 }, { "epoch": 2.1781375910683605, "grad_norm": 2.671152114868164, "learning_rate": 1.3712587879560923e-05, "loss": 2.5346, "step": 290300 }, { "epoch": 2.1788878968179533, "grad_norm": 2.5028014183044434, "learning_rate": 1.3700082783734372e-05, "loss": 2.7273, "step": 290400 }, { "epoch": 2.1796382025675465, "grad_norm": 1.6614943742752075, "learning_rate": 1.3687577687907825e-05, "loss": 2.549, "step": 290500 }, { "epoch": 2.1803885083171393, "grad_norm": 1.748841643333435, "learning_rate": 1.3675197643039539e-05, "loss": 2.6243, "step": 290600 }, { "epoch": 2.181138814066732, "grad_norm": 1.9785406589508057, "learning_rate": 1.3662692547212991e-05, "loss": 2.5909, "step": 290700 }, { "epoch": 2.1818891198163253, "grad_norm": 2.268904209136963, "learning_rate": 1.3650187451386439e-05, "loss": 2.5048, "step": 290800 }, { "epoch": 2.182639425565918, "grad_norm": 1.5863887071609497, "learning_rate": 1.3637682355559892e-05, "loss": 2.6718, "step": 290900 }, { "epoch": 2.1833897313155113, "grad_norm": 2.5883846282958984, "learning_rate": 1.3625177259733343e-05, "loss": 2.5786, "step": 291000 }, { "epoch": 2.184140037065104, "grad_norm": 1.7426083087921143, "learning_rate": 1.3612672163906792e-05, "loss": 2.5041, "step": 291100 }, { "epoch": 2.184890342814697, "grad_norm": 2.7418198585510254, "learning_rate": 1.360029211903851e-05, "loss": 2.3611, "step": 291200 }, { "epoch": 2.18564064856429, "grad_norm": 1.6359509229660034, "learning_rate": 1.3587787023211959e-05, "loss": 2.7326, "step": 291300 }, { "epoch": 2.186390954313883, "grad_norm": 2.2644433975219727, "learning_rate": 1.3575281927385411e-05, "loss": 2.7717, "step": 291400 }, { "epoch": 2.187141260063476, "grad_norm": 1.9305720329284668, "learning_rate": 1.3562776831558859e-05, "loss": 2.6558, "step": 291500 }, { "epoch": 2.187891565813069, "grad_norm": 1.6004599332809448, "learning_rate": 1.3550271735732312e-05, "loss": 2.5517, "step": 291600 }, { "epoch": 2.1886418715626617, "grad_norm": 2.956298589706421, "learning_rate": 1.3537766639905763e-05, "loss": 2.6792, "step": 291700 }, { "epoch": 2.189392177312255, "grad_norm": 2.234710693359375, "learning_rate": 1.3525261544079212e-05, "loss": 2.6866, "step": 291800 }, { "epoch": 2.1901424830618477, "grad_norm": 1.4121949672698975, "learning_rate": 1.3512756448252665e-05, "loss": 2.6252, "step": 291900 }, { "epoch": 2.190892788811441, "grad_norm": 1.7344379425048828, "learning_rate": 1.3500251352426112e-05, "loss": 2.6954, "step": 292000 }, { "epoch": 2.1916430945610337, "grad_norm": 2.324492931365967, "learning_rate": 1.3487746256599565e-05, "loss": 2.5777, "step": 292100 }, { "epoch": 2.1923934003106265, "grad_norm": 2.1749119758605957, "learning_rate": 1.3475241160773016e-05, "loss": 2.5257, "step": 292200 }, { "epoch": 2.1931437060602197, "grad_norm": 1.8566464185714722, "learning_rate": 1.3462736064946466e-05, "loss": 2.6765, "step": 292300 }, { "epoch": 2.1938940118098125, "grad_norm": 1.6240934133529663, "learning_rate": 1.3450230969119918e-05, "loss": 2.6904, "step": 292400 }, { "epoch": 2.1946443175594053, "grad_norm": 2.351010799407959, "learning_rate": 1.3437725873293366e-05, "loss": 2.5091, "step": 292500 }, { "epoch": 2.1953946233089985, "grad_norm": 2.3836910724639893, "learning_rate": 1.3425220777466819e-05, "loss": 2.6367, "step": 292600 }, { "epoch": 2.1961449290585913, "grad_norm": 1.699741005897522, "learning_rate": 1.341271568164027e-05, "loss": 2.5922, "step": 292700 }, { "epoch": 2.1968952348081845, "grad_norm": 2.0146923065185547, "learning_rate": 1.3400210585813719e-05, "loss": 2.5552, "step": 292800 }, { "epoch": 2.1976455405577773, "grad_norm": 1.360029697418213, "learning_rate": 1.338770548998717e-05, "loss": 2.5538, "step": 292900 }, { "epoch": 2.19839584630737, "grad_norm": 1.6312941312789917, "learning_rate": 1.3375200394160623e-05, "loss": 2.6694, "step": 293000 }, { "epoch": 2.1991461520569633, "grad_norm": 2.5789287090301514, "learning_rate": 1.3362695298334072e-05, "loss": 2.7647, "step": 293100 }, { "epoch": 2.199896457806556, "grad_norm": 1.4148457050323486, "learning_rate": 1.3350190202507523e-05, "loss": 2.5564, "step": 293200 }, { "epoch": 2.2006467635561493, "grad_norm": 1.8406236171722412, "learning_rate": 1.3337685106680972e-05, "loss": 2.6373, "step": 293300 }, { "epoch": 2.201397069305742, "grad_norm": 1.9403413534164429, "learning_rate": 1.3325180010854423e-05, "loss": 2.6854, "step": 293400 }, { "epoch": 2.202147375055335, "grad_norm": 1.478522777557373, "learning_rate": 1.3312674915027876e-05, "loss": 2.65, "step": 293500 }, { "epoch": 2.202897680804928, "grad_norm": 2.1510112285614014, "learning_rate": 1.3300169819201325e-05, "loss": 2.6796, "step": 293600 }, { "epoch": 2.203647986554521, "grad_norm": 1.4713475704193115, "learning_rate": 1.3287664723374776e-05, "loss": 2.7162, "step": 293700 }, { "epoch": 2.204398292304114, "grad_norm": 1.498712420463562, "learning_rate": 1.3275159627548226e-05, "loss": 2.7046, "step": 293800 }, { "epoch": 2.205148598053707, "grad_norm": 2.433047294616699, "learning_rate": 1.3262654531721677e-05, "loss": 2.6571, "step": 293900 }, { "epoch": 2.2058989038032997, "grad_norm": 1.7604888677597046, "learning_rate": 1.325014943589513e-05, "loss": 2.7132, "step": 294000 }, { "epoch": 2.206649209552893, "grad_norm": 2.85129451751709, "learning_rate": 1.3237644340068577e-05, "loss": 2.6403, "step": 294100 }, { "epoch": 2.2073995153024857, "grad_norm": 3.5000088214874268, "learning_rate": 1.322513924424203e-05, "loss": 2.639, "step": 294200 }, { "epoch": 2.208149821052079, "grad_norm": 1.5208982229232788, "learning_rate": 1.3212634148415479e-05, "loss": 2.7597, "step": 294300 }, { "epoch": 2.2089001268016717, "grad_norm": 2.237107753753662, "learning_rate": 1.320012905258893e-05, "loss": 2.6704, "step": 294400 }, { "epoch": 2.2096504325512645, "grad_norm": 1.952539324760437, "learning_rate": 1.3187623956762383e-05, "loss": 2.5954, "step": 294500 }, { "epoch": 2.2104007383008577, "grad_norm": 1.7287415266036987, "learning_rate": 1.317511886093583e-05, "loss": 2.6159, "step": 294600 }, { "epoch": 2.2111510440504505, "grad_norm": 2.743396759033203, "learning_rate": 1.3162613765109283e-05, "loss": 2.6446, "step": 294700 }, { "epoch": 2.2119013498000437, "grad_norm": 1.7857221364974976, "learning_rate": 1.3150108669282734e-05, "loss": 2.5532, "step": 294800 }, { "epoch": 2.2126516555496365, "grad_norm": 1.6194130182266235, "learning_rate": 1.3137603573456184e-05, "loss": 2.6197, "step": 294900 }, { "epoch": 2.2134019612992293, "grad_norm": 2.0615811347961426, "learning_rate": 1.3125098477629636e-05, "loss": 2.5066, "step": 295000 }, { "epoch": 2.2141522670488225, "grad_norm": 2.1108345985412598, "learning_rate": 1.3112593381803084e-05, "loss": 2.6125, "step": 295100 }, { "epoch": 2.2149025727984153, "grad_norm": 1.6125072240829468, "learning_rate": 1.3100088285976537e-05, "loss": 2.6535, "step": 295200 }, { "epoch": 2.2156528785480085, "grad_norm": 2.2677385807037354, "learning_rate": 1.308770824110825e-05, "loss": 2.5807, "step": 295300 }, { "epoch": 2.2164031842976013, "grad_norm": 2.445521593093872, "learning_rate": 1.3075203145281703e-05, "loss": 2.6998, "step": 295400 }, { "epoch": 2.217153490047194, "grad_norm": 2.340204954147339, "learning_rate": 1.3062698049455154e-05, "loss": 2.4978, "step": 295500 }, { "epoch": 2.2179037957967873, "grad_norm": 3.9271156787872314, "learning_rate": 1.3050192953628604e-05, "loss": 2.5388, "step": 295600 }, { "epoch": 2.21865410154638, "grad_norm": 3.758770227432251, "learning_rate": 1.3037687857802056e-05, "loss": 2.6228, "step": 295700 }, { "epoch": 2.219404407295973, "grad_norm": 1.7568727731704712, "learning_rate": 1.3025182761975504e-05, "loss": 2.6351, "step": 295800 }, { "epoch": 2.220154713045566, "grad_norm": 1.6370376348495483, "learning_rate": 1.3012677666148957e-05, "loss": 2.5715, "step": 295900 }, { "epoch": 2.220905018795159, "grad_norm": 2.6575162410736084, "learning_rate": 1.3000172570322408e-05, "loss": 2.5858, "step": 296000 }, { "epoch": 2.221655324544752, "grad_norm": 1.7552542686462402, "learning_rate": 1.2987667474495857e-05, "loss": 2.6663, "step": 296100 }, { "epoch": 2.222405630294345, "grad_norm": 3.4580540657043457, "learning_rate": 1.2975162378669308e-05, "loss": 2.608, "step": 296200 }, { "epoch": 2.2231559360439377, "grad_norm": 3.023489236831665, "learning_rate": 1.2962657282842757e-05, "loss": 2.6656, "step": 296300 }, { "epoch": 2.223906241793531, "grad_norm": 1.7863831520080566, "learning_rate": 1.295015218701621e-05, "loss": 2.6147, "step": 296400 }, { "epoch": 2.2246565475431237, "grad_norm": 2.055001974105835, "learning_rate": 1.2937647091189661e-05, "loss": 2.6679, "step": 296500 }, { "epoch": 2.225406853292717, "grad_norm": 1.7588492631912231, "learning_rate": 1.292514199536311e-05, "loss": 2.6745, "step": 296600 }, { "epoch": 2.2261571590423097, "grad_norm": 1.5557572841644287, "learning_rate": 1.2912636899536561e-05, "loss": 2.7772, "step": 296700 }, { "epoch": 2.2269074647919025, "grad_norm": 1.9384229183197021, "learning_rate": 1.290013180371001e-05, "loss": 2.5063, "step": 296800 }, { "epoch": 2.2276577705414957, "grad_norm": 1.6481722593307495, "learning_rate": 1.2887626707883463e-05, "loss": 2.6718, "step": 296900 }, { "epoch": 2.2284080762910885, "grad_norm": 2.0039162635803223, "learning_rate": 1.2875121612056915e-05, "loss": 2.606, "step": 297000 }, { "epoch": 2.2291583820406817, "grad_norm": 1.3928109407424927, "learning_rate": 1.2862616516230364e-05, "loss": 2.7236, "step": 297100 }, { "epoch": 2.2299086877902745, "grad_norm": 1.63323175907135, "learning_rate": 1.2850111420403815e-05, "loss": 2.6338, "step": 297200 }, { "epoch": 2.2306589935398673, "grad_norm": 1.8422727584838867, "learning_rate": 1.283773137553553e-05, "loss": 2.6391, "step": 297300 }, { "epoch": 2.2314092992894605, "grad_norm": 4.690929889678955, "learning_rate": 1.2825226279708982e-05, "loss": 2.5754, "step": 297400 }, { "epoch": 2.2321596050390533, "grad_norm": 2.101475477218628, "learning_rate": 1.2812721183882434e-05, "loss": 2.5617, "step": 297500 }, { "epoch": 2.2329099107886465, "grad_norm": 1.8621457815170288, "learning_rate": 1.2800216088055884e-05, "loss": 2.7227, "step": 297600 }, { "epoch": 2.2336602165382393, "grad_norm": 3.167649269104004, "learning_rate": 1.2787710992229335e-05, "loss": 2.6268, "step": 297700 }, { "epoch": 2.234410522287832, "grad_norm": 1.5397520065307617, "learning_rate": 1.2775205896402784e-05, "loss": 2.6135, "step": 297800 }, { "epoch": 2.2351608280374253, "grad_norm": 2.279135227203369, "learning_rate": 1.2762700800576235e-05, "loss": 2.6147, "step": 297900 }, { "epoch": 2.235911133787018, "grad_norm": 2.5068156719207764, "learning_rate": 1.2750195704749688e-05, "loss": 2.5824, "step": 298000 }, { "epoch": 2.2366614395366113, "grad_norm": 3.1276493072509766, "learning_rate": 1.2737690608923137e-05, "loss": 2.6233, "step": 298100 }, { "epoch": 2.237411745286204, "grad_norm": 2.0369410514831543, "learning_rate": 1.2725185513096588e-05, "loss": 2.6467, "step": 298200 }, { "epoch": 2.238162051035797, "grad_norm": 2.539090871810913, "learning_rate": 1.2712680417270037e-05, "loss": 2.6953, "step": 298300 }, { "epoch": 2.23891235678539, "grad_norm": 1.671465277671814, "learning_rate": 1.2700175321443488e-05, "loss": 2.5573, "step": 298400 }, { "epoch": 2.239662662534983, "grad_norm": 1.872691035270691, "learning_rate": 1.2687670225616941e-05, "loss": 2.5762, "step": 298500 }, { "epoch": 2.240412968284576, "grad_norm": 2.39350962638855, "learning_rate": 1.2675165129790389e-05, "loss": 2.4697, "step": 298600 }, { "epoch": 2.241163274034169, "grad_norm": 1.588431477546692, "learning_rate": 1.2662660033963841e-05, "loss": 2.6249, "step": 298700 }, { "epoch": 2.2419135797837617, "grad_norm": 1.6604194641113281, "learning_rate": 1.265015493813729e-05, "loss": 2.5967, "step": 298800 }, { "epoch": 2.242663885533355, "grad_norm": 1.990869402885437, "learning_rate": 1.2637649842310742e-05, "loss": 2.6688, "step": 298900 }, { "epoch": 2.2434141912829477, "grad_norm": 2.1065800189971924, "learning_rate": 1.2625144746484194e-05, "loss": 2.5572, "step": 299000 }, { "epoch": 2.244164497032541, "grad_norm": 2.8430752754211426, "learning_rate": 1.2612639650657642e-05, "loss": 2.7086, "step": 299100 }, { "epoch": 2.2449148027821337, "grad_norm": 2.305123805999756, "learning_rate": 1.2600134554831095e-05, "loss": 2.7477, "step": 299200 }, { "epoch": 2.2456651085317265, "grad_norm": 1.487415075302124, "learning_rate": 1.258775450996281e-05, "loss": 2.6376, "step": 299300 }, { "epoch": 2.2464154142813197, "grad_norm": 1.451766848564148, "learning_rate": 1.2575249414136262e-05, "loss": 2.5515, "step": 299400 }, { "epoch": 2.2471657200309125, "grad_norm": 1.6690791845321655, "learning_rate": 1.2562744318309711e-05, "loss": 2.584, "step": 299500 }, { "epoch": 2.2479160257805058, "grad_norm": 1.581775426864624, "learning_rate": 1.2550239222483162e-05, "loss": 2.6092, "step": 299600 }, { "epoch": 2.2486663315300985, "grad_norm": 1.8803397417068481, "learning_rate": 1.2537734126656615e-05, "loss": 2.4555, "step": 299700 }, { "epoch": 2.2494166372796913, "grad_norm": 1.8964818716049194, "learning_rate": 1.2525229030830062e-05, "loss": 2.677, "step": 299800 }, { "epoch": 2.2501669430292845, "grad_norm": 1.8672914505004883, "learning_rate": 1.2512723935003515e-05, "loss": 2.5692, "step": 299900 }, { "epoch": 2.2509172487788773, "grad_norm": 2.684805393218994, "learning_rate": 1.2500218839176966e-05, "loss": 2.6298, "step": 300000 }, { "epoch": 2.2516675545284706, "grad_norm": 2.034085988998413, "learning_rate": 1.2487713743350415e-05, "loss": 2.569, "step": 300100 }, { "epoch": 2.2524178602780633, "grad_norm": 2.0836050510406494, "learning_rate": 1.2475208647523868e-05, "loss": 2.6224, "step": 300200 }, { "epoch": 2.253168166027656, "grad_norm": 1.7401607036590576, "learning_rate": 1.2462703551697317e-05, "loss": 2.4996, "step": 300300 }, { "epoch": 2.2539184717772494, "grad_norm": 2.091693162918091, "learning_rate": 1.2450198455870768e-05, "loss": 2.51, "step": 300400 }, { "epoch": 2.254668777526842, "grad_norm": 3.022564649581909, "learning_rate": 1.2437693360044218e-05, "loss": 2.6308, "step": 300500 }, { "epoch": 2.2554190832764354, "grad_norm": 2.123534917831421, "learning_rate": 1.2425188264217669e-05, "loss": 2.6558, "step": 300600 }, { "epoch": 2.256169389026028, "grad_norm": 2.8972952365875244, "learning_rate": 1.241268316839112e-05, "loss": 2.5305, "step": 300700 }, { "epoch": 2.256919694775621, "grad_norm": 1.8962626457214355, "learning_rate": 1.240017807256457e-05, "loss": 2.7071, "step": 300800 }, { "epoch": 2.257670000525214, "grad_norm": 1.9597917795181274, "learning_rate": 1.2387672976738022e-05, "loss": 2.664, "step": 300900 }, { "epoch": 2.258420306274807, "grad_norm": 1.9787517786026, "learning_rate": 1.2375167880911471e-05, "loss": 2.5197, "step": 301000 }, { "epoch": 2.2591706120244, "grad_norm": 1.6346402168273926, "learning_rate": 1.2362662785084924e-05, "loss": 2.5638, "step": 301100 }, { "epoch": 2.259920917773993, "grad_norm": 1.6975589990615845, "learning_rate": 1.2350157689258373e-05, "loss": 2.4871, "step": 301200 }, { "epoch": 2.2606712235235857, "grad_norm": 2.6627750396728516, "learning_rate": 1.2337652593431824e-05, "loss": 2.6465, "step": 301300 }, { "epoch": 2.261421529273179, "grad_norm": 2.604396104812622, "learning_rate": 1.232527254856354e-05, "loss": 2.6524, "step": 301400 }, { "epoch": 2.2621718350227717, "grad_norm": 2.359536647796631, "learning_rate": 1.231276745273699e-05, "loss": 2.5917, "step": 301500 }, { "epoch": 2.262922140772365, "grad_norm": 1.5069667100906372, "learning_rate": 1.2300262356910442e-05, "loss": 2.7446, "step": 301600 }, { "epoch": 2.2636724465219578, "grad_norm": 1.700989842414856, "learning_rate": 1.2287757261083891e-05, "loss": 2.629, "step": 301700 }, { "epoch": 2.2644227522715505, "grad_norm": 1.462969422340393, "learning_rate": 1.2275252165257344e-05, "loss": 2.7921, "step": 301800 }, { "epoch": 2.2651730580211438, "grad_norm": 2.2601773738861084, "learning_rate": 1.2262747069430793e-05, "loss": 2.6327, "step": 301900 }, { "epoch": 2.2659233637707366, "grad_norm": 1.5958545207977295, "learning_rate": 1.2250241973604244e-05, "loss": 2.5108, "step": 302000 }, { "epoch": 2.26667366952033, "grad_norm": 2.0187625885009766, "learning_rate": 1.2237736877777695e-05, "loss": 2.5097, "step": 302100 }, { "epoch": 2.2674239752699226, "grad_norm": 1.5698049068450928, "learning_rate": 1.2225231781951145e-05, "loss": 2.5857, "step": 302200 }, { "epoch": 2.2681742810195153, "grad_norm": 1.8938987255096436, "learning_rate": 1.2212726686124597e-05, "loss": 2.5901, "step": 302300 }, { "epoch": 2.2689245867691086, "grad_norm": 1.5847225189208984, "learning_rate": 1.2200221590298047e-05, "loss": 2.545, "step": 302400 }, { "epoch": 2.2696748925187014, "grad_norm": 1.537087321281433, "learning_rate": 1.2187716494471498e-05, "loss": 2.6579, "step": 302500 }, { "epoch": 2.270425198268294, "grad_norm": 3.0201351642608643, "learning_rate": 1.2175211398644949e-05, "loss": 2.6222, "step": 302600 }, { "epoch": 2.2711755040178874, "grad_norm": 2.5191774368286133, "learning_rate": 1.21627063028184e-05, "loss": 2.6099, "step": 302700 }, { "epoch": 2.27192580976748, "grad_norm": 3.230480194091797, "learning_rate": 1.215020120699185e-05, "loss": 2.5916, "step": 302800 }, { "epoch": 2.2726761155170734, "grad_norm": 1.5975756645202637, "learning_rate": 1.21376961111653e-05, "loss": 2.7109, "step": 302900 }, { "epoch": 2.273426421266666, "grad_norm": 1.9980818033218384, "learning_rate": 1.2125191015338751e-05, "loss": 2.7248, "step": 303000 }, { "epoch": 2.274176727016259, "grad_norm": 2.166252374649048, "learning_rate": 1.2112685919512202e-05, "loss": 2.6103, "step": 303100 }, { "epoch": 2.274927032765852, "grad_norm": 1.870823621749878, "learning_rate": 1.2100180823685653e-05, "loss": 2.6139, "step": 303200 }, { "epoch": 2.275677338515445, "grad_norm": 2.017180919647217, "learning_rate": 1.2087675727859104e-05, "loss": 2.4763, "step": 303300 }, { "epoch": 2.276427644265038, "grad_norm": 1.5123525857925415, "learning_rate": 1.2075170632032553e-05, "loss": 2.6621, "step": 303400 }, { "epoch": 2.277177950014631, "grad_norm": 1.9445242881774902, "learning_rate": 1.206279058716427e-05, "loss": 2.7238, "step": 303500 }, { "epoch": 2.2779282557642238, "grad_norm": 2.2520720958709717, "learning_rate": 1.205028549133772e-05, "loss": 2.7595, "step": 303600 }, { "epoch": 2.278678561513817, "grad_norm": 2.3102266788482666, "learning_rate": 1.2037780395511171e-05, "loss": 2.5849, "step": 303700 }, { "epoch": 2.2794288672634098, "grad_norm": 2.784663438796997, "learning_rate": 1.2025275299684622e-05, "loss": 2.4842, "step": 303800 }, { "epoch": 2.280179173013003, "grad_norm": 1.9307456016540527, "learning_rate": 1.2012770203858073e-05, "loss": 2.5286, "step": 303900 }, { "epoch": 2.2809294787625958, "grad_norm": 1.851815938949585, "learning_rate": 1.2000265108031524e-05, "loss": 2.6098, "step": 304000 }, { "epoch": 2.2816797845121886, "grad_norm": 1.9133723974227905, "learning_rate": 1.198788506316324e-05, "loss": 2.6265, "step": 304100 }, { "epoch": 2.282430090261782, "grad_norm": 3.3028433322906494, "learning_rate": 1.1975379967336691e-05, "loss": 2.5022, "step": 304200 }, { "epoch": 2.2831803960113746, "grad_norm": 1.7723267078399658, "learning_rate": 1.196287487151014e-05, "loss": 2.5951, "step": 304300 }, { "epoch": 2.2839307017609674, "grad_norm": 2.0816967487335205, "learning_rate": 1.1950369775683591e-05, "loss": 2.6606, "step": 304400 }, { "epoch": 2.2846810075105606, "grad_norm": 3.5442676544189453, "learning_rate": 1.1937864679857042e-05, "loss": 2.633, "step": 304500 }, { "epoch": 2.2854313132601534, "grad_norm": 3.5391178131103516, "learning_rate": 1.1925359584030493e-05, "loss": 2.6209, "step": 304600 }, { "epoch": 2.2861816190097466, "grad_norm": 1.7492648363113403, "learning_rate": 1.1912854488203944e-05, "loss": 2.5601, "step": 304700 }, { "epoch": 2.2869319247593394, "grad_norm": 2.0791616439819336, "learning_rate": 1.1900349392377394e-05, "loss": 2.5614, "step": 304800 }, { "epoch": 2.287682230508932, "grad_norm": 2.0234415531158447, "learning_rate": 1.1887844296550845e-05, "loss": 2.7251, "step": 304900 }, { "epoch": 2.2884325362585254, "grad_norm": 1.6914787292480469, "learning_rate": 1.1875339200724296e-05, "loss": 2.7239, "step": 305000 }, { "epoch": 2.289182842008118, "grad_norm": 1.5743906497955322, "learning_rate": 1.1862834104897747e-05, "loss": 2.5177, "step": 305100 }, { "epoch": 2.2899331477577114, "grad_norm": 2.5704233646392822, "learning_rate": 1.1850329009071196e-05, "loss": 2.5291, "step": 305200 }, { "epoch": 2.290683453507304, "grad_norm": 2.543701410293579, "learning_rate": 1.1837823913244647e-05, "loss": 2.5428, "step": 305300 }, { "epoch": 2.291433759256897, "grad_norm": 1.6592743396759033, "learning_rate": 1.18253188174181e-05, "loss": 2.5852, "step": 305400 }, { "epoch": 2.29218406500649, "grad_norm": 2.518592119216919, "learning_rate": 1.1812813721591549e-05, "loss": 2.7179, "step": 305500 }, { "epoch": 2.292934370756083, "grad_norm": 3.037134885787964, "learning_rate": 1.1800308625765e-05, "loss": 2.7938, "step": 305600 }, { "epoch": 2.293684676505676, "grad_norm": 1.313339114189148, "learning_rate": 1.178780352993845e-05, "loss": 2.632, "step": 305700 }, { "epoch": 2.294434982255269, "grad_norm": 2.1003222465515137, "learning_rate": 1.17752984341119e-05, "loss": 2.5882, "step": 305800 }, { "epoch": 2.2951852880048618, "grad_norm": 1.8596316576004028, "learning_rate": 1.1762793338285351e-05, "loss": 2.5802, "step": 305900 }, { "epoch": 2.295935593754455, "grad_norm": 1.7678228616714478, "learning_rate": 1.1750288242458802e-05, "loss": 2.5073, "step": 306000 }, { "epoch": 2.296685899504048, "grad_norm": 3.005241632461548, "learning_rate": 1.1737783146632253e-05, "loss": 2.5933, "step": 306100 }, { "epoch": 2.297436205253641, "grad_norm": 2.4216701984405518, "learning_rate": 1.1725278050805703e-05, "loss": 2.5925, "step": 306200 }, { "epoch": 2.298186511003234, "grad_norm": 2.443013906478882, "learning_rate": 1.1712772954979156e-05, "loss": 2.5043, "step": 306300 }, { "epoch": 2.2989368167528266, "grad_norm": 2.8875677585601807, "learning_rate": 1.1700267859152605e-05, "loss": 2.6705, "step": 306400 }, { "epoch": 2.29968712250242, "grad_norm": 1.6119575500488281, "learning_rate": 1.1687762763326056e-05, "loss": 2.5128, "step": 306500 }, { "epoch": 2.3004374282520126, "grad_norm": 1.8601326942443848, "learning_rate": 1.1675257667499507e-05, "loss": 2.6266, "step": 306600 }, { "epoch": 2.301187734001606, "grad_norm": 1.782347321510315, "learning_rate": 1.1662752571672956e-05, "loss": 2.6638, "step": 306700 }, { "epoch": 2.3019380397511986, "grad_norm": 3.3608527183532715, "learning_rate": 1.1650247475846409e-05, "loss": 2.5629, "step": 306800 }, { "epoch": 2.3026883455007914, "grad_norm": 2.1393258571624756, "learning_rate": 1.1637742380019858e-05, "loss": 2.6107, "step": 306900 }, { "epoch": 2.3034386512503846, "grad_norm": 1.7944916486740112, "learning_rate": 1.162523728419331e-05, "loss": 2.5955, "step": 307000 }, { "epoch": 2.3041889569999774, "grad_norm": 3.100008010864258, "learning_rate": 1.161273218836676e-05, "loss": 2.5518, "step": 307100 }, { "epoch": 2.3049392627495706, "grad_norm": 1.8633559942245483, "learning_rate": 1.1600227092540211e-05, "loss": 2.5254, "step": 307200 }, { "epoch": 2.3056895684991634, "grad_norm": 1.7337347269058228, "learning_rate": 1.1587721996713662e-05, "loss": 2.4984, "step": 307300 }, { "epoch": 2.306439874248756, "grad_norm": 2.166839838027954, "learning_rate": 1.1575216900887112e-05, "loss": 2.6004, "step": 307400 }, { "epoch": 2.3071901799983494, "grad_norm": 1.7578617334365845, "learning_rate": 1.1562711805060563e-05, "loss": 2.696, "step": 307500 }, { "epoch": 2.307940485747942, "grad_norm": 1.4477901458740234, "learning_rate": 1.1550331760192278e-05, "loss": 2.513, "step": 307600 }, { "epoch": 2.3086907914975354, "grad_norm": 2.052649974822998, "learning_rate": 1.153782666436573e-05, "loss": 2.6672, "step": 307700 }, { "epoch": 2.309441097247128, "grad_norm": 2.6492111682891846, "learning_rate": 1.152532156853918e-05, "loss": 2.5625, "step": 307800 }, { "epoch": 2.310191402996721, "grad_norm": 2.6071813106536865, "learning_rate": 1.1512816472712631e-05, "loss": 2.5295, "step": 307900 }, { "epoch": 2.310941708746314, "grad_norm": 3.0935733318328857, "learning_rate": 1.1500311376886082e-05, "loss": 2.6597, "step": 308000 }, { "epoch": 2.311692014495907, "grad_norm": 1.2925702333450317, "learning_rate": 1.1487806281059532e-05, "loss": 2.7135, "step": 308100 }, { "epoch": 2.3124423202455002, "grad_norm": 1.622313141822815, "learning_rate": 1.1475301185232983e-05, "loss": 2.6175, "step": 308200 }, { "epoch": 2.313192625995093, "grad_norm": 2.2888755798339844, "learning_rate": 1.1462796089406434e-05, "loss": 2.6372, "step": 308300 }, { "epoch": 2.313942931744686, "grad_norm": 1.9650100469589233, "learning_rate": 1.1450290993579885e-05, "loss": 2.6597, "step": 308400 }, { "epoch": 2.314693237494279, "grad_norm": 2.2392756938934326, "learning_rate": 1.1437785897753336e-05, "loss": 2.6357, "step": 308500 }, { "epoch": 2.315443543243872, "grad_norm": 2.345167398452759, "learning_rate": 1.1425280801926785e-05, "loss": 2.6845, "step": 308600 }, { "epoch": 2.316193848993465, "grad_norm": 2.471766471862793, "learning_rate": 1.1412775706100236e-05, "loss": 2.5225, "step": 308700 }, { "epoch": 2.316944154743058, "grad_norm": 1.9985299110412598, "learning_rate": 1.1400270610273687e-05, "loss": 2.5306, "step": 308800 }, { "epoch": 2.3176944604926506, "grad_norm": 2.273674726486206, "learning_rate": 1.1387765514447138e-05, "loss": 2.5238, "step": 308900 }, { "epoch": 2.318444766242244, "grad_norm": 1.6446832418441772, "learning_rate": 1.1375260418620588e-05, "loss": 2.6675, "step": 309000 }, { "epoch": 2.3191950719918366, "grad_norm": 3.227836847305298, "learning_rate": 1.1362755322794039e-05, "loss": 2.5749, "step": 309100 }, { "epoch": 2.31994537774143, "grad_norm": 1.6124382019042969, "learning_rate": 1.135025022696749e-05, "loss": 2.609, "step": 309200 }, { "epoch": 2.3206956834910226, "grad_norm": 3.256087064743042, "learning_rate": 1.133774513114094e-05, "loss": 2.4961, "step": 309300 }, { "epoch": 2.3214459892406154, "grad_norm": 1.7087422609329224, "learning_rate": 1.1325240035314392e-05, "loss": 2.64, "step": 309400 }, { "epoch": 2.3221962949902086, "grad_norm": 1.809277057647705, "learning_rate": 1.1312734939487841e-05, "loss": 2.5408, "step": 309500 }, { "epoch": 2.3229466007398014, "grad_norm": 1.711296558380127, "learning_rate": 1.1300229843661292e-05, "loss": 2.6279, "step": 309600 }, { "epoch": 2.3236969064893946, "grad_norm": 1.4268347024917603, "learning_rate": 1.1287724747834743e-05, "loss": 2.635, "step": 309700 }, { "epoch": 2.3244472122389874, "grad_norm": 2.084550619125366, "learning_rate": 1.1275219652008194e-05, "loss": 2.6429, "step": 309800 }, { "epoch": 2.32519751798858, "grad_norm": 2.6850364208221436, "learning_rate": 1.1262714556181645e-05, "loss": 2.4754, "step": 309900 }, { "epoch": 2.3259478237381734, "grad_norm": 2.999711275100708, "learning_rate": 1.1250209460355094e-05, "loss": 2.6244, "step": 310000 }, { "epoch": 2.326698129487766, "grad_norm": 2.5941669940948486, "learning_rate": 1.1237704364528547e-05, "loss": 2.7156, "step": 310100 }, { "epoch": 2.3274484352373594, "grad_norm": 1.4064921140670776, "learning_rate": 1.1225199268701996e-05, "loss": 2.7301, "step": 310200 }, { "epoch": 2.3281987409869522, "grad_norm": 2.7909114360809326, "learning_rate": 1.1212694172875447e-05, "loss": 2.5069, "step": 310300 }, { "epoch": 2.328949046736545, "grad_norm": 1.2735257148742676, "learning_rate": 1.1200189077048898e-05, "loss": 2.6982, "step": 310400 }, { "epoch": 2.3296993524861382, "grad_norm": 1.473177433013916, "learning_rate": 1.1187683981222348e-05, "loss": 2.5943, "step": 310500 }, { "epoch": 2.330449658235731, "grad_norm": 1.535162329673767, "learning_rate": 1.11751788853958e-05, "loss": 2.5749, "step": 310600 }, { "epoch": 2.3311999639853243, "grad_norm": 1.6159747838974, "learning_rate": 1.116267378956925e-05, "loss": 2.6014, "step": 310700 }, { "epoch": 2.331950269734917, "grad_norm": 2.3892390727996826, "learning_rate": 1.11501686937427e-05, "loss": 2.7038, "step": 310800 }, { "epoch": 2.33270057548451, "grad_norm": 2.0302209854125977, "learning_rate": 1.113766359791615e-05, "loss": 2.6265, "step": 310900 }, { "epoch": 2.333450881234103, "grad_norm": 2.473446846008301, "learning_rate": 1.1125158502089603e-05, "loss": 2.6006, "step": 311000 }, { "epoch": 2.334201186983696, "grad_norm": 2.0377793312072754, "learning_rate": 1.1112653406263054e-05, "loss": 2.4549, "step": 311100 }, { "epoch": 2.334951492733289, "grad_norm": 2.907668113708496, "learning_rate": 1.1100148310436503e-05, "loss": 2.6454, "step": 311200 }, { "epoch": 2.335701798482882, "grad_norm": 1.9321383237838745, "learning_rate": 1.1087643214609954e-05, "loss": 2.61, "step": 311300 }, { "epoch": 2.3364521042324746, "grad_norm": 1.4112290143966675, "learning_rate": 1.1075138118783403e-05, "loss": 2.72, "step": 311400 }, { "epoch": 2.337202409982068, "grad_norm": 1.7696913480758667, "learning_rate": 1.1062633022956856e-05, "loss": 2.5608, "step": 311500 }, { "epoch": 2.3379527157316606, "grad_norm": 4.3406596183776855, "learning_rate": 1.1050252978088572e-05, "loss": 2.6666, "step": 311600 }, { "epoch": 2.3387030214812534, "grad_norm": 1.722280740737915, "learning_rate": 1.1037747882262023e-05, "loss": 2.4157, "step": 311700 }, { "epoch": 2.3394533272308466, "grad_norm": 1.7642699480056763, "learning_rate": 1.1025242786435474e-05, "loss": 2.7634, "step": 311800 }, { "epoch": 2.3402036329804394, "grad_norm": 1.8987239599227905, "learning_rate": 1.1012737690608923e-05, "loss": 2.6567, "step": 311900 }, { "epoch": 2.3409539387300327, "grad_norm": 2.18400239944458, "learning_rate": 1.1000232594782374e-05, "loss": 2.632, "step": 312000 }, { "epoch": 2.3417042444796254, "grad_norm": 1.9449148178100586, "learning_rate": 1.0987727498955825e-05, "loss": 2.5673, "step": 312100 }, { "epoch": 2.3424545502292182, "grad_norm": 1.3710496425628662, "learning_rate": 1.0975222403129276e-05, "loss": 2.629, "step": 312200 }, { "epoch": 2.3432048559788115, "grad_norm": 1.861373782157898, "learning_rate": 1.0962717307302726e-05, "loss": 2.6866, "step": 312300 }, { "epoch": 2.3439551617284042, "grad_norm": 1.7189323902130127, "learning_rate": 1.0950212211476177e-05, "loss": 2.4808, "step": 312400 }, { "epoch": 2.3447054674779975, "grad_norm": 2.0357396602630615, "learning_rate": 1.0937707115649628e-05, "loss": 2.5849, "step": 312500 }, { "epoch": 2.3454557732275902, "grad_norm": 2.6605656147003174, "learning_rate": 1.0925202019823079e-05, "loss": 2.6064, "step": 312600 }, { "epoch": 2.346206078977183, "grad_norm": 2.6388258934020996, "learning_rate": 1.091269692399653e-05, "loss": 2.6467, "step": 312700 }, { "epoch": 2.3469563847267763, "grad_norm": 2.6703643798828125, "learning_rate": 1.0900191828169979e-05, "loss": 2.6629, "step": 312800 }, { "epoch": 2.347706690476369, "grad_norm": 1.3593746423721313, "learning_rate": 1.088768673234343e-05, "loss": 2.5505, "step": 312900 }, { "epoch": 2.3484569962259623, "grad_norm": 2.5177085399627686, "learning_rate": 1.0875181636516881e-05, "loss": 2.7833, "step": 313000 }, { "epoch": 2.349207301975555, "grad_norm": 2.328702211380005, "learning_rate": 1.0862676540690332e-05, "loss": 2.6107, "step": 313100 }, { "epoch": 2.349957607725148, "grad_norm": 1.7837660312652588, "learning_rate": 1.0850171444863783e-05, "loss": 2.5744, "step": 313200 }, { "epoch": 2.350707913474741, "grad_norm": 1.7953094244003296, "learning_rate": 1.0837666349037232e-05, "loss": 2.6293, "step": 313300 }, { "epoch": 2.351458219224334, "grad_norm": 2.0160934925079346, "learning_rate": 1.0825161253210683e-05, "loss": 2.574, "step": 313400 }, { "epoch": 2.3522085249739266, "grad_norm": 1.8639044761657715, "learning_rate": 1.0812656157384134e-05, "loss": 2.5549, "step": 313500 }, { "epoch": 2.35295883072352, "grad_norm": 2.1086339950561523, "learning_rate": 1.0800151061557585e-05, "loss": 2.6386, "step": 313600 }, { "epoch": 2.3537091364731126, "grad_norm": 2.0798845291137695, "learning_rate": 1.0787771016689301e-05, "loss": 2.5692, "step": 313700 }, { "epoch": 2.354459442222706, "grad_norm": 3.0810914039611816, "learning_rate": 1.0775265920862752e-05, "loss": 2.6206, "step": 313800 }, { "epoch": 2.3552097479722987, "grad_norm": 2.1423470973968506, "learning_rate": 1.0762760825036203e-05, "loss": 2.6756, "step": 313900 }, { "epoch": 2.3559600537218914, "grad_norm": 2.195258855819702, "learning_rate": 1.0750255729209653e-05, "loss": 2.5925, "step": 314000 }, { "epoch": 2.3567103594714847, "grad_norm": 1.6781907081604004, "learning_rate": 1.0737750633383104e-05, "loss": 2.7171, "step": 314100 }, { "epoch": 2.3574606652210774, "grad_norm": 2.309427499771118, "learning_rate": 1.0725245537556555e-05, "loss": 2.5496, "step": 314200 }, { "epoch": 2.3582109709706707, "grad_norm": 1.5668365955352783, "learning_rate": 1.0712740441730006e-05, "loss": 2.5436, "step": 314300 }, { "epoch": 2.3589612767202635, "grad_norm": 2.1599910259246826, "learning_rate": 1.0700235345903457e-05, "loss": 2.5933, "step": 314400 }, { "epoch": 2.3597115824698562, "grad_norm": 1.7549711465835571, "learning_rate": 1.0687730250076906e-05, "loss": 2.4812, "step": 314500 }, { "epoch": 2.3604618882194495, "grad_norm": 2.1905171871185303, "learning_rate": 1.0675225154250359e-05, "loss": 2.5872, "step": 314600 }, { "epoch": 2.3612121939690423, "grad_norm": 1.5285897254943848, "learning_rate": 1.0662720058423808e-05, "loss": 2.5672, "step": 314700 }, { "epoch": 2.3619624997186355, "grad_norm": 1.3582463264465332, "learning_rate": 1.0650340013555525e-05, "loss": 2.4719, "step": 314800 }, { "epoch": 2.3627128054682283, "grad_norm": 1.515625, "learning_rate": 1.0637834917728975e-05, "loss": 2.7023, "step": 314900 }, { "epoch": 2.363463111217821, "grad_norm": 4.054426670074463, "learning_rate": 1.0625329821902426e-05, "loss": 2.5787, "step": 315000 }, { "epoch": 2.3642134169674143, "grad_norm": 1.9147390127182007, "learning_rate": 1.0612824726075877e-05, "loss": 2.503, "step": 315100 }, { "epoch": 2.364963722717007, "grad_norm": 2.682779550552368, "learning_rate": 1.0600319630249326e-05, "loss": 2.5371, "step": 315200 }, { "epoch": 2.3657140284666003, "grad_norm": 1.747098445892334, "learning_rate": 1.0587814534422779e-05, "loss": 2.5876, "step": 315300 }, { "epoch": 2.366464334216193, "grad_norm": 2.306715488433838, "learning_rate": 1.0575309438596228e-05, "loss": 2.6292, "step": 315400 }, { "epoch": 2.367214639965786, "grad_norm": 2.619098663330078, "learning_rate": 1.0562804342769679e-05, "loss": 2.5698, "step": 315500 }, { "epoch": 2.367964945715379, "grad_norm": 1.8151512145996094, "learning_rate": 1.055029924694313e-05, "loss": 2.4601, "step": 315600 }, { "epoch": 2.368715251464972, "grad_norm": 1.5597753524780273, "learning_rate": 1.0537794151116581e-05, "loss": 2.6578, "step": 315700 }, { "epoch": 2.369465557214565, "grad_norm": 1.7222017049789429, "learning_rate": 1.0525289055290032e-05, "loss": 2.4889, "step": 315800 }, { "epoch": 2.370215862964158, "grad_norm": 1.3403700590133667, "learning_rate": 1.0512783959463482e-05, "loss": 2.6473, "step": 315900 }, { "epoch": 2.3709661687137507, "grad_norm": 1.6437480449676514, "learning_rate": 1.0500278863636933e-05, "loss": 2.6423, "step": 316000 }, { "epoch": 2.371716474463344, "grad_norm": 3.012457847595215, "learning_rate": 1.0487773767810382e-05, "loss": 2.6162, "step": 316100 }, { "epoch": 2.3724667802129367, "grad_norm": 1.9253898859024048, "learning_rate": 1.0475268671983835e-05, "loss": 2.7247, "step": 316200 }, { "epoch": 2.37321708596253, "grad_norm": 2.1919641494750977, "learning_rate": 1.0462763576157286e-05, "loss": 2.582, "step": 316300 }, { "epoch": 2.3739673917121227, "grad_norm": 2.4128499031066895, "learning_rate": 1.0450258480330735e-05, "loss": 2.6642, "step": 316400 }, { "epoch": 2.3747176974617155, "grad_norm": 2.009474515914917, "learning_rate": 1.0437753384504186e-05, "loss": 2.6198, "step": 316500 }, { "epoch": 2.3754680032113087, "grad_norm": 1.684191346168518, "learning_rate": 1.0425248288677637e-05, "loss": 2.5403, "step": 316600 }, { "epoch": 2.3762183089609015, "grad_norm": 3.0007104873657227, "learning_rate": 1.0412743192851088e-05, "loss": 2.4764, "step": 316700 }, { "epoch": 2.3769686147104947, "grad_norm": 1.4402117729187012, "learning_rate": 1.0400238097024537e-05, "loss": 2.6935, "step": 316800 }, { "epoch": 2.3777189204600875, "grad_norm": 2.04482102394104, "learning_rate": 1.0387733001197988e-05, "loss": 2.6368, "step": 316900 }, { "epoch": 2.3784692262096803, "grad_norm": 2.1710877418518066, "learning_rate": 1.037522790537144e-05, "loss": 2.5595, "step": 317000 }, { "epoch": 2.3792195319592735, "grad_norm": 2.6785764694213867, "learning_rate": 1.036272280954489e-05, "loss": 2.6599, "step": 317100 }, { "epoch": 2.3799698377088663, "grad_norm": 1.6122616529464722, "learning_rate": 1.0350217713718341e-05, "loss": 2.4207, "step": 317200 }, { "epoch": 2.3807201434584595, "grad_norm": 1.8433833122253418, "learning_rate": 1.033771261789179e-05, "loss": 2.6141, "step": 317300 }, { "epoch": 2.3814704492080523, "grad_norm": 1.8683782815933228, "learning_rate": 1.0325207522065242e-05, "loss": 2.5304, "step": 317400 }, { "epoch": 2.382220754957645, "grad_norm": 1.797879695892334, "learning_rate": 1.0312702426238693e-05, "loss": 2.5968, "step": 317500 }, { "epoch": 2.3829710607072383, "grad_norm": 1.8420451879501343, "learning_rate": 1.0300197330412144e-05, "loss": 2.5873, "step": 317600 }, { "epoch": 2.383721366456831, "grad_norm": 1.7424778938293457, "learning_rate": 1.0287692234585595e-05, "loss": 2.654, "step": 317700 }, { "epoch": 2.3844716722064243, "grad_norm": 1.817161202430725, "learning_rate": 1.0275187138759044e-05, "loss": 2.6504, "step": 317800 }, { "epoch": 2.385221977956017, "grad_norm": 2.067371129989624, "learning_rate": 1.0262682042932495e-05, "loss": 2.7746, "step": 317900 }, { "epoch": 2.38597228370561, "grad_norm": 2.2672338485717773, "learning_rate": 1.0250176947105946e-05, "loss": 2.5645, "step": 318000 }, { "epoch": 2.386722589455203, "grad_norm": 1.9817509651184082, "learning_rate": 1.0237671851279397e-05, "loss": 2.575, "step": 318100 }, { "epoch": 2.387472895204796, "grad_norm": 2.0503695011138916, "learning_rate": 1.0225166755452848e-05, "loss": 2.6448, "step": 318200 }, { "epoch": 2.388223200954389, "grad_norm": 1.509717583656311, "learning_rate": 1.0212661659626297e-05, "loss": 2.6184, "step": 318300 }, { "epoch": 2.388973506703982, "grad_norm": 2.783141613006592, "learning_rate": 1.020015656379975e-05, "loss": 2.6331, "step": 318400 }, { "epoch": 2.3897238124535747, "grad_norm": 1.9050569534301758, "learning_rate": 1.01876514679732e-05, "loss": 2.5423, "step": 318500 }, { "epoch": 2.390474118203168, "grad_norm": 2.4140539169311523, "learning_rate": 1.017514637214665e-05, "loss": 2.5264, "step": 318600 }, { "epoch": 2.3912244239527607, "grad_norm": 2.683797597885132, "learning_rate": 1.01626412763201e-05, "loss": 2.6899, "step": 318700 }, { "epoch": 2.391974729702354, "grad_norm": 2.068958044052124, "learning_rate": 1.0150261231451817e-05, "loss": 2.6171, "step": 318800 }, { "epoch": 2.3927250354519467, "grad_norm": 1.2885767221450806, "learning_rate": 1.0137756135625268e-05, "loss": 2.5518, "step": 318900 }, { "epoch": 2.3934753412015395, "grad_norm": 2.7548437118530273, "learning_rate": 1.0125251039798718e-05, "loss": 2.7116, "step": 319000 }, { "epoch": 2.3942256469511327, "grad_norm": 1.6709879636764526, "learning_rate": 1.011274594397217e-05, "loss": 2.6304, "step": 319100 }, { "epoch": 2.3949759527007255, "grad_norm": 2.08390474319458, "learning_rate": 1.010024084814562e-05, "loss": 2.4779, "step": 319200 }, { "epoch": 2.3957262584503187, "grad_norm": 1.5995687246322632, "learning_rate": 1.008773575231907e-05, "loss": 2.4793, "step": 319300 }, { "epoch": 2.3964765641999115, "grad_norm": 3.4295847415924072, "learning_rate": 1.0075230656492522e-05, "loss": 2.5624, "step": 319400 }, { "epoch": 2.3972268699495043, "grad_norm": 1.5456715822219849, "learning_rate": 1.0062725560665971e-05, "loss": 2.7019, "step": 319500 }, { "epoch": 2.3979771756990975, "grad_norm": 1.5681673288345337, "learning_rate": 1.0050220464839424e-05, "loss": 2.6333, "step": 319600 }, { "epoch": 2.3987274814486903, "grad_norm": 3.707585096359253, "learning_rate": 1.0037715369012873e-05, "loss": 2.7234, "step": 319700 }, { "epoch": 2.3994777871982835, "grad_norm": 2.726243734359741, "learning_rate": 1.0025210273186324e-05, "loss": 2.3536, "step": 319800 }, { "epoch": 2.4002280929478763, "grad_norm": 2.334177255630493, "learning_rate": 1.0012705177359773e-05, "loss": 2.5854, "step": 319900 }, { "epoch": 2.400978398697469, "grad_norm": 2.41328763961792, "learning_rate": 1.0000200081533226e-05, "loss": 2.5711, "step": 320000 }, { "epoch": 2.4017287044470623, "grad_norm": 1.8288795948028564, "learning_rate": 9.987694985706677e-06, "loss": 2.6339, "step": 320100 }, { "epoch": 2.402479010196655, "grad_norm": 2.9753873348236084, "learning_rate": 9.975189889880126e-06, "loss": 2.5085, "step": 320200 }, { "epoch": 2.4032293159462483, "grad_norm": 1.6203949451446533, "learning_rate": 9.962684794053577e-06, "loss": 2.6533, "step": 320300 }, { "epoch": 2.403979621695841, "grad_norm": 1.2577120065689087, "learning_rate": 9.950179698227027e-06, "loss": 2.7429, "step": 320400 }, { "epoch": 2.404729927445434, "grad_norm": 2.842869997024536, "learning_rate": 9.93767460240048e-06, "loss": 2.6009, "step": 320500 }, { "epoch": 2.405480233195027, "grad_norm": 2.5965681076049805, "learning_rate": 9.925169506573929e-06, "loss": 2.6606, "step": 320600 }, { "epoch": 2.40623053894462, "grad_norm": 1.50444495677948, "learning_rate": 9.91266441074738e-06, "loss": 2.5942, "step": 320700 }, { "epoch": 2.406980844694213, "grad_norm": 2.196021795272827, "learning_rate": 9.900284365879097e-06, "loss": 2.7234, "step": 320800 }, { "epoch": 2.407731150443806, "grad_norm": 3.708512544631958, "learning_rate": 9.887779270052547e-06, "loss": 2.6571, "step": 320900 }, { "epoch": 2.4084814561933987, "grad_norm": 1.9676363468170166, "learning_rate": 9.875274174225998e-06, "loss": 2.5355, "step": 321000 }, { "epoch": 2.409231761942992, "grad_norm": 3.169390916824341, "learning_rate": 9.862769078399449e-06, "loss": 2.5688, "step": 321100 }, { "epoch": 2.4099820676925847, "grad_norm": 2.380173921585083, "learning_rate": 9.8502639825729e-06, "loss": 2.5646, "step": 321200 }, { "epoch": 2.4107323734421775, "grad_norm": 2.982416868209839, "learning_rate": 9.837758886746349e-06, "loss": 2.69, "step": 321300 }, { "epoch": 2.4114826791917707, "grad_norm": 2.5773870944976807, "learning_rate": 9.8252537909198e-06, "loss": 2.7343, "step": 321400 }, { "epoch": 2.4122329849413635, "grad_norm": 2.378453254699707, "learning_rate": 9.812748695093251e-06, "loss": 2.5248, "step": 321500 }, { "epoch": 2.4129832906909567, "grad_norm": 1.5384963750839233, "learning_rate": 9.800243599266702e-06, "loss": 2.696, "step": 321600 }, { "epoch": 2.4137335964405495, "grad_norm": 1.8065714836120605, "learning_rate": 9.787738503440153e-06, "loss": 2.6199, "step": 321700 }, { "epoch": 2.4144839021901423, "grad_norm": 1.5026440620422363, "learning_rate": 9.775233407613602e-06, "loss": 2.5846, "step": 321800 }, { "epoch": 2.4152342079397355, "grad_norm": 2.7265825271606445, "learning_rate": 9.762728311787053e-06, "loss": 2.552, "step": 321900 }, { "epoch": 2.4159845136893283, "grad_norm": 1.6912727355957031, "learning_rate": 9.750223215960504e-06, "loss": 2.7806, "step": 322000 }, { "epoch": 2.4167348194389215, "grad_norm": 1.4995813369750977, "learning_rate": 9.737718120133955e-06, "loss": 2.6377, "step": 322100 }, { "epoch": 2.4174851251885143, "grad_norm": 1.7775238752365112, "learning_rate": 9.725213024307406e-06, "loss": 2.535, "step": 322200 }, { "epoch": 2.418235430938107, "grad_norm": 1.5035706758499146, "learning_rate": 9.712832979439122e-06, "loss": 2.5608, "step": 322300 }, { "epoch": 2.4189857366877003, "grad_norm": 2.28379487991333, "learning_rate": 9.700327883612573e-06, "loss": 2.6776, "step": 322400 }, { "epoch": 2.419736042437293, "grad_norm": 2.0236008167266846, "learning_rate": 9.687822787786022e-06, "loss": 2.6198, "step": 322500 }, { "epoch": 2.4204863481868863, "grad_norm": 2.747851848602295, "learning_rate": 9.675317691959473e-06, "loss": 2.6357, "step": 322600 }, { "epoch": 2.421236653936479, "grad_norm": 2.354316473007202, "learning_rate": 9.662812596132924e-06, "loss": 2.6415, "step": 322700 }, { "epoch": 2.421986959686072, "grad_norm": 2.828082323074341, "learning_rate": 9.650307500306375e-06, "loss": 2.4734, "step": 322800 }, { "epoch": 2.422737265435665, "grad_norm": 1.7954586744308472, "learning_rate": 9.637802404479826e-06, "loss": 2.6701, "step": 322900 }, { "epoch": 2.423487571185258, "grad_norm": 2.489516496658325, "learning_rate": 9.625297308653276e-06, "loss": 2.6368, "step": 323000 }, { "epoch": 2.4242378769348507, "grad_norm": 3.4772844314575195, "learning_rate": 9.612792212826727e-06, "loss": 2.5857, "step": 323100 }, { "epoch": 2.424988182684444, "grad_norm": 2.8526694774627686, "learning_rate": 9.600287117000178e-06, "loss": 2.7009, "step": 323200 }, { "epoch": 2.4257384884340367, "grad_norm": 2.5367507934570312, "learning_rate": 9.587782021173629e-06, "loss": 2.5238, "step": 323300 }, { "epoch": 2.42648879418363, "grad_norm": 2.3718349933624268, "learning_rate": 9.57527692534708e-06, "loss": 2.6126, "step": 323400 }, { "epoch": 2.4272390999332227, "grad_norm": 2.5304007530212402, "learning_rate": 9.56277182952053e-06, "loss": 2.5729, "step": 323500 }, { "epoch": 2.4279894056828155, "grad_norm": 2.352348804473877, "learning_rate": 9.550266733693982e-06, "loss": 2.5002, "step": 323600 }, { "epoch": 2.4287397114324087, "grad_norm": 2.1964478492736816, "learning_rate": 9.537761637867431e-06, "loss": 2.5001, "step": 323700 }, { "epoch": 2.4294900171820015, "grad_norm": 1.5854220390319824, "learning_rate": 9.525256542040882e-06, "loss": 2.641, "step": 323800 }, { "epoch": 2.4302403229315948, "grad_norm": 1.7194136381149292, "learning_rate": 9.512751446214333e-06, "loss": 2.5524, "step": 323900 }, { "epoch": 2.4309906286811875, "grad_norm": 1.920021891593933, "learning_rate": 9.500246350387783e-06, "loss": 2.6865, "step": 324000 }, { "epoch": 2.4317409344307803, "grad_norm": 2.313459634780884, "learning_rate": 9.487741254561235e-06, "loss": 2.5875, "step": 324100 }, { "epoch": 2.4324912401803735, "grad_norm": 2.209449052810669, "learning_rate": 9.475236158734685e-06, "loss": 2.667, "step": 324200 }, { "epoch": 2.4332415459299663, "grad_norm": 2.4453959465026855, "learning_rate": 9.462731062908136e-06, "loss": 2.6081, "step": 324300 }, { "epoch": 2.4339918516795596, "grad_norm": 1.9108469486236572, "learning_rate": 9.450225967081585e-06, "loss": 2.3927, "step": 324400 }, { "epoch": 2.4347421574291523, "grad_norm": 2.2558658123016357, "learning_rate": 9.437720871255038e-06, "loss": 2.6519, "step": 324500 }, { "epoch": 2.435492463178745, "grad_norm": 2.7128031253814697, "learning_rate": 9.425215775428487e-06, "loss": 2.6122, "step": 324600 }, { "epoch": 2.4362427689283384, "grad_norm": 1.595797061920166, "learning_rate": 9.412710679601938e-06, "loss": 2.5577, "step": 324700 }, { "epoch": 2.436993074677931, "grad_norm": 1.8155691623687744, "learning_rate": 9.400205583775389e-06, "loss": 2.4858, "step": 324800 }, { "epoch": 2.4377433804275244, "grad_norm": 1.8783681392669678, "learning_rate": 9.387700487948838e-06, "loss": 2.4247, "step": 324900 }, { "epoch": 2.438493686177117, "grad_norm": 1.7750028371810913, "learning_rate": 9.375195392122291e-06, "loss": 2.6249, "step": 325000 }, { "epoch": 2.43924399192671, "grad_norm": 2.895244836807251, "learning_rate": 9.36269029629574e-06, "loss": 2.5591, "step": 325100 }, { "epoch": 2.439994297676303, "grad_norm": 3.1671745777130127, "learning_rate": 9.350185200469191e-06, "loss": 2.4846, "step": 325200 }, { "epoch": 2.440744603425896, "grad_norm": 2.0725669860839844, "learning_rate": 9.337680104642642e-06, "loss": 2.5451, "step": 325300 }, { "epoch": 2.441494909175489, "grad_norm": 1.5684928894042969, "learning_rate": 9.325175008816093e-06, "loss": 2.5838, "step": 325400 }, { "epoch": 2.442245214925082, "grad_norm": 2.4977684020996094, "learning_rate": 9.312669912989544e-06, "loss": 2.7665, "step": 325500 }, { "epoch": 2.4429955206746747, "grad_norm": 1.8156465291976929, "learning_rate": 9.300164817162994e-06, "loss": 2.5335, "step": 325600 }, { "epoch": 2.443745826424268, "grad_norm": 1.476993441581726, "learning_rate": 9.287659721336445e-06, "loss": 2.6467, "step": 325700 }, { "epoch": 2.4444961321738607, "grad_norm": 2.8897271156311035, "learning_rate": 9.275154625509896e-06, "loss": 2.6771, "step": 325800 }, { "epoch": 2.445246437923454, "grad_norm": 1.86719810962677, "learning_rate": 9.262649529683347e-06, "loss": 2.4475, "step": 325900 }, { "epoch": 2.4459967436730468, "grad_norm": 2.418515920639038, "learning_rate": 9.250144433856798e-06, "loss": 2.6206, "step": 326000 }, { "epoch": 2.4467470494226395, "grad_norm": 3.1310105323791504, "learning_rate": 9.237639338030247e-06, "loss": 2.7265, "step": 326100 }, { "epoch": 2.4474973551722328, "grad_norm": 1.589293122291565, "learning_rate": 9.225134242203698e-06, "loss": 2.5102, "step": 326200 }, { "epoch": 2.4482476609218256, "grad_norm": 1.445005178451538, "learning_rate": 9.212754197335414e-06, "loss": 2.7664, "step": 326300 }, { "epoch": 2.448997966671419, "grad_norm": 2.2900209426879883, "learning_rate": 9.200249101508865e-06, "loss": 2.6065, "step": 326400 }, { "epoch": 2.4497482724210116, "grad_norm": 1.644031047821045, "learning_rate": 9.187744005682316e-06, "loss": 2.5746, "step": 326500 }, { "epoch": 2.4504985781706043, "grad_norm": 2.536602735519409, "learning_rate": 9.175238909855767e-06, "loss": 2.6126, "step": 326600 }, { "epoch": 2.4512488839201976, "grad_norm": 1.600121021270752, "learning_rate": 9.162733814029218e-06, "loss": 2.6499, "step": 326700 }, { "epoch": 2.4519991896697904, "grad_norm": 1.8909085988998413, "learning_rate": 9.150228718202667e-06, "loss": 2.6509, "step": 326800 }, { "epoch": 2.4527494954193836, "grad_norm": 1.8307281732559204, "learning_rate": 9.137723622376118e-06, "loss": 2.5496, "step": 326900 }, { "epoch": 2.4534998011689764, "grad_norm": 2.5986745357513428, "learning_rate": 9.12521852654957e-06, "loss": 2.6829, "step": 327000 }, { "epoch": 2.454250106918569, "grad_norm": 2.5019965171813965, "learning_rate": 9.11271343072302e-06, "loss": 2.6081, "step": 327100 }, { "epoch": 2.4550004126681624, "grad_norm": 3.5042240619659424, "learning_rate": 9.100208334896471e-06, "loss": 2.6941, "step": 327200 }, { "epoch": 2.455750718417755, "grad_norm": 2.1198794841766357, "learning_rate": 9.08770323906992e-06, "loss": 2.7039, "step": 327300 }, { "epoch": 2.4565010241673484, "grad_norm": 1.6794726848602295, "learning_rate": 9.075198143243373e-06, "loss": 2.682, "step": 327400 }, { "epoch": 2.457251329916941, "grad_norm": 2.110032320022583, "learning_rate": 9.062693047416823e-06, "loss": 2.5836, "step": 327500 }, { "epoch": 2.458001635666534, "grad_norm": 2.892002820968628, "learning_rate": 9.050313002548538e-06, "loss": 2.6262, "step": 327600 }, { "epoch": 2.458751941416127, "grad_norm": 1.5849529504776, "learning_rate": 9.03780790672199e-06, "loss": 2.7047, "step": 327700 }, { "epoch": 2.45950224716572, "grad_norm": 1.4462875127792358, "learning_rate": 9.02530281089544e-06, "loss": 2.6265, "step": 327800 }, { "epoch": 2.460252552915313, "grad_norm": 2.518441915512085, "learning_rate": 9.012797715068892e-06, "loss": 2.7073, "step": 327900 }, { "epoch": 2.461002858664906, "grad_norm": 2.7888994216918945, "learning_rate": 9.00029261924234e-06, "loss": 2.5055, "step": 328000 }, { "epoch": 2.4617531644144988, "grad_norm": 1.7096757888793945, "learning_rate": 8.987787523415794e-06, "loss": 2.6336, "step": 328100 }, { "epoch": 2.462503470164092, "grad_norm": 2.292510986328125, "learning_rate": 8.975282427589243e-06, "loss": 2.6159, "step": 328200 }, { "epoch": 2.4632537759136848, "grad_norm": 1.8179641962051392, "learning_rate": 8.962777331762694e-06, "loss": 2.4539, "step": 328300 }, { "epoch": 2.464004081663278, "grad_norm": 1.8188307285308838, "learning_rate": 8.950272235936143e-06, "loss": 2.422, "step": 328400 }, { "epoch": 2.464754387412871, "grad_norm": 2.165984869003296, "learning_rate": 8.937767140109594e-06, "loss": 2.6217, "step": 328500 }, { "epoch": 2.4655046931624636, "grad_norm": 1.6129826307296753, "learning_rate": 8.925262044283047e-06, "loss": 2.8213, "step": 328600 }, { "epoch": 2.466254998912057, "grad_norm": 1.8243364095687866, "learning_rate": 8.912756948456496e-06, "loss": 2.5934, "step": 328700 }, { "epoch": 2.4670053046616496, "grad_norm": 1.5155155658721924, "learning_rate": 8.900251852629947e-06, "loss": 2.6696, "step": 328800 }, { "epoch": 2.467755610411243, "grad_norm": 1.6866010427474976, "learning_rate": 8.887746756803397e-06, "loss": 2.5775, "step": 328900 }, { "epoch": 2.4685059161608356, "grad_norm": 2.33107590675354, "learning_rate": 8.87524166097685e-06, "loss": 2.5839, "step": 329000 }, { "epoch": 2.4692562219104284, "grad_norm": 2.6548943519592285, "learning_rate": 8.862736565150299e-06, "loss": 2.5645, "step": 329100 }, { "epoch": 2.4700065276600216, "grad_norm": 1.9178476333618164, "learning_rate": 8.85023146932375e-06, "loss": 2.6193, "step": 329200 }, { "epoch": 2.4707568334096144, "grad_norm": 2.820767402648926, "learning_rate": 8.8377263734972e-06, "loss": 2.5356, "step": 329300 }, { "epoch": 2.4715071391592076, "grad_norm": 2.1721036434173584, "learning_rate": 8.82522127767065e-06, "loss": 2.5912, "step": 329400 }, { "epoch": 2.4722574449088004, "grad_norm": 1.9252573251724243, "learning_rate": 8.812716181844103e-06, "loss": 2.535, "step": 329500 }, { "epoch": 2.473007750658393, "grad_norm": 1.633926510810852, "learning_rate": 8.800336136975817e-06, "loss": 2.5619, "step": 329600 }, { "epoch": 2.4737580564079864, "grad_norm": 2.4749765396118164, "learning_rate": 8.78783104114927e-06, "loss": 2.7468, "step": 329700 }, { "epoch": 2.474508362157579, "grad_norm": 2.213408946990967, "learning_rate": 8.77532594532272e-06, "loss": 2.6946, "step": 329800 }, { "epoch": 2.4752586679071724, "grad_norm": 2.5451297760009766, "learning_rate": 8.76282084949617e-06, "loss": 2.6473, "step": 329900 }, { "epoch": 2.476008973656765, "grad_norm": 2.0401804447174072, "learning_rate": 8.75031575366962e-06, "loss": 2.7235, "step": 330000 }, { "epoch": 2.476759279406358, "grad_norm": 2.4314186573028564, "learning_rate": 8.737810657843072e-06, "loss": 2.5946, "step": 330100 }, { "epoch": 2.477509585155951, "grad_norm": 2.242508888244629, "learning_rate": 8.725305562016523e-06, "loss": 2.526, "step": 330200 }, { "epoch": 2.478259890905544, "grad_norm": 2.0018928050994873, "learning_rate": 8.712800466189972e-06, "loss": 2.6575, "step": 330300 }, { "epoch": 2.4790101966551368, "grad_norm": 2.1769182682037354, "learning_rate": 8.700295370363423e-06, "loss": 2.648, "step": 330400 }, { "epoch": 2.47976050240473, "grad_norm": 2.1352710723876953, "learning_rate": 8.687790274536874e-06, "loss": 2.7342, "step": 330500 }, { "epoch": 2.480510808154323, "grad_norm": 2.4450430870056152, "learning_rate": 8.675285178710325e-06, "loss": 2.6344, "step": 330600 }, { "epoch": 2.481261113903916, "grad_norm": 2.134237766265869, "learning_rate": 8.662780082883776e-06, "loss": 2.465, "step": 330700 }, { "epoch": 2.482011419653509, "grad_norm": 4.443004608154297, "learning_rate": 8.650274987057226e-06, "loss": 2.6331, "step": 330800 }, { "epoch": 2.4827617254031016, "grad_norm": 1.5482871532440186, "learning_rate": 8.637769891230677e-06, "loss": 2.5594, "step": 330900 }, { "epoch": 2.483512031152695, "grad_norm": 3.357581615447998, "learning_rate": 8.625264795404128e-06, "loss": 2.6489, "step": 331000 }, { "epoch": 2.4842623369022876, "grad_norm": 2.006734848022461, "learning_rate": 8.612759699577579e-06, "loss": 2.5641, "step": 331100 }, { "epoch": 2.485012642651881, "grad_norm": 3.1221957206726074, "learning_rate": 8.60025460375103e-06, "loss": 2.664, "step": 331200 }, { "epoch": 2.4857629484014736, "grad_norm": 2.2118375301361084, "learning_rate": 8.587749507924479e-06, "loss": 2.6522, "step": 331300 }, { "epoch": 2.4865132541510664, "grad_norm": 2.2800331115722656, "learning_rate": 8.57524441209793e-06, "loss": 2.7514, "step": 331400 }, { "epoch": 2.4872635599006596, "grad_norm": 2.147138833999634, "learning_rate": 8.562739316271381e-06, "loss": 2.6457, "step": 331500 }, { "epoch": 2.4880138656502524, "grad_norm": 1.5632050037384033, "learning_rate": 8.550234220444832e-06, "loss": 2.6785, "step": 331600 }, { "epoch": 2.4887641713998456, "grad_norm": 2.044084072113037, "learning_rate": 8.537729124618283e-06, "loss": 2.4706, "step": 331700 }, { "epoch": 2.4895144771494384, "grad_norm": 1.8024667501449585, "learning_rate": 8.525224028791732e-06, "loss": 2.6191, "step": 331800 }, { "epoch": 2.490264782899031, "grad_norm": 1.6234326362609863, "learning_rate": 8.512718932965185e-06, "loss": 2.544, "step": 331900 }, { "epoch": 2.4910150886486244, "grad_norm": 1.303628921508789, "learning_rate": 8.500213837138634e-06, "loss": 2.7237, "step": 332000 }, { "epoch": 2.491765394398217, "grad_norm": 2.0385546684265137, "learning_rate": 8.487708741312085e-06, "loss": 2.6047, "step": 332100 }, { "epoch": 2.49251570014781, "grad_norm": 1.6391701698303223, "learning_rate": 8.475203645485535e-06, "loss": 2.7369, "step": 332200 }, { "epoch": 2.493266005897403, "grad_norm": 1.4800525903701782, "learning_rate": 8.462698549658986e-06, "loss": 2.576, "step": 332300 }, { "epoch": 2.494016311646996, "grad_norm": 2.3062684535980225, "learning_rate": 8.450193453832438e-06, "loss": 2.7355, "step": 332400 }, { "epoch": 2.4947666173965892, "grad_norm": 2.2018158435821533, "learning_rate": 8.437688358005888e-06, "loss": 2.696, "step": 332500 }, { "epoch": 2.495516923146182, "grad_norm": 2.302565097808838, "learning_rate": 8.425183262179339e-06, "loss": 2.6224, "step": 332600 }, { "epoch": 2.496267228895775, "grad_norm": 2.0073564052581787, "learning_rate": 8.412678166352788e-06, "loss": 2.579, "step": 332700 }, { "epoch": 2.497017534645368, "grad_norm": 2.6570277214050293, "learning_rate": 8.40017307052624e-06, "loss": 2.7488, "step": 332800 }, { "epoch": 2.497767840394961, "grad_norm": 1.4606101512908936, "learning_rate": 8.38766797469969e-06, "loss": 2.7064, "step": 332900 }, { "epoch": 2.498518146144554, "grad_norm": 2.0754003524780273, "learning_rate": 8.375162878873141e-06, "loss": 2.6253, "step": 333000 }, { "epoch": 2.499268451894147, "grad_norm": 2.11366605758667, "learning_rate": 8.362657783046592e-06, "loss": 2.6814, "step": 333100 }, { "epoch": 2.5000187576437396, "grad_norm": 1.917872428894043, "learning_rate": 8.350152687220041e-06, "loss": 2.5356, "step": 333200 }, { "epoch": 2.500769063393333, "grad_norm": 1.698380947113037, "learning_rate": 8.337647591393494e-06, "loss": 2.4523, "step": 333300 }, { "epoch": 2.5015193691429256, "grad_norm": 2.4579780101776123, "learning_rate": 8.325142495566944e-06, "loss": 2.5362, "step": 333400 }, { "epoch": 2.502269674892519, "grad_norm": 1.9996274709701538, "learning_rate": 8.312637399740395e-06, "loss": 2.6604, "step": 333500 }, { "epoch": 2.5030199806421116, "grad_norm": 1.4296164512634277, "learning_rate": 8.30025735487211e-06, "loss": 2.5964, "step": 333600 }, { "epoch": 2.5037702863917044, "grad_norm": 2.5872793197631836, "learning_rate": 8.287752259045561e-06, "loss": 2.6514, "step": 333700 }, { "epoch": 2.5045205921412976, "grad_norm": 3.0169572830200195, "learning_rate": 8.275247163219012e-06, "loss": 2.758, "step": 333800 }, { "epoch": 2.5052708978908904, "grad_norm": 2.4743967056274414, "learning_rate": 8.262742067392462e-06, "loss": 2.5892, "step": 333900 }, { "epoch": 2.5060212036404836, "grad_norm": 3.348289728164673, "learning_rate": 8.250236971565914e-06, "loss": 2.5166, "step": 334000 }, { "epoch": 2.5067715093900764, "grad_norm": 1.9004870653152466, "learning_rate": 8.237731875739364e-06, "loss": 2.6382, "step": 334100 }, { "epoch": 2.507521815139669, "grad_norm": 3.6000423431396484, "learning_rate": 8.225226779912815e-06, "loss": 2.5865, "step": 334200 }, { "epoch": 2.5082721208892624, "grad_norm": 3.054227828979492, "learning_rate": 8.212721684086266e-06, "loss": 2.5984, "step": 334300 }, { "epoch": 2.509022426638855, "grad_norm": 1.868584394454956, "learning_rate": 8.200216588259717e-06, "loss": 2.6086, "step": 334400 }, { "epoch": 2.5097727323884484, "grad_norm": 1.5333629846572876, "learning_rate": 8.187711492433168e-06, "loss": 2.7192, "step": 334500 }, { "epoch": 2.5105230381380412, "grad_norm": 2.493169069290161, "learning_rate": 8.175206396606617e-06, "loss": 2.5758, "step": 334600 }, { "epoch": 2.511273343887634, "grad_norm": 2.8925745487213135, "learning_rate": 8.162701300780068e-06, "loss": 2.5101, "step": 334700 }, { "epoch": 2.5120236496372272, "grad_norm": 2.4412386417388916, "learning_rate": 8.150196204953519e-06, "loss": 2.5808, "step": 334800 }, { "epoch": 2.51277395538682, "grad_norm": 1.744142770767212, "learning_rate": 8.13769110912697e-06, "loss": 2.6812, "step": 334900 }, { "epoch": 2.5135242611364133, "grad_norm": 2.8063182830810547, "learning_rate": 8.125186013300421e-06, "loss": 2.7311, "step": 335000 }, { "epoch": 2.514274566886006, "grad_norm": 1.5492258071899414, "learning_rate": 8.11268091747387e-06, "loss": 2.4349, "step": 335100 }, { "epoch": 2.515024872635599, "grad_norm": 1.4118491411209106, "learning_rate": 8.100175821647321e-06, "loss": 2.6178, "step": 335200 }, { "epoch": 2.515775178385192, "grad_norm": 1.7479945421218872, "learning_rate": 8.087670725820772e-06, "loss": 2.6101, "step": 335300 }, { "epoch": 2.516525484134785, "grad_norm": 1.7041529417037964, "learning_rate": 8.075165629994223e-06, "loss": 2.4828, "step": 335400 }, { "epoch": 2.517275789884378, "grad_norm": 1.989111065864563, "learning_rate": 8.062660534167673e-06, "loss": 2.4584, "step": 335500 }, { "epoch": 2.518026095633971, "grad_norm": 1.7306188344955444, "learning_rate": 8.050155438341124e-06, "loss": 2.6552, "step": 335600 }, { "epoch": 2.5187764013835636, "grad_norm": 2.5682766437530518, "learning_rate": 8.037775393472841e-06, "loss": 2.6447, "step": 335700 }, { "epoch": 2.519526707133157, "grad_norm": 1.6468662023544312, "learning_rate": 8.02527029764629e-06, "loss": 2.6255, "step": 335800 }, { "epoch": 2.5202770128827496, "grad_norm": 2.0624780654907227, "learning_rate": 8.012765201819742e-06, "loss": 2.5309, "step": 335900 }, { "epoch": 2.521027318632343, "grad_norm": 1.931422472000122, "learning_rate": 8.000260105993193e-06, "loss": 2.4957, "step": 336000 }, { "epoch": 2.5217776243819356, "grad_norm": 1.642999529838562, "learning_rate": 7.987755010166644e-06, "loss": 2.5572, "step": 336100 }, { "epoch": 2.5225279301315284, "grad_norm": 2.182037830352783, "learning_rate": 7.975249914340095e-06, "loss": 2.5586, "step": 336200 }, { "epoch": 2.5232782358811217, "grad_norm": 1.5089385509490967, "learning_rate": 7.962744818513544e-06, "loss": 2.6607, "step": 336300 }, { "epoch": 2.5240285416307144, "grad_norm": 2.710645914077759, "learning_rate": 7.950239722686997e-06, "loss": 2.5628, "step": 336400 }, { "epoch": 2.5247788473803077, "grad_norm": 1.570879578590393, "learning_rate": 7.937734626860446e-06, "loss": 2.5777, "step": 336500 }, { "epoch": 2.5255291531299005, "grad_norm": 1.8518353700637817, "learning_rate": 7.925229531033897e-06, "loss": 2.6314, "step": 336600 }, { "epoch": 2.5262794588794932, "grad_norm": 1.8629765510559082, "learning_rate": 7.912724435207346e-06, "loss": 2.6434, "step": 336700 }, { "epoch": 2.5270297646290865, "grad_norm": 3.0152058601379395, "learning_rate": 7.900219339380797e-06, "loss": 2.5133, "step": 336800 }, { "epoch": 2.5277800703786792, "grad_norm": 1.7443206310272217, "learning_rate": 7.887714243554248e-06, "loss": 2.4859, "step": 336900 }, { "epoch": 2.5285303761282725, "grad_norm": 2.03421688079834, "learning_rate": 7.8752091477277e-06, "loss": 2.6534, "step": 337000 }, { "epoch": 2.5292806818778653, "grad_norm": 2.537003755569458, "learning_rate": 7.86270405190115e-06, "loss": 2.5285, "step": 337100 }, { "epoch": 2.530030987627458, "grad_norm": 1.9931659698486328, "learning_rate": 7.8501989560746e-06, "loss": 2.5313, "step": 337200 }, { "epoch": 2.5307812933770513, "grad_norm": 1.994364857673645, "learning_rate": 7.837693860248052e-06, "loss": 2.5342, "step": 337300 }, { "epoch": 2.531531599126644, "grad_norm": 1.7115684747695923, "learning_rate": 7.825188764421502e-06, "loss": 2.6166, "step": 337400 }, { "epoch": 2.5322819048762373, "grad_norm": 1.6577633619308472, "learning_rate": 7.812683668594953e-06, "loss": 2.6414, "step": 337500 }, { "epoch": 2.53303221062583, "grad_norm": 2.349367141723633, "learning_rate": 7.800178572768404e-06, "loss": 2.5838, "step": 337600 }, { "epoch": 2.533782516375423, "grad_norm": 1.5493122339248657, "learning_rate": 7.787673476941853e-06, "loss": 2.5208, "step": 337700 }, { "epoch": 2.534532822125016, "grad_norm": 3.604438066482544, "learning_rate": 7.77529343207357e-06, "loss": 2.6024, "step": 337800 }, { "epoch": 2.535283127874609, "grad_norm": 1.4478806257247925, "learning_rate": 7.76278833624702e-06, "loss": 2.7491, "step": 337900 }, { "epoch": 2.536033433624202, "grad_norm": 2.922895669937134, "learning_rate": 7.750283240420473e-06, "loss": 2.7098, "step": 338000 }, { "epoch": 2.536783739373795, "grad_norm": 2.4405345916748047, "learning_rate": 7.737778144593922e-06, "loss": 2.6477, "step": 338100 }, { "epoch": 2.5375340451233876, "grad_norm": 2.275482416152954, "learning_rate": 7.725273048767373e-06, "loss": 2.6201, "step": 338200 }, { "epoch": 2.538284350872981, "grad_norm": 1.8202006816864014, "learning_rate": 7.712767952940824e-06, "loss": 2.3979, "step": 338300 }, { "epoch": 2.5390346566225737, "grad_norm": 3.0519165992736816, "learning_rate": 7.700262857114273e-06, "loss": 2.5813, "step": 338400 }, { "epoch": 2.539784962372167, "grad_norm": 2.047832489013672, "learning_rate": 7.687757761287726e-06, "loss": 2.6632, "step": 338500 }, { "epoch": 2.5405352681217597, "grad_norm": 1.9051623344421387, "learning_rate": 7.67537771641944e-06, "loss": 2.7544, "step": 338600 }, { "epoch": 2.5412855738713525, "grad_norm": 1.5043554306030273, "learning_rate": 7.662872620592893e-06, "loss": 2.5512, "step": 338700 }, { "epoch": 2.5420358796209457, "grad_norm": 2.1277756690979004, "learning_rate": 7.650367524766342e-06, "loss": 2.5365, "step": 338800 }, { "epoch": 2.5427861853705385, "grad_norm": 2.748286724090576, "learning_rate": 7.637862428939793e-06, "loss": 2.6363, "step": 338900 }, { "epoch": 2.5435364911201317, "grad_norm": 2.324490785598755, "learning_rate": 7.625357333113243e-06, "loss": 2.6376, "step": 339000 }, { "epoch": 2.5442867968697245, "grad_norm": 1.99073326587677, "learning_rate": 7.612852237286695e-06, "loss": 2.4678, "step": 339100 }, { "epoch": 2.5450371026193173, "grad_norm": 2.4580142498016357, "learning_rate": 7.600347141460146e-06, "loss": 2.6134, "step": 339200 }, { "epoch": 2.5457874083689105, "grad_norm": 2.3560922145843506, "learning_rate": 7.587842045633596e-06, "loss": 2.6689, "step": 339300 }, { "epoch": 2.5465377141185033, "grad_norm": 1.404917597770691, "learning_rate": 7.5753369498070464e-06, "loss": 2.5663, "step": 339400 }, { "epoch": 2.5472880198680965, "grad_norm": 1.8341381549835205, "learning_rate": 7.562831853980497e-06, "loss": 2.4548, "step": 339500 }, { "epoch": 2.5480383256176893, "grad_norm": 1.8727697134017944, "learning_rate": 7.5503267581539485e-06, "loss": 2.595, "step": 339600 }, { "epoch": 2.548788631367282, "grad_norm": 2.732870578765869, "learning_rate": 7.537821662327399e-06, "loss": 2.6373, "step": 339700 }, { "epoch": 2.5495389371168753, "grad_norm": 2.929840087890625, "learning_rate": 7.52531656650085e-06, "loss": 2.5925, "step": 339800 }, { "epoch": 2.550289242866468, "grad_norm": 2.005859136581421, "learning_rate": 7.5128114706743e-06, "loss": 2.6229, "step": 339900 }, { "epoch": 2.5510395486160613, "grad_norm": 2.0198705196380615, "learning_rate": 7.500306374847752e-06, "loss": 2.5806, "step": 340000 }, { "epoch": 2.551789854365654, "grad_norm": 1.6483122110366821, "learning_rate": 7.487801279021202e-06, "loss": 2.6313, "step": 340100 }, { "epoch": 2.552540160115247, "grad_norm": 1.8223135471343994, "learning_rate": 7.475296183194652e-06, "loss": 2.6488, "step": 340200 }, { "epoch": 2.5532904658648397, "grad_norm": 2.3268048763275146, "learning_rate": 7.462791087368102e-06, "loss": 2.6599, "step": 340300 }, { "epoch": 2.554040771614433, "grad_norm": 1.7521164417266846, "learning_rate": 7.450285991541553e-06, "loss": 2.5366, "step": 340400 }, { "epoch": 2.554791077364026, "grad_norm": 2.2516255378723145, "learning_rate": 7.437780895715005e-06, "loss": 2.8009, "step": 340500 }, { "epoch": 2.555541383113619, "grad_norm": 1.8112910985946655, "learning_rate": 7.425275799888455e-06, "loss": 2.5363, "step": 340600 }, { "epoch": 2.5562916888632117, "grad_norm": 1.9161149263381958, "learning_rate": 7.4127707040619054e-06, "loss": 2.5427, "step": 340700 }, { "epoch": 2.5570419946128045, "grad_norm": 1.6606091260910034, "learning_rate": 7.400265608235356e-06, "loss": 2.5663, "step": 340800 }, { "epoch": 2.5577923003623977, "grad_norm": 1.6055129766464233, "learning_rate": 7.3877605124088075e-06, "loss": 2.5563, "step": 340900 }, { "epoch": 2.558542606111991, "grad_norm": 1.9305312633514404, "learning_rate": 7.375255416582258e-06, "loss": 2.4536, "step": 341000 }, { "epoch": 2.5592929118615837, "grad_norm": 2.1383469104766846, "learning_rate": 7.362750320755709e-06, "loss": 2.4863, "step": 341100 }, { "epoch": 2.5600432176111765, "grad_norm": 2.752330780029297, "learning_rate": 7.350245224929159e-06, "loss": 2.5704, "step": 341200 }, { "epoch": 2.5607935233607693, "grad_norm": 1.7061856985092163, "learning_rate": 7.337740129102609e-06, "loss": 2.5721, "step": 341300 }, { "epoch": 2.5615438291103625, "grad_norm": 2.245173454284668, "learning_rate": 7.325235033276061e-06, "loss": 2.6623, "step": 341400 }, { "epoch": 2.5622941348599553, "grad_norm": 1.3698896169662476, "learning_rate": 7.312729937449511e-06, "loss": 2.6807, "step": 341500 }, { "epoch": 2.5630444406095485, "grad_norm": 3.056504964828491, "learning_rate": 7.300224841622961e-06, "loss": 2.464, "step": 341600 }, { "epoch": 2.5637947463591413, "grad_norm": 2.047161102294922, "learning_rate": 7.287719745796412e-06, "loss": 2.6793, "step": 341700 }, { "epoch": 2.564545052108734, "grad_norm": 1.856803297996521, "learning_rate": 7.275214649969864e-06, "loss": 2.6017, "step": 341800 }, { "epoch": 2.5652953578583273, "grad_norm": 1.9524445533752441, "learning_rate": 7.262709554143314e-06, "loss": 2.5967, "step": 341900 }, { "epoch": 2.56604566360792, "grad_norm": 1.6474368572235107, "learning_rate": 7.250204458316764e-06, "loss": 2.696, "step": 342000 }, { "epoch": 2.5667959693575133, "grad_norm": 2.05098032951355, "learning_rate": 7.237699362490215e-06, "loss": 2.577, "step": 342100 }, { "epoch": 2.567546275107106, "grad_norm": 1.4670623540878296, "learning_rate": 7.225194266663665e-06, "loss": 2.6754, "step": 342200 }, { "epoch": 2.568296580856699, "grad_norm": 2.285264730453491, "learning_rate": 7.212689170837117e-06, "loss": 2.7318, "step": 342300 }, { "epoch": 2.569046886606292, "grad_norm": 1.4632030725479126, "learning_rate": 7.200184075010568e-06, "loss": 2.6087, "step": 342400 }, { "epoch": 2.569797192355885, "grad_norm": 2.258666753768921, "learning_rate": 7.187678979184018e-06, "loss": 2.5722, "step": 342500 }, { "epoch": 2.570547498105478, "grad_norm": 2.709440231323242, "learning_rate": 7.175298934315734e-06, "loss": 2.2378, "step": 342600 }, { "epoch": 2.571297803855071, "grad_norm": 2.1037185192108154, "learning_rate": 7.1627938384891845e-06, "loss": 2.5592, "step": 342700 }, { "epoch": 2.5720481096046637, "grad_norm": 2.2382352352142334, "learning_rate": 7.150288742662635e-06, "loss": 2.6187, "step": 342800 }, { "epoch": 2.572798415354257, "grad_norm": 2.4388909339904785, "learning_rate": 7.137783646836086e-06, "loss": 2.6939, "step": 342900 }, { "epoch": 2.5735487211038497, "grad_norm": 1.5464428663253784, "learning_rate": 7.125278551009537e-06, "loss": 2.4875, "step": 343000 }, { "epoch": 2.574299026853443, "grad_norm": 2.671454429626465, "learning_rate": 7.112773455182988e-06, "loss": 2.7333, "step": 343100 }, { "epoch": 2.5750493326030357, "grad_norm": 3.9727165699005127, "learning_rate": 7.100268359356438e-06, "loss": 2.5716, "step": 343200 }, { "epoch": 2.5757996383526285, "grad_norm": 1.4055310487747192, "learning_rate": 7.087763263529888e-06, "loss": 2.5529, "step": 343300 }, { "epoch": 2.5765499441022217, "grad_norm": 2.146747350692749, "learning_rate": 7.07525816770334e-06, "loss": 2.5163, "step": 343400 }, { "epoch": 2.5773002498518145, "grad_norm": 1.2915709018707275, "learning_rate": 7.06275307187679e-06, "loss": 2.5944, "step": 343500 }, { "epoch": 2.5780505556014077, "grad_norm": 2.2657463550567627, "learning_rate": 7.05024797605024e-06, "loss": 2.5398, "step": 343600 }, { "epoch": 2.5788008613510005, "grad_norm": 1.5447255373001099, "learning_rate": 7.037867931181957e-06, "loss": 2.5666, "step": 343700 }, { "epoch": 2.5795511671005933, "grad_norm": 2.634927749633789, "learning_rate": 7.025362835355408e-06, "loss": 2.5445, "step": 343800 }, { "epoch": 2.5803014728501865, "grad_norm": 2.009474992752075, "learning_rate": 7.012857739528858e-06, "loss": 2.5232, "step": 343900 }, { "epoch": 2.5810517785997793, "grad_norm": 3.1027610301971436, "learning_rate": 7.000352643702308e-06, "loss": 2.6554, "step": 344000 }, { "epoch": 2.5818020843493725, "grad_norm": 2.5384840965270996, "learning_rate": 6.98784754787576e-06, "loss": 2.7012, "step": 344100 }, { "epoch": 2.5825523900989653, "grad_norm": 1.4438155889511108, "learning_rate": 6.97534245204921e-06, "loss": 2.4915, "step": 344200 }, { "epoch": 2.583302695848558, "grad_norm": 2.2438268661499023, "learning_rate": 6.962837356222661e-06, "loss": 2.7121, "step": 344300 }, { "epoch": 2.5840530015981513, "grad_norm": 2.017545223236084, "learning_rate": 6.9503322603961115e-06, "loss": 2.6051, "step": 344400 }, { "epoch": 2.584803307347744, "grad_norm": 1.650408387184143, "learning_rate": 6.937827164569563e-06, "loss": 2.6065, "step": 344500 }, { "epoch": 2.5855536130973373, "grad_norm": 2.4685916900634766, "learning_rate": 6.9253220687430135e-06, "loss": 2.6325, "step": 344600 }, { "epoch": 2.58630391884693, "grad_norm": 2.122819423675537, "learning_rate": 6.912816972916464e-06, "loss": 2.5105, "step": 344700 }, { "epoch": 2.587054224596523, "grad_norm": 1.662254810333252, "learning_rate": 6.900311877089914e-06, "loss": 2.812, "step": 344800 }, { "epoch": 2.587804530346116, "grad_norm": 1.9483011960983276, "learning_rate": 6.887806781263365e-06, "loss": 2.6202, "step": 344900 }, { "epoch": 2.588554836095709, "grad_norm": 1.5198951959609985, "learning_rate": 6.875301685436817e-06, "loss": 2.5931, "step": 345000 }, { "epoch": 2.589305141845302, "grad_norm": 3.7698116302490234, "learning_rate": 6.862796589610267e-06, "loss": 2.5403, "step": 345100 }, { "epoch": 2.590055447594895, "grad_norm": 2.623826265335083, "learning_rate": 6.850291493783717e-06, "loss": 2.7051, "step": 345200 }, { "epoch": 2.5908057533444877, "grad_norm": 2.1330373287200928, "learning_rate": 6.837786397957167e-06, "loss": 2.6212, "step": 345300 }, { "epoch": 2.591556059094081, "grad_norm": 1.833808183670044, "learning_rate": 6.825281302130619e-06, "loss": 2.733, "step": 345400 }, { "epoch": 2.5923063648436737, "grad_norm": 1.563490390777588, "learning_rate": 6.812776206304069e-06, "loss": 2.6375, "step": 345500 }, { "epoch": 2.593056670593267, "grad_norm": 1.5851026773452759, "learning_rate": 6.80027111047752e-06, "loss": 2.5903, "step": 345600 }, { "epoch": 2.5938069763428597, "grad_norm": 2.207085132598877, "learning_rate": 6.7877660146509704e-06, "loss": 2.7099, "step": 345700 }, { "epoch": 2.5945572820924525, "grad_norm": 2.172098159790039, "learning_rate": 6.775260918824421e-06, "loss": 2.6275, "step": 345800 }, { "epoch": 2.5953075878420457, "grad_norm": 1.8356029987335205, "learning_rate": 6.7627558229978725e-06, "loss": 2.4992, "step": 345900 }, { "epoch": 2.5960578935916385, "grad_norm": 1.9420874118804932, "learning_rate": 6.750250727171323e-06, "loss": 2.5898, "step": 346000 }, { "epoch": 2.5968081993412317, "grad_norm": 1.5789530277252197, "learning_rate": 6.737745631344773e-06, "loss": 2.4952, "step": 346100 }, { "epoch": 2.5975585050908245, "grad_norm": 1.436226725578308, "learning_rate": 6.725240535518224e-06, "loss": 2.5277, "step": 346200 }, { "epoch": 2.5983088108404173, "grad_norm": 1.5282238721847534, "learning_rate": 6.712735439691676e-06, "loss": 2.6086, "step": 346300 }, { "epoch": 2.5990591165900105, "grad_norm": 1.3646161556243896, "learning_rate": 6.700230343865126e-06, "loss": 2.5349, "step": 346400 }, { "epoch": 2.5998094223396033, "grad_norm": 1.8200621604919434, "learning_rate": 6.687725248038576e-06, "loss": 2.5028, "step": 346500 }, { "epoch": 2.6005597280891966, "grad_norm": 3.5772039890289307, "learning_rate": 6.675220152212026e-06, "loss": 2.543, "step": 346600 }, { "epoch": 2.6013100338387893, "grad_norm": 2.2443201541900635, "learning_rate": 6.662715056385476e-06, "loss": 2.3939, "step": 346700 }, { "epoch": 2.602060339588382, "grad_norm": 1.6778925657272339, "learning_rate": 6.650209960558928e-06, "loss": 2.6609, "step": 346800 }, { "epoch": 2.6028106453379753, "grad_norm": 2.9567599296569824, "learning_rate": 6.637704864732379e-06, "loss": 2.5633, "step": 346900 }, { "epoch": 2.603560951087568, "grad_norm": 2.076625347137451, "learning_rate": 6.6251997689058294e-06, "loss": 2.6977, "step": 347000 }, { "epoch": 2.6043112568371614, "grad_norm": 1.55661141872406, "learning_rate": 6.61269467307928e-06, "loss": 2.4742, "step": 347100 }, { "epoch": 2.605061562586754, "grad_norm": 2.26289701461792, "learning_rate": 6.6001895772527315e-06, "loss": 2.5059, "step": 347200 }, { "epoch": 2.605811868336347, "grad_norm": 3.1053037643432617, "learning_rate": 6.587684481426182e-06, "loss": 2.6079, "step": 347300 }, { "epoch": 2.60656217408594, "grad_norm": 2.1305930614471436, "learning_rate": 6.575179385599632e-06, "loss": 2.6056, "step": 347400 }, { "epoch": 2.607312479835533, "grad_norm": 1.6984187364578247, "learning_rate": 6.562674289773083e-06, "loss": 2.7504, "step": 347500 }, { "epoch": 2.608062785585126, "grad_norm": 3.19179105758667, "learning_rate": 6.550169193946533e-06, "loss": 2.5429, "step": 347600 }, { "epoch": 2.608813091334719, "grad_norm": 1.5836104154586792, "learning_rate": 6.5377891490782496e-06, "loss": 2.5581, "step": 347700 }, { "epoch": 2.6095633970843117, "grad_norm": 2.6001293659210205, "learning_rate": 6.5252840532517e-06, "loss": 2.576, "step": 347800 }, { "epoch": 2.610313702833905, "grad_norm": 2.6567318439483643, "learning_rate": 6.512778957425152e-06, "loss": 2.4815, "step": 347900 }, { "epoch": 2.6110640085834977, "grad_norm": 1.6779398918151855, "learning_rate": 6.500273861598602e-06, "loss": 2.5781, "step": 348000 }, { "epoch": 2.611814314333091, "grad_norm": 1.835142970085144, "learning_rate": 6.487768765772052e-06, "loss": 2.6808, "step": 348100 }, { "epoch": 2.6125646200826838, "grad_norm": 2.0114054679870605, "learning_rate": 6.475263669945503e-06, "loss": 2.6808, "step": 348200 }, { "epoch": 2.6133149258322765, "grad_norm": 5.315013885498047, "learning_rate": 6.462758574118955e-06, "loss": 2.6101, "step": 348300 }, { "epoch": 2.6140652315818698, "grad_norm": 1.5334457159042358, "learning_rate": 6.450253478292405e-06, "loss": 2.6538, "step": 348400 }, { "epoch": 2.6148155373314625, "grad_norm": 2.8877389430999756, "learning_rate": 6.437748382465855e-06, "loss": 2.6341, "step": 348500 }, { "epoch": 2.6155658430810558, "grad_norm": 2.628593683242798, "learning_rate": 6.425243286639305e-06, "loss": 2.6854, "step": 348600 }, { "epoch": 2.6163161488306486, "grad_norm": 2.6816608905792236, "learning_rate": 6.412738190812756e-06, "loss": 2.5389, "step": 348700 }, { "epoch": 2.6170664545802413, "grad_norm": 1.9977065324783325, "learning_rate": 6.400233094986207e-06, "loss": 2.6626, "step": 348800 }, { "epoch": 2.6178167603298346, "grad_norm": 2.17655086517334, "learning_rate": 6.387727999159658e-06, "loss": 2.6182, "step": 348900 }, { "epoch": 2.6185670660794274, "grad_norm": 2.6067800521850586, "learning_rate": 6.3752229033331086e-06, "loss": 2.5383, "step": 349000 }, { "epoch": 2.6193173718290206, "grad_norm": 2.2063779830932617, "learning_rate": 6.362717807506559e-06, "loss": 2.5978, "step": 349100 }, { "epoch": 2.6200676775786134, "grad_norm": 2.264939785003662, "learning_rate": 6.350212711680011e-06, "loss": 2.6981, "step": 349200 }, { "epoch": 2.620817983328206, "grad_norm": 1.6933406591415405, "learning_rate": 6.337707615853461e-06, "loss": 2.6195, "step": 349300 }, { "epoch": 2.621568289077799, "grad_norm": 1.4628052711486816, "learning_rate": 6.325202520026911e-06, "loss": 2.6746, "step": 349400 }, { "epoch": 2.622318594827392, "grad_norm": 1.7390116453170776, "learning_rate": 6.312697424200362e-06, "loss": 2.5788, "step": 349500 }, { "epoch": 2.6230689005769854, "grad_norm": 2.0898962020874023, "learning_rate": 6.300192328373812e-06, "loss": 2.6238, "step": 349600 }, { "epoch": 2.623819206326578, "grad_norm": 2.157350778579712, "learning_rate": 6.287812283505529e-06, "loss": 2.4053, "step": 349700 }, { "epoch": 2.624569512076171, "grad_norm": 3.8957016468048096, "learning_rate": 6.275307187678979e-06, "loss": 2.5807, "step": 349800 }, { "epoch": 2.6253198178257637, "grad_norm": 1.685300350189209, "learning_rate": 6.262802091852431e-06, "loss": 2.5453, "step": 349900 }, { "epoch": 2.626070123575357, "grad_norm": 3.0033140182495117, "learning_rate": 6.250296996025881e-06, "loss": 2.4797, "step": 350000 }, { "epoch": 2.62682042932495, "grad_norm": 2.4521448612213135, "learning_rate": 6.237791900199332e-06, "loss": 2.5522, "step": 350100 }, { "epoch": 2.627570735074543, "grad_norm": 3.310697317123413, "learning_rate": 6.225286804372782e-06, "loss": 2.8371, "step": 350200 }, { "epoch": 2.6283210408241358, "grad_norm": 1.910348653793335, "learning_rate": 6.212781708546233e-06, "loss": 2.641, "step": 350300 }, { "epoch": 2.6290713465737285, "grad_norm": 2.228285789489746, "learning_rate": 6.200276612719683e-06, "loss": 2.5755, "step": 350400 }, { "epoch": 2.6298216523233218, "grad_norm": 2.1396589279174805, "learning_rate": 6.187771516893134e-06, "loss": 2.5622, "step": 350500 }, { "epoch": 2.6305719580729146, "grad_norm": 1.6816312074661255, "learning_rate": 6.17539147202485e-06, "loss": 2.4388, "step": 350600 }, { "epoch": 2.631322263822508, "grad_norm": 1.9014140367507935, "learning_rate": 6.162886376198301e-06, "loss": 2.5255, "step": 350700 }, { "epoch": 2.6320725695721006, "grad_norm": 2.2106807231903076, "learning_rate": 6.150381280371752e-06, "loss": 2.6294, "step": 350800 }, { "epoch": 2.6328228753216933, "grad_norm": 2.1610264778137207, "learning_rate": 6.137876184545203e-06, "loss": 2.6624, "step": 350900 }, { "epoch": 2.6335731810712866, "grad_norm": 2.0146477222442627, "learning_rate": 6.125371088718653e-06, "loss": 2.5036, "step": 351000 }, { "epoch": 2.6343234868208794, "grad_norm": 1.778427004814148, "learning_rate": 6.112865992892103e-06, "loss": 2.6647, "step": 351100 }, { "epoch": 2.6350737925704726, "grad_norm": 1.5104138851165771, "learning_rate": 6.100360897065554e-06, "loss": 2.6927, "step": 351200 }, { "epoch": 2.6358240983200654, "grad_norm": 2.1402580738067627, "learning_rate": 6.087855801239005e-06, "loss": 2.715, "step": 351300 }, { "epoch": 2.636574404069658, "grad_norm": 2.0048305988311768, "learning_rate": 6.075350705412456e-06, "loss": 2.6136, "step": 351400 }, { "epoch": 2.6373247098192514, "grad_norm": 3.6146621704101562, "learning_rate": 6.062845609585907e-06, "loss": 2.5475, "step": 351500 }, { "epoch": 2.638075015568844, "grad_norm": 2.3486762046813965, "learning_rate": 6.050340513759358e-06, "loss": 2.4832, "step": 351600 }, { "epoch": 2.6388253213184374, "grad_norm": 2.162766933441162, "learning_rate": 6.037835417932808e-06, "loss": 2.5602, "step": 351700 }, { "epoch": 2.63957562706803, "grad_norm": 1.5309759378433228, "learning_rate": 6.025330322106259e-06, "loss": 2.6125, "step": 351800 }, { "epoch": 2.640325932817623, "grad_norm": 2.1594014167785645, "learning_rate": 6.012825226279709e-06, "loss": 2.4565, "step": 351900 }, { "epoch": 2.641076238567216, "grad_norm": 2.4794704914093018, "learning_rate": 6.00032013045316e-06, "loss": 2.4781, "step": 352000 }, { "epoch": 2.641826544316809, "grad_norm": 4.1486592292785645, "learning_rate": 5.987815034626611e-06, "loss": 2.6018, "step": 352100 }, { "epoch": 2.642576850066402, "grad_norm": 2.6703028678894043, "learning_rate": 5.975309938800061e-06, "loss": 2.4227, "step": 352200 }, { "epoch": 2.643327155815995, "grad_norm": 2.062568426132202, "learning_rate": 5.962804842973512e-06, "loss": 2.5298, "step": 352300 }, { "epoch": 2.6440774615655878, "grad_norm": 1.8262780904769897, "learning_rate": 5.950299747146962e-06, "loss": 2.7354, "step": 352400 }, { "epoch": 2.644827767315181, "grad_norm": 1.971304178237915, "learning_rate": 5.937794651320413e-06, "loss": 2.757, "step": 352500 }, { "epoch": 2.6455780730647738, "grad_norm": 4.046659469604492, "learning_rate": 5.9252895554938636e-06, "loss": 2.5223, "step": 352600 }, { "epoch": 2.646328378814367, "grad_norm": 1.9898048639297485, "learning_rate": 5.912784459667315e-06, "loss": 2.7885, "step": 352700 }, { "epoch": 2.64707868456396, "grad_norm": 1.960715889930725, "learning_rate": 5.900279363840766e-06, "loss": 2.6245, "step": 352800 }, { "epoch": 2.6478289903135526, "grad_norm": 3.0842037200927734, "learning_rate": 5.887774268014216e-06, "loss": 2.5229, "step": 352900 }, { "epoch": 2.648579296063146, "grad_norm": 2.1859943866729736, "learning_rate": 5.875269172187667e-06, "loss": 2.679, "step": 353000 }, { "epoch": 2.6493296018127386, "grad_norm": 1.8382549285888672, "learning_rate": 5.862764076361117e-06, "loss": 2.5531, "step": 353100 }, { "epoch": 2.650079907562332, "grad_norm": 2.346933364868164, "learning_rate": 5.850258980534568e-06, "loss": 2.6233, "step": 353200 }, { "epoch": 2.6508302133119246, "grad_norm": 2.190685510635376, "learning_rate": 5.837753884708019e-06, "loss": 2.6702, "step": 353300 }, { "epoch": 2.6515805190615174, "grad_norm": 2.2028472423553467, "learning_rate": 5.8253738398397356e-06, "loss": 2.5411, "step": 353400 }, { "epoch": 2.6523308248111106, "grad_norm": 1.9298349618911743, "learning_rate": 5.812868744013186e-06, "loss": 2.4384, "step": 353500 }, { "epoch": 2.6530811305607034, "grad_norm": 2.238983392715454, "learning_rate": 5.800363648186637e-06, "loss": 2.6607, "step": 353600 }, { "epoch": 2.6538314363102966, "grad_norm": 1.4921610355377197, "learning_rate": 5.787858552360087e-06, "loss": 2.679, "step": 353700 }, { "epoch": 2.6545817420598894, "grad_norm": 2.5303447246551514, "learning_rate": 5.775353456533537e-06, "loss": 2.5078, "step": 353800 }, { "epoch": 2.655332047809482, "grad_norm": 1.8276262283325195, "learning_rate": 5.762848360706988e-06, "loss": 2.6849, "step": 353900 }, { "epoch": 2.6560823535590754, "grad_norm": 2.062143325805664, "learning_rate": 5.750343264880439e-06, "loss": 2.6042, "step": 354000 }, { "epoch": 2.656832659308668, "grad_norm": 2.615635395050049, "learning_rate": 5.73783816905389e-06, "loss": 2.7392, "step": 354100 }, { "epoch": 2.6575829650582614, "grad_norm": 1.9050719738006592, "learning_rate": 5.72533307322734e-06, "loss": 2.6445, "step": 354200 }, { "epoch": 2.658333270807854, "grad_norm": 2.592411756515503, "learning_rate": 5.712827977400791e-06, "loss": 2.6104, "step": 354300 }, { "epoch": 2.659083576557447, "grad_norm": 2.5355255603790283, "learning_rate": 5.7003228815742415e-06, "loss": 2.5226, "step": 354400 }, { "epoch": 2.65983388230704, "grad_norm": 1.7968924045562744, "learning_rate": 5.6878177857476925e-06, "loss": 2.6407, "step": 354500 }, { "epoch": 2.660584188056633, "grad_norm": 2.411778450012207, "learning_rate": 5.6753126899211435e-06, "loss": 2.6728, "step": 354600 }, { "epoch": 2.661334493806226, "grad_norm": 1.6319868564605713, "learning_rate": 5.662807594094594e-06, "loss": 2.6494, "step": 354700 }, { "epoch": 2.662084799555819, "grad_norm": 2.5121476650238037, "learning_rate": 5.650302498268045e-06, "loss": 2.6387, "step": 354800 }, { "epoch": 2.662835105305412, "grad_norm": 1.7053611278533936, "learning_rate": 5.637797402441495e-06, "loss": 2.5937, "step": 354900 }, { "epoch": 2.663585411055005, "grad_norm": 3.4513907432556152, "learning_rate": 5.625292306614946e-06, "loss": 2.6464, "step": 355000 }, { "epoch": 2.664335716804598, "grad_norm": 2.8060545921325684, "learning_rate": 5.612787210788396e-06, "loss": 2.7216, "step": 355100 }, { "epoch": 2.665086022554191, "grad_norm": 2.0192863941192627, "learning_rate": 5.600282114961847e-06, "loss": 2.5809, "step": 355200 }, { "epoch": 2.665836328303784, "grad_norm": 1.752548098564148, "learning_rate": 5.587777019135298e-06, "loss": 2.5791, "step": 355300 }, { "epoch": 2.6665866340533766, "grad_norm": 1.9476202726364136, "learning_rate": 5.575271923308749e-06, "loss": 2.6583, "step": 355400 }, { "epoch": 2.66733693980297, "grad_norm": 3.9260270595550537, "learning_rate": 5.562766827482199e-06, "loss": 2.5446, "step": 355500 }, { "epoch": 2.6680872455525626, "grad_norm": 3.0735867023468018, "learning_rate": 5.5502617316556495e-06, "loss": 2.4505, "step": 355600 }, { "epoch": 2.668837551302156, "grad_norm": 1.89884614944458, "learning_rate": 5.5377566358291005e-06, "loss": 2.6527, "step": 355700 }, { "epoch": 2.6695878570517486, "grad_norm": 1.8676130771636963, "learning_rate": 5.525251540002551e-06, "loss": 2.5937, "step": 355800 }, { "epoch": 2.6703381628013414, "grad_norm": 1.6196540594100952, "learning_rate": 5.5127464441760025e-06, "loss": 2.6023, "step": 355900 }, { "epoch": 2.6710884685509346, "grad_norm": 2.5568621158599854, "learning_rate": 5.500241348349453e-06, "loss": 2.7071, "step": 356000 }, { "epoch": 2.6718387743005274, "grad_norm": 1.7157033681869507, "learning_rate": 5.487736252522904e-06, "loss": 2.7845, "step": 356100 }, { "epoch": 2.6725890800501206, "grad_norm": 2.3365726470947266, "learning_rate": 5.475231156696354e-06, "loss": 2.5265, "step": 356200 }, { "epoch": 2.6733393857997134, "grad_norm": 2.2917847633361816, "learning_rate": 5.462726060869805e-06, "loss": 2.6632, "step": 356300 }, { "epoch": 2.674089691549306, "grad_norm": 1.3997043371200562, "learning_rate": 5.450220965043255e-06, "loss": 2.7329, "step": 356400 }, { "epoch": 2.6748399972988994, "grad_norm": 1.8763389587402344, "learning_rate": 5.437715869216706e-06, "loss": 2.4823, "step": 356500 }, { "epoch": 2.675590303048492, "grad_norm": 2.244072198867798, "learning_rate": 5.425210773390157e-06, "loss": 2.457, "step": 356600 }, { "epoch": 2.6763406087980854, "grad_norm": 1.6799561977386475, "learning_rate": 5.412705677563607e-06, "loss": 2.805, "step": 356700 }, { "epoch": 2.6770909145476782, "grad_norm": 2.3979103565216064, "learning_rate": 5.400200581737058e-06, "loss": 2.557, "step": 356800 }, { "epoch": 2.677841220297271, "grad_norm": 3.0641000270843506, "learning_rate": 5.3876954859105085e-06, "loss": 2.6294, "step": 356900 }, { "epoch": 2.6785915260468642, "grad_norm": 1.8645331859588623, "learning_rate": 5.3751903900839595e-06, "loss": 2.6237, "step": 357000 }, { "epoch": 2.679341831796457, "grad_norm": 2.0037920475006104, "learning_rate": 5.36268529425741e-06, "loss": 2.5722, "step": 357100 }, { "epoch": 2.6800921375460502, "grad_norm": 1.8396400213241577, "learning_rate": 5.3501801984308615e-06, "loss": 2.5972, "step": 357200 }, { "epoch": 2.680842443295643, "grad_norm": 1.7444285154342651, "learning_rate": 5.337675102604312e-06, "loss": 2.5372, "step": 357300 }, { "epoch": 2.681592749045236, "grad_norm": 4.0078444480896, "learning_rate": 5.325295057736027e-06, "loss": 2.6437, "step": 357400 }, { "epoch": 2.682343054794829, "grad_norm": 2.5500075817108154, "learning_rate": 5.3127899619094784e-06, "loss": 2.6318, "step": 357500 }, { "epoch": 2.683093360544422, "grad_norm": 2.0509278774261475, "learning_rate": 5.300284866082929e-06, "loss": 2.5762, "step": 357600 }, { "epoch": 2.683843666294015, "grad_norm": 2.553086280822754, "learning_rate": 5.28777977025638e-06, "loss": 2.5428, "step": 357700 }, { "epoch": 2.684593972043608, "grad_norm": 2.26973819732666, "learning_rate": 5.275274674429831e-06, "loss": 2.6552, "step": 357800 }, { "epoch": 2.6853442777932006, "grad_norm": 1.9227548837661743, "learning_rate": 5.262769578603282e-06, "loss": 2.592, "step": 357900 }, { "epoch": 2.686094583542794, "grad_norm": 1.5981769561767578, "learning_rate": 5.250264482776732e-06, "loss": 2.4877, "step": 358000 }, { "epoch": 2.6868448892923866, "grad_norm": 3.161842107772827, "learning_rate": 5.237759386950183e-06, "loss": 2.4097, "step": 358100 }, { "epoch": 2.68759519504198, "grad_norm": 2.6494064331054688, "learning_rate": 5.2253793420818986e-06, "loss": 2.6209, "step": 358200 }, { "epoch": 2.6883455007915726, "grad_norm": 1.9512072801589966, "learning_rate": 5.212874246255349e-06, "loss": 2.5864, "step": 358300 }, { "epoch": 2.6890958065411654, "grad_norm": 1.960808515548706, "learning_rate": 5.2003691504288e-06, "loss": 2.687, "step": 358400 }, { "epoch": 2.689846112290758, "grad_norm": 1.6314973831176758, "learning_rate": 5.187864054602251e-06, "loss": 2.5622, "step": 358500 }, { "epoch": 2.6905964180403514, "grad_norm": 1.3951847553253174, "learning_rate": 5.175358958775702e-06, "loss": 2.6006, "step": 358600 }, { "epoch": 2.6913467237899447, "grad_norm": 1.626577377319336, "learning_rate": 5.162853862949152e-06, "loss": 2.576, "step": 358700 }, { "epoch": 2.6920970295395374, "grad_norm": 1.681369662284851, "learning_rate": 5.150348767122603e-06, "loss": 2.3427, "step": 358800 }, { "epoch": 2.6928473352891302, "grad_norm": 1.8352446556091309, "learning_rate": 5.137843671296053e-06, "loss": 2.655, "step": 358900 }, { "epoch": 2.693597641038723, "grad_norm": 1.7884840965270996, "learning_rate": 5.125338575469504e-06, "loss": 2.4629, "step": 359000 }, { "epoch": 2.6943479467883162, "grad_norm": 1.7948821783065796, "learning_rate": 5.112833479642954e-06, "loss": 2.7104, "step": 359100 }, { "epoch": 2.6950982525379095, "grad_norm": 3.2723934650421143, "learning_rate": 5.100328383816405e-06, "loss": 2.4649, "step": 359200 }, { "epoch": 2.6958485582875023, "grad_norm": 2.426009178161621, "learning_rate": 5.087823287989856e-06, "loss": 2.6023, "step": 359300 }, { "epoch": 2.696598864037095, "grad_norm": 1.455514907836914, "learning_rate": 5.0753181921633065e-06, "loss": 2.6026, "step": 359400 }, { "epoch": 2.697349169786688, "grad_norm": 2.32313871383667, "learning_rate": 5.0628130963367575e-06, "loss": 2.4438, "step": 359500 }, { "epoch": 2.698099475536281, "grad_norm": 2.0478949546813965, "learning_rate": 5.050308000510208e-06, "loss": 2.6377, "step": 359600 }, { "epoch": 2.698849781285874, "grad_norm": 2.287689208984375, "learning_rate": 5.037802904683659e-06, "loss": 2.7266, "step": 359700 }, { "epoch": 2.699600087035467, "grad_norm": 1.7533799409866333, "learning_rate": 5.02529780885711e-06, "loss": 2.6095, "step": 359800 }, { "epoch": 2.70035039278506, "grad_norm": 1.7358663082122803, "learning_rate": 5.012792713030561e-06, "loss": 2.4346, "step": 359900 }, { "epoch": 2.7011006985346526, "grad_norm": 2.5408456325531006, "learning_rate": 5.000287617204011e-06, "loss": 2.4344, "step": 360000 }, { "epoch": 2.701851004284246, "grad_norm": 2.5869903564453125, "learning_rate": 4.987782521377461e-06, "loss": 2.4738, "step": 360100 }, { "epoch": 2.7026013100338386, "grad_norm": 1.65389084815979, "learning_rate": 4.975277425550912e-06, "loss": 2.4842, "step": 360200 }, { "epoch": 2.703351615783432, "grad_norm": 2.008599281311035, "learning_rate": 4.962772329724362e-06, "loss": 2.6869, "step": 360300 }, { "epoch": 2.7041019215330246, "grad_norm": 2.978081703186035, "learning_rate": 4.950267233897813e-06, "loss": 2.744, "step": 360400 }, { "epoch": 2.7048522272826174, "grad_norm": 2.7859420776367188, "learning_rate": 4.93788718902953e-06, "loss": 2.5288, "step": 360500 }, { "epoch": 2.7056025330322107, "grad_norm": 1.680914282798767, "learning_rate": 4.925382093202981e-06, "loss": 2.6401, "step": 360600 }, { "epoch": 2.7063528387818034, "grad_norm": 1.6114667654037476, "learning_rate": 4.912876997376431e-06, "loss": 2.6157, "step": 360700 }, { "epoch": 2.7071031445313967, "grad_norm": 1.7679351568222046, "learning_rate": 4.900371901549882e-06, "loss": 2.5719, "step": 360800 }, { "epoch": 2.7078534502809894, "grad_norm": 1.520399808883667, "learning_rate": 4.887866805723332e-06, "loss": 2.6316, "step": 360900 }, { "epoch": 2.7086037560305822, "grad_norm": 2.263160467147827, "learning_rate": 4.8753617098967824e-06, "loss": 2.6101, "step": 361000 }, { "epoch": 2.7093540617801755, "grad_norm": 1.7819262742996216, "learning_rate": 4.862856614070234e-06, "loss": 2.6537, "step": 361100 }, { "epoch": 2.7101043675297682, "grad_norm": 1.7128369808197021, "learning_rate": 4.8503515182436845e-06, "loss": 2.5434, "step": 361200 }, { "epoch": 2.7108546732793615, "grad_norm": 2.324150800704956, "learning_rate": 4.8378464224171355e-06, "loss": 2.5238, "step": 361300 }, { "epoch": 2.7116049790289543, "grad_norm": 2.3219072818756104, "learning_rate": 4.825341326590586e-06, "loss": 2.5778, "step": 361400 }, { "epoch": 2.712355284778547, "grad_norm": 1.6519358158111572, "learning_rate": 4.812836230764037e-06, "loss": 2.7263, "step": 361500 }, { "epoch": 2.7131055905281403, "grad_norm": 2.867141008377075, "learning_rate": 4.800331134937487e-06, "loss": 2.4985, "step": 361600 }, { "epoch": 2.713855896277733, "grad_norm": 2.053107738494873, "learning_rate": 4.787826039110938e-06, "loss": 2.6044, "step": 361700 }, { "epoch": 2.7146062020273263, "grad_norm": 3.106074810028076, "learning_rate": 4.775320943284389e-06, "loss": 2.4682, "step": 361800 }, { "epoch": 2.715356507776919, "grad_norm": 1.5324020385742188, "learning_rate": 4.762815847457839e-06, "loss": 2.6186, "step": 361900 }, { "epoch": 2.716106813526512, "grad_norm": 2.337327718734741, "learning_rate": 4.75031075163129e-06, "loss": 2.7778, "step": 362000 }, { "epoch": 2.716857119276105, "grad_norm": 2.4133830070495605, "learning_rate": 4.73780565580474e-06, "loss": 2.5784, "step": 362100 }, { "epoch": 2.717607425025698, "grad_norm": 1.6150598526000977, "learning_rate": 4.725300559978191e-06, "loss": 2.6715, "step": 362200 }, { "epoch": 2.718357730775291, "grad_norm": 2.7139618396759033, "learning_rate": 4.712795464151641e-06, "loss": 2.5443, "step": 362300 }, { "epoch": 2.719108036524884, "grad_norm": 1.7415693998336792, "learning_rate": 4.700290368325093e-06, "loss": 2.6519, "step": 362400 }, { "epoch": 2.7198583422744766, "grad_norm": 1.9416621923446655, "learning_rate": 4.6877852724985434e-06, "loss": 2.6413, "step": 362500 }, { "epoch": 2.72060864802407, "grad_norm": 2.4205000400543213, "learning_rate": 4.67540522763026e-06, "loss": 2.5765, "step": 362600 }, { "epoch": 2.7213589537736627, "grad_norm": 2.13726806640625, "learning_rate": 4.66290013180371e-06, "loss": 2.5045, "step": 362700 }, { "epoch": 2.722109259523256, "grad_norm": 1.913934350013733, "learning_rate": 4.65039503597716e-06, "loss": 2.4664, "step": 362800 }, { "epoch": 2.7228595652728487, "grad_norm": 1.7394511699676514, "learning_rate": 4.637889940150611e-06, "loss": 2.5141, "step": 362900 }, { "epoch": 2.7236098710224415, "grad_norm": 1.6200562715530396, "learning_rate": 4.625384844324062e-06, "loss": 2.6199, "step": 363000 }, { "epoch": 2.7243601767720347, "grad_norm": 1.625698208808899, "learning_rate": 4.612879748497513e-06, "loss": 2.7641, "step": 363100 }, { "epoch": 2.7251104825216275, "grad_norm": 1.458992600440979, "learning_rate": 4.600374652670964e-06, "loss": 2.572, "step": 363200 }, { "epoch": 2.7258607882712207, "grad_norm": 2.5031352043151855, "learning_rate": 4.587869556844415e-06, "loss": 2.6019, "step": 363300 }, { "epoch": 2.7266110940208135, "grad_norm": 1.972806692123413, "learning_rate": 4.575364461017865e-06, "loss": 2.5336, "step": 363400 }, { "epoch": 2.7273613997704063, "grad_norm": 2.4421892166137695, "learning_rate": 4.562859365191316e-06, "loss": 2.6582, "step": 363500 }, { "epoch": 2.7281117055199995, "grad_norm": 1.8256570100784302, "learning_rate": 4.550354269364766e-06, "loss": 2.632, "step": 363600 }, { "epoch": 2.7288620112695923, "grad_norm": 2.011327028274536, "learning_rate": 4.537849173538217e-06, "loss": 2.5023, "step": 363700 }, { "epoch": 2.7296123170191855, "grad_norm": 2.231536865234375, "learning_rate": 4.525344077711668e-06, "loss": 2.4886, "step": 363800 }, { "epoch": 2.7303626227687783, "grad_norm": 2.4770963191986084, "learning_rate": 4.512838981885118e-06, "loss": 2.6397, "step": 363900 }, { "epoch": 2.731112928518371, "grad_norm": 1.7517036199569702, "learning_rate": 4.500333886058569e-06, "loss": 2.5169, "step": 364000 }, { "epoch": 2.7318632342679643, "grad_norm": 2.722395896911621, "learning_rate": 4.487828790232019e-06, "loss": 2.5433, "step": 364100 }, { "epoch": 2.732613540017557, "grad_norm": 2.2222886085510254, "learning_rate": 4.47532369440547e-06, "loss": 2.6774, "step": 364200 }, { "epoch": 2.7333638457671503, "grad_norm": 1.7316468954086304, "learning_rate": 4.462818598578921e-06, "loss": 2.5761, "step": 364300 }, { "epoch": 2.734114151516743, "grad_norm": 1.7708454132080078, "learning_rate": 4.450313502752372e-06, "loss": 2.6043, "step": 364400 }, { "epoch": 2.734864457266336, "grad_norm": 3.0199925899505615, "learning_rate": 4.4378084069258226e-06, "loss": 2.7094, "step": 364500 }, { "epoch": 2.735614763015929, "grad_norm": 2.4026551246643066, "learning_rate": 4.425303311099273e-06, "loss": 2.6474, "step": 364600 }, { "epoch": 2.736365068765522, "grad_norm": 2.543175458908081, "learning_rate": 4.412798215272724e-06, "loss": 2.5658, "step": 364700 }, { "epoch": 2.737115374515115, "grad_norm": 2.3438799381256104, "learning_rate": 4.400293119446174e-06, "loss": 2.6498, "step": 364800 }, { "epoch": 2.737865680264708, "grad_norm": 1.7845457792282104, "learning_rate": 4.387788023619625e-06, "loss": 2.6329, "step": 364900 }, { "epoch": 2.7386159860143007, "grad_norm": 2.2968509197235107, "learning_rate": 4.375282927793076e-06, "loss": 2.6507, "step": 365000 }, { "epoch": 2.739366291763894, "grad_norm": 2.140889883041382, "learning_rate": 4.362777831966527e-06, "loss": 2.6209, "step": 365100 }, { "epoch": 2.7401165975134867, "grad_norm": 3.3901896476745605, "learning_rate": 4.350272736139977e-06, "loss": 2.5373, "step": 365200 }, { "epoch": 2.74086690326308, "grad_norm": 1.940739393234253, "learning_rate": 4.337767640313428e-06, "loss": 2.541, "step": 365300 }, { "epoch": 2.7416172090126727, "grad_norm": 1.9092193841934204, "learning_rate": 4.325262544486878e-06, "loss": 2.6683, "step": 365400 }, { "epoch": 2.7423675147622655, "grad_norm": 3.874051332473755, "learning_rate": 4.312757448660329e-06, "loss": 2.5379, "step": 365500 }, { "epoch": 2.7431178205118587, "grad_norm": 1.8659422397613525, "learning_rate": 4.30025235283378e-06, "loss": 2.6338, "step": 365600 }, { "epoch": 2.7438681262614515, "grad_norm": 2.9669270515441895, "learning_rate": 4.2877472570072305e-06, "loss": 2.5963, "step": 365700 }, { "epoch": 2.7446184320110447, "grad_norm": 1.954716444015503, "learning_rate": 4.2752421611806816e-06, "loss": 2.4263, "step": 365800 }, { "epoch": 2.7453687377606375, "grad_norm": 1.579888939857483, "learning_rate": 4.262737065354132e-06, "loss": 2.5004, "step": 365900 }, { "epoch": 2.7461190435102303, "grad_norm": 3.1502599716186523, "learning_rate": 4.250231969527583e-06, "loss": 2.6059, "step": 366000 }, { "epoch": 2.7468693492598235, "grad_norm": 2.499486207962036, "learning_rate": 4.237726873701033e-06, "loss": 2.6884, "step": 366100 }, { "epoch": 2.7476196550094163, "grad_norm": 1.634782075881958, "learning_rate": 4.225221777874484e-06, "loss": 2.6054, "step": 366200 }, { "epoch": 2.7483699607590095, "grad_norm": 2.252899646759033, "learning_rate": 4.212716682047935e-06, "loss": 2.7246, "step": 366300 }, { "epoch": 2.7491202665086023, "grad_norm": 1.6400189399719238, "learning_rate": 4.200211586221386e-06, "loss": 2.6688, "step": 366400 }, { "epoch": 2.749870572258195, "grad_norm": 1.8423357009887695, "learning_rate": 4.187706490394836e-06, "loss": 2.6685, "step": 366500 }, { "epoch": 2.7506208780077883, "grad_norm": 2.2574868202209473, "learning_rate": 4.175201394568286e-06, "loss": 2.5009, "step": 366600 }, { "epoch": 2.751371183757381, "grad_norm": 1.7580839395523071, "learning_rate": 4.162821349700003e-06, "loss": 2.5708, "step": 366700 }, { "epoch": 2.7521214895069743, "grad_norm": 2.367560863494873, "learning_rate": 4.150316253873453e-06, "loss": 2.6023, "step": 366800 }, { "epoch": 2.752871795256567, "grad_norm": 1.413507103919983, "learning_rate": 4.137811158046905e-06, "loss": 2.5793, "step": 366900 }, { "epoch": 2.75362210100616, "grad_norm": 2.0538790225982666, "learning_rate": 4.125306062220355e-06, "loss": 2.7004, "step": 367000 }, { "epoch": 2.754372406755753, "grad_norm": 1.9186277389526367, "learning_rate": 4.112800966393806e-06, "loss": 2.5998, "step": 367100 }, { "epoch": 2.755122712505346, "grad_norm": 2.5047926902770996, "learning_rate": 4.100295870567256e-06, "loss": 2.4512, "step": 367200 }, { "epoch": 2.755873018254939, "grad_norm": 2.645082473754883, "learning_rate": 4.087790774740707e-06, "loss": 2.6588, "step": 367300 }, { "epoch": 2.756623324004532, "grad_norm": 2.0614640712738037, "learning_rate": 4.0752856789141575e-06, "loss": 2.4623, "step": 367400 }, { "epoch": 2.7573736297541247, "grad_norm": 2.437154531478882, "learning_rate": 4.0627805830876085e-06, "loss": 2.6247, "step": 367500 }, { "epoch": 2.758123935503718, "grad_norm": 2.5125656127929688, "learning_rate": 4.0502754872610595e-06, "loss": 2.5876, "step": 367600 }, { "epoch": 2.7588742412533107, "grad_norm": 1.9980062246322632, "learning_rate": 4.03777039143451e-06, "loss": 2.6373, "step": 367700 }, { "epoch": 2.759624547002904, "grad_norm": 2.23891019821167, "learning_rate": 4.025265295607961e-06, "loss": 2.6036, "step": 367800 }, { "epoch": 2.7603748527524967, "grad_norm": 2.6541731357574463, "learning_rate": 4.012760199781411e-06, "loss": 2.6269, "step": 367900 }, { "epoch": 2.7611251585020895, "grad_norm": 1.5899639129638672, "learning_rate": 4.000255103954862e-06, "loss": 2.7124, "step": 368000 }, { "epoch": 2.7618754642516823, "grad_norm": 1.5905603170394897, "learning_rate": 3.987750008128312e-06, "loss": 2.5437, "step": 368100 }, { "epoch": 2.7626257700012755, "grad_norm": 1.429392695426941, "learning_rate": 3.975244912301764e-06, "loss": 2.7117, "step": 368200 }, { "epoch": 2.7633760757508687, "grad_norm": 1.9740855693817139, "learning_rate": 3.962739816475214e-06, "loss": 2.6348, "step": 368300 }, { "epoch": 2.7641263815004615, "grad_norm": 2.3016018867492676, "learning_rate": 3.950234720648664e-06, "loss": 2.5094, "step": 368400 }, { "epoch": 2.7648766872500543, "grad_norm": 1.4428989887237549, "learning_rate": 3.937729624822115e-06, "loss": 2.6658, "step": 368500 }, { "epoch": 2.765626992999647, "grad_norm": 2.0367777347564697, "learning_rate": 3.925224528995565e-06, "loss": 2.7026, "step": 368600 }, { "epoch": 2.7663772987492403, "grad_norm": 2.37367844581604, "learning_rate": 3.912844484127282e-06, "loss": 2.6464, "step": 368700 }, { "epoch": 2.7671276044988335, "grad_norm": 1.6731842756271362, "learning_rate": 3.900339388300733e-06, "loss": 2.6475, "step": 368800 }, { "epoch": 2.7678779102484263, "grad_norm": 2.033116102218628, "learning_rate": 3.887834292474184e-06, "loss": 2.6015, "step": 368900 }, { "epoch": 2.768628215998019, "grad_norm": 2.197063446044922, "learning_rate": 3.875329196647634e-06, "loss": 2.66, "step": 369000 }, { "epoch": 2.769378521747612, "grad_norm": 2.063412666320801, "learning_rate": 3.862824100821085e-06, "loss": 2.6142, "step": 369100 }, { "epoch": 2.770128827497205, "grad_norm": 1.9627147912979126, "learning_rate": 3.850319004994535e-06, "loss": 2.6601, "step": 369200 }, { "epoch": 2.770879133246798, "grad_norm": 1.5814809799194336, "learning_rate": 3.8378139091679856e-06, "loss": 2.6364, "step": 369300 }, { "epoch": 2.771629438996391, "grad_norm": 1.8003391027450562, "learning_rate": 3.8253088133414366e-06, "loss": 2.5709, "step": 369400 }, { "epoch": 2.772379744745984, "grad_norm": 2.9619009494781494, "learning_rate": 3.812803717514887e-06, "loss": 2.5856, "step": 369500 }, { "epoch": 2.7731300504955767, "grad_norm": 1.7599934339523315, "learning_rate": 3.800298621688338e-06, "loss": 2.5061, "step": 369600 }, { "epoch": 2.77388035624517, "grad_norm": 1.7115685939788818, "learning_rate": 3.7877935258617888e-06, "loss": 2.6284, "step": 369700 }, { "epoch": 2.7746306619947627, "grad_norm": 1.943713903427124, "learning_rate": 3.77528843003524e-06, "loss": 2.5697, "step": 369800 }, { "epoch": 2.775380967744356, "grad_norm": 1.7505131959915161, "learning_rate": 3.76278333420869e-06, "loss": 2.5975, "step": 369900 }, { "epoch": 2.7761312734939487, "grad_norm": 1.780674934387207, "learning_rate": 3.7502782383821414e-06, "loss": 2.5862, "step": 370000 }, { "epoch": 2.7768815792435415, "grad_norm": 1.837540626525879, "learning_rate": 3.7377731425555916e-06, "loss": 2.6222, "step": 370100 }, { "epoch": 2.7776318849931347, "grad_norm": 2.446751594543457, "learning_rate": 3.725268046729042e-06, "loss": 2.611, "step": 370200 }, { "epoch": 2.7783821907427275, "grad_norm": 2.3905112743377686, "learning_rate": 3.712762950902493e-06, "loss": 2.6546, "step": 370300 }, { "epoch": 2.7791324964923207, "grad_norm": 1.7717081308364868, "learning_rate": 3.7002578550759434e-06, "loss": 2.6539, "step": 370400 }, { "epoch": 2.7798828022419135, "grad_norm": 3.37977933883667, "learning_rate": 3.6877527592493944e-06, "loss": 2.6653, "step": 370500 }, { "epoch": 2.7806331079915063, "grad_norm": 2.707179307937622, "learning_rate": 3.675247663422845e-06, "loss": 2.6165, "step": 370600 }, { "epoch": 2.7813834137410995, "grad_norm": 1.4421894550323486, "learning_rate": 3.662742567596296e-06, "loss": 2.6426, "step": 370700 }, { "epoch": 2.7821337194906923, "grad_norm": 2.290649652481079, "learning_rate": 3.6503625227280117e-06, "loss": 2.5957, "step": 370800 }, { "epoch": 2.7828840252402856, "grad_norm": 2.172128438949585, "learning_rate": 3.6378574269014627e-06, "loss": 2.5749, "step": 370900 }, { "epoch": 2.7836343309898783, "grad_norm": 1.8519526720046997, "learning_rate": 3.6253523310749133e-06, "loss": 2.4881, "step": 371000 }, { "epoch": 2.784384636739471, "grad_norm": 2.210249662399292, "learning_rate": 3.6128472352483635e-06, "loss": 2.4848, "step": 371100 }, { "epoch": 2.7851349424890643, "grad_norm": 2.1184005737304688, "learning_rate": 3.6003421394218145e-06, "loss": 2.6149, "step": 371200 }, { "epoch": 2.785885248238657, "grad_norm": 1.9629889726638794, "learning_rate": 3.587837043595265e-06, "loss": 2.3995, "step": 371300 }, { "epoch": 2.7866355539882504, "grad_norm": 2.0180890560150146, "learning_rate": 3.575331947768716e-06, "loss": 2.537, "step": 371400 }, { "epoch": 2.787385859737843, "grad_norm": 1.57570481300354, "learning_rate": 3.5628268519421663e-06, "loss": 2.6213, "step": 371500 }, { "epoch": 2.788136165487436, "grad_norm": 1.8034864664077759, "learning_rate": 3.5503217561156177e-06, "loss": 2.6003, "step": 371600 }, { "epoch": 2.788886471237029, "grad_norm": 2.6936264038085938, "learning_rate": 3.537816660289068e-06, "loss": 2.7067, "step": 371700 }, { "epoch": 2.789636776986622, "grad_norm": 2.6864986419677734, "learning_rate": 3.525311564462519e-06, "loss": 2.6247, "step": 371800 }, { "epoch": 2.790387082736215, "grad_norm": 2.029170513153076, "learning_rate": 3.5128064686359695e-06, "loss": 2.6569, "step": 371900 }, { "epoch": 2.791137388485808, "grad_norm": 2.5010197162628174, "learning_rate": 3.5003013728094197e-06, "loss": 2.5319, "step": 372000 }, { "epoch": 2.7918876942354007, "grad_norm": 2.1413414478302, "learning_rate": 3.4877962769828707e-06, "loss": 2.5367, "step": 372100 }, { "epoch": 2.792637999984994, "grad_norm": 2.3574554920196533, "learning_rate": 3.475416232114587e-06, "loss": 2.4927, "step": 372200 }, { "epoch": 2.7933883057345867, "grad_norm": 1.675987958908081, "learning_rate": 3.462911136288038e-06, "loss": 2.6745, "step": 372300 }, { "epoch": 2.79413861148418, "grad_norm": 1.8746140003204346, "learning_rate": 3.450406040461488e-06, "loss": 2.5936, "step": 372400 }, { "epoch": 2.7948889172337728, "grad_norm": 2.06672739982605, "learning_rate": 3.437900944634939e-06, "loss": 2.6488, "step": 372500 }, { "epoch": 2.7956392229833655, "grad_norm": 6.925310134887695, "learning_rate": 3.4253958488083896e-06, "loss": 2.5446, "step": 372600 }, { "epoch": 2.7963895287329588, "grad_norm": 2.020282030105591, "learning_rate": 3.4128907529818407e-06, "loss": 2.6912, "step": 372700 }, { "epoch": 2.7971398344825515, "grad_norm": 2.815974473953247, "learning_rate": 3.400385657155291e-06, "loss": 2.3939, "step": 372800 }, { "epoch": 2.7978901402321448, "grad_norm": 1.5518447160720825, "learning_rate": 3.3878805613287414e-06, "loss": 2.6357, "step": 372900 }, { "epoch": 2.7986404459817376, "grad_norm": 1.9564472436904907, "learning_rate": 3.3753754655021924e-06, "loss": 2.5065, "step": 373000 }, { "epoch": 2.7993907517313303, "grad_norm": 1.6460965871810913, "learning_rate": 3.3628703696756426e-06, "loss": 2.5926, "step": 373100 }, { "epoch": 2.8001410574809236, "grad_norm": 1.445528268814087, "learning_rate": 3.350365273849094e-06, "loss": 2.5733, "step": 373200 }, { "epoch": 2.8008913632305164, "grad_norm": 1.8347655534744263, "learning_rate": 3.3378601780225442e-06, "loss": 2.6856, "step": 373300 }, { "epoch": 2.8016416689801096, "grad_norm": 2.612912654876709, "learning_rate": 3.3253550821959952e-06, "loss": 2.6115, "step": 373400 }, { "epoch": 2.8023919747297024, "grad_norm": 1.566575050354004, "learning_rate": 3.312849986369446e-06, "loss": 2.6865, "step": 373500 }, { "epoch": 2.803142280479295, "grad_norm": 1.891832709312439, "learning_rate": 3.300344890542897e-06, "loss": 2.6253, "step": 373600 }, { "epoch": 2.8038925862288884, "grad_norm": 1.8060983419418335, "learning_rate": 3.287839794716347e-06, "loss": 2.6103, "step": 373700 }, { "epoch": 2.804642891978481, "grad_norm": 2.1914401054382324, "learning_rate": 3.2753346988897976e-06, "loss": 2.554, "step": 373800 }, { "epoch": 2.8053931977280744, "grad_norm": 1.9983303546905518, "learning_rate": 3.2628296030632486e-06, "loss": 2.5766, "step": 373900 }, { "epoch": 2.806143503477667, "grad_norm": 1.827908992767334, "learning_rate": 3.250324507236699e-06, "loss": 2.5607, "step": 374000 }, { "epoch": 2.80689380922726, "grad_norm": 2.0494143962860107, "learning_rate": 3.23781941141015e-06, "loss": 2.6921, "step": 374100 }, { "epoch": 2.807644114976853, "grad_norm": 1.6216812133789062, "learning_rate": 3.2253143155836004e-06, "loss": 2.5815, "step": 374200 }, { "epoch": 2.808394420726446, "grad_norm": 1.8752127885818481, "learning_rate": 3.2128092197570514e-06, "loss": 2.6975, "step": 374300 }, { "epoch": 2.809144726476039, "grad_norm": 2.6517438888549805, "learning_rate": 3.2003041239305016e-06, "loss": 2.5177, "step": 374400 }, { "epoch": 2.809895032225632, "grad_norm": 1.8380937576293945, "learning_rate": 3.187799028103953e-06, "loss": 2.5407, "step": 374500 }, { "epoch": 2.8106453379752248, "grad_norm": 1.9118543863296509, "learning_rate": 3.175293932277403e-06, "loss": 2.4783, "step": 374600 }, { "epoch": 2.811395643724818, "grad_norm": 1.9306488037109375, "learning_rate": 3.1627888364508534e-06, "loss": 2.5661, "step": 374700 }, { "epoch": 2.8121459494744108, "grad_norm": 1.370684027671814, "learning_rate": 3.15040879158257e-06, "loss": 2.476, "step": 374800 }, { "epoch": 2.812896255224004, "grad_norm": 2.1756105422973633, "learning_rate": 3.1379036957560205e-06, "loss": 2.573, "step": 374900 }, { "epoch": 2.8136465609735968, "grad_norm": 1.633949637413025, "learning_rate": 3.1253985999294716e-06, "loss": 2.6139, "step": 375000 }, { "epoch": 2.8143968667231896, "grad_norm": 1.8031585216522217, "learning_rate": 3.112893504102922e-06, "loss": 2.5145, "step": 375100 }, { "epoch": 2.815147172472783, "grad_norm": 1.761385440826416, "learning_rate": 3.1003884082763727e-06, "loss": 2.6318, "step": 375200 }, { "epoch": 2.8158974782223756, "grad_norm": 2.427542209625244, "learning_rate": 3.0878833124498233e-06, "loss": 2.6462, "step": 375300 }, { "epoch": 2.816647783971969, "grad_norm": 2.281066656112671, "learning_rate": 3.075378216623274e-06, "loss": 2.5939, "step": 375400 }, { "epoch": 2.8173980897215616, "grad_norm": 2.482724189758301, "learning_rate": 3.062873120796725e-06, "loss": 2.5367, "step": 375500 }, { "epoch": 2.8181483954711544, "grad_norm": 1.7763923406600952, "learning_rate": 3.0503680249701755e-06, "loss": 2.678, "step": 375600 }, { "epoch": 2.8188987012207476, "grad_norm": 1.9145292043685913, "learning_rate": 3.037862929143626e-06, "loss": 2.7134, "step": 375700 }, { "epoch": 2.8196490069703404, "grad_norm": 2.1454126834869385, "learning_rate": 3.025357833317077e-06, "loss": 2.6525, "step": 375800 }, { "epoch": 2.8203993127199336, "grad_norm": 1.9757317304611206, "learning_rate": 3.0128527374905273e-06, "loss": 2.6165, "step": 375900 }, { "epoch": 2.8211496184695264, "grad_norm": 2.6684036254882812, "learning_rate": 3.000347641663978e-06, "loss": 2.5558, "step": 376000 }, { "epoch": 2.821899924219119, "grad_norm": 2.30035662651062, "learning_rate": 2.987842545837429e-06, "loss": 2.5305, "step": 376100 }, { "epoch": 2.8226502299687124, "grad_norm": 1.6929515600204468, "learning_rate": 2.9753374500108795e-06, "loss": 2.5814, "step": 376200 }, { "epoch": 2.823400535718305, "grad_norm": 1.94644033908844, "learning_rate": 2.96283235418433e-06, "loss": 2.5493, "step": 376300 }, { "epoch": 2.8241508414678984, "grad_norm": 2.458526372909546, "learning_rate": 2.950327258357781e-06, "loss": 2.5117, "step": 376400 }, { "epoch": 2.824901147217491, "grad_norm": 2.3397929668426514, "learning_rate": 2.9378221625312317e-06, "loss": 2.4159, "step": 376500 }, { "epoch": 2.825651452967084, "grad_norm": 1.6349704265594482, "learning_rate": 2.9253170667046823e-06, "loss": 2.722, "step": 376600 }, { "epoch": 2.826401758716677, "grad_norm": 2.339352607727051, "learning_rate": 2.912811970878133e-06, "loss": 2.5652, "step": 376700 }, { "epoch": 2.82715206446627, "grad_norm": 1.7890764474868774, "learning_rate": 2.900306875051584e-06, "loss": 2.5799, "step": 376800 }, { "epoch": 2.827902370215863, "grad_norm": 2.3709723949432373, "learning_rate": 2.887801779225034e-06, "loss": 2.6537, "step": 376900 }, { "epoch": 2.828652675965456, "grad_norm": 1.791680097579956, "learning_rate": 2.875296683398485e-06, "loss": 2.5664, "step": 377000 }, { "epoch": 2.829402981715049, "grad_norm": 2.083951950073242, "learning_rate": 2.8627915875719357e-06, "loss": 2.4868, "step": 377100 }, { "epoch": 2.8301532874646416, "grad_norm": 1.6566107273101807, "learning_rate": 2.8502864917453863e-06, "loss": 2.5982, "step": 377200 }, { "epoch": 2.830903593214235, "grad_norm": 2.2383785247802734, "learning_rate": 2.837781395918837e-06, "loss": 2.4948, "step": 377300 }, { "epoch": 2.831653898963828, "grad_norm": 1.9785301685333252, "learning_rate": 2.825276300092288e-06, "loss": 2.5941, "step": 377400 }, { "epoch": 2.832404204713421, "grad_norm": 1.9732385873794556, "learning_rate": 2.8127712042657385e-06, "loss": 2.6799, "step": 377500 }, { "epoch": 2.8331545104630136, "grad_norm": 1.806898832321167, "learning_rate": 2.800266108439189e-06, "loss": 2.5956, "step": 377600 }, { "epoch": 2.8339048162126064, "grad_norm": 2.042236328125, "learning_rate": 2.78776101261264e-06, "loss": 2.7791, "step": 377700 }, { "epoch": 2.8346551219621996, "grad_norm": 1.7272120714187622, "learning_rate": 2.7752559167860903e-06, "loss": 2.675, "step": 377800 }, { "epoch": 2.835405427711793, "grad_norm": 2.3832032680511475, "learning_rate": 2.762750820959541e-06, "loss": 2.6739, "step": 377900 }, { "epoch": 2.8361557334613856, "grad_norm": 3.899729013442993, "learning_rate": 2.7503707760912575e-06, "loss": 2.4712, "step": 378000 }, { "epoch": 2.8369060392109784, "grad_norm": 2.0574769973754883, "learning_rate": 2.737865680264708e-06, "loss": 2.6458, "step": 378100 }, { "epoch": 2.837656344960571, "grad_norm": 1.8501521348953247, "learning_rate": 2.7253605844381587e-06, "loss": 2.5873, "step": 378200 }, { "epoch": 2.8384066507101644, "grad_norm": 1.779913306236267, "learning_rate": 2.7128554886116092e-06, "loss": 2.7612, "step": 378300 }, { "epoch": 2.839156956459757, "grad_norm": 2.3205032348632812, "learning_rate": 2.7003503927850603e-06, "loss": 2.5898, "step": 378400 }, { "epoch": 2.8399072622093504, "grad_norm": 3.4975647926330566, "learning_rate": 2.687845296958511e-06, "loss": 2.5615, "step": 378500 }, { "epoch": 2.840657567958943, "grad_norm": 1.7648327350616455, "learning_rate": 2.6753402011319614e-06, "loss": 2.4994, "step": 378600 }, { "epoch": 2.841407873708536, "grad_norm": 2.535836935043335, "learning_rate": 2.662835105305412e-06, "loss": 2.6022, "step": 378700 }, { "epoch": 2.842158179458129, "grad_norm": 2.2830305099487305, "learning_rate": 2.6503300094788626e-06, "loss": 2.6372, "step": 378800 }, { "epoch": 2.842908485207722, "grad_norm": 1.9025962352752686, "learning_rate": 2.6378249136523132e-06, "loss": 2.6871, "step": 378900 }, { "epoch": 2.843658790957315, "grad_norm": 1.9495879411697388, "learning_rate": 2.6253198178257642e-06, "loss": 2.5694, "step": 379000 }, { "epoch": 2.844409096706908, "grad_norm": 2.844428062438965, "learning_rate": 2.612814721999215e-06, "loss": 2.7391, "step": 379100 }, { "epoch": 2.845159402456501, "grad_norm": 1.9427083730697632, "learning_rate": 2.6003096261726654e-06, "loss": 2.6337, "step": 379200 }, { "epoch": 2.845909708206094, "grad_norm": 2.200645923614502, "learning_rate": 2.5878045303461164e-06, "loss": 2.5147, "step": 379300 }, { "epoch": 2.846660013955687, "grad_norm": 2.7773449420928955, "learning_rate": 2.575299434519567e-06, "loss": 2.5596, "step": 379400 }, { "epoch": 2.84741031970528, "grad_norm": 1.8649303913116455, "learning_rate": 2.5627943386930176e-06, "loss": 2.6648, "step": 379500 }, { "epoch": 2.848160625454873, "grad_norm": 1.9988199472427368, "learning_rate": 2.5502892428664682e-06, "loss": 2.5294, "step": 379600 }, { "epoch": 2.8489109312044656, "grad_norm": 3.5162243843078613, "learning_rate": 2.537784147039919e-06, "loss": 2.4597, "step": 379700 }, { "epoch": 2.849661236954059, "grad_norm": 1.474135398864746, "learning_rate": 2.5252790512133694e-06, "loss": 2.7109, "step": 379800 }, { "epoch": 2.8504115427036516, "grad_norm": 2.6334829330444336, "learning_rate": 2.51277395538682e-06, "loss": 2.5895, "step": 379900 }, { "epoch": 2.851161848453245, "grad_norm": 2.280531883239746, "learning_rate": 2.500268859560271e-06, "loss": 2.6656, "step": 380000 }, { "epoch": 2.8519121542028376, "grad_norm": 2.0040626525878906, "learning_rate": 2.4877637637337216e-06, "loss": 2.4999, "step": 380100 }, { "epoch": 2.8526624599524304, "grad_norm": 3.445302724838257, "learning_rate": 2.4752586679071722e-06, "loss": 2.5092, "step": 380200 }, { "epoch": 2.8534127657020236, "grad_norm": 2.8367371559143066, "learning_rate": 2.4627535720806232e-06, "loss": 2.4987, "step": 380300 }, { "epoch": 2.8541630714516164, "grad_norm": 2.1648447513580322, "learning_rate": 2.450248476254074e-06, "loss": 2.6298, "step": 380400 }, { "epoch": 2.8549133772012096, "grad_norm": 2.187962055206299, "learning_rate": 2.437743380427524e-06, "loss": 2.5912, "step": 380500 }, { "epoch": 2.8556636829508024, "grad_norm": 1.6468254327774048, "learning_rate": 2.425238284600975e-06, "loss": 2.5701, "step": 380600 }, { "epoch": 2.856413988700395, "grad_norm": 2.638075351715088, "learning_rate": 2.4127331887744256e-06, "loss": 2.5341, "step": 380700 }, { "epoch": 2.8571642944499884, "grad_norm": 1.7863880395889282, "learning_rate": 2.400228092947876e-06, "loss": 2.645, "step": 380800 }, { "epoch": 2.857914600199581, "grad_norm": 3.1836862564086914, "learning_rate": 2.3877229971213272e-06, "loss": 2.6566, "step": 380900 }, { "epoch": 2.8586649059491744, "grad_norm": 1.736457109451294, "learning_rate": 2.375217901294778e-06, "loss": 2.65, "step": 381000 }, { "epoch": 2.8594152116987672, "grad_norm": 2.330167770385742, "learning_rate": 2.3627128054682284e-06, "loss": 2.6778, "step": 381100 }, { "epoch": 2.86016551744836, "grad_norm": 1.810409426689148, "learning_rate": 2.350207709641679e-06, "loss": 2.5521, "step": 381200 }, { "epoch": 2.8609158231979532, "grad_norm": 1.994168996810913, "learning_rate": 2.33770261381513e-06, "loss": 2.6625, "step": 381300 }, { "epoch": 2.861666128947546, "grad_norm": 2.397773027420044, "learning_rate": 2.32519751798858e-06, "loss": 2.5921, "step": 381400 }, { "epoch": 2.8624164346971392, "grad_norm": 2.72331166267395, "learning_rate": 2.312692422162031e-06, "loss": 2.6427, "step": 381500 }, { "epoch": 2.863166740446732, "grad_norm": 2.894571304321289, "learning_rate": 2.300187326335482e-06, "loss": 2.5368, "step": 381600 }, { "epoch": 2.863917046196325, "grad_norm": 2.095959424972534, "learning_rate": 2.2876822305089324e-06, "loss": 2.6735, "step": 381700 }, { "epoch": 2.864667351945918, "grad_norm": 2.3911800384521484, "learning_rate": 2.275177134682383e-06, "loss": 2.6243, "step": 381800 }, { "epoch": 2.865417657695511, "grad_norm": 1.5862504243850708, "learning_rate": 2.262672038855834e-06, "loss": 2.5517, "step": 381900 }, { "epoch": 2.866167963445104, "grad_norm": 1.8291475772857666, "learning_rate": 2.25029199398755e-06, "loss": 2.5582, "step": 382000 }, { "epoch": 2.866918269194697, "grad_norm": 3.6166903972625732, "learning_rate": 2.2377868981610007e-06, "loss": 2.7138, "step": 382100 }, { "epoch": 2.8676685749442896, "grad_norm": 2.795863628387451, "learning_rate": 2.225406853292717e-06, "loss": 2.572, "step": 382200 }, { "epoch": 2.868418880693883, "grad_norm": 1.8899908065795898, "learning_rate": 2.2129017574661675e-06, "loss": 2.6371, "step": 382300 }, { "epoch": 2.8691691864434756, "grad_norm": 1.656887173652649, "learning_rate": 2.200396661639618e-06, "loss": 2.4171, "step": 382400 }, { "epoch": 2.869919492193069, "grad_norm": 1.4669989347457886, "learning_rate": 2.1878915658130687e-06, "loss": 2.5764, "step": 382500 }, { "epoch": 2.8706697979426616, "grad_norm": 2.617932081222534, "learning_rate": 2.1753864699865197e-06, "loss": 2.5341, "step": 382600 }, { "epoch": 2.8714201036922544, "grad_norm": 1.8708443641662598, "learning_rate": 2.1628813741599703e-06, "loss": 2.6526, "step": 382700 }, { "epoch": 2.8721704094418476, "grad_norm": 1.824487566947937, "learning_rate": 2.150376278333421e-06, "loss": 2.5665, "step": 382800 }, { "epoch": 2.8729207151914404, "grad_norm": 1.8914059400558472, "learning_rate": 2.137871182506872e-06, "loss": 2.5447, "step": 382900 }, { "epoch": 2.8736710209410337, "grad_norm": 1.7847994565963745, "learning_rate": 2.1253660866803225e-06, "loss": 2.6445, "step": 383000 }, { "epoch": 2.8744213266906264, "grad_norm": 2.5227315425872803, "learning_rate": 2.112860990853773e-06, "loss": 2.5657, "step": 383100 }, { "epoch": 2.8751716324402192, "grad_norm": 2.1747264862060547, "learning_rate": 2.1003558950272237e-06, "loss": 2.5054, "step": 383200 }, { "epoch": 2.8759219381898125, "grad_norm": 1.4939942359924316, "learning_rate": 2.0878507992006743e-06, "loss": 2.5621, "step": 383300 }, { "epoch": 2.8766722439394052, "grad_norm": 1.943249225616455, "learning_rate": 2.075345703374125e-06, "loss": 2.4312, "step": 383400 }, { "epoch": 2.8774225496889985, "grad_norm": 2.1415445804595947, "learning_rate": 2.062840607547576e-06, "loss": 2.4514, "step": 383500 }, { "epoch": 2.8781728554385912, "grad_norm": 2.079357147216797, "learning_rate": 2.0503355117210265e-06, "loss": 2.595, "step": 383600 }, { "epoch": 2.878923161188184, "grad_norm": 2.7664432525634766, "learning_rate": 2.037830415894477e-06, "loss": 2.6387, "step": 383700 }, { "epoch": 2.8796734669377773, "grad_norm": 1.3531365394592285, "learning_rate": 2.0253253200679277e-06, "loss": 2.6034, "step": 383800 }, { "epoch": 2.88042377268737, "grad_norm": 2.334254026412964, "learning_rate": 2.0128202242413787e-06, "loss": 2.5383, "step": 383900 }, { "epoch": 2.8811740784369633, "grad_norm": 2.123295307159424, "learning_rate": 2.0003151284148293e-06, "loss": 2.6491, "step": 384000 }, { "epoch": 2.881924384186556, "grad_norm": 2.473393440246582, "learning_rate": 1.98781003258828e-06, "loss": 2.6018, "step": 384100 }, { "epoch": 2.882674689936149, "grad_norm": 1.941599726676941, "learning_rate": 1.9753049367617305e-06, "loss": 2.6759, "step": 384200 }, { "epoch": 2.883424995685742, "grad_norm": 1.9638973474502563, "learning_rate": 1.962799840935181e-06, "loss": 2.5474, "step": 384300 }, { "epoch": 2.884175301435335, "grad_norm": 3.36242938041687, "learning_rate": 1.9502947451086316e-06, "loss": 2.5348, "step": 384400 }, { "epoch": 2.884925607184928, "grad_norm": 2.90741229057312, "learning_rate": 1.9377896492820827e-06, "loss": 2.6053, "step": 384500 }, { "epoch": 2.885675912934521, "grad_norm": 2.0190701484680176, "learning_rate": 1.9252845534555333e-06, "loss": 2.6679, "step": 384600 }, { "epoch": 2.8864262186841136, "grad_norm": 2.6110951900482178, "learning_rate": 1.912779457628984e-06, "loss": 2.6961, "step": 384700 }, { "epoch": 2.887176524433707, "grad_norm": 2.225430488586426, "learning_rate": 1.9002743618024346e-06, "loss": 2.6158, "step": 384800 }, { "epoch": 2.8879268301832997, "grad_norm": 3.8653688430786133, "learning_rate": 1.8877692659758855e-06, "loss": 2.589, "step": 384900 }, { "epoch": 2.888677135932893, "grad_norm": 4.752131462097168, "learning_rate": 1.8752641701493358e-06, "loss": 2.6462, "step": 385000 }, { "epoch": 2.8894274416824857, "grad_norm": 3.345918655395508, "learning_rate": 1.8627590743227864e-06, "loss": 2.5922, "step": 385100 }, { "epoch": 2.8901777474320784, "grad_norm": 1.702400803565979, "learning_rate": 1.8502539784962372e-06, "loss": 2.6072, "step": 385200 }, { "epoch": 2.8909280531816717, "grad_norm": 1.536049723625183, "learning_rate": 1.837748882669688e-06, "loss": 2.5529, "step": 385300 }, { "epoch": 2.8916783589312645, "grad_norm": 1.2182538509368896, "learning_rate": 1.8252437868431386e-06, "loss": 2.4602, "step": 385400 }, { "epoch": 2.8924286646808577, "grad_norm": 3.043809652328491, "learning_rate": 1.8127386910165894e-06, "loss": 2.5987, "step": 385500 }, { "epoch": 2.8931789704304505, "grad_norm": 2.4860177040100098, "learning_rate": 1.80023359519004e-06, "loss": 2.5097, "step": 385600 }, { "epoch": 2.8939292761800433, "grad_norm": 1.6197898387908936, "learning_rate": 1.7877284993634908e-06, "loss": 2.5525, "step": 385700 }, { "epoch": 2.8946795819296365, "grad_norm": 1.4882330894470215, "learning_rate": 1.7752234035369414e-06, "loss": 2.5959, "step": 385800 }, { "epoch": 2.8954298876792293, "grad_norm": 1.7036482095718384, "learning_rate": 1.7627183077103918e-06, "loss": 2.6597, "step": 385900 }, { "epoch": 2.8961801934288225, "grad_norm": 2.793865442276001, "learning_rate": 1.7502132118838426e-06, "loss": 2.5663, "step": 386000 }, { "epoch": 2.8969304991784153, "grad_norm": 1.7878729104995728, "learning_rate": 1.7377081160572934e-06, "loss": 2.4283, "step": 386100 }, { "epoch": 2.897680804928008, "grad_norm": 1.7107988595962524, "learning_rate": 1.7253280711890096e-06, "loss": 2.554, "step": 386200 }, { "epoch": 2.8984311106776013, "grad_norm": 1.940669298171997, "learning_rate": 1.7129480263207257e-06, "loss": 2.5751, "step": 386300 }, { "epoch": 2.899181416427194, "grad_norm": 1.598775029182434, "learning_rate": 1.7004429304941765e-06, "loss": 2.5094, "step": 386400 }, { "epoch": 2.8999317221767873, "grad_norm": 2.5570945739746094, "learning_rate": 1.6879378346676271e-06, "loss": 2.5604, "step": 386500 }, { "epoch": 2.90068202792638, "grad_norm": 1.5548298358917236, "learning_rate": 1.675432738841078e-06, "loss": 2.5964, "step": 386600 }, { "epoch": 2.901432333675973, "grad_norm": 2.9374773502349854, "learning_rate": 1.6629276430145287e-06, "loss": 2.6052, "step": 386700 }, { "epoch": 2.9021826394255656, "grad_norm": 1.8140839338302612, "learning_rate": 1.6504225471879791e-06, "loss": 2.5542, "step": 386800 }, { "epoch": 2.902932945175159, "grad_norm": 2.061464786529541, "learning_rate": 1.6379174513614297e-06, "loss": 2.6897, "step": 386900 }, { "epoch": 2.903683250924752, "grad_norm": 2.0674686431884766, "learning_rate": 1.6254123555348805e-06, "loss": 2.7398, "step": 387000 }, { "epoch": 2.904433556674345, "grad_norm": 1.8845494985580444, "learning_rate": 1.6129072597083311e-06, "loss": 2.5612, "step": 387100 }, { "epoch": 2.9051838624239377, "grad_norm": 2.0465009212493896, "learning_rate": 1.600402163881782e-06, "loss": 2.7302, "step": 387200 }, { "epoch": 2.9059341681735305, "grad_norm": 2.2436370849609375, "learning_rate": 1.5878970680552327e-06, "loss": 2.5564, "step": 387300 }, { "epoch": 2.9066844739231237, "grad_norm": 1.8065285682678223, "learning_rate": 1.5753919722286833e-06, "loss": 2.6578, "step": 387400 }, { "epoch": 2.907434779672717, "grad_norm": 1.9475536346435547, "learning_rate": 1.5628868764021341e-06, "loss": 2.4264, "step": 387500 }, { "epoch": 2.9081850854223097, "grad_norm": 2.14984130859375, "learning_rate": 1.5503817805755847e-06, "loss": 2.6611, "step": 387600 }, { "epoch": 2.9089353911719025, "grad_norm": 2.020871639251709, "learning_rate": 1.5378766847490353e-06, "loss": 2.606, "step": 387700 }, { "epoch": 2.9096856969214953, "grad_norm": 2.7595577239990234, "learning_rate": 1.5253715889224861e-06, "loss": 2.5949, "step": 387800 }, { "epoch": 2.9104360026710885, "grad_norm": 1.7816346883773804, "learning_rate": 1.5128664930959365e-06, "loss": 2.6774, "step": 387900 }, { "epoch": 2.9111863084206813, "grad_norm": 2.1705400943756104, "learning_rate": 1.5003613972693873e-06, "loss": 2.5794, "step": 388000 }, { "epoch": 2.9119366141702745, "grad_norm": 3.8287248611450195, "learning_rate": 1.487856301442838e-06, "loss": 2.4305, "step": 388100 }, { "epoch": 2.9126869199198673, "grad_norm": 3.023650646209717, "learning_rate": 1.4753512056162887e-06, "loss": 2.4777, "step": 388200 }, { "epoch": 2.91343722566946, "grad_norm": 1.8146389722824097, "learning_rate": 1.4628461097897395e-06, "loss": 2.6632, "step": 388300 }, { "epoch": 2.9141875314190533, "grad_norm": 1.872207522392273, "learning_rate": 1.45034101396319e-06, "loss": 2.5323, "step": 388400 }, { "epoch": 2.914937837168646, "grad_norm": 2.4085588455200195, "learning_rate": 1.4379609690949062e-06, "loss": 2.6722, "step": 388500 }, { "epoch": 2.9156881429182393, "grad_norm": 1.805689811706543, "learning_rate": 1.425455873268357e-06, "loss": 2.4333, "step": 388600 }, { "epoch": 2.916438448667832, "grad_norm": 2.7809383869171143, "learning_rate": 1.4129507774418076e-06, "loss": 2.671, "step": 388700 }, { "epoch": 2.917188754417425, "grad_norm": 2.5384788513183594, "learning_rate": 1.4004456816152582e-06, "loss": 2.5208, "step": 388800 }, { "epoch": 2.917939060167018, "grad_norm": 1.9367709159851074, "learning_rate": 1.3879405857887088e-06, "loss": 2.6545, "step": 388900 }, { "epoch": 2.918689365916611, "grad_norm": 1.5118560791015625, "learning_rate": 1.3754354899621596e-06, "loss": 2.4555, "step": 389000 }, { "epoch": 2.919439671666204, "grad_norm": 1.659996509552002, "learning_rate": 1.3629303941356104e-06, "loss": 2.3617, "step": 389100 }, { "epoch": 2.920189977415797, "grad_norm": 1.943641185760498, "learning_rate": 1.350425298309061e-06, "loss": 2.5015, "step": 389200 }, { "epoch": 2.9209402831653897, "grad_norm": 2.0973548889160156, "learning_rate": 1.3379202024825116e-06, "loss": 2.685, "step": 389300 }, { "epoch": 2.921690588914983, "grad_norm": 2.136996269226074, "learning_rate": 1.3254151066559624e-06, "loss": 2.6789, "step": 389400 }, { "epoch": 2.9224408946645757, "grad_norm": 1.6953818798065186, "learning_rate": 1.312910010829413e-06, "loss": 2.6646, "step": 389500 }, { "epoch": 2.923191200414169, "grad_norm": 2.841609477996826, "learning_rate": 1.3004049150028638e-06, "loss": 2.6672, "step": 389600 }, { "epoch": 2.9239415061637617, "grad_norm": 1.3768996000289917, "learning_rate": 1.2878998191763144e-06, "loss": 2.6781, "step": 389700 }, { "epoch": 2.9246918119133545, "grad_norm": 1.8095844984054565, "learning_rate": 1.275394723349765e-06, "loss": 2.5138, "step": 389800 }, { "epoch": 2.9254421176629477, "grad_norm": 2.098127603530884, "learning_rate": 1.2628896275232158e-06, "loss": 2.5604, "step": 389900 }, { "epoch": 2.9261924234125405, "grad_norm": 1.8579503297805786, "learning_rate": 1.2503845316966664e-06, "loss": 2.7466, "step": 390000 }, { "epoch": 2.9269427291621337, "grad_norm": 2.636667013168335, "learning_rate": 1.2378794358701172e-06, "loss": 2.4719, "step": 390100 }, { "epoch": 2.9276930349117265, "grad_norm": 2.3550992012023926, "learning_rate": 1.2253743400435678e-06, "loss": 2.5235, "step": 390200 }, { "epoch": 2.9284433406613193, "grad_norm": 1.6980268955230713, "learning_rate": 1.2128692442170184e-06, "loss": 2.6953, "step": 390300 }, { "epoch": 2.9291936464109125, "grad_norm": 2.1095213890075684, "learning_rate": 1.2003641483904692e-06, "loss": 2.5943, "step": 390400 }, { "epoch": 2.9299439521605053, "grad_norm": 1.5535529851913452, "learning_rate": 1.1878590525639198e-06, "loss": 2.5236, "step": 390500 }, { "epoch": 2.9306942579100985, "grad_norm": 1.8415498733520508, "learning_rate": 1.1753539567373704e-06, "loss": 2.5513, "step": 390600 }, { "epoch": 2.9314445636596913, "grad_norm": 1.8522553443908691, "learning_rate": 1.1628488609108212e-06, "loss": 2.6077, "step": 390700 }, { "epoch": 2.932194869409284, "grad_norm": 2.3113105297088623, "learning_rate": 1.1503437650842718e-06, "loss": 2.4412, "step": 390800 }, { "epoch": 2.9329451751588773, "grad_norm": 2.1428894996643066, "learning_rate": 1.1378386692577226e-06, "loss": 2.7069, "step": 390900 }, { "epoch": 2.93369548090847, "grad_norm": 2.1996378898620605, "learning_rate": 1.1253335734311734e-06, "loss": 2.4915, "step": 391000 }, { "epoch": 2.9344457866580633, "grad_norm": 2.197658061981201, "learning_rate": 1.1128284776046238e-06, "loss": 2.5208, "step": 391100 }, { "epoch": 2.935196092407656, "grad_norm": 1.342636227607727, "learning_rate": 1.1003233817780746e-06, "loss": 2.6263, "step": 391200 }, { "epoch": 2.935946398157249, "grad_norm": 2.1825311183929443, "learning_rate": 1.0878182859515254e-06, "loss": 2.5843, "step": 391300 }, { "epoch": 2.936696703906842, "grad_norm": 2.454946279525757, "learning_rate": 1.075313190124976e-06, "loss": 2.5923, "step": 391400 }, { "epoch": 2.937447009656435, "grad_norm": 1.734416127204895, "learning_rate": 1.0628080942984268e-06, "loss": 2.5855, "step": 391500 }, { "epoch": 2.938197315406028, "grad_norm": 1.6744393110275269, "learning_rate": 1.0503029984718772e-06, "loss": 2.7006, "step": 391600 }, { "epoch": 2.938947621155621, "grad_norm": 2.7215471267700195, "learning_rate": 1.037797902645328e-06, "loss": 2.6523, "step": 391700 }, { "epoch": 2.9396979269052137, "grad_norm": 3.144425868988037, "learning_rate": 1.0252928068187788e-06, "loss": 2.606, "step": 391800 }, { "epoch": 2.940448232654807, "grad_norm": 1.8308464288711548, "learning_rate": 1.0127877109922294e-06, "loss": 2.7023, "step": 391900 }, { "epoch": 2.9411985384043997, "grad_norm": 2.573479652404785, "learning_rate": 1.00028261516568e-06, "loss": 2.5179, "step": 392000 }, { "epoch": 2.941948844153993, "grad_norm": 2.535738945007324, "learning_rate": 9.877775193391308e-07, "loss": 2.7346, "step": 392100 }, { "epoch": 2.9426991499035857, "grad_norm": 1.4177404642105103, "learning_rate": 9.752724235125814e-07, "loss": 2.5836, "step": 392200 }, { "epoch": 2.9434494556531785, "grad_norm": 1.4833168983459473, "learning_rate": 9.627673276860322e-07, "loss": 2.6647, "step": 392300 }, { "epoch": 2.9441997614027717, "grad_norm": 1.7907729148864746, "learning_rate": 9.502622318594829e-07, "loss": 2.6378, "step": 392400 }, { "epoch": 2.9449500671523645, "grad_norm": 1.965946912765503, "learning_rate": 9.378821869911989e-07, "loss": 2.4836, "step": 392500 }, { "epoch": 2.9457003729019577, "grad_norm": 2.4709551334381104, "learning_rate": 9.253770911646496e-07, "loss": 2.693, "step": 392600 }, { "epoch": 2.9464506786515505, "grad_norm": 2.6375977993011475, "learning_rate": 9.128719953381003e-07, "loss": 2.6786, "step": 392700 }, { "epoch": 2.9472009844011433, "grad_norm": 2.4155969619750977, "learning_rate": 9.00366899511551e-07, "loss": 2.5676, "step": 392800 }, { "epoch": 2.9479512901507365, "grad_norm": 1.6862118244171143, "learning_rate": 8.878618036850016e-07, "loss": 2.3867, "step": 392900 }, { "epoch": 2.9487015959003293, "grad_norm": 1.7810701131820679, "learning_rate": 8.753567078584523e-07, "loss": 2.5473, "step": 393000 }, { "epoch": 2.9494519016499225, "grad_norm": 1.8795430660247803, "learning_rate": 8.62851612031903e-07, "loss": 2.4928, "step": 393100 }, { "epoch": 2.9502022073995153, "grad_norm": 2.1721715927124023, "learning_rate": 8.503465162053537e-07, "loss": 2.531, "step": 393200 }, { "epoch": 2.950952513149108, "grad_norm": 2.910233974456787, "learning_rate": 8.378414203788045e-07, "loss": 2.7082, "step": 393300 }, { "epoch": 2.9517028188987013, "grad_norm": 1.8398337364196777, "learning_rate": 8.25336324552255e-07, "loss": 2.597, "step": 393400 }, { "epoch": 2.952453124648294, "grad_norm": 2.1391775608062744, "learning_rate": 8.129562796839713e-07, "loss": 2.5187, "step": 393500 }, { "epoch": 2.9532034303978874, "grad_norm": 2.668010711669922, "learning_rate": 8.00451183857422e-07, "loss": 2.531, "step": 393600 }, { "epoch": 2.95395373614748, "grad_norm": 1.8840651512145996, "learning_rate": 7.879460880308727e-07, "loss": 2.6064, "step": 393700 }, { "epoch": 2.954704041897073, "grad_norm": 2.0809106826782227, "learning_rate": 7.754409922043234e-07, "loss": 2.6148, "step": 393800 }, { "epoch": 2.955454347646666, "grad_norm": 1.6464370489120483, "learning_rate": 7.62935896377774e-07, "loss": 2.5391, "step": 393900 }, { "epoch": 2.956204653396259, "grad_norm": 1.4903913736343384, "learning_rate": 7.504308005512247e-07, "loss": 2.4529, "step": 394000 }, { "epoch": 2.956954959145852, "grad_norm": 2.1271581649780273, "learning_rate": 7.379257047246754e-07, "loss": 2.5656, "step": 394100 }, { "epoch": 2.957705264895445, "grad_norm": 1.6282991170883179, "learning_rate": 7.254206088981261e-07, "loss": 2.6073, "step": 394200 }, { "epoch": 2.9584555706450377, "grad_norm": 4.052754878997803, "learning_rate": 7.129155130715768e-07, "loss": 2.5825, "step": 394300 }, { "epoch": 2.959205876394631, "grad_norm": 2.626290798187256, "learning_rate": 7.004104172450273e-07, "loss": 2.6156, "step": 394400 }, { "epoch": 2.9599561821442237, "grad_norm": 2.1757078170776367, "learning_rate": 6.87905321418478e-07, "loss": 2.6192, "step": 394500 }, { "epoch": 2.960706487893817, "grad_norm": 1.9374918937683105, "learning_rate": 6.754002255919287e-07, "loss": 2.5176, "step": 394600 }, { "epoch": 2.9614567936434097, "grad_norm": 1.6955287456512451, "learning_rate": 6.628951297653794e-07, "loss": 2.5569, "step": 394700 }, { "epoch": 2.9622070993930025, "grad_norm": 2.471172571182251, "learning_rate": 6.5039003393883e-07, "loss": 2.5452, "step": 394800 }, { "epoch": 2.9629574051425958, "grad_norm": 1.7333520650863647, "learning_rate": 6.378849381122807e-07, "loss": 2.6859, "step": 394900 }, { "epoch": 2.9637077108921885, "grad_norm": 2.594331741333008, "learning_rate": 6.253798422857315e-07, "loss": 2.4664, "step": 395000 }, { "epoch": 2.9644580166417818, "grad_norm": 2.192216634750366, "learning_rate": 6.128747464591821e-07, "loss": 2.7278, "step": 395100 }, { "epoch": 2.9652083223913746, "grad_norm": 2.125816822052002, "learning_rate": 6.003696506326328e-07, "loss": 2.5821, "step": 395200 }, { "epoch": 2.9659586281409673, "grad_norm": 1.8153510093688965, "learning_rate": 5.878645548060835e-07, "loss": 2.6731, "step": 395300 }, { "epoch": 2.9667089338905606, "grad_norm": 1.8855352401733398, "learning_rate": 5.753594589795342e-07, "loss": 2.5923, "step": 395400 }, { "epoch": 2.9674592396401533, "grad_norm": 2.0466699600219727, "learning_rate": 5.628543631529848e-07, "loss": 2.5534, "step": 395500 }, { "epoch": 2.9682095453897466, "grad_norm": 2.3601109981536865, "learning_rate": 5.503492673264355e-07, "loss": 2.5681, "step": 395600 }, { "epoch": 2.9689598511393394, "grad_norm": 2.1077232360839844, "learning_rate": 5.378441714998862e-07, "loss": 2.5347, "step": 395700 }, { "epoch": 2.969710156888932, "grad_norm": 2.7479407787323, "learning_rate": 5.253390756733369e-07, "loss": 2.5603, "step": 395800 }, { "epoch": 2.970460462638525, "grad_norm": 2.0573081970214844, "learning_rate": 5.128339798467876e-07, "loss": 2.5652, "step": 395900 }, { "epoch": 2.971210768388118, "grad_norm": 2.1057260036468506, "learning_rate": 5.004539349785038e-07, "loss": 2.5087, "step": 396000 }, { "epoch": 2.9719610741377114, "grad_norm": 3.5929861068725586, "learning_rate": 4.879488391519545e-07, "loss": 2.64, "step": 396100 }, { "epoch": 2.972711379887304, "grad_norm": 1.8528203964233398, "learning_rate": 4.754437433254051e-07, "loss": 2.4942, "step": 396200 }, { "epoch": 2.973461685636897, "grad_norm": 1.5527862310409546, "learning_rate": 4.629386474988558e-07, "loss": 2.5991, "step": 396300 }, { "epoch": 2.9742119913864897, "grad_norm": 1.561394453048706, "learning_rate": 4.5043355167230647e-07, "loss": 2.6193, "step": 396400 }, { "epoch": 2.974962297136083, "grad_norm": 1.8670674562454224, "learning_rate": 4.3792845584575717e-07, "loss": 2.597, "step": 396500 }, { "epoch": 2.975712602885676, "grad_norm": 1.4492853879928589, "learning_rate": 4.254233600192078e-07, "loss": 2.6952, "step": 396600 }, { "epoch": 2.976462908635269, "grad_norm": 2.333540678024292, "learning_rate": 4.129182641926585e-07, "loss": 2.5964, "step": 396700 }, { "epoch": 2.9772132143848618, "grad_norm": 1.6717228889465332, "learning_rate": 4.0041316836610916e-07, "loss": 2.6108, "step": 396800 }, { "epoch": 2.9779635201344545, "grad_norm": 2.111940622329712, "learning_rate": 3.8790807253955986e-07, "loss": 2.4849, "step": 396900 }, { "epoch": 2.9787138258840478, "grad_norm": 1.7377349138259888, "learning_rate": 3.7540297671301056e-07, "loss": 2.6912, "step": 397000 }, { "epoch": 2.9794641316336405, "grad_norm": 2.1681129932403564, "learning_rate": 3.6289788088646126e-07, "loss": 2.619, "step": 397100 }, { "epoch": 2.9802144373832338, "grad_norm": 2.583889961242676, "learning_rate": 3.5039278505991196e-07, "loss": 2.4646, "step": 397200 }, { "epoch": 2.9809647431328266, "grad_norm": 1.6595933437347412, "learning_rate": 3.378876892333626e-07, "loss": 2.3257, "step": 397300 }, { "epoch": 2.9817150488824193, "grad_norm": 1.9701926708221436, "learning_rate": 3.253825934068133e-07, "loss": 2.7629, "step": 397400 }, { "epoch": 2.9824653546320126, "grad_norm": 2.2998785972595215, "learning_rate": 3.1287749758026395e-07, "loss": 2.5776, "step": 397500 }, { "epoch": 2.9832156603816053, "grad_norm": 2.1334009170532227, "learning_rate": 3.0037240175371465e-07, "loss": 2.5869, "step": 397600 }, { "epoch": 2.9839659661311986, "grad_norm": 2.41184139251709, "learning_rate": 2.8786730592716535e-07, "loss": 2.6082, "step": 397700 }, { "epoch": 2.9847162718807914, "grad_norm": 1.7991142272949219, "learning_rate": 2.75362210100616e-07, "loss": 2.5876, "step": 397800 }, { "epoch": 2.985466577630384, "grad_norm": 2.585958242416382, "learning_rate": 2.628571142740667e-07, "loss": 2.6049, "step": 397900 }, { "epoch": 2.9862168833799774, "grad_norm": 2.0497772693634033, "learning_rate": 2.503520184475174e-07, "loss": 2.6146, "step": 398000 }, { "epoch": 2.98696718912957, "grad_norm": 2.566714286804199, "learning_rate": 2.3784692262096807e-07, "loss": 2.6358, "step": 398100 }, { "epoch": 2.9877174948791634, "grad_norm": 1.9682271480560303, "learning_rate": 2.2534182679441874e-07, "loss": 2.4838, "step": 398200 }, { "epoch": 2.988467800628756, "grad_norm": 2.609741687774658, "learning_rate": 2.128367309678694e-07, "loss": 2.7225, "step": 398300 }, { "epoch": 2.989218106378349, "grad_norm": 2.018561363220215, "learning_rate": 2.0033163514132008e-07, "loss": 2.7099, "step": 398400 }, { "epoch": 2.989968412127942, "grad_norm": 1.6763876676559448, "learning_rate": 1.8782653931477076e-07, "loss": 2.5537, "step": 398500 }, { "epoch": 2.990718717877535, "grad_norm": 1.9421207904815674, "learning_rate": 1.7532144348822146e-07, "loss": 2.5968, "step": 398600 }, { "epoch": 2.991469023627128, "grad_norm": 2.096930742263794, "learning_rate": 1.6294139861993763e-07, "loss": 2.4783, "step": 398700 }, { "epoch": 2.992219329376721, "grad_norm": 2.7762465476989746, "learning_rate": 1.504363027933883e-07, "loss": 2.5422, "step": 398800 }, { "epoch": 2.9929696351263138, "grad_norm": 1.7911304235458374, "learning_rate": 1.37931206966839e-07, "loss": 2.5522, "step": 398900 }, { "epoch": 2.993719940875907, "grad_norm": 2.6297237873077393, "learning_rate": 1.2542611114028968e-07, "loss": 2.491, "step": 399000 }, { "epoch": 2.9944702466254998, "grad_norm": 2.6798365116119385, "learning_rate": 1.1292101531374035e-07, "loss": 2.5916, "step": 399100 }, { "epoch": 2.995220552375093, "grad_norm": 2.592543601989746, "learning_rate": 1.0041591948719104e-07, "loss": 2.455, "step": 399200 }, { "epoch": 2.9959708581246858, "grad_norm": 2.7518293857574463, "learning_rate": 8.791082366064172e-08, "loss": 2.4738, "step": 399300 }, { "epoch": 2.9967211638742786, "grad_norm": 2.082054376602173, "learning_rate": 7.54057278340924e-08, "loss": 2.6166, "step": 399400 }, { "epoch": 2.997471469623872, "grad_norm": 1.852839708328247, "learning_rate": 6.290063200754308e-08, "loss": 2.6089, "step": 399500 }, { "epoch": 2.9982217753734646, "grad_norm": 2.495514392852783, "learning_rate": 5.0395536180993756e-08, "loss": 2.597, "step": 399600 }, { "epoch": 2.998972081123058, "grad_norm": 1.880475640296936, "learning_rate": 3.7890440354444436e-08, "loss": 2.4503, "step": 399700 }, { "epoch": 2.9997223868726506, "grad_norm": 2.012917995452881, "learning_rate": 2.5385344527895115e-08, "loss": 2.7263, "step": 399800 } ], "logging_steps": 100, "max_steps": 399837, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0894805245952e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }