{ "best_metric": 1.00823629, "best_model_checkpoint": "/mnt/bn/haiyang-dataset-lq/medical/outputclass/qwen2-vl-2b-instruct/v5-20241113-121646/checkpoint-9430", "epoch": 10.0, "eval_steps": 10000, "global_step": 9430, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.91473788, "epoch": 0.0010604453870625664, "grad_norm": 13.708544731140137, "learning_rate": 0.0, "loss": 0.25200555, "memory(GiB)": 22.83, "step": 1, "train_speed(iter/s)": 0.097186 }, { "acc": 0.91393471, "epoch": 0.005302226935312832, "grad_norm": 11.364119529724121, "learning_rate": 2.6140058561220443e-06, "loss": 0.25148368, "memory(GiB)": 26.31, "step": 5, "train_speed(iter/s)": 0.291554 }, { "acc": 0.93851728, "epoch": 0.010604453870625663, "grad_norm": 8.12688159942627, "learning_rate": 3.73979690102038e-06, "loss": 0.1808401, "memory(GiB)": 26.31, "step": 10, "train_speed(iter/s)": 0.389217 }, { "acc": 0.95571146, "epoch": 0.015906680805938492, "grad_norm": 4.948174476623535, "learning_rate": 4.398342445933593e-06, "loss": 0.13787798, "memory(GiB)": 26.31, "step": 15, "train_speed(iter/s)": 0.438702 }, { "acc": 0.96221857, "epoch": 0.021208907741251327, "grad_norm": 2.2628302574157715, "learning_rate": 4.865587945918714e-06, "loss": 0.11728274, "memory(GiB)": 26.31, "step": 20, "train_speed(iter/s)": 0.46779 }, { "acc": 0.96878824, "epoch": 0.026511134676564158, "grad_norm": 1.461674451828003, "learning_rate": 5.2280117122440885e-06, "loss": 0.08844265, "memory(GiB)": 26.31, "step": 25, "train_speed(iter/s)": 0.4881 }, { "acc": 0.97463093, "epoch": 0.031813361611876985, "grad_norm": 1.4498542547225952, "learning_rate": 5.524133490831929e-06, "loss": 0.07447982, "memory(GiB)": 26.31, "step": 30, "train_speed(iter/s)": 0.501732 }, { "acc": 0.97848167, "epoch": 0.03711558854718982, "grad_norm": 1.6475602388381958, "learning_rate": 5.774500887225759e-06, "loss": 0.06788331, "memory(GiB)": 26.31, "step": 35, "train_speed(iter/s)": 0.512 }, { "acc": 0.97805767, "epoch": 0.042417815482502653, "grad_norm": 1.8446458578109741, "learning_rate": 5.99137899081705e-06, "loss": 0.06491201, "memory(GiB)": 26.31, "step": 40, "train_speed(iter/s)": 0.520488 }, { "acc": 0.97699718, "epoch": 0.04772004241781548, "grad_norm": 1.640552043914795, "learning_rate": 6.182679035745141e-06, "loss": 0.06084052, "memory(GiB)": 26.31, "step": 45, "train_speed(iter/s)": 0.527455 }, { "acc": 0.98118534, "epoch": 0.053022269353128315, "grad_norm": 1.425261378288269, "learning_rate": 6.353802757142424e-06, "loss": 0.05228539, "memory(GiB)": 26.31, "step": 50, "train_speed(iter/s)": 0.532614 }, { "acc": 0.98107777, "epoch": 0.05832449628844114, "grad_norm": 1.407269835472107, "learning_rate": 6.508602992822067e-06, "loss": 0.05452412, "memory(GiB)": 26.31, "step": 55, "train_speed(iter/s)": 0.536881 }, { "acc": 0.98615017, "epoch": 0.06362672322375397, "grad_norm": 1.4538545608520508, "learning_rate": 6.649924535730263e-06, "loss": 0.0399454, "memory(GiB)": 26.31, "step": 60, "train_speed(iter/s)": 0.541033 }, { "acc": 0.98120651, "epoch": 0.0689289501590668, "grad_norm": 2.002455711364746, "learning_rate": 6.7799277529914054e-06, "loss": 0.05406591, "memory(GiB)": 26.31, "step": 65, "train_speed(iter/s)": 0.544033 }, { "acc": 0.98403168, "epoch": 0.07423117709437964, "grad_norm": 1.1853930950164795, "learning_rate": 6.900291932124094e-06, "loss": 0.04809659, "memory(GiB)": 26.31, "step": 70, "train_speed(iter/s)": 0.546646 }, { "acc": 0.98352365, "epoch": 0.07953340402969247, "grad_norm": 1.3366847038269043, "learning_rate": 7.012348302055637e-06, "loss": 0.05014178, "memory(GiB)": 26.31, "step": 75, "train_speed(iter/s)": 0.549242 }, { "acc": 0.98384953, "epoch": 0.08483563096500531, "grad_norm": 1.5215312242507935, "learning_rate": 7.117170035715385e-06, "loss": 0.04720557, "memory(GiB)": 26.31, "step": 80, "train_speed(iter/s)": 0.551303 }, { "acc": 0.98184252, "epoch": 0.09013785790031813, "grad_norm": 1.1831401586532593, "learning_rate": 7.215634919156383e-06, "loss": 0.04520521, "memory(GiB)": 26.31, "step": 85, "train_speed(iter/s)": 0.55303 }, { "acc": 0.98633242, "epoch": 0.09544008483563096, "grad_norm": 0.7181859612464905, "learning_rate": 7.308470080643477e-06, "loss": 0.03698838, "memory(GiB)": 26.31, "step": 90, "train_speed(iter/s)": 0.554866 }, { "acc": 0.98487539, "epoch": 0.1007423117709438, "grad_norm": 0.8339945077896118, "learning_rate": 7.396284610134086e-06, "loss": 0.04118965, "memory(GiB)": 26.31, "step": 95, "train_speed(iter/s)": 0.55628 }, { "acc": 0.98523464, "epoch": 0.10604453870625663, "grad_norm": 1.019827961921692, "learning_rate": 7.47959380204076e-06, "loss": 0.04264031, "memory(GiB)": 26.31, "step": 100, "train_speed(iter/s)": 0.557577 }, { "acc": 0.98460913, "epoch": 0.11134676564156946, "grad_norm": 1.2514287233352661, "learning_rate": 7.558837477037307e-06, "loss": 0.04326624, "memory(GiB)": 26.31, "step": 105, "train_speed(iter/s)": 0.558938 }, { "acc": 0.98482342, "epoch": 0.11664899257688228, "grad_norm": 1.3463575839996338, "learning_rate": 7.634394037720401e-06, "loss": 0.04483083, "memory(GiB)": 26.31, "step": 110, "train_speed(iter/s)": 0.560167 }, { "acc": 0.98519001, "epoch": 0.12195121951219512, "grad_norm": 1.130424976348877, "learning_rate": 7.706591397293826e-06, "loss": 0.04231756, "memory(GiB)": 26.31, "step": 115, "train_speed(iter/s)": 0.561101 }, { "acc": 0.98767662, "epoch": 0.12725344644750794, "grad_norm": 1.0245696306228638, "learning_rate": 7.775715580628599e-06, "loss": 0.03614664, "memory(GiB)": 26.31, "step": 120, "train_speed(iter/s)": 0.561979 }, { "acc": 0.98472729, "epoch": 0.1325556733828208, "grad_norm": 1.5159035921096802, "learning_rate": 7.842017568366133e-06, "loss": 0.04319248, "memory(GiB)": 26.31, "step": 125, "train_speed(iter/s)": 0.562763 }, { "acc": 0.98606205, "epoch": 0.1378579003181336, "grad_norm": 1.2546530961990356, "learning_rate": 7.90571879788974e-06, "loss": 0.04082851, "memory(GiB)": 26.31, "step": 130, "train_speed(iter/s)": 0.56355 }, { "acc": 0.98801918, "epoch": 0.14316012725344646, "grad_norm": 0.9412282109260559, "learning_rate": 7.96701562555669e-06, "loss": 0.03775344, "memory(GiB)": 26.31, "step": 135, "train_speed(iter/s)": 0.56423 }, { "acc": 0.98443604, "epoch": 0.14846235418875928, "grad_norm": 1.3335416316986084, "learning_rate": 8.02608297702243e-06, "loss": 0.04015646, "memory(GiB)": 26.31, "step": 140, "train_speed(iter/s)": 0.564868 }, { "acc": 0.98708563, "epoch": 0.1537645811240721, "grad_norm": 1.6027027368545532, "learning_rate": 8.083077356722968e-06, "loss": 0.04044313, "memory(GiB)": 26.31, "step": 145, "train_speed(iter/s)": 0.565465 }, { "acc": 0.98780003, "epoch": 0.15906680805938495, "grad_norm": 1.1246064901351929, "learning_rate": 8.138139346953973e-06, "loss": 0.03837634, "memory(GiB)": 26.31, "step": 150, "train_speed(iter/s)": 0.566141 }, { "acc": 0.98864136, "epoch": 0.16436903499469777, "grad_norm": 1.0349862575531006, "learning_rate": 8.191395697023962e-06, "loss": 0.03336571, "memory(GiB)": 26.31, "step": 155, "train_speed(iter/s)": 0.566841 }, { "acc": 0.98549376, "epoch": 0.16967126193001061, "grad_norm": 1.4518648386001587, "learning_rate": 8.24296108061372e-06, "loss": 0.04370693, "memory(GiB)": 26.31, "step": 160, "train_speed(iter/s)": 0.567431 }, { "acc": 0.98450546, "epoch": 0.17497348886532343, "grad_norm": 1.8398545980453491, "learning_rate": 8.292939582633615e-06, "loss": 0.04571437, "memory(GiB)": 26.31, "step": 165, "train_speed(iter/s)": 0.567932 }, { "acc": 0.98669834, "epoch": 0.18027571580063625, "grad_norm": 1.194319248199463, "learning_rate": 8.341425964054718e-06, "loss": 0.03776242, "memory(GiB)": 26.31, "step": 170, "train_speed(iter/s)": 0.568358 }, { "acc": 0.9872467, "epoch": 0.1855779427359491, "grad_norm": 1.2717541456222534, "learning_rate": 8.388506743347804e-06, "loss": 0.03852434, "memory(GiB)": 26.31, "step": 175, "train_speed(iter/s)": 0.568766 }, { "acc": 0.98648386, "epoch": 0.19088016967126192, "grad_norm": 1.1042882204055786, "learning_rate": 8.434261125541812e-06, "loss": 0.0422026, "memory(GiB)": 26.31, "step": 180, "train_speed(iter/s)": 0.569335 }, { "acc": 0.98854618, "epoch": 0.19618239660657477, "grad_norm": 1.5004727840423584, "learning_rate": 8.478761803962609e-06, "loss": 0.03735459, "memory(GiB)": 26.31, "step": 185, "train_speed(iter/s)": 0.56973 }, { "acc": 0.98710842, "epoch": 0.2014846235418876, "grad_norm": 0.9797543883323669, "learning_rate": 8.52207565503242e-06, "loss": 0.03711139, "memory(GiB)": 26.31, "step": 190, "train_speed(iter/s)": 0.570088 }, { "acc": 0.98806124, "epoch": 0.2067868504772004, "grad_norm": 0.6779474020004272, "learning_rate": 8.564264342802955e-06, "loss": 0.03549838, "memory(GiB)": 26.31, "step": 195, "train_speed(iter/s)": 0.570432 }, { "acc": 0.9885643, "epoch": 0.21208907741251326, "grad_norm": 0.9963794350624084, "learning_rate": 8.605384846939095e-06, "loss": 0.0340589, "memory(GiB)": 26.31, "step": 200, "train_speed(iter/s)": 0.570722 }, { "acc": 0.98800755, "epoch": 0.21739130434782608, "grad_norm": 0.8053348660469055, "learning_rate": 8.645489925498207e-06, "loss": 0.03213402, "memory(GiB)": 26.31, "step": 205, "train_speed(iter/s)": 0.571029 }, { "acc": 0.98844814, "epoch": 0.22269353128313893, "grad_norm": 0.9053013920783997, "learning_rate": 8.684628521935642e-06, "loss": 0.03458236, "memory(GiB)": 26.31, "step": 210, "train_speed(iter/s)": 0.571289 }, { "acc": 0.98876429, "epoch": 0.22799575821845175, "grad_norm": 0.9718145728111267, "learning_rate": 8.722846124213128e-06, "loss": 0.03245977, "memory(GiB)": 26.31, "step": 215, "train_speed(iter/s)": 0.571635 }, { "acc": 0.9877038, "epoch": 0.23329798515376457, "grad_norm": 0.8481668829917908, "learning_rate": 8.760185082618737e-06, "loss": 0.03817012, "memory(GiB)": 26.31, "step": 220, "train_speed(iter/s)": 0.571873 }, { "acc": 0.98829918, "epoch": 0.23860021208907742, "grad_norm": 0.8947212100028992, "learning_rate": 8.796684891867187e-06, "loss": 0.03497087, "memory(GiB)": 26.31, "step": 225, "train_speed(iter/s)": 0.572187 }, { "acc": 0.98882122, "epoch": 0.24390243902439024, "grad_norm": 0.9148950576782227, "learning_rate": 8.83238244219216e-06, "loss": 0.03352001, "memory(GiB)": 26.31, "step": 230, "train_speed(iter/s)": 0.572426 }, { "acc": 0.9899559, "epoch": 0.2492046659597031, "grad_norm": 0.8286107182502747, "learning_rate": 8.867312243432854e-06, "loss": 0.02834421, "memory(GiB)": 26.31, "step": 235, "train_speed(iter/s)": 0.572807 }, { "acc": 0.98945627, "epoch": 0.2545068928950159, "grad_norm": 0.7633819580078125, "learning_rate": 8.901506625526934e-06, "loss": 0.03042876, "memory(GiB)": 26.31, "step": 240, "train_speed(iter/s)": 0.573113 }, { "acc": 0.98813782, "epoch": 0.2598091198303287, "grad_norm": 1.1705169677734375, "learning_rate": 8.934995918329474e-06, "loss": 0.03317465, "memory(GiB)": 26.31, "step": 245, "train_speed(iter/s)": 0.573356 }, { "acc": 0.98917551, "epoch": 0.2651113467656416, "grad_norm": 0.8647633194923401, "learning_rate": 8.967808613264469e-06, "loss": 0.03340689, "memory(GiB)": 26.31, "step": 250, "train_speed(iter/s)": 0.573591 }, { "acc": 0.98857431, "epoch": 0.2704135737009544, "grad_norm": 1.1097129583358765, "learning_rate": 8.999971508967931e-06, "loss": 0.03664353, "memory(GiB)": 26.31, "step": 255, "train_speed(iter/s)": 0.573787 }, { "acc": 0.98884392, "epoch": 0.2757158006362672, "grad_norm": 0.6874321103096008, "learning_rate": 9.031509842788075e-06, "loss": 0.02958034, "memory(GiB)": 26.31, "step": 260, "train_speed(iter/s)": 0.573963 }, { "acc": 0.9894619, "epoch": 0.28101802757158006, "grad_norm": 1.0628961324691772, "learning_rate": 9.062447409759295e-06, "loss": 0.02991056, "memory(GiB)": 26.31, "step": 265, "train_speed(iter/s)": 0.574138 }, { "acc": 0.98855505, "epoch": 0.2863202545068929, "grad_norm": 1.0344219207763672, "learning_rate": 9.092806670455026e-06, "loss": 0.03874294, "memory(GiB)": 26.31, "step": 270, "train_speed(iter/s)": 0.574331 }, { "acc": 0.98721523, "epoch": 0.2916224814422057, "grad_norm": 1.0998539924621582, "learning_rate": 9.122608848944111e-06, "loss": 0.03713042, "memory(GiB)": 26.31, "step": 275, "train_speed(iter/s)": 0.574549 }, { "acc": 0.98913212, "epoch": 0.29692470837751855, "grad_norm": 0.7507591247558594, "learning_rate": 9.151874021920764e-06, "loss": 0.03179551, "memory(GiB)": 26.31, "step": 280, "train_speed(iter/s)": 0.57478 }, { "acc": 0.98866091, "epoch": 0.3022269353128314, "grad_norm": 0.7731961607933044, "learning_rate": 9.180621199945635e-06, "loss": 0.03505718, "memory(GiB)": 26.31, "step": 285, "train_speed(iter/s)": 0.574926 }, { "acc": 0.9891468, "epoch": 0.3075291622481442, "grad_norm": 1.2755072116851807, "learning_rate": 9.208868401621302e-06, "loss": 0.0319423, "memory(GiB)": 26.31, "step": 290, "train_speed(iter/s)": 0.575064 }, { "acc": 0.98575668, "epoch": 0.31283138918345704, "grad_norm": 1.1947649717330933, "learning_rate": 9.236632721427041e-06, "loss": 0.04200953, "memory(GiB)": 26.31, "step": 295, "train_speed(iter/s)": 0.575254 }, { "acc": 0.98714046, "epoch": 0.3181336161187699, "grad_norm": 0.9724829196929932, "learning_rate": 9.263930391852308e-06, "loss": 0.03815084, "memory(GiB)": 26.31, "step": 300, "train_speed(iter/s)": 0.575407 }, { "acc": 0.99066544, "epoch": 0.32343584305408274, "grad_norm": 1.0233286619186401, "learning_rate": 9.290776840394537e-06, "loss": 0.0282487, "memory(GiB)": 26.31, "step": 305, "train_speed(iter/s)": 0.575641 }, { "acc": 0.989046, "epoch": 0.32873806998939553, "grad_norm": 0.9235794544219971, "learning_rate": 9.317186741922297e-06, "loss": 0.02938327, "memory(GiB)": 26.31, "step": 310, "train_speed(iter/s)": 0.575756 }, { "acc": 0.98994274, "epoch": 0.3340402969247084, "grad_norm": 0.9956262707710266, "learning_rate": 9.343174066848856e-06, "loss": 0.02921431, "memory(GiB)": 26.31, "step": 315, "train_speed(iter/s)": 0.575954 }, { "acc": 0.98916187, "epoch": 0.33934252386002123, "grad_norm": 1.35161292552948, "learning_rate": 9.368752125512056e-06, "loss": 0.0287899, "memory(GiB)": 26.31, "step": 320, "train_speed(iter/s)": 0.57606 }, { "acc": 0.98746319, "epoch": 0.344644750795334, "grad_norm": 1.062467098236084, "learning_rate": 9.39393360911345e-06, "loss": 0.03575181, "memory(GiB)": 26.31, "step": 325, "train_speed(iter/s)": 0.576176 }, { "acc": 0.98824463, "epoch": 0.34994697773064687, "grad_norm": 0.9505136609077454, "learning_rate": 9.41873062753195e-06, "loss": 0.03047816, "memory(GiB)": 26.31, "step": 330, "train_speed(iter/s)": 0.576314 }, { "acc": 0.9914609, "epoch": 0.3552492046659597, "grad_norm": 0.632502019405365, "learning_rate": 9.443154744293996e-06, "loss": 0.0260011, "memory(GiB)": 26.31, "step": 335, "train_speed(iter/s)": 0.576483 }, { "acc": 0.98916855, "epoch": 0.3605514316012725, "grad_norm": 1.034669280052185, "learning_rate": 9.467217008953053e-06, "loss": 0.03403499, "memory(GiB)": 26.31, "step": 340, "train_speed(iter/s)": 0.57658 }, { "acc": 0.98895273, "epoch": 0.36585365853658536, "grad_norm": 0.952576756477356, "learning_rate": 9.490927987105374e-06, "loss": 0.0317608, "memory(GiB)": 26.31, "step": 345, "train_speed(iter/s)": 0.576739 }, { "acc": 0.98866873, "epoch": 0.3711558854718982, "grad_norm": 0.8197150230407715, "learning_rate": 9.514297788246138e-06, "loss": 0.03127484, "memory(GiB)": 26.31, "step": 350, "train_speed(iter/s)": 0.576816 }, { "acc": 0.99059963, "epoch": 0.37645811240721105, "grad_norm": 0.825374960899353, "learning_rate": 9.537336091649749e-06, "loss": 0.02805768, "memory(GiB)": 26.31, "step": 355, "train_speed(iter/s)": 0.576892 }, { "acc": 0.98838682, "epoch": 0.38176033934252385, "grad_norm": 0.8780854940414429, "learning_rate": 9.560052170440148e-06, "loss": 0.03393146, "memory(GiB)": 26.31, "step": 360, "train_speed(iter/s)": 0.576976 }, { "acc": 0.99057932, "epoch": 0.3870625662778367, "grad_norm": 0.5594467520713806, "learning_rate": 9.582454914000955e-06, "loss": 0.02473587, "memory(GiB)": 26.31, "step": 365, "train_speed(iter/s)": 0.577044 }, { "acc": 0.99060621, "epoch": 0.39236479321314954, "grad_norm": 0.9039445519447327, "learning_rate": 9.604552848860942e-06, "loss": 0.02671352, "memory(GiB)": 26.31, "step": 370, "train_speed(iter/s)": 0.577136 }, { "acc": 0.98958836, "epoch": 0.39766702014846234, "grad_norm": 1.2218246459960938, "learning_rate": 9.626354158177683e-06, "loss": 0.03268912, "memory(GiB)": 26.31, "step": 375, "train_speed(iter/s)": 0.577268 }, { "acc": 0.98990612, "epoch": 0.4029692470837752, "grad_norm": 1.1614571809768677, "learning_rate": 9.647866699930756e-06, "loss": 0.02865369, "memory(GiB)": 26.31, "step": 380, "train_speed(iter/s)": 0.57737 }, { "acc": 0.99105186, "epoch": 0.40827147401908803, "grad_norm": 0.7430466413497925, "learning_rate": 9.669098023925782e-06, "loss": 0.02913875, "memory(GiB)": 26.31, "step": 385, "train_speed(iter/s)": 0.577492 }, { "acc": 0.9908083, "epoch": 0.4135737009544008, "grad_norm": 0.6815354228019714, "learning_rate": 9.690055387701289e-06, "loss": 0.02702239, "memory(GiB)": 26.31, "step": 390, "train_speed(iter/s)": 0.577623 }, { "acc": 0.98966265, "epoch": 0.4188759278897137, "grad_norm": 0.7089868783950806, "learning_rate": 9.710745771422355e-06, "loss": 0.02905208, "memory(GiB)": 26.31, "step": 395, "train_speed(iter/s)": 0.577742 }, { "acc": 0.99098186, "epoch": 0.4241781548250265, "grad_norm": 0.6910232901573181, "learning_rate": 9.731175891837428e-06, "loss": 0.02598582, "memory(GiB)": 26.31, "step": 400, "train_speed(iter/s)": 0.577853 }, { "acc": 0.98882151, "epoch": 0.42948038176033937, "grad_norm": 1.0936506986618042, "learning_rate": 9.751352215368239e-06, "loss": 0.03153276, "memory(GiB)": 26.31, "step": 405, "train_speed(iter/s)": 0.577953 }, { "acc": 0.99092979, "epoch": 0.43478260869565216, "grad_norm": 0.7421805262565613, "learning_rate": 9.771280970396543e-06, "loss": 0.02831423, "memory(GiB)": 26.31, "step": 410, "train_speed(iter/s)": 0.57803 }, { "acc": 0.98938055, "epoch": 0.440084835630965, "grad_norm": 0.9013299345970154, "learning_rate": 9.790968158806186e-06, "loss": 0.02966608, "memory(GiB)": 26.31, "step": 415, "train_speed(iter/s)": 0.578126 }, { "acc": 0.98750973, "epoch": 0.44538706256627786, "grad_norm": 0.8295337557792664, "learning_rate": 9.810419566833978e-06, "loss": 0.0346707, "memory(GiB)": 26.31, "step": 420, "train_speed(iter/s)": 0.578189 }, { "acc": 0.98876076, "epoch": 0.45068928950159065, "grad_norm": 0.7757723927497864, "learning_rate": 9.829640775278427e-06, "loss": 0.0301866, "memory(GiB)": 26.31, "step": 425, "train_speed(iter/s)": 0.578228 }, { "acc": 0.99074221, "epoch": 0.4559915164369035, "grad_norm": 0.9902310967445374, "learning_rate": 9.848637169111462e-06, "loss": 0.02895352, "memory(GiB)": 26.31, "step": 430, "train_speed(iter/s)": 0.578321 }, { "acc": 0.98980846, "epoch": 0.46129374337221635, "grad_norm": 0.916560709476471, "learning_rate": 9.867413946534518e-06, "loss": 0.030442, "memory(GiB)": 26.31, "step": 435, "train_speed(iter/s)": 0.578384 }, { "acc": 0.9902401, "epoch": 0.46659597030752914, "grad_norm": 0.7796283960342407, "learning_rate": 9.885976127517072e-06, "loss": 0.02879334, "memory(GiB)": 26.31, "step": 440, "train_speed(iter/s)": 0.578533 }, { "acc": 0.99136429, "epoch": 0.471898197242842, "grad_norm": 0.7088432908058167, "learning_rate": 9.904328561852786e-06, "loss": 0.0259961, "memory(GiB)": 26.31, "step": 445, "train_speed(iter/s)": 0.578598 }, { "acc": 0.99091301, "epoch": 0.47720042417815484, "grad_norm": 0.6826531887054443, "learning_rate": 9.922475936765522e-06, "loss": 0.02690938, "memory(GiB)": 26.31, "step": 450, "train_speed(iter/s)": 0.578648 }, { "acc": 0.99094028, "epoch": 0.48250265111346763, "grad_norm": 0.753305971622467, "learning_rate": 9.94042278409512e-06, "loss": 0.0271632, "memory(GiB)": 26.31, "step": 455, "train_speed(iter/s)": 0.578709 }, { "acc": 0.99208088, "epoch": 0.4878048780487805, "grad_norm": 0.7565575838088989, "learning_rate": 9.958173487090496e-06, "loss": 0.02474675, "memory(GiB)": 26.31, "step": 460, "train_speed(iter/s)": 0.578762 }, { "acc": 0.9922308, "epoch": 0.4931071049840933, "grad_norm": 0.5841274261474609, "learning_rate": 9.97573228683551e-06, "loss": 0.02355065, "memory(GiB)": 26.31, "step": 465, "train_speed(iter/s)": 0.578809 }, { "acc": 0.99055576, "epoch": 0.4984093319194062, "grad_norm": 0.6141323447227478, "learning_rate": 9.99310328833119e-06, "loss": 0.02669106, "memory(GiB)": 26.31, "step": 470, "train_speed(iter/s)": 0.578891 }, { "acc": 0.98941097, "epoch": 0.503711558854719, "grad_norm": 0.8238556385040283, "learning_rate": 9.99999723295211e-06, "loss": 0.02945265, "memory(GiB)": 26.31, "step": 475, "train_speed(iter/s)": 0.578983 }, { "acc": 0.99244938, "epoch": 0.5090137857900318, "grad_norm": 0.6722385287284851, "learning_rate": 9.999980323226098e-06, "loss": 0.02124254, "memory(GiB)": 26.31, "step": 480, "train_speed(iter/s)": 0.579061 }, { "acc": 0.99193668, "epoch": 0.5143160127253447, "grad_norm": 1.1237002611160278, "learning_rate": 9.999948041074835e-06, "loss": 0.02350252, "memory(GiB)": 26.31, "step": 485, "train_speed(iter/s)": 0.579118 }, { "acc": 0.99161243, "epoch": 0.5196182396606575, "grad_norm": 0.8993910551071167, "learning_rate": 9.99990038659758e-06, "loss": 0.02440767, "memory(GiB)": 26.31, "step": 490, "train_speed(iter/s)": 0.579193 }, { "acc": 0.99120369, "epoch": 0.5249204665959704, "grad_norm": 0.9947673678398132, "learning_rate": 9.999837359940859e-06, "loss": 0.02465511, "memory(GiB)": 26.31, "step": 495, "train_speed(iter/s)": 0.579226 }, { "acc": 0.99091749, "epoch": 0.5302226935312832, "grad_norm": 1.0027878284454346, "learning_rate": 9.999758961298472e-06, "loss": 0.02779818, "memory(GiB)": 26.31, "step": 500, "train_speed(iter/s)": 0.579296 }, { "acc": 0.99098568, "epoch": 0.5355249204665959, "grad_norm": 0.8108927607536316, "learning_rate": 9.999665190911476e-06, "loss": 0.02721919, "memory(GiB)": 26.31, "step": 505, "train_speed(iter/s)": 0.579374 }, { "acc": 0.99209585, "epoch": 0.5408271474019088, "grad_norm": 0.4550963044166565, "learning_rate": 9.999556049068198e-06, "loss": 0.02487307, "memory(GiB)": 26.31, "step": 510, "train_speed(iter/s)": 0.579448 }, { "acc": 0.99234419, "epoch": 0.5461293743372216, "grad_norm": 0.5903543829917908, "learning_rate": 9.999431536104226e-06, "loss": 0.02234256, "memory(GiB)": 26.31, "step": 515, "train_speed(iter/s)": 0.579509 }, { "acc": 0.99038715, "epoch": 0.5514316012725344, "grad_norm": 0.7893358469009399, "learning_rate": 9.999291652402414e-06, "loss": 0.02926616, "memory(GiB)": 26.31, "step": 520, "train_speed(iter/s)": 0.579546 }, { "acc": 0.99203892, "epoch": 0.5567338282078473, "grad_norm": 0.8482871055603027, "learning_rate": 9.999136398392877e-06, "loss": 0.02198004, "memory(GiB)": 26.31, "step": 525, "train_speed(iter/s)": 0.579597 }, { "acc": 0.99342957, "epoch": 0.5620360551431601, "grad_norm": 0.5417046546936035, "learning_rate": 9.998965774552995e-06, "loss": 0.02237912, "memory(GiB)": 26.31, "step": 530, "train_speed(iter/s)": 0.579673 }, { "acc": 0.99193487, "epoch": 0.5673382820784729, "grad_norm": 0.5985880494117737, "learning_rate": 9.998779781407395e-06, "loss": 0.02462097, "memory(GiB)": 26.31, "step": 535, "train_speed(iter/s)": 0.579695 }, { "acc": 0.99162283, "epoch": 0.5726405090137858, "grad_norm": 0.7096518278121948, "learning_rate": 9.998578419527974e-06, "loss": 0.02340071, "memory(GiB)": 26.31, "step": 540, "train_speed(iter/s)": 0.579769 }, { "acc": 0.98985367, "epoch": 0.5779427359490986, "grad_norm": 1.0494897365570068, "learning_rate": 9.998361689533882e-06, "loss": 0.02991374, "memory(GiB)": 26.31, "step": 545, "train_speed(iter/s)": 0.57983 }, { "acc": 0.9917799, "epoch": 0.5832449628844114, "grad_norm": 0.8652428388595581, "learning_rate": 9.998129592091518e-06, "loss": 0.02398509, "memory(GiB)": 26.31, "step": 550, "train_speed(iter/s)": 0.579848 }, { "acc": 0.99160995, "epoch": 0.5885471898197243, "grad_norm": 0.9238954186439514, "learning_rate": 9.99788212791454e-06, "loss": 0.026893, "memory(GiB)": 26.31, "step": 555, "train_speed(iter/s)": 0.57992 }, { "acc": 0.98957453, "epoch": 0.5938494167550371, "grad_norm": 1.0519745349884033, "learning_rate": 9.997619297763849e-06, "loss": 0.03056088, "memory(GiB)": 26.31, "step": 560, "train_speed(iter/s)": 0.579957 }, { "acc": 0.99082956, "epoch": 0.5991516436903499, "grad_norm": 0.6974701285362244, "learning_rate": 9.997341102447595e-06, "loss": 0.02576918, "memory(GiB)": 26.31, "step": 565, "train_speed(iter/s)": 0.580013 }, { "acc": 0.99181623, "epoch": 0.6044538706256628, "grad_norm": 0.857319176197052, "learning_rate": 9.997047542821179e-06, "loss": 0.02497994, "memory(GiB)": 26.31, "step": 570, "train_speed(iter/s)": 0.58011 }, { "acc": 0.99321194, "epoch": 0.6097560975609756, "grad_norm": 0.6084680557250977, "learning_rate": 9.996738619787236e-06, "loss": 0.02231183, "memory(GiB)": 26.31, "step": 575, "train_speed(iter/s)": 0.580164 }, { "acc": 0.99226446, "epoch": 0.6150583244962884, "grad_norm": 0.6827586889266968, "learning_rate": 9.996414334295644e-06, "loss": 0.02310744, "memory(GiB)": 26.31, "step": 580, "train_speed(iter/s)": 0.58019 }, { "acc": 0.9927886, "epoch": 0.6203605514316013, "grad_norm": 0.4883098006248474, "learning_rate": 9.99607468734352e-06, "loss": 0.02214252, "memory(GiB)": 26.31, "step": 585, "train_speed(iter/s)": 0.580207 }, { "acc": 0.99254446, "epoch": 0.6256627783669141, "grad_norm": 0.8108164668083191, "learning_rate": 9.995719679975209e-06, "loss": 0.0238646, "memory(GiB)": 26.31, "step": 590, "train_speed(iter/s)": 0.580266 }, { "acc": 0.99094467, "epoch": 0.630965005302227, "grad_norm": 0.665600061416626, "learning_rate": 9.995349313282291e-06, "loss": 0.02722621, "memory(GiB)": 26.31, "step": 595, "train_speed(iter/s)": 0.5803 }, { "acc": 0.99339981, "epoch": 0.6362672322375398, "grad_norm": 0.5218392014503479, "learning_rate": 9.994963588403572e-06, "loss": 0.02030847, "memory(GiB)": 26.31, "step": 600, "train_speed(iter/s)": 0.580317 }, { "acc": 0.99143753, "epoch": 0.6415694591728526, "grad_norm": 0.6035894155502319, "learning_rate": 9.99456250652508e-06, "loss": 0.02466365, "memory(GiB)": 26.31, "step": 605, "train_speed(iter/s)": 0.580325 }, { "acc": 0.99280758, "epoch": 0.6468716861081655, "grad_norm": 0.6592385768890381, "learning_rate": 9.994146068880061e-06, "loss": 0.02078036, "memory(GiB)": 26.31, "step": 610, "train_speed(iter/s)": 0.580376 }, { "acc": 0.99108791, "epoch": 0.6521739130434783, "grad_norm": 0.6236558556556702, "learning_rate": 9.993714276748982e-06, "loss": 0.02436172, "memory(GiB)": 26.31, "step": 615, "train_speed(iter/s)": 0.580431 }, { "acc": 0.9920083, "epoch": 0.6574761399787911, "grad_norm": 0.47285163402557373, "learning_rate": 9.993267131459518e-06, "loss": 0.02266829, "memory(GiB)": 26.31, "step": 620, "train_speed(iter/s)": 0.580493 }, { "acc": 0.99268894, "epoch": 0.662778366914104, "grad_norm": 0.7881173491477966, "learning_rate": 9.992804634386555e-06, "loss": 0.01983478, "memory(GiB)": 26.31, "step": 625, "train_speed(iter/s)": 0.580548 }, { "acc": 0.99176426, "epoch": 0.6680805938494168, "grad_norm": 0.8369566798210144, "learning_rate": 9.992326786952182e-06, "loss": 0.0240339, "memory(GiB)": 26.31, "step": 630, "train_speed(iter/s)": 0.58057 }, { "acc": 0.99282837, "epoch": 0.6733828207847296, "grad_norm": 0.9245002269744873, "learning_rate": 9.991833590625683e-06, "loss": 0.02064004, "memory(GiB)": 26.31, "step": 635, "train_speed(iter/s)": 0.58059 }, { "acc": 0.99282904, "epoch": 0.6786850477200425, "grad_norm": 0.6197252869606018, "learning_rate": 9.991325046923544e-06, "loss": 0.02151935, "memory(GiB)": 26.31, "step": 640, "train_speed(iter/s)": 0.580615 }, { "acc": 0.99258642, "epoch": 0.6839872746553552, "grad_norm": 0.6860659122467041, "learning_rate": 9.990801157409434e-06, "loss": 0.02141871, "memory(GiB)": 26.31, "step": 645, "train_speed(iter/s)": 0.580643 }, { "acc": 0.99289961, "epoch": 0.689289501590668, "grad_norm": 0.6437894105911255, "learning_rate": 9.990261923694215e-06, "loss": 0.02073102, "memory(GiB)": 26.31, "step": 650, "train_speed(iter/s)": 0.58073 }, { "acc": 0.99290047, "epoch": 0.694591728525981, "grad_norm": 0.5864006876945496, "learning_rate": 9.989707347435921e-06, "loss": 0.01984381, "memory(GiB)": 26.31, "step": 655, "train_speed(iter/s)": 0.580734 }, { "acc": 0.99156342, "epoch": 0.6998939554612937, "grad_norm": 0.7833142280578613, "learning_rate": 9.98913743033977e-06, "loss": 0.02350827, "memory(GiB)": 26.31, "step": 660, "train_speed(iter/s)": 0.58077 }, { "acc": 0.99425297, "epoch": 0.7051961823966065, "grad_norm": 0.7044891119003296, "learning_rate": 9.988552174158141e-06, "loss": 0.01863719, "memory(GiB)": 26.31, "step": 665, "train_speed(iter/s)": 0.580791 }, { "acc": 0.99157066, "epoch": 0.7104984093319194, "grad_norm": 0.7697265148162842, "learning_rate": 9.987951580690585e-06, "loss": 0.02315737, "memory(GiB)": 26.31, "step": 670, "train_speed(iter/s)": 0.580838 }, { "acc": 0.99336653, "epoch": 0.7158006362672322, "grad_norm": 0.626921534538269, "learning_rate": 9.987335651783809e-06, "loss": 0.01875867, "memory(GiB)": 26.31, "step": 675, "train_speed(iter/s)": 0.580889 }, { "acc": 0.99421749, "epoch": 0.721102863202545, "grad_norm": 0.6633844375610352, "learning_rate": 9.986704389331675e-06, "loss": 0.01869861, "memory(GiB)": 26.31, "step": 680, "train_speed(iter/s)": 0.58092 }, { "acc": 0.99101582, "epoch": 0.7264050901378579, "grad_norm": 0.7727330327033997, "learning_rate": 9.986057795275192e-06, "loss": 0.02501288, "memory(GiB)": 26.31, "step": 685, "train_speed(iter/s)": 0.580941 }, { "acc": 0.99241905, "epoch": 0.7317073170731707, "grad_norm": 0.5293135643005371, "learning_rate": 9.98539587160251e-06, "loss": 0.02406199, "memory(GiB)": 26.31, "step": 690, "train_speed(iter/s)": 0.580949 }, { "acc": 0.99096756, "epoch": 0.7370095440084835, "grad_norm": 0.7288177013397217, "learning_rate": 9.984718620348913e-06, "loss": 0.0244829, "memory(GiB)": 26.31, "step": 695, "train_speed(iter/s)": 0.580994 }, { "acc": 0.99141579, "epoch": 0.7423117709437964, "grad_norm": 0.7569153308868408, "learning_rate": 9.984026043596819e-06, "loss": 0.02278415, "memory(GiB)": 26.31, "step": 700, "train_speed(iter/s)": 0.581045 }, { "acc": 0.9932703, "epoch": 0.7476139978791092, "grad_norm": 0.6304537057876587, "learning_rate": 9.983318143475762e-06, "loss": 0.01978764, "memory(GiB)": 26.31, "step": 705, "train_speed(iter/s)": 0.58107 }, { "acc": 0.99194145, "epoch": 0.7529162248144221, "grad_norm": 0.5985568165779114, "learning_rate": 9.982594922162403e-06, "loss": 0.02452516, "memory(GiB)": 26.31, "step": 710, "train_speed(iter/s)": 0.581118 }, { "acc": 0.99371014, "epoch": 0.7582184517497349, "grad_norm": 0.6595653891563416, "learning_rate": 9.981856381880504e-06, "loss": 0.01932598, "memory(GiB)": 26.31, "step": 715, "train_speed(iter/s)": 0.581125 }, { "acc": 0.99341488, "epoch": 0.7635206786850477, "grad_norm": 0.7101230025291443, "learning_rate": 9.981102524900929e-06, "loss": 0.02069145, "memory(GiB)": 26.31, "step": 720, "train_speed(iter/s)": 0.581142 }, { "acc": 0.99183006, "epoch": 0.7688229056203606, "grad_norm": 0.6250572800636292, "learning_rate": 9.98033335354164e-06, "loss": 0.02185034, "memory(GiB)": 26.31, "step": 725, "train_speed(iter/s)": 0.58116 }, { "acc": 0.99280148, "epoch": 0.7741251325556734, "grad_norm": 0.6395146250724792, "learning_rate": 9.979548870167695e-06, "loss": 0.02104771, "memory(GiB)": 26.31, "step": 730, "train_speed(iter/s)": 0.581173 }, { "acc": 0.99531593, "epoch": 0.7794273594909862, "grad_norm": 0.5320297479629517, "learning_rate": 9.978749077191223e-06, "loss": 0.01694808, "memory(GiB)": 26.31, "step": 735, "train_speed(iter/s)": 0.581213 }, { "acc": 0.99354649, "epoch": 0.7847295864262991, "grad_norm": 0.6778517961502075, "learning_rate": 9.977933977071433e-06, "loss": 0.01976324, "memory(GiB)": 26.31, "step": 740, "train_speed(iter/s)": 0.581244 }, { "acc": 0.99129896, "epoch": 0.7900318133616119, "grad_norm": 0.8144289255142212, "learning_rate": 9.977103572314595e-06, "loss": 0.02646843, "memory(GiB)": 26.31, "step": 745, "train_speed(iter/s)": 0.581258 }, { "acc": 0.99141903, "epoch": 0.7953340402969247, "grad_norm": 0.6856284141540527, "learning_rate": 9.976257865474044e-06, "loss": 0.02530013, "memory(GiB)": 26.31, "step": 750, "train_speed(iter/s)": 0.581283 }, { "acc": 0.99295807, "epoch": 0.8006362672322376, "grad_norm": 0.5783557295799255, "learning_rate": 9.975396859150165e-06, "loss": 0.02089001, "memory(GiB)": 26.31, "step": 755, "train_speed(iter/s)": 0.581298 }, { "acc": 0.99298162, "epoch": 0.8059384941675504, "grad_norm": 0.681499719619751, "learning_rate": 9.97452055599038e-06, "loss": 0.02061177, "memory(GiB)": 26.31, "step": 760, "train_speed(iter/s)": 0.581304 }, { "acc": 0.99296398, "epoch": 0.8112407211028632, "grad_norm": 0.7560820579528809, "learning_rate": 9.973628958689153e-06, "loss": 0.0210947, "memory(GiB)": 26.31, "step": 765, "train_speed(iter/s)": 0.581369 }, { "acc": 0.99206982, "epoch": 0.8165429480381761, "grad_norm": 1.0038448572158813, "learning_rate": 9.972722069987973e-06, "loss": 0.02418317, "memory(GiB)": 26.31, "step": 770, "train_speed(iter/s)": 0.581388 }, { "acc": 0.99350224, "epoch": 0.8218451749734889, "grad_norm": 0.678901195526123, "learning_rate": 9.971799892675342e-06, "loss": 0.0195865, "memory(GiB)": 26.31, "step": 775, "train_speed(iter/s)": 0.581427 }, { "acc": 0.99451637, "epoch": 0.8271474019088016, "grad_norm": 0.688715934753418, "learning_rate": 9.970862429586775e-06, "loss": 0.01755214, "memory(GiB)": 26.31, "step": 780, "train_speed(iter/s)": 0.581463 }, { "acc": 0.99366951, "epoch": 0.8324496288441146, "grad_norm": 0.7547446489334106, "learning_rate": 9.969909683604791e-06, "loss": 0.01924557, "memory(GiB)": 26.31, "step": 785, "train_speed(iter/s)": 0.581476 }, { "acc": 0.99350204, "epoch": 0.8377518557794273, "grad_norm": 0.5072101950645447, "learning_rate": 9.968941657658897e-06, "loss": 0.01881273, "memory(GiB)": 26.31, "step": 790, "train_speed(iter/s)": 0.581485 }, { "acc": 0.99326372, "epoch": 0.8430540827147401, "grad_norm": 0.5862351059913635, "learning_rate": 9.96795835472558e-06, "loss": 0.01942059, "memory(GiB)": 26.31, "step": 795, "train_speed(iter/s)": 0.581506 }, { "acc": 0.99427719, "epoch": 0.848356309650053, "grad_norm": 0.6744386553764343, "learning_rate": 9.96695977782831e-06, "loss": 0.01748537, "memory(GiB)": 26.31, "step": 800, "train_speed(iter/s)": 0.581514 }, { "acc": 0.99367943, "epoch": 0.8536585365853658, "grad_norm": 0.45461753010749817, "learning_rate": 9.965945930037511e-06, "loss": 0.01755228, "memory(GiB)": 26.31, "step": 805, "train_speed(iter/s)": 0.581548 }, { "acc": 0.9931777, "epoch": 0.8589607635206787, "grad_norm": 0.6794818043708801, "learning_rate": 9.96491681447057e-06, "loss": 0.02256282, "memory(GiB)": 26.31, "step": 810, "train_speed(iter/s)": 0.581579 }, { "acc": 0.99288502, "epoch": 0.8642629904559915, "grad_norm": 0.6164844036102295, "learning_rate": 9.963872434291817e-06, "loss": 0.02206081, "memory(GiB)": 26.31, "step": 815, "train_speed(iter/s)": 0.581636 }, { "acc": 0.99430141, "epoch": 0.8695652173913043, "grad_norm": 0.5261210799217224, "learning_rate": 9.962812792712513e-06, "loss": 0.01857843, "memory(GiB)": 26.31, "step": 820, "train_speed(iter/s)": 0.581676 }, { "acc": 0.99293432, "epoch": 0.8748674443266172, "grad_norm": 0.610313892364502, "learning_rate": 9.96173789299085e-06, "loss": 0.01938998, "memory(GiB)": 26.31, "step": 825, "train_speed(iter/s)": 0.581676 }, { "acc": 0.9944231, "epoch": 0.88016967126193, "grad_norm": 0.6256194710731506, "learning_rate": 9.960647738431939e-06, "loss": 0.01734181, "memory(GiB)": 26.31, "step": 830, "train_speed(iter/s)": 0.581682 }, { "acc": 0.99155655, "epoch": 0.8854718981972428, "grad_norm": 1.561480164527893, "learning_rate": 9.959542332387785e-06, "loss": 0.02328325, "memory(GiB)": 26.31, "step": 835, "train_speed(iter/s)": 0.581692 }, { "acc": 0.99236937, "epoch": 0.8907741251325557, "grad_norm": 0.8149193525314331, "learning_rate": 9.958421678257304e-06, "loss": 0.02260731, "memory(GiB)": 26.31, "step": 840, "train_speed(iter/s)": 0.581707 }, { "acc": 0.99417439, "epoch": 0.8960763520678685, "grad_norm": 0.5183711647987366, "learning_rate": 9.95728577948628e-06, "loss": 0.01935203, "memory(GiB)": 26.31, "step": 845, "train_speed(iter/s)": 0.581739 }, { "acc": 0.99458427, "epoch": 0.9013785790031813, "grad_norm": 0.5166410207748413, "learning_rate": 9.956134639567388e-06, "loss": 0.01900076, "memory(GiB)": 26.31, "step": 850, "train_speed(iter/s)": 0.581748 }, { "acc": 0.99384928, "epoch": 0.9066808059384942, "grad_norm": 0.7617852687835693, "learning_rate": 9.954968262040152e-06, "loss": 0.02062955, "memory(GiB)": 26.31, "step": 855, "train_speed(iter/s)": 0.581761 }, { "acc": 0.99329472, "epoch": 0.911983032873807, "grad_norm": 0.5276218056678772, "learning_rate": 9.953786650490957e-06, "loss": 0.01935146, "memory(GiB)": 26.31, "step": 860, "train_speed(iter/s)": 0.581771 }, { "acc": 0.99199495, "epoch": 0.9172852598091198, "grad_norm": 1.0328503847122192, "learning_rate": 9.952589808553028e-06, "loss": 0.02057788, "memory(GiB)": 26.31, "step": 865, "train_speed(iter/s)": 0.5818 }, { "acc": 0.99147606, "epoch": 0.9225874867444327, "grad_norm": 0.6767083406448364, "learning_rate": 9.951377739906422e-06, "loss": 0.02531357, "memory(GiB)": 26.31, "step": 870, "train_speed(iter/s)": 0.581807 }, { "acc": 0.99108963, "epoch": 0.9278897136797455, "grad_norm": 0.8326674103736877, "learning_rate": 9.95015044827801e-06, "loss": 0.02297047, "memory(GiB)": 26.31, "step": 875, "train_speed(iter/s)": 0.581807 }, { "acc": 0.99454174, "epoch": 0.9331919406150583, "grad_norm": 0.5634021162986755, "learning_rate": 9.948907937441476e-06, "loss": 0.01865164, "memory(GiB)": 26.31, "step": 880, "train_speed(iter/s)": 0.581816 }, { "acc": 0.99318018, "epoch": 0.9384941675503712, "grad_norm": 0.5036848783493042, "learning_rate": 9.947650211217297e-06, "loss": 0.02226395, "memory(GiB)": 26.31, "step": 885, "train_speed(iter/s)": 0.581851 }, { "acc": 0.99398918, "epoch": 0.943796394485684, "grad_norm": 0.5968469977378845, "learning_rate": 9.946377273472736e-06, "loss": 0.01695271, "memory(GiB)": 26.31, "step": 890, "train_speed(iter/s)": 0.581867 }, { "acc": 0.99425325, "epoch": 0.9490986214209968, "grad_norm": 0.7831963300704956, "learning_rate": 9.945089128121828e-06, "loss": 0.01738757, "memory(GiB)": 26.31, "step": 895, "train_speed(iter/s)": 0.581866 }, { "acc": 0.99221354, "epoch": 0.9544008483563097, "grad_norm": 0.950468122959137, "learning_rate": 9.943785779125367e-06, "loss": 0.02397622, "memory(GiB)": 26.31, "step": 900, "train_speed(iter/s)": 0.581868 }, { "acc": 0.99435406, "epoch": 0.9597030752916225, "grad_norm": 0.6189500093460083, "learning_rate": 9.942467230490899e-06, "loss": 0.01756911, "memory(GiB)": 26.31, "step": 905, "train_speed(iter/s)": 0.581878 }, { "acc": 0.99479847, "epoch": 0.9650053022269353, "grad_norm": 0.6108078956604004, "learning_rate": 9.941133486272702e-06, "loss": 0.01728452, "memory(GiB)": 26.31, "step": 910, "train_speed(iter/s)": 0.581885 }, { "acc": 0.99450102, "epoch": 0.9703075291622482, "grad_norm": 0.6411594748497009, "learning_rate": 9.939784550571779e-06, "loss": 0.017101, "memory(GiB)": 26.31, "step": 915, "train_speed(iter/s)": 0.581895 }, { "acc": 0.99435349, "epoch": 0.975609756097561, "grad_norm": 0.9291826486587524, "learning_rate": 9.938420427535842e-06, "loss": 0.01866589, "memory(GiB)": 26.31, "step": 920, "train_speed(iter/s)": 0.581901 }, { "acc": 0.99345455, "epoch": 0.9809119830328739, "grad_norm": 0.9071804881095886, "learning_rate": 9.937041121359307e-06, "loss": 0.01875674, "memory(GiB)": 26.31, "step": 925, "train_speed(iter/s)": 0.581908 }, { "acc": 0.99373541, "epoch": 0.9862142099681867, "grad_norm": 0.8860246539115906, "learning_rate": 9.935646636283267e-06, "loss": 0.01871233, "memory(GiB)": 26.31, "step": 930, "train_speed(iter/s)": 0.581913 }, { "acc": 0.99323978, "epoch": 0.9915164369034994, "grad_norm": 0.9965890645980835, "learning_rate": 9.934236976595492e-06, "loss": 0.01928719, "memory(GiB)": 26.31, "step": 935, "train_speed(iter/s)": 0.581928 }, { "acc": 0.99340267, "epoch": 0.9968186638388123, "grad_norm": 0.6819751858711243, "learning_rate": 9.932812146630413e-06, "loss": 0.02095537, "memory(GiB)": 26.31, "step": 940, "train_speed(iter/s)": 0.581931 }, { "acc": 0.99613075, "epoch": 1.002120890774125, "grad_norm": 0.4798663854598999, "learning_rate": 9.9313721507691e-06, "loss": 0.0142905, "memory(GiB)": 26.31, "step": 945, "train_speed(iter/s)": 0.581588 }, { "acc": 0.99497509, "epoch": 1.007423117709438, "grad_norm": 0.6852810382843018, "learning_rate": 9.92991699343926e-06, "loss": 0.01566033, "memory(GiB)": 26.31, "step": 950, "train_speed(iter/s)": 0.581615 }, { "acc": 0.99398909, "epoch": 1.0127253446447508, "grad_norm": 0.5741665959358215, "learning_rate": 9.92844667911522e-06, "loss": 0.02031869, "memory(GiB)": 26.31, "step": 955, "train_speed(iter/s)": 0.58163 }, { "acc": 0.99311733, "epoch": 1.0180275715800637, "grad_norm": 0.5430195331573486, "learning_rate": 9.926961212317905e-06, "loss": 0.01968945, "memory(GiB)": 26.31, "step": 960, "train_speed(iter/s)": 0.581642 }, { "acc": 0.99392338, "epoch": 1.0233297985153764, "grad_norm": 0.8401893973350525, "learning_rate": 9.92546059761484e-06, "loss": 0.01952956, "memory(GiB)": 26.31, "step": 965, "train_speed(iter/s)": 0.581654 }, { "acc": 0.99450817, "epoch": 1.0286320254506893, "grad_norm": 0.4842678904533386, "learning_rate": 9.923944839620118e-06, "loss": 0.01760198, "memory(GiB)": 26.31, "step": 970, "train_speed(iter/s)": 0.581667 }, { "acc": 0.99244471, "epoch": 1.0339342523860022, "grad_norm": 0.6450461745262146, "learning_rate": 9.922413942994401e-06, "loss": 0.02271796, "memory(GiB)": 26.31, "step": 975, "train_speed(iter/s)": 0.581675 }, { "acc": 0.99532566, "epoch": 1.039236479321315, "grad_norm": 0.4986313581466675, "learning_rate": 9.920867912444895e-06, "loss": 0.01437508, "memory(GiB)": 26.31, "step": 980, "train_speed(iter/s)": 0.581702 }, { "acc": 0.99303761, "epoch": 1.0445387062566278, "grad_norm": 0.6543801426887512, "learning_rate": 9.919306752725346e-06, "loss": 0.02089486, "memory(GiB)": 26.31, "step": 985, "train_speed(iter/s)": 0.581733 }, { "acc": 0.9940731, "epoch": 1.0498409331919407, "grad_norm": 0.6764442920684814, "learning_rate": 9.917730468636012e-06, "loss": 0.02023424, "memory(GiB)": 26.31, "step": 990, "train_speed(iter/s)": 0.58174 }, { "acc": 0.99587164, "epoch": 1.0551431601272534, "grad_norm": 0.44897225499153137, "learning_rate": 9.916139065023656e-06, "loss": 0.0132737, "memory(GiB)": 26.31, "step": 995, "train_speed(iter/s)": 0.581747 }, { "acc": 0.99454727, "epoch": 1.0604453870625663, "grad_norm": 0.6791054010391235, "learning_rate": 9.914532546781538e-06, "loss": 0.01728434, "memory(GiB)": 26.31, "step": 1000, "train_speed(iter/s)": 0.581774 }, { "acc": 0.99382992, "epoch": 1.0657476139978792, "grad_norm": 0.4228080213069916, "learning_rate": 9.912910918849386e-06, "loss": 0.01803771, "memory(GiB)": 26.31, "step": 1005, "train_speed(iter/s)": 0.58178 }, { "acc": 0.99441757, "epoch": 1.0710498409331919, "grad_norm": 0.5636529326438904, "learning_rate": 9.911274186213388e-06, "loss": 0.0167443, "memory(GiB)": 26.31, "step": 1010, "train_speed(iter/s)": 0.581806 }, { "acc": 0.99366875, "epoch": 1.0763520678685048, "grad_norm": 0.6800065636634827, "learning_rate": 9.909622353906179e-06, "loss": 0.01885929, "memory(GiB)": 26.31, "step": 1015, "train_speed(iter/s)": 0.581833 }, { "acc": 0.99449482, "epoch": 1.0816542948038177, "grad_norm": 0.5581928491592407, "learning_rate": 9.90795542700682e-06, "loss": 0.01485443, "memory(GiB)": 26.31, "step": 1020, "train_speed(iter/s)": 0.581848 }, { "acc": 0.99461403, "epoch": 1.0869565217391304, "grad_norm": 0.7231691479682922, "learning_rate": 9.906273410640785e-06, "loss": 0.01806219, "memory(GiB)": 26.31, "step": 1025, "train_speed(iter/s)": 0.581856 }, { "acc": 0.99244728, "epoch": 1.0922587486744433, "grad_norm": 0.9682743549346924, "learning_rate": 9.904576309979945e-06, "loss": 0.02174127, "memory(GiB)": 26.31, "step": 1030, "train_speed(iter/s)": 0.581873 }, { "acc": 0.9933219, "epoch": 1.0975609756097562, "grad_norm": 0.752882719039917, "learning_rate": 9.902864130242557e-06, "loss": 0.02086806, "memory(GiB)": 26.31, "step": 1035, "train_speed(iter/s)": 0.581897 }, { "acc": 0.99312601, "epoch": 1.1028632025450689, "grad_norm": 0.7803872227668762, "learning_rate": 9.901136876693233e-06, "loss": 0.02002691, "memory(GiB)": 26.31, "step": 1040, "train_speed(iter/s)": 0.581919 }, { "acc": 0.99472342, "epoch": 1.1081654294803818, "grad_norm": 0.6384313106536865, "learning_rate": 9.899394554642943e-06, "loss": 0.01663477, "memory(GiB)": 26.31, "step": 1045, "train_speed(iter/s)": 0.581947 }, { "acc": 0.99375353, "epoch": 1.1134676564156947, "grad_norm": 0.68864506483078, "learning_rate": 9.897637169448988e-06, "loss": 0.01638348, "memory(GiB)": 26.31, "step": 1050, "train_speed(iter/s)": 0.581979 }, { "acc": 0.99457264, "epoch": 1.1187698833510074, "grad_norm": 0.4632731080055237, "learning_rate": 9.895864726514983e-06, "loss": 0.0175137, "memory(GiB)": 26.31, "step": 1055, "train_speed(iter/s)": 0.58199 }, { "acc": 0.99362888, "epoch": 1.1240721102863203, "grad_norm": 0.6945028305053711, "learning_rate": 9.894077231290846e-06, "loss": 0.01872665, "memory(GiB)": 26.31, "step": 1060, "train_speed(iter/s)": 0.582022 }, { "acc": 0.9930809, "epoch": 1.1293743372216332, "grad_norm": 0.6852729320526123, "learning_rate": 9.892274689272772e-06, "loss": 0.01868084, "memory(GiB)": 26.31, "step": 1065, "train_speed(iter/s)": 0.582048 }, { "acc": 0.99468307, "epoch": 1.1346765641569458, "grad_norm": 0.6329808831214905, "learning_rate": 9.890457106003228e-06, "loss": 0.01552589, "memory(GiB)": 26.31, "step": 1070, "train_speed(iter/s)": 0.582056 }, { "acc": 0.9942112, "epoch": 1.1399787910922587, "grad_norm": 0.587793231010437, "learning_rate": 9.888624487070926e-06, "loss": 0.0155957, "memory(GiB)": 26.31, "step": 1075, "train_speed(iter/s)": 0.58206 }, { "acc": 0.9943779, "epoch": 1.1452810180275717, "grad_norm": 0.8524949550628662, "learning_rate": 9.886776838110811e-06, "loss": 0.01897177, "memory(GiB)": 26.31, "step": 1080, "train_speed(iter/s)": 0.582084 }, { "acc": 0.99492826, "epoch": 1.1505832449628843, "grad_norm": 0.48728036880493164, "learning_rate": 9.884914164804047e-06, "loss": 0.01589368, "memory(GiB)": 26.31, "step": 1085, "train_speed(iter/s)": 0.582097 }, { "acc": 0.99354877, "epoch": 1.1558854718981972, "grad_norm": 0.4839894771575928, "learning_rate": 9.883036472877983e-06, "loss": 0.01641577, "memory(GiB)": 26.31, "step": 1090, "train_speed(iter/s)": 0.5821 }, { "acc": 0.9963026, "epoch": 1.1611876988335101, "grad_norm": 0.568143367767334, "learning_rate": 9.881143768106162e-06, "loss": 0.01254884, "memory(GiB)": 26.31, "step": 1095, "train_speed(iter/s)": 0.582105 }, { "acc": 0.99448452, "epoch": 1.1664899257688228, "grad_norm": 0.46727454662323, "learning_rate": 9.879236056308277e-06, "loss": 0.01576549, "memory(GiB)": 26.31, "step": 1100, "train_speed(iter/s)": 0.582132 }, { "acc": 0.99468699, "epoch": 1.1717921527041357, "grad_norm": 0.6475072503089905, "learning_rate": 9.877313343350169e-06, "loss": 0.01640804, "memory(GiB)": 26.31, "step": 1105, "train_speed(iter/s)": 0.582153 }, { "acc": 0.99498844, "epoch": 1.1770943796394486, "grad_norm": 0.7032859921455383, "learning_rate": 9.875375635143809e-06, "loss": 0.01677711, "memory(GiB)": 26.31, "step": 1110, "train_speed(iter/s)": 0.582153 }, { "acc": 0.99531984, "epoch": 1.1823966065747613, "grad_norm": 0.5358213782310486, "learning_rate": 9.873422937647266e-06, "loss": 0.0163477, "memory(GiB)": 26.31, "step": 1115, "train_speed(iter/s)": 0.582182 }, { "acc": 0.99534531, "epoch": 1.1876988335100742, "grad_norm": 0.7569906711578369, "learning_rate": 9.871455256864705e-06, "loss": 0.01311936, "memory(GiB)": 26.31, "step": 1120, "train_speed(iter/s)": 0.58219 }, { "acc": 0.99419842, "epoch": 1.1930010604453871, "grad_norm": 0.8755477666854858, "learning_rate": 9.869472598846362e-06, "loss": 0.01503182, "memory(GiB)": 26.31, "step": 1125, "train_speed(iter/s)": 0.582198 }, { "acc": 0.99561043, "epoch": 1.1983032873806998, "grad_norm": 0.6605581641197205, "learning_rate": 9.86747496968852e-06, "loss": 0.0120577, "memory(GiB)": 26.31, "step": 1130, "train_speed(iter/s)": 0.58222 }, { "acc": 0.99585924, "epoch": 1.2036055143160127, "grad_norm": 0.3325146436691284, "learning_rate": 9.8654623755335e-06, "loss": 0.01220306, "memory(GiB)": 26.31, "step": 1135, "train_speed(iter/s)": 0.582233 }, { "acc": 0.99469805, "epoch": 1.2089077412513256, "grad_norm": 0.49594518542289734, "learning_rate": 9.863434822569637e-06, "loss": 0.0143767, "memory(GiB)": 26.31, "step": 1140, "train_speed(iter/s)": 0.582272 }, { "acc": 0.99370193, "epoch": 1.2142099681866383, "grad_norm": 1.0842325687408447, "learning_rate": 9.861392317031256e-06, "loss": 0.01859829, "memory(GiB)": 26.31, "step": 1145, "train_speed(iter/s)": 0.582293 }, { "acc": 0.99477673, "epoch": 1.2195121951219512, "grad_norm": 0.6422792673110962, "learning_rate": 9.85933486519867e-06, "loss": 0.01558067, "memory(GiB)": 26.31, "step": 1150, "train_speed(iter/s)": 0.582306 }, { "acc": 0.9952467, "epoch": 1.224814422057264, "grad_norm": 0.3615686297416687, "learning_rate": 9.857262473398134e-06, "loss": 0.01394599, "memory(GiB)": 26.31, "step": 1155, "train_speed(iter/s)": 0.582312 }, { "acc": 0.99529438, "epoch": 1.2301166489925768, "grad_norm": 0.6838765740394592, "learning_rate": 9.855175148001852e-06, "loss": 0.01169058, "memory(GiB)": 26.31, "step": 1160, "train_speed(iter/s)": 0.582335 }, { "acc": 0.99526882, "epoch": 1.2354188759278897, "grad_norm": 0.6930041313171387, "learning_rate": 9.853072895427938e-06, "loss": 0.01289481, "memory(GiB)": 26.31, "step": 1165, "train_speed(iter/s)": 0.582338 }, { "acc": 0.99560337, "epoch": 1.2407211028632026, "grad_norm": 0.8965883255004883, "learning_rate": 9.850955722140412e-06, "loss": 0.01668053, "memory(GiB)": 26.31, "step": 1170, "train_speed(iter/s)": 0.582371 }, { "acc": 0.99516726, "epoch": 1.2460233297985153, "grad_norm": 0.8131476640701294, "learning_rate": 9.848823634649169e-06, "loss": 0.01300157, "memory(GiB)": 26.31, "step": 1175, "train_speed(iter/s)": 0.582381 }, { "acc": 0.99540739, "epoch": 1.2513255567338282, "grad_norm": 0.5490795373916626, "learning_rate": 9.846676639509958e-06, "loss": 0.01537903, "memory(GiB)": 26.31, "step": 1180, "train_speed(iter/s)": 0.582374 }, { "acc": 0.99454231, "epoch": 1.256627783669141, "grad_norm": 0.6122965812683105, "learning_rate": 9.84451474332437e-06, "loss": 0.01733764, "memory(GiB)": 26.31, "step": 1185, "train_speed(iter/s)": 0.582379 }, { "acc": 0.99378347, "epoch": 1.2619300106044538, "grad_norm": 0.6917413473129272, "learning_rate": 9.842337952739813e-06, "loss": 0.01898844, "memory(GiB)": 26.31, "step": 1190, "train_speed(iter/s)": 0.58238 }, { "acc": 0.99473305, "epoch": 1.2672322375397667, "grad_norm": 0.9137494564056396, "learning_rate": 9.840146274449497e-06, "loss": 0.01613488, "memory(GiB)": 26.31, "step": 1195, "train_speed(iter/s)": 0.582386 }, { "acc": 0.99495955, "epoch": 1.2725344644750796, "grad_norm": 0.492419570684433, "learning_rate": 9.8379397151924e-06, "loss": 0.01467134, "memory(GiB)": 26.31, "step": 1200, "train_speed(iter/s)": 0.58241 }, { "acc": 0.99571934, "epoch": 1.2778366914103922, "grad_norm": 0.6594977378845215, "learning_rate": 9.835718281753262e-06, "loss": 0.01280947, "memory(GiB)": 26.31, "step": 1205, "train_speed(iter/s)": 0.58241 }, { "acc": 0.99385624, "epoch": 1.2831389183457051, "grad_norm": 0.6085644364356995, "learning_rate": 9.833481980962557e-06, "loss": 0.01779164, "memory(GiB)": 26.31, "step": 1210, "train_speed(iter/s)": 0.582412 }, { "acc": 0.99450674, "epoch": 1.288441145281018, "grad_norm": 0.9028270840644836, "learning_rate": 9.831230819696474e-06, "loss": 0.01454319, "memory(GiB)": 26.31, "step": 1215, "train_speed(iter/s)": 0.582434 }, { "acc": 0.99494953, "epoch": 1.2937433722163307, "grad_norm": 0.6843631863594055, "learning_rate": 9.828964804876893e-06, "loss": 0.01487939, "memory(GiB)": 26.31, "step": 1220, "train_speed(iter/s)": 0.582442 }, { "acc": 0.99503832, "epoch": 1.2990455991516436, "grad_norm": 0.6850240230560303, "learning_rate": 9.826683943471366e-06, "loss": 0.01760384, "memory(GiB)": 26.31, "step": 1225, "train_speed(iter/s)": 0.582446 }, { "acc": 0.99546528, "epoch": 1.3043478260869565, "grad_norm": 0.7692059874534607, "learning_rate": 9.824388242493098e-06, "loss": 0.01505646, "memory(GiB)": 26.31, "step": 1230, "train_speed(iter/s)": 0.582466 }, { "acc": 0.99555531, "epoch": 1.3096500530222692, "grad_norm": 0.767185389995575, "learning_rate": 9.82207770900092e-06, "loss": 0.01339123, "memory(GiB)": 26.31, "step": 1235, "train_speed(iter/s)": 0.582466 }, { "acc": 0.99524298, "epoch": 1.3149522799575821, "grad_norm": 0.6795545220375061, "learning_rate": 9.81975235009927e-06, "loss": 0.01468817, "memory(GiB)": 26.31, "step": 1240, "train_speed(iter/s)": 0.582484 }, { "acc": 0.99549789, "epoch": 1.320254506892895, "grad_norm": 0.7110230922698975, "learning_rate": 9.817412172938176e-06, "loss": 0.01409793, "memory(GiB)": 26.31, "step": 1245, "train_speed(iter/s)": 0.582489 }, { "acc": 0.99565172, "epoch": 1.3255567338282077, "grad_norm": 0.5482975840568542, "learning_rate": 9.815057184713223e-06, "loss": 0.01393813, "memory(GiB)": 26.31, "step": 1250, "train_speed(iter/s)": 0.582524 }, { "acc": 0.99554529, "epoch": 1.3308589607635206, "grad_norm": 0.7029886245727539, "learning_rate": 9.81268739266554e-06, "loss": 0.01505921, "memory(GiB)": 26.31, "step": 1255, "train_speed(iter/s)": 0.582526 }, { "acc": 0.99403896, "epoch": 1.3361611876988335, "grad_norm": 0.9024369716644287, "learning_rate": 9.810302804081772e-06, "loss": 0.01528788, "memory(GiB)": 26.31, "step": 1260, "train_speed(iter/s)": 0.582528 }, { "acc": 0.99561853, "epoch": 1.3414634146341464, "grad_norm": 0.6582544445991516, "learning_rate": 9.807903426294067e-06, "loss": 0.01325701, "memory(GiB)": 26.31, "step": 1265, "train_speed(iter/s)": 0.58253 }, { "acc": 0.99610195, "epoch": 1.346765641569459, "grad_norm": 0.7300844192504883, "learning_rate": 9.805489266680042e-06, "loss": 0.01429899, "memory(GiB)": 26.31, "step": 1270, "train_speed(iter/s)": 0.582526 }, { "acc": 0.9962471, "epoch": 1.352067868504772, "grad_norm": 0.7025005221366882, "learning_rate": 9.803060332662764e-06, "loss": 0.01404113, "memory(GiB)": 26.31, "step": 1275, "train_speed(iter/s)": 0.582544 }, { "acc": 0.9944252, "epoch": 1.357370095440085, "grad_norm": 0.6766318678855896, "learning_rate": 9.800616631710734e-06, "loss": 0.01653153, "memory(GiB)": 26.31, "step": 1280, "train_speed(iter/s)": 0.582545 }, { "acc": 0.99595785, "epoch": 1.3626723223753976, "grad_norm": 0.738859236240387, "learning_rate": 9.798158171337852e-06, "loss": 0.01370402, "memory(GiB)": 26.31, "step": 1285, "train_speed(iter/s)": 0.58255 }, { "acc": 0.99550438, "epoch": 1.3679745493107105, "grad_norm": 0.5170459747314453, "learning_rate": 9.795684959103405e-06, "loss": 0.01245071, "memory(GiB)": 26.31, "step": 1290, "train_speed(iter/s)": 0.582555 }, { "acc": 0.99478798, "epoch": 1.3732767762460234, "grad_norm": 0.719942569732666, "learning_rate": 9.793197002612038e-06, "loss": 0.01572569, "memory(GiB)": 26.31, "step": 1295, "train_speed(iter/s)": 0.582558 }, { "acc": 0.9956728, "epoch": 1.378579003181336, "grad_norm": 0.553420901298523, "learning_rate": 9.790694309513728e-06, "loss": 0.0132667, "memory(GiB)": 26.31, "step": 1300, "train_speed(iter/s)": 0.582572 }, { "acc": 0.99500141, "epoch": 1.383881230116649, "grad_norm": 0.6300088763237, "learning_rate": 9.788176887503771e-06, "loss": 0.0131365, "memory(GiB)": 26.31, "step": 1305, "train_speed(iter/s)": 0.582576 }, { "acc": 0.99426498, "epoch": 1.389183457051962, "grad_norm": 0.5392923951148987, "learning_rate": 9.785644744322745e-06, "loss": 0.0165678, "memory(GiB)": 26.31, "step": 1310, "train_speed(iter/s)": 0.582594 }, { "acc": 0.99493856, "epoch": 1.3944856839872748, "grad_norm": 0.5953570604324341, "learning_rate": 9.783097887756497e-06, "loss": 0.01508854, "memory(GiB)": 26.31, "step": 1315, "train_speed(iter/s)": 0.582621 }, { "acc": 0.99446144, "epoch": 1.3997879109225875, "grad_norm": 0.6170870065689087, "learning_rate": 9.780536325636113e-06, "loss": 0.01575189, "memory(GiB)": 26.31, "step": 1320, "train_speed(iter/s)": 0.582638 }, { "acc": 0.99507885, "epoch": 1.4050901378579004, "grad_norm": 0.5795525908470154, "learning_rate": 9.777960065837898e-06, "loss": 0.01650278, "memory(GiB)": 26.31, "step": 1325, "train_speed(iter/s)": 0.582647 }, { "acc": 0.9949317, "epoch": 1.4103923647932133, "grad_norm": 0.986248254776001, "learning_rate": 9.775369116283346e-06, "loss": 0.01463662, "memory(GiB)": 26.31, "step": 1330, "train_speed(iter/s)": 0.582653 }, { "acc": 0.99627342, "epoch": 1.415694591728526, "grad_norm": 0.671968400478363, "learning_rate": 9.772763484939118e-06, "loss": 0.01474574, "memory(GiB)": 26.31, "step": 1335, "train_speed(iter/s)": 0.582666 }, { "acc": 0.99520779, "epoch": 1.4209968186638389, "grad_norm": 0.8155642151832581, "learning_rate": 9.770143179817025e-06, "loss": 0.01464276, "memory(GiB)": 26.31, "step": 1340, "train_speed(iter/s)": 0.582666 }, { "acc": 0.99607487, "epoch": 1.4262990455991518, "grad_norm": 0.46176499128341675, "learning_rate": 9.767508208973993e-06, "loss": 0.01274185, "memory(GiB)": 26.31, "step": 1345, "train_speed(iter/s)": 0.582676 }, { "acc": 0.99446783, "epoch": 1.4316012725344645, "grad_norm": 1.2167030572891235, "learning_rate": 9.76485858051204e-06, "loss": 0.01575762, "memory(GiB)": 26.31, "step": 1350, "train_speed(iter/s)": 0.582677 }, { "acc": 0.99542255, "epoch": 1.4369034994697774, "grad_norm": 0.3225151300430298, "learning_rate": 9.762194302578258e-06, "loss": 0.01336584, "memory(GiB)": 26.31, "step": 1355, "train_speed(iter/s)": 0.58268 }, { "acc": 0.99579563, "epoch": 1.4422057264050903, "grad_norm": 0.6517772078514099, "learning_rate": 9.759515383364782e-06, "loss": 0.01390352, "memory(GiB)": 26.31, "step": 1360, "train_speed(iter/s)": 0.582687 }, { "acc": 0.99631348, "epoch": 1.447507953340403, "grad_norm": 0.5232001543045044, "learning_rate": 9.756821831108764e-06, "loss": 0.01418541, "memory(GiB)": 26.31, "step": 1365, "train_speed(iter/s)": 0.582704 }, { "acc": 0.99409437, "epoch": 1.4528101802757158, "grad_norm": 0.4395295977592468, "learning_rate": 9.75411365409235e-06, "loss": 0.01683905, "memory(GiB)": 26.31, "step": 1370, "train_speed(iter/s)": 0.582716 }, { "acc": 0.99526634, "epoch": 1.4581124072110287, "grad_norm": 0.6830112338066101, "learning_rate": 9.751390860642655e-06, "loss": 0.01343714, "memory(GiB)": 26.31, "step": 1375, "train_speed(iter/s)": 0.58273 }, { "acc": 0.99617195, "epoch": 1.4634146341463414, "grad_norm": 0.3659854829311371, "learning_rate": 9.748653459131741e-06, "loss": 0.01204016, "memory(GiB)": 26.31, "step": 1380, "train_speed(iter/s)": 0.582732 }, { "acc": 0.99550266, "epoch": 1.4687168610816543, "grad_norm": 0.6669169068336487, "learning_rate": 9.745901457976578e-06, "loss": 0.01441504, "memory(GiB)": 26.31, "step": 1385, "train_speed(iter/s)": 0.582737 }, { "acc": 0.99465714, "epoch": 1.4740190880169672, "grad_norm": 0.7347469925880432, "learning_rate": 9.743134865639034e-06, "loss": 0.01655078, "memory(GiB)": 26.31, "step": 1390, "train_speed(iter/s)": 0.582751 }, { "acc": 0.99624844, "epoch": 1.47932131495228, "grad_norm": 0.9294884204864502, "learning_rate": 9.74035369062584e-06, "loss": 0.01310423, "memory(GiB)": 26.31, "step": 1395, "train_speed(iter/s)": 0.582783 }, { "acc": 0.99600592, "epoch": 1.4846235418875928, "grad_norm": 0.4734921455383301, "learning_rate": 9.737557941488565e-06, "loss": 0.01109402, "memory(GiB)": 26.31, "step": 1400, "train_speed(iter/s)": 0.582791 }, { "acc": 0.99517355, "epoch": 1.4899257688229057, "grad_norm": 0.7194894552230835, "learning_rate": 9.73474762682359e-06, "loss": 0.01321665, "memory(GiB)": 26.31, "step": 1405, "train_speed(iter/s)": 0.582798 }, { "acc": 0.99582138, "epoch": 1.4952279957582184, "grad_norm": 0.8972120881080627, "learning_rate": 9.73192275527209e-06, "loss": 0.01335516, "memory(GiB)": 26.31, "step": 1410, "train_speed(iter/s)": 0.5828 }, { "acc": 0.99488239, "epoch": 1.5005302226935313, "grad_norm": 0.7436479330062866, "learning_rate": 9.729083335519984e-06, "loss": 0.01636965, "memory(GiB)": 26.31, "step": 1415, "train_speed(iter/s)": 0.582804 }, { "acc": 0.99691572, "epoch": 1.5058324496288442, "grad_norm": 0.36048823595046997, "learning_rate": 9.72622937629794e-06, "loss": 0.01027709, "memory(GiB)": 26.31, "step": 1420, "train_speed(iter/s)": 0.582826 }, { "acc": 0.99606276, "epoch": 1.511134676564157, "grad_norm": 0.4885866641998291, "learning_rate": 9.723360886381322e-06, "loss": 0.01266043, "memory(GiB)": 26.31, "step": 1425, "train_speed(iter/s)": 0.582831 }, { "acc": 0.99566917, "epoch": 1.5164369034994698, "grad_norm": 0.5471593141555786, "learning_rate": 9.720477874590176e-06, "loss": 0.01241938, "memory(GiB)": 26.31, "step": 1430, "train_speed(iter/s)": 0.582847 }, { "acc": 0.99473476, "epoch": 1.5217391304347827, "grad_norm": 0.8035010099411011, "learning_rate": 9.717580349789203e-06, "loss": 0.01485032, "memory(GiB)": 26.31, "step": 1435, "train_speed(iter/s)": 0.582848 }, { "acc": 0.9948205, "epoch": 1.5270413573700954, "grad_norm": 0.811253547668457, "learning_rate": 9.714668320887722e-06, "loss": 0.01394672, "memory(GiB)": 26.31, "step": 1440, "train_speed(iter/s)": 0.582855 }, { "acc": 0.99572573, "epoch": 1.5323435843054083, "grad_norm": 0.8021854162216187, "learning_rate": 9.711741796839656e-06, "loss": 0.01230425, "memory(GiB)": 26.31, "step": 1445, "train_speed(iter/s)": 0.582857 }, { "acc": 0.99498787, "epoch": 1.5376458112407212, "grad_norm": 0.7353776097297668, "learning_rate": 9.70880078664349e-06, "loss": 0.01455704, "memory(GiB)": 26.31, "step": 1450, "train_speed(iter/s)": 0.582861 }, { "acc": 0.99538994, "epoch": 1.5429480381760339, "grad_norm": 0.5929186344146729, "learning_rate": 9.705845299342261e-06, "loss": 0.01419341, "memory(GiB)": 26.31, "step": 1455, "train_speed(iter/s)": 0.582874 }, { "acc": 0.99502916, "epoch": 1.5482502651113468, "grad_norm": 0.7457852959632874, "learning_rate": 9.70287534402351e-06, "loss": 0.01404118, "memory(GiB)": 26.31, "step": 1460, "train_speed(iter/s)": 0.582889 }, { "acc": 0.99454365, "epoch": 1.5535524920466597, "grad_norm": 0.7059531211853027, "learning_rate": 9.699890929819277e-06, "loss": 0.01576061, "memory(GiB)": 26.31, "step": 1465, "train_speed(iter/s)": 0.582906 }, { "acc": 0.99530487, "epoch": 1.5588547189819724, "grad_norm": 0.594176709651947, "learning_rate": 9.696892065906045e-06, "loss": 0.01272766, "memory(GiB)": 26.31, "step": 1470, "train_speed(iter/s)": 0.582924 }, { "acc": 0.99706202, "epoch": 1.5641569459172853, "grad_norm": 0.45875370502471924, "learning_rate": 9.693878761504738e-06, "loss": 0.01053843, "memory(GiB)": 26.31, "step": 1475, "train_speed(iter/s)": 0.58292 }, { "acc": 0.99492579, "epoch": 1.5694591728525982, "grad_norm": 0.6784182786941528, "learning_rate": 9.690851025880677e-06, "loss": 0.01531172, "memory(GiB)": 26.31, "step": 1480, "train_speed(iter/s)": 0.582924 }, { "acc": 0.99470387, "epoch": 1.5747613997879109, "grad_norm": 0.7049443125724792, "learning_rate": 9.687808868343558e-06, "loss": 0.01651053, "memory(GiB)": 26.31, "step": 1485, "train_speed(iter/s)": 0.582923 }, { "acc": 0.99574337, "epoch": 1.5800636267232238, "grad_norm": 0.8059265613555908, "learning_rate": 9.684752298247424e-06, "loss": 0.01243878, "memory(GiB)": 26.31, "step": 1490, "train_speed(iter/s)": 0.582924 }, { "acc": 0.99399509, "epoch": 1.5853658536585367, "grad_norm": 0.9278422594070435, "learning_rate": 9.681681324990627e-06, "loss": 0.01400151, "memory(GiB)": 26.31, "step": 1495, "train_speed(iter/s)": 0.582942 }, { "acc": 0.99648762, "epoch": 1.5906680805938493, "grad_norm": 0.5391660928726196, "learning_rate": 9.678595958015809e-06, "loss": 0.01022374, "memory(GiB)": 26.31, "step": 1500, "train_speed(iter/s)": 0.582956 }, { "acc": 0.99651642, "epoch": 1.5959703075291622, "grad_norm": 0.7209609746932983, "learning_rate": 9.675496206809875e-06, "loss": 0.01008506, "memory(GiB)": 26.31, "step": 1505, "train_speed(iter/s)": 0.582961 }, { "acc": 0.99613867, "epoch": 1.6012725344644752, "grad_norm": 0.7365092039108276, "learning_rate": 9.672382080903952e-06, "loss": 0.01454763, "memory(GiB)": 26.31, "step": 1510, "train_speed(iter/s)": 0.582962 }, { "acc": 0.99595547, "epoch": 1.6065747613997878, "grad_norm": 0.7338438630104065, "learning_rate": 9.669253589873369e-06, "loss": 0.01647771, "memory(GiB)": 26.31, "step": 1515, "train_speed(iter/s)": 0.582973 }, { "acc": 0.99618711, "epoch": 1.6118769883351007, "grad_norm": 0.26463937759399414, "learning_rate": 9.666110743337625e-06, "loss": 0.01331544, "memory(GiB)": 26.31, "step": 1520, "train_speed(iter/s)": 0.582974 }, { "acc": 0.99598131, "epoch": 1.6171792152704136, "grad_norm": 0.49993637204170227, "learning_rate": 9.662953550960357e-06, "loss": 0.01459299, "memory(GiB)": 26.31, "step": 1525, "train_speed(iter/s)": 0.582978 }, { "acc": 0.99672394, "epoch": 1.6224814422057263, "grad_norm": 0.5060617923736572, "learning_rate": 9.659782022449317e-06, "loss": 0.01016317, "memory(GiB)": 26.31, "step": 1530, "train_speed(iter/s)": 0.582992 }, { "acc": 0.99469109, "epoch": 1.6277836691410392, "grad_norm": 0.4732325077056885, "learning_rate": 9.656596167556335e-06, "loss": 0.01715919, "memory(GiB)": 26.31, "step": 1535, "train_speed(iter/s)": 0.583017 }, { "acc": 0.99556866, "epoch": 1.6330858960763521, "grad_norm": 0.7603917717933655, "learning_rate": 9.653395996077293e-06, "loss": 0.01328594, "memory(GiB)": 26.31, "step": 1540, "train_speed(iter/s)": 0.583006 }, { "acc": 0.99626541, "epoch": 1.6383881230116648, "grad_norm": 0.781606912612915, "learning_rate": 9.650181517852092e-06, "loss": 0.0128001, "memory(GiB)": 26.31, "step": 1545, "train_speed(iter/s)": 0.582998 }, { "acc": 0.99559555, "epoch": 1.6436903499469777, "grad_norm": 0.42572420835494995, "learning_rate": 9.646952742764624e-06, "loss": 0.01503129, "memory(GiB)": 26.31, "step": 1550, "train_speed(iter/s)": 0.582989 }, { "acc": 0.99584656, "epoch": 1.6489925768822906, "grad_norm": 0.5383741855621338, "learning_rate": 9.643709680742746e-06, "loss": 0.01389532, "memory(GiB)": 26.31, "step": 1555, "train_speed(iter/s)": 0.582991 }, { "acc": 0.99579649, "epoch": 1.6542948038176033, "grad_norm": 0.6257635951042175, "learning_rate": 9.640452341758233e-06, "loss": 0.01323799, "memory(GiB)": 26.31, "step": 1560, "train_speed(iter/s)": 0.582982 }, { "acc": 0.99618187, "epoch": 1.6595970307529162, "grad_norm": 0.5762199759483337, "learning_rate": 9.637180735826771e-06, "loss": 0.01221062, "memory(GiB)": 26.31, "step": 1565, "train_speed(iter/s)": 0.582976 }, { "acc": 0.99726114, "epoch": 1.664899257688229, "grad_norm": 0.4736613929271698, "learning_rate": 9.633894873007907e-06, "loss": 0.00984205, "memory(GiB)": 26.31, "step": 1570, "train_speed(iter/s)": 0.58297 }, { "acc": 0.99648628, "epoch": 1.6702014846235418, "grad_norm": 0.7545337080955505, "learning_rate": 9.63059476340503e-06, "loss": 0.01090518, "memory(GiB)": 26.31, "step": 1575, "train_speed(iter/s)": 0.582973 }, { "acc": 0.99693813, "epoch": 1.6755037115588547, "grad_norm": 0.45600301027297974, "learning_rate": 9.62728041716533e-06, "loss": 0.0102604, "memory(GiB)": 26.31, "step": 1580, "train_speed(iter/s)": 0.582965 }, { "acc": 0.99703884, "epoch": 1.6808059384941676, "grad_norm": 0.5730128884315491, "learning_rate": 9.623951844479772e-06, "loss": 0.00968609, "memory(GiB)": 26.31, "step": 1585, "train_speed(iter/s)": 0.582958 }, { "acc": 0.99681854, "epoch": 1.6861081654294803, "grad_norm": 0.6165497303009033, "learning_rate": 9.620609055583071e-06, "loss": 0.01058234, "memory(GiB)": 26.31, "step": 1590, "train_speed(iter/s)": 0.582963 }, { "acc": 0.99705038, "epoch": 1.6914103923647932, "grad_norm": 0.5090743899345398, "learning_rate": 9.617252060753647e-06, "loss": 0.01027542, "memory(GiB)": 26.31, "step": 1595, "train_speed(iter/s)": 0.582956 }, { "acc": 0.99618645, "epoch": 1.696712619300106, "grad_norm": 0.7163627743721008, "learning_rate": 9.613880870313604e-06, "loss": 0.01235223, "memory(GiB)": 26.31, "step": 1600, "train_speed(iter/s)": 0.582941 }, { "acc": 0.997223, "epoch": 1.7020148462354188, "grad_norm": 0.5650729537010193, "learning_rate": 9.610495494628696e-06, "loss": 0.00963061, "memory(GiB)": 26.31, "step": 1605, "train_speed(iter/s)": 0.582941 }, { "acc": 0.99562531, "epoch": 1.7073170731707317, "grad_norm": 0.64713054895401, "learning_rate": 9.60709594410829e-06, "loss": 0.01414991, "memory(GiB)": 26.31, "step": 1610, "train_speed(iter/s)": 0.582936 }, { "acc": 0.996521, "epoch": 1.7126193001060446, "grad_norm": 0.7104651927947998, "learning_rate": 9.603682229205338e-06, "loss": 0.01142905, "memory(GiB)": 26.31, "step": 1615, "train_speed(iter/s)": 0.582941 }, { "acc": 0.99511213, "epoch": 1.7179215270413573, "grad_norm": 1.0390896797180176, "learning_rate": 9.600254360416347e-06, "loss": 0.0135205, "memory(GiB)": 26.31, "step": 1620, "train_speed(iter/s)": 0.582944 }, { "acc": 0.99557877, "epoch": 1.7232237539766702, "grad_norm": 0.7820794582366943, "learning_rate": 9.596812348281348e-06, "loss": 0.01177459, "memory(GiB)": 26.31, "step": 1625, "train_speed(iter/s)": 0.582937 }, { "acc": 0.99583693, "epoch": 1.728525980911983, "grad_norm": 0.8768166899681091, "learning_rate": 9.59335620338385e-06, "loss": 0.01301484, "memory(GiB)": 26.31, "step": 1630, "train_speed(iter/s)": 0.582927 }, { "acc": 0.99763937, "epoch": 1.7338282078472957, "grad_norm": 0.7672966718673706, "learning_rate": 9.589885936350828e-06, "loss": 0.00735577, "memory(GiB)": 26.31, "step": 1635, "train_speed(iter/s)": 0.582922 }, { "acc": 0.99700527, "epoch": 1.7391304347826086, "grad_norm": 0.5951688289642334, "learning_rate": 9.586401557852673e-06, "loss": 0.00939723, "memory(GiB)": 26.31, "step": 1640, "train_speed(iter/s)": 0.582916 }, { "acc": 0.99769516, "epoch": 1.7444326617179216, "grad_norm": 0.5226224660873413, "learning_rate": 9.58290307860317e-06, "loss": 0.00775344, "memory(GiB)": 26.31, "step": 1645, "train_speed(iter/s)": 0.582912 }, { "acc": 0.99687271, "epoch": 1.7497348886532342, "grad_norm": 0.6737238168716431, "learning_rate": 9.579390509359456e-06, "loss": 0.00943484, "memory(GiB)": 26.31, "step": 1650, "train_speed(iter/s)": 0.582911 }, { "acc": 0.99588261, "epoch": 1.7550371155885471, "grad_norm": 0.8879348635673523, "learning_rate": 9.575863860921995e-06, "loss": 0.01161455, "memory(GiB)": 26.31, "step": 1655, "train_speed(iter/s)": 0.582904 }, { "acc": 0.99591579, "epoch": 1.76033934252386, "grad_norm": 0.8889970183372498, "learning_rate": 9.572323144134546e-06, "loss": 0.01313305, "memory(GiB)": 26.31, "step": 1660, "train_speed(iter/s)": 0.582896 }, { "acc": 0.99647074, "epoch": 1.7656415694591727, "grad_norm": 0.691448986530304, "learning_rate": 9.568768369884119e-06, "loss": 0.01091637, "memory(GiB)": 26.31, "step": 1665, "train_speed(iter/s)": 0.58289 }, { "acc": 0.9959466, "epoch": 1.7709437963944858, "grad_norm": 0.752933144569397, "learning_rate": 9.565199549100948e-06, "loss": 0.01170037, "memory(GiB)": 26.31, "step": 1670, "train_speed(iter/s)": 0.58289 }, { "acc": 0.99625635, "epoch": 1.7762460233297985, "grad_norm": 0.7446765899658203, "learning_rate": 9.561616692758463e-06, "loss": 0.01202907, "memory(GiB)": 26.31, "step": 1675, "train_speed(iter/s)": 0.582879 }, { "acc": 0.996422, "epoch": 1.7815482502651112, "grad_norm": 0.6502768397331238, "learning_rate": 9.558019811873248e-06, "loss": 0.01007521, "memory(GiB)": 26.31, "step": 1680, "train_speed(iter/s)": 0.582874 }, { "acc": 0.99729595, "epoch": 1.7868504772004243, "grad_norm": 0.6278144121170044, "learning_rate": 9.554408917505007e-06, "loss": 0.00926385, "memory(GiB)": 26.31, "step": 1685, "train_speed(iter/s)": 0.582863 }, { "acc": 0.99589462, "epoch": 1.792152704135737, "grad_norm": 0.8441500663757324, "learning_rate": 9.550784020756535e-06, "loss": 0.01101158, "memory(GiB)": 26.31, "step": 1690, "train_speed(iter/s)": 0.582853 }, { "acc": 0.99660549, "epoch": 1.7974549310710497, "grad_norm": 0.6982602477073669, "learning_rate": 9.54714513277368e-06, "loss": 0.00878714, "memory(GiB)": 26.31, "step": 1695, "train_speed(iter/s)": 0.582859 }, { "acc": 0.99487324, "epoch": 1.8027571580063628, "grad_norm": 0.4580039083957672, "learning_rate": 9.543492264745314e-06, "loss": 0.0145345, "memory(GiB)": 26.31, "step": 1700, "train_speed(iter/s)": 0.582852 }, { "acc": 0.9955162, "epoch": 1.8080593849416755, "grad_norm": 0.47468751668930054, "learning_rate": 9.539825427903293e-06, "loss": 0.01137172, "memory(GiB)": 26.31, "step": 1705, "train_speed(iter/s)": 0.582862 }, { "acc": 0.99595766, "epoch": 1.8133616118769882, "grad_norm": 0.8541509509086609, "learning_rate": 9.536144633522422e-06, "loss": 0.01074421, "memory(GiB)": 26.31, "step": 1710, "train_speed(iter/s)": 0.582867 }, { "acc": 0.99637671, "epoch": 1.8186638388123013, "grad_norm": 0.4724918305873871, "learning_rate": 9.532449892920423e-06, "loss": 0.00988533, "memory(GiB)": 26.31, "step": 1715, "train_speed(iter/s)": 0.582875 }, { "acc": 0.99641857, "epoch": 1.823966065747614, "grad_norm": 0.584002673625946, "learning_rate": 9.528741217457906e-06, "loss": 0.0138078, "memory(GiB)": 26.31, "step": 1720, "train_speed(iter/s)": 0.582865 }, { "acc": 0.99653225, "epoch": 1.8292682926829267, "grad_norm": 0.477457195520401, "learning_rate": 9.525018618538319e-06, "loss": 0.00891152, "memory(GiB)": 26.31, "step": 1725, "train_speed(iter/s)": 0.582862 }, { "acc": 0.99620132, "epoch": 1.8345705196182398, "grad_norm": 0.57017982006073, "learning_rate": 9.52128210760793e-06, "loss": 0.0105824, "memory(GiB)": 26.31, "step": 1730, "train_speed(iter/s)": 0.582852 }, { "acc": 0.99704952, "epoch": 1.8398727465535525, "grad_norm": 0.32321634888648987, "learning_rate": 9.51753169615578e-06, "loss": 0.00954672, "memory(GiB)": 26.31, "step": 1735, "train_speed(iter/s)": 0.582854 }, { "acc": 0.99546967, "epoch": 1.8451749734888652, "grad_norm": 0.6538419127464294, "learning_rate": 9.513767395713647e-06, "loss": 0.01392893, "memory(GiB)": 26.31, "step": 1740, "train_speed(iter/s)": 0.582844 }, { "acc": 0.99725323, "epoch": 1.8504772004241783, "grad_norm": 0.5553358793258667, "learning_rate": 9.509989217856022e-06, "loss": 0.00883438, "memory(GiB)": 26.31, "step": 1745, "train_speed(iter/s)": 0.582835 }, { "acc": 0.99713745, "epoch": 1.855779427359491, "grad_norm": 0.5093122124671936, "learning_rate": 9.506197174200066e-06, "loss": 0.01078411, "memory(GiB)": 26.31, "step": 1750, "train_speed(iter/s)": 0.582837 }, { "acc": 0.99583797, "epoch": 1.8610816542948037, "grad_norm": 0.6216493844985962, "learning_rate": 9.502391276405571e-06, "loss": 0.01271906, "memory(GiB)": 26.31, "step": 1755, "train_speed(iter/s)": 0.582826 }, { "acc": 0.99700794, "epoch": 1.8663838812301168, "grad_norm": 0.9486095309257507, "learning_rate": 9.498571536174927e-06, "loss": 0.01122559, "memory(GiB)": 26.31, "step": 1760, "train_speed(iter/s)": 0.582819 }, { "acc": 0.99467316, "epoch": 1.8716861081654295, "grad_norm": 0.5736285448074341, "learning_rate": 9.494737965253094e-06, "loss": 0.01514575, "memory(GiB)": 26.31, "step": 1765, "train_speed(iter/s)": 0.582812 }, { "acc": 0.99623184, "epoch": 1.8769883351007424, "grad_norm": 0.7306117415428162, "learning_rate": 9.490890575427549e-06, "loss": 0.01168928, "memory(GiB)": 26.31, "step": 1770, "train_speed(iter/s)": 0.582805 }, { "acc": 0.99540997, "epoch": 1.8822905620360553, "grad_norm": 0.7581856846809387, "learning_rate": 9.487029378528265e-06, "loss": 0.01336176, "memory(GiB)": 26.31, "step": 1775, "train_speed(iter/s)": 0.582793 }, { "acc": 0.99567566, "epoch": 1.887592788971368, "grad_norm": 0.6911209225654602, "learning_rate": 9.483154386427669e-06, "loss": 0.01130526, "memory(GiB)": 26.31, "step": 1780, "train_speed(iter/s)": 0.582791 }, { "acc": 0.99613075, "epoch": 1.8928950159066809, "grad_norm": 0.535045325756073, "learning_rate": 9.479265611040605e-06, "loss": 0.01026231, "memory(GiB)": 26.31, "step": 1785, "train_speed(iter/s)": 0.582792 }, { "acc": 0.99551563, "epoch": 1.8981972428419938, "grad_norm": 0.626434326171875, "learning_rate": 9.475363064324295e-06, "loss": 0.01337331, "memory(GiB)": 26.31, "step": 1790, "train_speed(iter/s)": 0.582783 }, { "acc": 0.99523849, "epoch": 1.9034994697773064, "grad_norm": 0.62039715051651, "learning_rate": 9.47144675827831e-06, "loss": 0.01312898, "memory(GiB)": 26.31, "step": 1795, "train_speed(iter/s)": 0.582786 }, { "acc": 0.99659777, "epoch": 1.9088016967126193, "grad_norm": 0.5358000993728638, "learning_rate": 9.467516704944526e-06, "loss": 0.01025775, "memory(GiB)": 26.31, "step": 1800, "train_speed(iter/s)": 0.582775 }, { "acc": 0.99596386, "epoch": 1.9141039236479322, "grad_norm": 0.5743092894554138, "learning_rate": 9.46357291640709e-06, "loss": 0.01595644, "memory(GiB)": 26.31, "step": 1805, "train_speed(iter/s)": 0.582773 }, { "acc": 0.99663105, "epoch": 1.919406150583245, "grad_norm": 0.8163368105888367, "learning_rate": 9.459615404792381e-06, "loss": 0.01093026, "memory(GiB)": 26.31, "step": 1810, "train_speed(iter/s)": 0.582761 }, { "acc": 0.99598207, "epoch": 1.9247083775185578, "grad_norm": 0.8356859087944031, "learning_rate": 9.45564418226897e-06, "loss": 0.0133225, "memory(GiB)": 26.31, "step": 1815, "train_speed(iter/s)": 0.582743 }, { "acc": 0.99689674, "epoch": 1.9300106044538707, "grad_norm": 0.6908077597618103, "learning_rate": 9.451659261047595e-06, "loss": 0.00967747, "memory(GiB)": 26.31, "step": 1820, "train_speed(iter/s)": 0.582732 }, { "acc": 0.99649096, "epoch": 1.9353128313891834, "grad_norm": 0.3842003345489502, "learning_rate": 9.447660653381107e-06, "loss": 0.01140719, "memory(GiB)": 26.31, "step": 1825, "train_speed(iter/s)": 0.582738 }, { "acc": 0.99603977, "epoch": 1.9406150583244963, "grad_norm": 0.95283043384552, "learning_rate": 9.443648371564445e-06, "loss": 0.01253378, "memory(GiB)": 26.31, "step": 1830, "train_speed(iter/s)": 0.582727 }, { "acc": 0.99565907, "epoch": 1.9459172852598092, "grad_norm": 0.5776175856590271, "learning_rate": 9.439622427934594e-06, "loss": 0.01431614, "memory(GiB)": 26.31, "step": 1835, "train_speed(iter/s)": 0.582718 }, { "acc": 0.99552002, "epoch": 1.951219512195122, "grad_norm": 0.516409158706665, "learning_rate": 9.435582834870539e-06, "loss": 0.01414626, "memory(GiB)": 26.31, "step": 1840, "train_speed(iter/s)": 0.582723 }, { "acc": 0.99604492, "epoch": 1.9565217391304348, "grad_norm": 0.5727446675300598, "learning_rate": 9.431529604793246e-06, "loss": 0.01338579, "memory(GiB)": 26.31, "step": 1845, "train_speed(iter/s)": 0.582705 }, { "acc": 0.99671364, "epoch": 1.9618239660657477, "grad_norm": 0.5135867595672607, "learning_rate": 9.4274627501656e-06, "loss": 0.01212858, "memory(GiB)": 26.31, "step": 1850, "train_speed(iter/s)": 0.582702 }, { "acc": 0.99795332, "epoch": 1.9671261930010604, "grad_norm": 0.27841368317604065, "learning_rate": 9.423382283492386e-06, "loss": 0.00771015, "memory(GiB)": 26.31, "step": 1855, "train_speed(iter/s)": 0.582688 }, { "acc": 0.99660807, "epoch": 1.9724284199363733, "grad_norm": 0.5688284039497375, "learning_rate": 9.419288217320243e-06, "loss": 0.01114456, "memory(GiB)": 26.31, "step": 1860, "train_speed(iter/s)": 0.582679 }, { "acc": 0.99710732, "epoch": 1.9777306468716862, "grad_norm": 0.4894907772541046, "learning_rate": 9.415180564237623e-06, "loss": 0.00835708, "memory(GiB)": 26.31, "step": 1865, "train_speed(iter/s)": 0.582673 }, { "acc": 0.99726276, "epoch": 1.9830328738069989, "grad_norm": 0.8209954500198364, "learning_rate": 9.411059336874757e-06, "loss": 0.01041235, "memory(GiB)": 26.31, "step": 1870, "train_speed(iter/s)": 0.582657 }, { "acc": 0.99661264, "epoch": 1.9883351007423118, "grad_norm": 0.5984225869178772, "learning_rate": 9.406924547903615e-06, "loss": 0.01018832, "memory(GiB)": 26.31, "step": 1875, "train_speed(iter/s)": 0.582656 }, { "acc": 0.99723349, "epoch": 1.9936373276776247, "grad_norm": 0.7700032591819763, "learning_rate": 9.402776210037867e-06, "loss": 0.00970466, "memory(GiB)": 26.31, "step": 1880, "train_speed(iter/s)": 0.582653 }, { "acc": 0.99703884, "epoch": 1.9989395546129374, "grad_norm": 0.49653443694114685, "learning_rate": 9.398614336032837e-06, "loss": 0.00890055, "memory(GiB)": 26.31, "step": 1885, "train_speed(iter/s)": 0.582645 }, { "acc": 0.9972353, "epoch": 2.00424178154825, "grad_norm": 0.6007420420646667, "learning_rate": 9.394438938685476e-06, "loss": 0.00938985, "memory(GiB)": 26.31, "step": 1890, "train_speed(iter/s)": 0.582456 }, { "acc": 0.99653854, "epoch": 2.009544008483563, "grad_norm": 0.6522446274757385, "learning_rate": 9.390250030834319e-06, "loss": 0.0111994, "memory(GiB)": 26.31, "step": 1895, "train_speed(iter/s)": 0.582457 }, { "acc": 0.99756927, "epoch": 2.014846235418876, "grad_norm": 0.6940723061561584, "learning_rate": 9.386047625359436e-06, "loss": 0.00796779, "memory(GiB)": 26.31, "step": 1900, "train_speed(iter/s)": 0.582464 }, { "acc": 0.99708109, "epoch": 2.0201484623541885, "grad_norm": 0.5657925009727478, "learning_rate": 9.3818317351824e-06, "loss": 0.00889682, "memory(GiB)": 26.31, "step": 1905, "train_speed(iter/s)": 0.582458 }, { "acc": 0.99729185, "epoch": 2.0254506892895017, "grad_norm": 0.793789267539978, "learning_rate": 9.377602373266253e-06, "loss": 0.01040011, "memory(GiB)": 26.31, "step": 1910, "train_speed(iter/s)": 0.582452 }, { "acc": 0.99636059, "epoch": 2.0307529162248144, "grad_norm": 0.7435290813446045, "learning_rate": 9.373359552615459e-06, "loss": 0.009646, "memory(GiB)": 26.31, "step": 1915, "train_speed(iter/s)": 0.582452 }, { "acc": 0.99647522, "epoch": 2.0360551431601275, "grad_norm": 1.047269344329834, "learning_rate": 9.369103286275857e-06, "loss": 0.00896999, "memory(GiB)": 26.31, "step": 1920, "train_speed(iter/s)": 0.582465 }, { "acc": 0.99617004, "epoch": 2.04135737009544, "grad_norm": 0.503805935382843, "learning_rate": 9.36483358733464e-06, "loss": 0.01182005, "memory(GiB)": 26.31, "step": 1925, "train_speed(iter/s)": 0.582473 }, { "acc": 0.99728832, "epoch": 2.046659597030753, "grad_norm": 0.503113865852356, "learning_rate": 9.360550468920292e-06, "loss": 0.0090973, "memory(GiB)": 26.31, "step": 1930, "train_speed(iter/s)": 0.582476 }, { "acc": 0.99628162, "epoch": 2.0519618239660655, "grad_norm": 0.5395364761352539, "learning_rate": 9.35625394420257e-06, "loss": 0.01315861, "memory(GiB)": 26.31, "step": 1935, "train_speed(iter/s)": 0.582478 }, { "acc": 0.99678946, "epoch": 2.0572640509013786, "grad_norm": 0.8396658301353455, "learning_rate": 9.351944026392447e-06, "loss": 0.00879585, "memory(GiB)": 26.31, "step": 1940, "train_speed(iter/s)": 0.582479 }, { "acc": 0.99789181, "epoch": 2.0625662778366913, "grad_norm": 0.5394271016120911, "learning_rate": 9.347620728742079e-06, "loss": 0.00796552, "memory(GiB)": 26.31, "step": 1945, "train_speed(iter/s)": 0.582487 }, { "acc": 0.99663525, "epoch": 2.0678685047720045, "grad_norm": 0.47398996353149414, "learning_rate": 9.343284064544758e-06, "loss": 0.00746697, "memory(GiB)": 26.31, "step": 1950, "train_speed(iter/s)": 0.582499 }, { "acc": 0.99564629, "epoch": 2.073170731707317, "grad_norm": 0.7614686489105225, "learning_rate": 9.338934047134884e-06, "loss": 0.01439559, "memory(GiB)": 26.31, "step": 1955, "train_speed(iter/s)": 0.582505 }, { "acc": 0.99717464, "epoch": 2.07847295864263, "grad_norm": 0.4342188537120819, "learning_rate": 9.334570689887908e-06, "loss": 0.00976835, "memory(GiB)": 26.31, "step": 1960, "train_speed(iter/s)": 0.582507 }, { "acc": 0.99687204, "epoch": 2.083775185577943, "grad_norm": 0.42526519298553467, "learning_rate": 9.330194006220301e-06, "loss": 0.00783934, "memory(GiB)": 26.31, "step": 1965, "train_speed(iter/s)": 0.582505 }, { "acc": 0.99659252, "epoch": 2.0890774125132556, "grad_norm": 0.707332968711853, "learning_rate": 9.325804009589513e-06, "loss": 0.0122927, "memory(GiB)": 26.31, "step": 1970, "train_speed(iter/s)": 0.582508 }, { "acc": 0.9970993, "epoch": 2.0943796394485683, "grad_norm": 0.43426114320755005, "learning_rate": 9.32140071349392e-06, "loss": 0.00998023, "memory(GiB)": 26.31, "step": 1975, "train_speed(iter/s)": 0.58251 }, { "acc": 0.99600496, "epoch": 2.0996818663838814, "grad_norm": 0.5607444047927856, "learning_rate": 9.316984131472804e-06, "loss": 0.01285059, "memory(GiB)": 26.31, "step": 1980, "train_speed(iter/s)": 0.58253 }, { "acc": 0.99762526, "epoch": 2.104984093319194, "grad_norm": 0.4278039336204529, "learning_rate": 9.312554277106289e-06, "loss": 0.00749103, "memory(GiB)": 26.31, "step": 1985, "train_speed(iter/s)": 0.582541 }, { "acc": 0.99604216, "epoch": 2.110286320254507, "grad_norm": 0.5797408223152161, "learning_rate": 9.30811116401531e-06, "loss": 0.01161444, "memory(GiB)": 26.31, "step": 1990, "train_speed(iter/s)": 0.582544 }, { "acc": 0.9973856, "epoch": 2.11558854718982, "grad_norm": 0.6868969202041626, "learning_rate": 9.303654805861572e-06, "loss": 0.00661921, "memory(GiB)": 26.31, "step": 1995, "train_speed(iter/s)": 0.582553 }, { "acc": 0.99702368, "epoch": 2.1208907741251326, "grad_norm": 0.42404189705848694, "learning_rate": 9.29918521634751e-06, "loss": 0.008734, "memory(GiB)": 26.31, "step": 2000, "train_speed(iter/s)": 0.582562 }, { "acc": 0.99814854, "epoch": 2.1261930010604453, "grad_norm": 0.5553742051124573, "learning_rate": 9.294702409216235e-06, "loss": 0.00553756, "memory(GiB)": 26.31, "step": 2005, "train_speed(iter/s)": 0.582563 }, { "acc": 0.99660997, "epoch": 2.1314952279957584, "grad_norm": 0.7405137419700623, "learning_rate": 9.290206398251501e-06, "loss": 0.01015682, "memory(GiB)": 26.31, "step": 2010, "train_speed(iter/s)": 0.582562 }, { "acc": 0.99767761, "epoch": 2.136797454931071, "grad_norm": 1.012142300605774, "learning_rate": 9.285697197277668e-06, "loss": 0.00823362, "memory(GiB)": 26.31, "step": 2015, "train_speed(iter/s)": 0.582563 }, { "acc": 0.99828796, "epoch": 2.1420996818663838, "grad_norm": 0.6939220428466797, "learning_rate": 9.281174820159645e-06, "loss": 0.00619173, "memory(GiB)": 26.31, "step": 2020, "train_speed(iter/s)": 0.582563 }, { "acc": 0.99645348, "epoch": 2.147401908801697, "grad_norm": 0.8721736073493958, "learning_rate": 9.276639280802859e-06, "loss": 0.01071875, "memory(GiB)": 26.31, "step": 2025, "train_speed(iter/s)": 0.582584 }, { "acc": 0.9964325, "epoch": 2.1527041357370096, "grad_norm": 0.6826593279838562, "learning_rate": 9.272090593153209e-06, "loss": 0.01154263, "memory(GiB)": 26.31, "step": 2030, "train_speed(iter/s)": 0.582597 }, { "acc": 0.99698286, "epoch": 2.1580063626723223, "grad_norm": 0.5387014150619507, "learning_rate": 9.267528771197019e-06, "loss": 0.01052478, "memory(GiB)": 26.31, "step": 2035, "train_speed(iter/s)": 0.582604 }, { "acc": 0.99636288, "epoch": 2.1633085896076354, "grad_norm": 0.8106936812400818, "learning_rate": 9.262953828961e-06, "loss": 0.01369351, "memory(GiB)": 26.31, "step": 2040, "train_speed(iter/s)": 0.582603 }, { "acc": 0.99709806, "epoch": 2.168610816542948, "grad_norm": 0.48297539353370667, "learning_rate": 9.258365780512206e-06, "loss": 0.00937909, "memory(GiB)": 26.31, "step": 2045, "train_speed(iter/s)": 0.582612 }, { "acc": 0.99722157, "epoch": 2.1739130434782608, "grad_norm": 0.6017871499061584, "learning_rate": 9.25376463995799e-06, "loss": 0.00824606, "memory(GiB)": 26.31, "step": 2050, "train_speed(iter/s)": 0.58261 }, { "acc": 0.99659395, "epoch": 2.179215270413574, "grad_norm": 0.6640163064002991, "learning_rate": 9.249150421445962e-06, "loss": 0.01057127, "memory(GiB)": 26.31, "step": 2055, "train_speed(iter/s)": 0.582614 }, { "acc": 0.99606657, "epoch": 2.1845174973488866, "grad_norm": 0.718140184879303, "learning_rate": 9.24452313916394e-06, "loss": 0.01253057, "memory(GiB)": 26.31, "step": 2060, "train_speed(iter/s)": 0.582622 }, { "acc": 0.9973835, "epoch": 2.1898197242841992, "grad_norm": 0.20768648386001587, "learning_rate": 9.239882807339911e-06, "loss": 0.00806569, "memory(GiB)": 26.31, "step": 2065, "train_speed(iter/s)": 0.582621 }, { "acc": 0.9973218, "epoch": 2.1951219512195124, "grad_norm": 0.37041446566581726, "learning_rate": 9.235229440241993e-06, "loss": 0.00846872, "memory(GiB)": 26.31, "step": 2070, "train_speed(iter/s)": 0.582633 }, { "acc": 0.99718323, "epoch": 2.200424178154825, "grad_norm": 0.31240540742874146, "learning_rate": 9.23056305217838e-06, "loss": 0.00764921, "memory(GiB)": 26.31, "step": 2075, "train_speed(iter/s)": 0.582634 }, { "acc": 0.99602003, "epoch": 2.2057264050901377, "grad_norm": 0.5981764197349548, "learning_rate": 9.2258836574973e-06, "loss": 0.01258934, "memory(GiB)": 26.31, "step": 2080, "train_speed(iter/s)": 0.582646 }, { "acc": 0.99759808, "epoch": 2.211028632025451, "grad_norm": 0.5735823512077332, "learning_rate": 9.221191270586983e-06, "loss": 0.00761086, "memory(GiB)": 26.31, "step": 2085, "train_speed(iter/s)": 0.582657 }, { "acc": 0.99688892, "epoch": 2.2163308589607635, "grad_norm": 0.4943540692329407, "learning_rate": 9.216485905875599e-06, "loss": 0.00940942, "memory(GiB)": 26.31, "step": 2090, "train_speed(iter/s)": 0.582656 }, { "acc": 0.99643354, "epoch": 2.221633085896076, "grad_norm": 0.5726579427719116, "learning_rate": 9.211767577831225e-06, "loss": 0.01135425, "memory(GiB)": 26.31, "step": 2095, "train_speed(iter/s)": 0.582674 }, { "acc": 0.99656467, "epoch": 2.2269353128313893, "grad_norm": 0.6457775831222534, "learning_rate": 9.207036300961796e-06, "loss": 0.0108216, "memory(GiB)": 26.31, "step": 2100, "train_speed(iter/s)": 0.582683 }, { "acc": 0.99748325, "epoch": 2.232237539766702, "grad_norm": 0.8732754588127136, "learning_rate": 9.202292089815069e-06, "loss": 0.00855574, "memory(GiB)": 26.31, "step": 2105, "train_speed(iter/s)": 0.582682 }, { "acc": 0.99805489, "epoch": 2.2375397667020147, "grad_norm": 0.48590216040611267, "learning_rate": 9.197534958978562e-06, "loss": 0.00908014, "memory(GiB)": 26.31, "step": 2110, "train_speed(iter/s)": 0.582684 }, { "acc": 0.99843454, "epoch": 2.242841993637328, "grad_norm": 0.4169938564300537, "learning_rate": 9.192764923079526e-06, "loss": 0.0069149, "memory(GiB)": 26.31, "step": 2115, "train_speed(iter/s)": 0.582693 }, { "acc": 0.99805841, "epoch": 2.2481442205726405, "grad_norm": 0.6880154609680176, "learning_rate": 9.187981996784886e-06, "loss": 0.00760795, "memory(GiB)": 26.31, "step": 2120, "train_speed(iter/s)": 0.582693 }, { "acc": 0.99637814, "epoch": 2.253446447507953, "grad_norm": 0.6809533834457397, "learning_rate": 9.18318619480121e-06, "loss": 0.00879649, "memory(GiB)": 26.31, "step": 2125, "train_speed(iter/s)": 0.582695 }, { "acc": 0.99643812, "epoch": 2.2587486744432663, "grad_norm": 0.7628903388977051, "learning_rate": 9.17837753187465e-06, "loss": 0.0110072, "memory(GiB)": 26.31, "step": 2130, "train_speed(iter/s)": 0.582694 }, { "acc": 0.99702005, "epoch": 2.264050901378579, "grad_norm": 0.49134159088134766, "learning_rate": 9.173556022790905e-06, "loss": 0.01013456, "memory(GiB)": 26.31, "step": 2135, "train_speed(iter/s)": 0.582705 }, { "acc": 0.99699507, "epoch": 2.2693531283138917, "grad_norm": 0.553747832775116, "learning_rate": 9.168721682375173e-06, "loss": 0.00931663, "memory(GiB)": 26.31, "step": 2140, "train_speed(iter/s)": 0.582705 }, { "acc": 0.99674129, "epoch": 2.274655355249205, "grad_norm": 0.662503719329834, "learning_rate": 9.163874525492108e-06, "loss": 0.00786655, "memory(GiB)": 26.31, "step": 2145, "train_speed(iter/s)": 0.582705 }, { "acc": 0.9980854, "epoch": 2.2799575821845175, "grad_norm": 0.28207457065582275, "learning_rate": 9.15901456704577e-06, "loss": 0.00622527, "memory(GiB)": 26.31, "step": 2150, "train_speed(iter/s)": 0.582702 }, { "acc": 0.99700918, "epoch": 2.28525980911983, "grad_norm": 0.6629775166511536, "learning_rate": 9.15414182197958e-06, "loss": 0.00998119, "memory(GiB)": 26.31, "step": 2155, "train_speed(iter/s)": 0.582698 }, { "acc": 0.9979825, "epoch": 2.2905620360551433, "grad_norm": 0.43215715885162354, "learning_rate": 9.149256305276277e-06, "loss": 0.00786603, "memory(GiB)": 26.31, "step": 2160, "train_speed(iter/s)": 0.582692 }, { "acc": 0.99758244, "epoch": 2.295864262990456, "grad_norm": 0.487210750579834, "learning_rate": 9.144358031957872e-06, "loss": 0.00788484, "memory(GiB)": 26.31, "step": 2165, "train_speed(iter/s)": 0.582701 }, { "acc": 0.99730835, "epoch": 2.3011664899257687, "grad_norm": 0.3774445652961731, "learning_rate": 9.1394470170856e-06, "loss": 0.00904835, "memory(GiB)": 26.31, "step": 2170, "train_speed(iter/s)": 0.5827 }, { "acc": 0.99750452, "epoch": 2.306468716861082, "grad_norm": 0.7599917650222778, "learning_rate": 9.134523275759872e-06, "loss": 0.00717712, "memory(GiB)": 26.31, "step": 2175, "train_speed(iter/s)": 0.582711 }, { "acc": 0.99696083, "epoch": 2.3117709437963945, "grad_norm": 0.6776115894317627, "learning_rate": 9.129586823120224e-06, "loss": 0.00911687, "memory(GiB)": 26.31, "step": 2180, "train_speed(iter/s)": 0.582718 }, { "acc": 0.99689713, "epoch": 2.317073170731707, "grad_norm": 0.5783346891403198, "learning_rate": 9.124637674345291e-06, "loss": 0.00991484, "memory(GiB)": 26.31, "step": 2185, "train_speed(iter/s)": 0.582721 }, { "acc": 0.99779062, "epoch": 2.3223753976670203, "grad_norm": 0.6449788212776184, "learning_rate": 9.119675844652737e-06, "loss": 0.0080985, "memory(GiB)": 26.31, "step": 2190, "train_speed(iter/s)": 0.582731 }, { "acc": 0.99645386, "epoch": 2.327677624602333, "grad_norm": 0.7729847431182861, "learning_rate": 9.114701349299217e-06, "loss": 0.01042995, "memory(GiB)": 26.31, "step": 2195, "train_speed(iter/s)": 0.582731 }, { "acc": 0.99811287, "epoch": 2.3329798515376456, "grad_norm": 0.5190842747688293, "learning_rate": 9.109714203580334e-06, "loss": 0.00516622, "memory(GiB)": 26.31, "step": 2200, "train_speed(iter/s)": 0.582741 }, { "acc": 0.99637775, "epoch": 2.3382820784729588, "grad_norm": 0.48969125747680664, "learning_rate": 9.104714422830586e-06, "loss": 0.00895324, "memory(GiB)": 26.31, "step": 2205, "train_speed(iter/s)": 0.582743 }, { "acc": 0.99677792, "epoch": 2.3435843054082715, "grad_norm": 0.6211754083633423, "learning_rate": 9.09970202242332e-06, "loss": 0.0104282, "memory(GiB)": 26.31, "step": 2210, "train_speed(iter/s)": 0.582743 }, { "acc": 0.99684505, "epoch": 2.348886532343584, "grad_norm": 0.6650230884552002, "learning_rate": 9.094677017770692e-06, "loss": 0.01132647, "memory(GiB)": 26.31, "step": 2215, "train_speed(iter/s)": 0.58276 }, { "acc": 0.99717751, "epoch": 2.3541887592788973, "grad_norm": 0.4855519235134125, "learning_rate": 9.089639424323608e-06, "loss": 0.00853827, "memory(GiB)": 26.31, "step": 2220, "train_speed(iter/s)": 0.582764 }, { "acc": 0.99725456, "epoch": 2.35949098621421, "grad_norm": 0.8926551938056946, "learning_rate": 9.084589257571682e-06, "loss": 0.00890769, "memory(GiB)": 26.31, "step": 2225, "train_speed(iter/s)": 0.582764 }, { "acc": 0.99793701, "epoch": 2.3647932131495226, "grad_norm": 0.8140351176261902, "learning_rate": 9.079526533043192e-06, "loss": 0.00827439, "memory(GiB)": 26.31, "step": 2230, "train_speed(iter/s)": 0.582764 }, { "acc": 0.99684944, "epoch": 2.3700954400848357, "grad_norm": 0.7877880334854126, "learning_rate": 9.074451266305026e-06, "loss": 0.00902928, "memory(GiB)": 26.31, "step": 2235, "train_speed(iter/s)": 0.582768 }, { "acc": 0.99745779, "epoch": 2.3753976670201484, "grad_norm": 0.4703558087348938, "learning_rate": 9.069363472962639e-06, "loss": 0.00938336, "memory(GiB)": 26.31, "step": 2240, "train_speed(iter/s)": 0.582771 }, { "acc": 0.99804792, "epoch": 2.380699893955461, "grad_norm": 0.49165722727775574, "learning_rate": 9.064263168659999e-06, "loss": 0.00661491, "memory(GiB)": 26.31, "step": 2245, "train_speed(iter/s)": 0.582767 }, { "acc": 0.99777336, "epoch": 2.3860021208907742, "grad_norm": 0.34549641609191895, "learning_rate": 9.059150369079548e-06, "loss": 0.00687485, "memory(GiB)": 26.31, "step": 2250, "train_speed(iter/s)": 0.582768 }, { "acc": 0.99720001, "epoch": 2.391304347826087, "grad_norm": 0.45710110664367676, "learning_rate": 9.054025089942143e-06, "loss": 0.00874545, "memory(GiB)": 26.31, "step": 2255, "train_speed(iter/s)": 0.582772 }, { "acc": 0.99818325, "epoch": 2.3966065747613996, "grad_norm": 0.3455301523208618, "learning_rate": 9.048887347007018e-06, "loss": 0.00510659, "memory(GiB)": 26.31, "step": 2260, "train_speed(iter/s)": 0.582786 }, { "acc": 0.99830542, "epoch": 2.4019088016967127, "grad_norm": 0.621752142906189, "learning_rate": 9.043737156071728e-06, "loss": 0.00655494, "memory(GiB)": 26.31, "step": 2265, "train_speed(iter/s)": 0.582785 }, { "acc": 0.99765148, "epoch": 2.4072110286320254, "grad_norm": 0.7850625514984131, "learning_rate": 9.038574532972108e-06, "loss": 0.00760993, "memory(GiB)": 26.31, "step": 2270, "train_speed(iter/s)": 0.582794 }, { "acc": 0.99838095, "epoch": 2.412513255567338, "grad_norm": 0.8043009638786316, "learning_rate": 9.033399493582208e-06, "loss": 0.0075691, "memory(GiB)": 26.31, "step": 2275, "train_speed(iter/s)": 0.582803 }, { "acc": 0.99808836, "epoch": 2.417815482502651, "grad_norm": 0.609122097492218, "learning_rate": 9.028212053814269e-06, "loss": 0.00771583, "memory(GiB)": 26.31, "step": 2280, "train_speed(iter/s)": 0.582776 }, { "acc": 0.99779768, "epoch": 2.423117709437964, "grad_norm": 0.6378943920135498, "learning_rate": 9.023012229618651e-06, "loss": 0.00826254, "memory(GiB)": 26.31, "step": 2285, "train_speed(iter/s)": 0.582742 }, { "acc": 0.99719963, "epoch": 2.4284199363732766, "grad_norm": 0.5774968266487122, "learning_rate": 9.0178000369838e-06, "loss": 0.00730962, "memory(GiB)": 26.31, "step": 2290, "train_speed(iter/s)": 0.582725 }, { "acc": 0.99702272, "epoch": 2.4337221633085897, "grad_norm": 0.7756372094154358, "learning_rate": 9.012575491936189e-06, "loss": 0.01053988, "memory(GiB)": 26.31, "step": 2295, "train_speed(iter/s)": 0.582741 }, { "acc": 0.99702606, "epoch": 2.4390243902439024, "grad_norm": 0.4530414342880249, "learning_rate": 9.007338610540274e-06, "loss": 0.00954269, "memory(GiB)": 26.31, "step": 2300, "train_speed(iter/s)": 0.582737 }, { "acc": 0.99730387, "epoch": 2.4443266171792155, "grad_norm": 0.4954255223274231, "learning_rate": 9.00208940889844e-06, "loss": 0.00952832, "memory(GiB)": 26.31, "step": 2305, "train_speed(iter/s)": 0.582736 }, { "acc": 0.99814606, "epoch": 2.449628844114528, "grad_norm": 0.5628678202629089, "learning_rate": 8.996827903150959e-06, "loss": 0.00628898, "memory(GiB)": 26.31, "step": 2310, "train_speed(iter/s)": 0.582742 }, { "acc": 0.99818916, "epoch": 2.454931071049841, "grad_norm": 0.5463048815727234, "learning_rate": 8.991554109475933e-06, "loss": 0.00808192, "memory(GiB)": 26.31, "step": 2315, "train_speed(iter/s)": 0.582743 }, { "acc": 0.99799252, "epoch": 2.4602332979851536, "grad_norm": 0.40346306562423706, "learning_rate": 8.986268044089247e-06, "loss": 0.00596498, "memory(GiB)": 26.31, "step": 2320, "train_speed(iter/s)": 0.58275 }, { "acc": 0.99781971, "epoch": 2.4655355249204667, "grad_norm": 0.6184868812561035, "learning_rate": 8.980969723244518e-06, "loss": 0.00875518, "memory(GiB)": 26.31, "step": 2325, "train_speed(iter/s)": 0.58275 }, { "acc": 0.99754887, "epoch": 2.4708377518557794, "grad_norm": 0.45224523544311523, "learning_rate": 8.97565916323305e-06, "loss": 0.00927275, "memory(GiB)": 26.31, "step": 2330, "train_speed(iter/s)": 0.582754 }, { "acc": 0.99780216, "epoch": 2.4761399787910925, "grad_norm": 0.8157827258110046, "learning_rate": 8.970336380383773e-06, "loss": 0.00661554, "memory(GiB)": 26.31, "step": 2335, "train_speed(iter/s)": 0.582754 }, { "acc": 0.99679346, "epoch": 2.481442205726405, "grad_norm": 0.8614205121994019, "learning_rate": 8.965001391063212e-06, "loss": 0.01007259, "memory(GiB)": 26.31, "step": 2340, "train_speed(iter/s)": 0.582759 }, { "acc": 0.99660053, "epoch": 2.486744432661718, "grad_norm": 0.7390111684799194, "learning_rate": 8.95965421167541e-06, "loss": 0.00984244, "memory(GiB)": 26.31, "step": 2345, "train_speed(iter/s)": 0.58276 }, { "acc": 0.99715099, "epoch": 2.4920466595970305, "grad_norm": 0.7102832794189453, "learning_rate": 8.9542948586619e-06, "loss": 0.00887956, "memory(GiB)": 26.31, "step": 2350, "train_speed(iter/s)": 0.582762 }, { "acc": 0.99650688, "epoch": 2.4973488865323437, "grad_norm": 0.7877767086029053, "learning_rate": 8.948923348501646e-06, "loss": 0.01136077, "memory(GiB)": 26.31, "step": 2355, "train_speed(iter/s)": 0.582769 }, { "acc": 0.9968586, "epoch": 2.5026511134676563, "grad_norm": 0.6240387558937073, "learning_rate": 8.943539697710996e-06, "loss": 0.00829753, "memory(GiB)": 26.31, "step": 2360, "train_speed(iter/s)": 0.582782 }, { "acc": 0.99727345, "epoch": 2.5079533404029695, "grad_norm": 0.6722913980484009, "learning_rate": 8.93814392284362e-06, "loss": 0.0081228, "memory(GiB)": 26.31, "step": 2365, "train_speed(iter/s)": 0.58278 }, { "acc": 0.99726486, "epoch": 2.513255567338282, "grad_norm": 0.6057406067848206, "learning_rate": 8.932736040490472e-06, "loss": 0.00857747, "memory(GiB)": 26.31, "step": 2370, "train_speed(iter/s)": 0.582779 }, { "acc": 0.99796257, "epoch": 2.518557794273595, "grad_norm": 0.32492902874946594, "learning_rate": 8.927316067279736e-06, "loss": 0.00711781, "memory(GiB)": 26.31, "step": 2375, "train_speed(iter/s)": 0.582783 }, { "acc": 0.99653759, "epoch": 2.5238600212089075, "grad_norm": 0.4648723304271698, "learning_rate": 8.921884019876768e-06, "loss": 0.01062218, "memory(GiB)": 26.31, "step": 2380, "train_speed(iter/s)": 0.582786 }, { "acc": 0.99777451, "epoch": 2.5291622481442206, "grad_norm": 0.44860148429870605, "learning_rate": 8.916439914984055e-06, "loss": 0.00815452, "memory(GiB)": 26.31, "step": 2385, "train_speed(iter/s)": 0.582782 }, { "acc": 0.99838314, "epoch": 2.5344644750795333, "grad_norm": 0.6119086146354675, "learning_rate": 8.910983769341154e-06, "loss": 0.00666094, "memory(GiB)": 26.31, "step": 2390, "train_speed(iter/s)": 0.582789 }, { "acc": 0.99723606, "epoch": 2.5397667020148464, "grad_norm": 0.46494340896606445, "learning_rate": 8.905515599724649e-06, "loss": 0.00702635, "memory(GiB)": 26.31, "step": 2395, "train_speed(iter/s)": 0.58279 }, { "acc": 0.99754772, "epoch": 2.545068928950159, "grad_norm": 0.5167229771614075, "learning_rate": 8.90003542294809e-06, "loss": 0.00611775, "memory(GiB)": 26.31, "step": 2400, "train_speed(iter/s)": 0.582791 }, { "acc": 0.99728813, "epoch": 2.550371155885472, "grad_norm": 0.5445597767829895, "learning_rate": 8.894543255861953e-06, "loss": 0.00897106, "memory(GiB)": 26.31, "step": 2405, "train_speed(iter/s)": 0.582791 }, { "acc": 0.99713316, "epoch": 2.5556733828207845, "grad_norm": 0.5003531575202942, "learning_rate": 8.889039115353577e-06, "loss": 0.00824212, "memory(GiB)": 26.31, "step": 2410, "train_speed(iter/s)": 0.582792 }, { "acc": 0.99826508, "epoch": 2.5609756097560976, "grad_norm": 0.6428782343864441, "learning_rate": 8.883523018347122e-06, "loss": 0.00542104, "memory(GiB)": 26.31, "step": 2415, "train_speed(iter/s)": 0.582795 }, { "acc": 0.99676142, "epoch": 2.5662778366914103, "grad_norm": 1.0514658689498901, "learning_rate": 8.877994981803503e-06, "loss": 0.01014628, "memory(GiB)": 26.31, "step": 2420, "train_speed(iter/s)": 0.582802 }, { "acc": 0.99781837, "epoch": 2.5715800636267234, "grad_norm": 0.49720627069473267, "learning_rate": 8.872455022720356e-06, "loss": 0.00676076, "memory(GiB)": 26.31, "step": 2425, "train_speed(iter/s)": 0.582817 }, { "acc": 0.99802704, "epoch": 2.576882290562036, "grad_norm": 0.5802571773529053, "learning_rate": 8.866903158131972e-06, "loss": 0.00726605, "memory(GiB)": 26.31, "step": 2430, "train_speed(iter/s)": 0.582818 }, { "acc": 0.99644756, "epoch": 2.582184517497349, "grad_norm": 1.120002031326294, "learning_rate": 8.861339405109253e-06, "loss": 0.01294475, "memory(GiB)": 26.31, "step": 2435, "train_speed(iter/s)": 0.582841 }, { "acc": 0.99776955, "epoch": 2.5874867444326615, "grad_norm": 0.4678545892238617, "learning_rate": 8.855763780759646e-06, "loss": 0.00850418, "memory(GiB)": 26.31, "step": 2440, "train_speed(iter/s)": 0.582849 }, { "acc": 0.99819298, "epoch": 2.5927889713679746, "grad_norm": 0.4603610932826996, "learning_rate": 8.850176302227113e-06, "loss": 0.0065477, "memory(GiB)": 26.31, "step": 2445, "train_speed(iter/s)": 0.582848 }, { "acc": 0.99753914, "epoch": 2.5980911983032873, "grad_norm": 0.7381535172462463, "learning_rate": 8.844576986692056e-06, "loss": 0.00721828, "memory(GiB)": 26.31, "step": 2450, "train_speed(iter/s)": 0.58285 }, { "acc": 0.99687433, "epoch": 2.6033934252386004, "grad_norm": 0.42761534452438354, "learning_rate": 8.838965851371274e-06, "loss": 0.00944657, "memory(GiB)": 26.31, "step": 2455, "train_speed(iter/s)": 0.582857 }, { "acc": 0.99773045, "epoch": 2.608695652173913, "grad_norm": 0.5802911520004272, "learning_rate": 8.833342913517916e-06, "loss": 0.00601513, "memory(GiB)": 26.31, "step": 2460, "train_speed(iter/s)": 0.582836 }, { "acc": 0.99647751, "epoch": 2.6139978791092258, "grad_norm": 0.5661634802818298, "learning_rate": 8.827708190421416e-06, "loss": 0.01108431, "memory(GiB)": 26.31, "step": 2465, "train_speed(iter/s)": 0.582849 }, { "acc": 0.99830036, "epoch": 2.6193001060445384, "grad_norm": 0.5017287731170654, "learning_rate": 8.822061699407447e-06, "loss": 0.00618156, "memory(GiB)": 26.31, "step": 2470, "train_speed(iter/s)": 0.582828 }, { "acc": 0.9973937, "epoch": 2.6246023329798516, "grad_norm": 0.5929028391838074, "learning_rate": 8.816403457837865e-06, "loss": 0.0077717, "memory(GiB)": 26.31, "step": 2475, "train_speed(iter/s)": 0.582848 }, { "acc": 0.99757099, "epoch": 2.6299045599151643, "grad_norm": 0.6684390902519226, "learning_rate": 8.810733483110656e-06, "loss": 0.00653151, "memory(GiB)": 26.31, "step": 2480, "train_speed(iter/s)": 0.582856 }, { "acc": 0.99810505, "epoch": 2.6352067868504774, "grad_norm": 0.3855116069316864, "learning_rate": 8.805051792659887e-06, "loss": 0.00685595, "memory(GiB)": 26.31, "step": 2485, "train_speed(iter/s)": 0.582866 }, { "acc": 0.99766197, "epoch": 2.64050901378579, "grad_norm": 0.641624927520752, "learning_rate": 8.799358403955646e-06, "loss": 0.00616185, "memory(GiB)": 26.31, "step": 2490, "train_speed(iter/s)": 0.582875 }, { "acc": 0.99728374, "epoch": 2.6458112407211027, "grad_norm": 0.594316840171814, "learning_rate": 8.79365333450399e-06, "loss": 0.00788632, "memory(GiB)": 26.31, "step": 2495, "train_speed(iter/s)": 0.582888 }, { "acc": 0.99828377, "epoch": 2.6511134676564154, "grad_norm": 0.34747976064682007, "learning_rate": 8.787936601846892e-06, "loss": 0.00582717, "memory(GiB)": 26.31, "step": 2500, "train_speed(iter/s)": 0.582886 }, { "acc": 0.99804268, "epoch": 2.6564156945917285, "grad_norm": 0.47376132011413574, "learning_rate": 8.78220822356219e-06, "loss": 0.00711892, "memory(GiB)": 26.31, "step": 2505, "train_speed(iter/s)": 0.582886 }, { "acc": 0.9984189, "epoch": 2.6617179215270412, "grad_norm": 0.38001590967178345, "learning_rate": 8.776468217263526e-06, "loss": 0.00497809, "memory(GiB)": 26.31, "step": 2510, "train_speed(iter/s)": 0.582885 }, { "acc": 0.99746351, "epoch": 2.6670201484623544, "grad_norm": 0.9638261198997498, "learning_rate": 8.770716600600301e-06, "loss": 0.00803462, "memory(GiB)": 26.31, "step": 2515, "train_speed(iter/s)": 0.582892 }, { "acc": 0.99772987, "epoch": 2.672322375397667, "grad_norm": 0.3004850149154663, "learning_rate": 8.764953391257611e-06, "loss": 0.00633329, "memory(GiB)": 26.31, "step": 2520, "train_speed(iter/s)": 0.582891 }, { "acc": 0.99645672, "epoch": 2.6776246023329797, "grad_norm": 1.1040234565734863, "learning_rate": 8.759178606956197e-06, "loss": 0.00878785, "memory(GiB)": 26.31, "step": 2525, "train_speed(iter/s)": 0.582889 }, { "acc": 0.99747276, "epoch": 2.682926829268293, "grad_norm": 0.582634687423706, "learning_rate": 8.753392265452395e-06, "loss": 0.00888529, "memory(GiB)": 26.31, "step": 2530, "train_speed(iter/s)": 0.582886 }, { "acc": 0.99669046, "epoch": 2.6882290562036055, "grad_norm": 0.5359108448028564, "learning_rate": 8.747594384538073e-06, "loss": 0.00974076, "memory(GiB)": 26.31, "step": 2535, "train_speed(iter/s)": 0.582886 }, { "acc": 0.99788017, "epoch": 2.693531283138918, "grad_norm": 1.008933663368225, "learning_rate": 8.741784982040583e-06, "loss": 0.0074301, "memory(GiB)": 26.31, "step": 2540, "train_speed(iter/s)": 0.582892 }, { "acc": 0.99824905, "epoch": 2.6988335100742313, "grad_norm": 0.43502724170684814, "learning_rate": 8.735964075822702e-06, "loss": 0.00669004, "memory(GiB)": 26.31, "step": 2545, "train_speed(iter/s)": 0.582879 }, { "acc": 0.99731178, "epoch": 2.704135737009544, "grad_norm": 0.8386071920394897, "learning_rate": 8.730131683782583e-06, "loss": 0.00993623, "memory(GiB)": 26.31, "step": 2550, "train_speed(iter/s)": 0.582877 }, { "acc": 0.99741039, "epoch": 2.7094379639448567, "grad_norm": 0.5602560639381409, "learning_rate": 8.724287823853687e-06, "loss": 0.00661452, "memory(GiB)": 26.31, "step": 2555, "train_speed(iter/s)": 0.582883 }, { "acc": 0.99765863, "epoch": 2.71474019088017, "grad_norm": 0.7029976844787598, "learning_rate": 8.718432514004743e-06, "loss": 0.00765093, "memory(GiB)": 26.31, "step": 2560, "train_speed(iter/s)": 0.582883 }, { "acc": 0.99744787, "epoch": 2.7200424178154825, "grad_norm": 0.8403060436248779, "learning_rate": 8.712565772239685e-06, "loss": 0.00870134, "memory(GiB)": 26.31, "step": 2565, "train_speed(iter/s)": 0.582882 }, { "acc": 0.99806385, "epoch": 2.725344644750795, "grad_norm": 0.7096578478813171, "learning_rate": 8.706687616597599e-06, "loss": 0.00648894, "memory(GiB)": 26.31, "step": 2570, "train_speed(iter/s)": 0.582882 }, { "acc": 0.99784031, "epoch": 2.7306468716861083, "grad_norm": 0.9655284285545349, "learning_rate": 8.700798065152664e-06, "loss": 0.00564081, "memory(GiB)": 26.31, "step": 2575, "train_speed(iter/s)": 0.582883 }, { "acc": 0.99837542, "epoch": 2.735949098621421, "grad_norm": 0.5595547556877136, "learning_rate": 8.694897136014102e-06, "loss": 0.00597643, "memory(GiB)": 26.31, "step": 2580, "train_speed(iter/s)": 0.582882 }, { "acc": 0.99757414, "epoch": 2.7412513255567337, "grad_norm": 0.845004141330719, "learning_rate": 8.688984847326113e-06, "loss": 0.00632632, "memory(GiB)": 26.31, "step": 2585, "train_speed(iter/s)": 0.582886 }, { "acc": 0.99758339, "epoch": 2.746553552492047, "grad_norm": 0.8130578994750977, "learning_rate": 8.683061217267834e-06, "loss": 0.00771255, "memory(GiB)": 26.31, "step": 2590, "train_speed(iter/s)": 0.582894 }, { "acc": 0.99802599, "epoch": 2.7518557794273595, "grad_norm": 0.8608774542808533, "learning_rate": 8.677126264053266e-06, "loss": 0.00774147, "memory(GiB)": 26.31, "step": 2595, "train_speed(iter/s)": 0.582895 }, { "acc": 0.99850788, "epoch": 2.757158006362672, "grad_norm": 0.6176683902740479, "learning_rate": 8.671180005931236e-06, "loss": 0.0063642, "memory(GiB)": 26.31, "step": 2600, "train_speed(iter/s)": 0.582892 }, { "acc": 0.99837418, "epoch": 2.7624602332979853, "grad_norm": 0.29822659492492676, "learning_rate": 8.665222461185323e-06, "loss": 0.00637889, "memory(GiB)": 26.31, "step": 2605, "train_speed(iter/s)": 0.582891 }, { "acc": 0.99730186, "epoch": 2.767762460233298, "grad_norm": 0.8909950256347656, "learning_rate": 8.659253648133812e-06, "loss": 0.00681949, "memory(GiB)": 26.31, "step": 2610, "train_speed(iter/s)": 0.582888 }, { "acc": 0.99875526, "epoch": 2.7730646871686107, "grad_norm": 0.3553677797317505, "learning_rate": 8.653273585129638e-06, "loss": 0.00568229, "memory(GiB)": 26.31, "step": 2615, "train_speed(iter/s)": 0.582887 }, { "acc": 0.99821167, "epoch": 2.778366914103924, "grad_norm": 0.4157305657863617, "learning_rate": 8.647282290560328e-06, "loss": 0.00531367, "memory(GiB)": 26.31, "step": 2620, "train_speed(iter/s)": 0.582894 }, { "acc": 0.99853363, "epoch": 2.7836691410392365, "grad_norm": 0.48346978425979614, "learning_rate": 8.64127978284794e-06, "loss": 0.00515425, "memory(GiB)": 26.31, "step": 2625, "train_speed(iter/s)": 0.582895 }, { "acc": 0.99797735, "epoch": 2.7889713679745496, "grad_norm": 0.824645459651947, "learning_rate": 8.635266080449015e-06, "loss": 0.00621112, "memory(GiB)": 26.31, "step": 2630, "train_speed(iter/s)": 0.582901 }, { "acc": 0.99723949, "epoch": 2.7942735949098623, "grad_norm": 0.4443477690219879, "learning_rate": 8.62924120185451e-06, "loss": 0.00894453, "memory(GiB)": 26.31, "step": 2635, "train_speed(iter/s)": 0.582915 }, { "acc": 0.99760208, "epoch": 2.799575821845175, "grad_norm": 0.6249599456787109, "learning_rate": 8.623205165589752e-06, "loss": 0.00710995, "memory(GiB)": 26.31, "step": 2640, "train_speed(iter/s)": 0.582927 }, { "acc": 0.99724007, "epoch": 2.8048780487804876, "grad_norm": 0.8389932513237, "learning_rate": 8.61715799021437e-06, "loss": 0.00801658, "memory(GiB)": 26.31, "step": 2645, "train_speed(iter/s)": 0.582928 }, { "acc": 0.99735107, "epoch": 2.8101802757158008, "grad_norm": 0.6563589572906494, "learning_rate": 8.61109969432225e-06, "loss": 0.00741932, "memory(GiB)": 26.31, "step": 2650, "train_speed(iter/s)": 0.582929 }, { "acc": 0.9972703, "epoch": 2.8154825026511134, "grad_norm": 0.9069436192512512, "learning_rate": 8.60503029654147e-06, "loss": 0.00955604, "memory(GiB)": 26.31, "step": 2655, "train_speed(iter/s)": 0.582935 }, { "acc": 0.99771433, "epoch": 2.8207847295864266, "grad_norm": 0.540453314781189, "learning_rate": 8.598949815534237e-06, "loss": 0.00701306, "memory(GiB)": 26.31, "step": 2660, "train_speed(iter/s)": 0.582935 }, { "acc": 0.99766121, "epoch": 2.8260869565217392, "grad_norm": 0.49637776613235474, "learning_rate": 8.592858269996845e-06, "loss": 0.00793933, "memory(GiB)": 26.31, "step": 2665, "train_speed(iter/s)": 0.582938 }, { "acc": 0.99823942, "epoch": 2.831389183457052, "grad_norm": 0.752679169178009, "learning_rate": 8.58675567865961e-06, "loss": 0.0062173, "memory(GiB)": 26.31, "step": 2670, "train_speed(iter/s)": 0.58296 }, { "acc": 0.99808493, "epoch": 2.8366914103923646, "grad_norm": 0.2555808424949646, "learning_rate": 8.580642060286801e-06, "loss": 0.00525451, "memory(GiB)": 26.31, "step": 2675, "train_speed(iter/s)": 0.58296 }, { "acc": 0.99891806, "epoch": 2.8419936373276777, "grad_norm": 0.4761360287666321, "learning_rate": 8.574517433676606e-06, "loss": 0.00387856, "memory(GiB)": 26.31, "step": 2680, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99823341, "epoch": 2.8472958642629904, "grad_norm": 0.39361944794654846, "learning_rate": 8.56838181766105e-06, "loss": 0.00706662, "memory(GiB)": 26.31, "step": 2685, "train_speed(iter/s)": 0.582979 }, { "acc": 0.99847803, "epoch": 2.8525980911983035, "grad_norm": 0.6813247799873352, "learning_rate": 8.56223523110596e-06, "loss": 0.00404909, "memory(GiB)": 26.31, "step": 2690, "train_speed(iter/s)": 0.582982 }, { "acc": 0.99783859, "epoch": 2.8579003181336162, "grad_norm": 0.539756178855896, "learning_rate": 8.556077692910884e-06, "loss": 0.00924466, "memory(GiB)": 26.31, "step": 2695, "train_speed(iter/s)": 0.582983 }, { "acc": 0.99754581, "epoch": 2.863202545068929, "grad_norm": 0.3688136041164398, "learning_rate": 8.549909222009049e-06, "loss": 0.00816878, "memory(GiB)": 26.31, "step": 2700, "train_speed(iter/s)": 0.582981 }, { "acc": 0.99778786, "epoch": 2.8685047720042416, "grad_norm": 0.6642614006996155, "learning_rate": 8.543729837367299e-06, "loss": 0.00844958, "memory(GiB)": 26.31, "step": 2705, "train_speed(iter/s)": 0.58298 }, { "acc": 0.99753714, "epoch": 2.8738069989395547, "grad_norm": 0.40582075715065, "learning_rate": 8.537539557986036e-06, "loss": 0.00675502, "memory(GiB)": 26.31, "step": 2710, "train_speed(iter/s)": 0.582986 }, { "acc": 0.9975502, "epoch": 2.8791092258748674, "grad_norm": 0.6373099684715271, "learning_rate": 8.531338402899158e-06, "loss": 0.00913531, "memory(GiB)": 26.31, "step": 2715, "train_speed(iter/s)": 0.582992 }, { "acc": 0.997229, "epoch": 2.8844114528101805, "grad_norm": 0.6850215792655945, "learning_rate": 8.525126391174008e-06, "loss": 0.00866958, "memory(GiB)": 26.31, "step": 2720, "train_speed(iter/s)": 0.582989 }, { "acc": 0.99879761, "epoch": 2.889713679745493, "grad_norm": 0.2462194859981537, "learning_rate": 8.518903541911302e-06, "loss": 0.00522173, "memory(GiB)": 26.31, "step": 2725, "train_speed(iter/s)": 0.582995 }, { "acc": 0.99818954, "epoch": 2.895015906680806, "grad_norm": 0.6007863879203796, "learning_rate": 8.512669874245093e-06, "loss": 0.00626441, "memory(GiB)": 26.31, "step": 2730, "train_speed(iter/s)": 0.583001 }, { "acc": 0.99893417, "epoch": 2.9003181336161186, "grad_norm": 0.24022957682609558, "learning_rate": 8.506425407342687e-06, "loss": 0.0040527, "memory(GiB)": 26.31, "step": 2735, "train_speed(iter/s)": 0.583006 }, { "acc": 0.9976409, "epoch": 2.9056203605514317, "grad_norm": 0.7551782131195068, "learning_rate": 8.500170160404601e-06, "loss": 0.00866469, "memory(GiB)": 26.31, "step": 2740, "train_speed(iter/s)": 0.583005 }, { "acc": 0.99791851, "epoch": 2.9109225874867444, "grad_norm": 0.48232194781303406, "learning_rate": 8.493904152664496e-06, "loss": 0.00649021, "memory(GiB)": 26.31, "step": 2745, "train_speed(iter/s)": 0.583006 }, { "acc": 0.99849854, "epoch": 2.9162248144220575, "grad_norm": 0.17249707877635956, "learning_rate": 8.487627403389123e-06, "loss": 0.005985, "memory(GiB)": 26.31, "step": 2750, "train_speed(iter/s)": 0.583012 }, { "acc": 0.9975729, "epoch": 2.92152704135737, "grad_norm": 0.4319506287574768, "learning_rate": 8.48133993187826e-06, "loss": 0.0076484, "memory(GiB)": 26.31, "step": 2755, "train_speed(iter/s)": 0.583011 }, { "acc": 0.99815025, "epoch": 2.926829268292683, "grad_norm": 0.5459288358688354, "learning_rate": 8.475041757464654e-06, "loss": 0.00624922, "memory(GiB)": 26.31, "step": 2760, "train_speed(iter/s)": 0.583013 }, { "acc": 0.99748049, "epoch": 2.9321314952279955, "grad_norm": 0.5239591002464294, "learning_rate": 8.468732899513958e-06, "loss": 0.00663889, "memory(GiB)": 26.31, "step": 2765, "train_speed(iter/s)": 0.583014 }, { "acc": 0.9986208, "epoch": 2.9374337221633087, "grad_norm": 0.29796159267425537, "learning_rate": 8.462413377424682e-06, "loss": 0.0047507, "memory(GiB)": 26.31, "step": 2770, "train_speed(iter/s)": 0.583011 }, { "acc": 0.99765453, "epoch": 2.9427359490986214, "grad_norm": 0.6221350431442261, "learning_rate": 8.456083210628117e-06, "loss": 0.00811146, "memory(GiB)": 26.31, "step": 2775, "train_speed(iter/s)": 0.583018 }, { "acc": 0.99878178, "epoch": 2.9480381760339345, "grad_norm": 0.45186954736709595, "learning_rate": 8.449742418588293e-06, "loss": 0.00400441, "memory(GiB)": 26.31, "step": 2780, "train_speed(iter/s)": 0.583017 }, { "acc": 0.9988843, "epoch": 2.953340402969247, "grad_norm": 0.4978770315647125, "learning_rate": 8.443391020801904e-06, "loss": 0.00453811, "memory(GiB)": 26.31, "step": 2785, "train_speed(iter/s)": 0.583021 }, { "acc": 0.99816494, "epoch": 2.95864262990456, "grad_norm": 0.46046286821365356, "learning_rate": 8.437029036798259e-06, "loss": 0.00635334, "memory(GiB)": 26.31, "step": 2790, "train_speed(iter/s)": 0.583016 }, { "acc": 0.99850636, "epoch": 2.9639448568398725, "grad_norm": 0.6356995105743408, "learning_rate": 8.430656486139217e-06, "loss": 0.00549962, "memory(GiB)": 26.31, "step": 2795, "train_speed(iter/s)": 0.583014 }, { "acc": 0.99839563, "epoch": 2.9692470837751856, "grad_norm": 0.6807392239570618, "learning_rate": 8.424273388419122e-06, "loss": 0.0061514, "memory(GiB)": 26.31, "step": 2800, "train_speed(iter/s)": 0.583014 }, { "acc": 0.99671831, "epoch": 2.9745493107104983, "grad_norm": 0.6449872255325317, "learning_rate": 8.417879763264759e-06, "loss": 0.01174504, "memory(GiB)": 26.31, "step": 2805, "train_speed(iter/s)": 0.58302 }, { "acc": 0.99840851, "epoch": 2.9798515376458115, "grad_norm": 0.5310406684875488, "learning_rate": 8.411475630335267e-06, "loss": 0.00404135, "memory(GiB)": 26.31, "step": 2810, "train_speed(iter/s)": 0.583024 }, { "acc": 0.99735947, "epoch": 2.985153764581124, "grad_norm": 0.4129229784011841, "learning_rate": 8.405061009322113e-06, "loss": 0.00868261, "memory(GiB)": 26.31, "step": 2815, "train_speed(iter/s)": 0.583022 }, { "acc": 0.99885445, "epoch": 2.990455991516437, "grad_norm": 0.4066607654094696, "learning_rate": 8.398635919948998e-06, "loss": 0.0049273, "memory(GiB)": 26.31, "step": 2820, "train_speed(iter/s)": 0.583025 }, { "acc": 0.99865417, "epoch": 2.9957582184517495, "grad_norm": 0.45932242274284363, "learning_rate": 8.392200381971819e-06, "loss": 0.00448716, "memory(GiB)": 26.31, "step": 2825, "train_speed(iter/s)": 0.583027 }, { "acc": 0.99809284, "epoch": 3.0010604453870626, "grad_norm": 0.6505632400512695, "learning_rate": 8.385754415178594e-06, "loss": 0.00660937, "memory(GiB)": 26.31, "step": 2830, "train_speed(iter/s)": 0.582904 }, { "acc": 0.99857645, "epoch": 3.0063626723223753, "grad_norm": 0.5474786758422852, "learning_rate": 8.379298039389418e-06, "loss": 0.00656885, "memory(GiB)": 26.31, "step": 2835, "train_speed(iter/s)": 0.582903 }, { "acc": 0.99797611, "epoch": 3.0116648992576884, "grad_norm": 0.5049321055412292, "learning_rate": 8.372831274456378e-06, "loss": 0.0067019, "memory(GiB)": 26.31, "step": 2840, "train_speed(iter/s)": 0.582908 }, { "acc": 0.9989069, "epoch": 3.016967126193001, "grad_norm": 0.25424474477767944, "learning_rate": 8.366354140263519e-06, "loss": 0.00447071, "memory(GiB)": 26.31, "step": 2845, "train_speed(iter/s)": 0.582914 }, { "acc": 0.99727268, "epoch": 3.022269353128314, "grad_norm": 0.8304399847984314, "learning_rate": 8.35986665672676e-06, "loss": 0.00797994, "memory(GiB)": 26.31, "step": 2850, "train_speed(iter/s)": 0.582914 }, { "acc": 0.99812622, "epoch": 3.027571580063627, "grad_norm": 0.5393794775009155, "learning_rate": 8.353368843793847e-06, "loss": 0.00611412, "memory(GiB)": 26.31, "step": 2855, "train_speed(iter/s)": 0.582917 }, { "acc": 0.99841595, "epoch": 3.0328738069989396, "grad_norm": 0.5000630021095276, "learning_rate": 8.346860721444284e-06, "loss": 0.00692058, "memory(GiB)": 26.31, "step": 2860, "train_speed(iter/s)": 0.582916 }, { "acc": 0.9972909, "epoch": 3.0381760339342523, "grad_norm": 0.6896098256111145, "learning_rate": 8.340342309689274e-06, "loss": 0.0062426, "memory(GiB)": 26.31, "step": 2865, "train_speed(iter/s)": 0.582925 }, { "acc": 0.99858389, "epoch": 3.0434782608695654, "grad_norm": 0.49193787574768066, "learning_rate": 8.333813628571665e-06, "loss": 0.00430128, "memory(GiB)": 26.31, "step": 2870, "train_speed(iter/s)": 0.582926 }, { "acc": 0.99836273, "epoch": 3.048780487804878, "grad_norm": 0.20785479247570038, "learning_rate": 8.32727469816587e-06, "loss": 0.00575757, "memory(GiB)": 26.31, "step": 2875, "train_speed(iter/s)": 0.582938 }, { "acc": 0.99837456, "epoch": 3.0540827147401908, "grad_norm": 0.4574993848800659, "learning_rate": 8.320725538577825e-06, "loss": 0.00428646, "memory(GiB)": 26.31, "step": 2880, "train_speed(iter/s)": 0.58294 }, { "acc": 0.99787979, "epoch": 3.059384941675504, "grad_norm": 0.3935782015323639, "learning_rate": 8.314166169944919e-06, "loss": 0.0048808, "memory(GiB)": 26.31, "step": 2885, "train_speed(iter/s)": 0.582947 }, { "acc": 0.99847889, "epoch": 3.0646871686108166, "grad_norm": 0.34742140769958496, "learning_rate": 8.307596612435925e-06, "loss": 0.00527749, "memory(GiB)": 26.31, "step": 2890, "train_speed(iter/s)": 0.582951 }, { "acc": 0.99888916, "epoch": 3.0699893955461293, "grad_norm": 0.1643511801958084, "learning_rate": 8.30101688625095e-06, "loss": 0.00485554, "memory(GiB)": 26.31, "step": 2895, "train_speed(iter/s)": 0.582947 }, { "acc": 0.99863644, "epoch": 3.0752916224814424, "grad_norm": 0.4972369074821472, "learning_rate": 8.294427011621367e-06, "loss": 0.0042267, "memory(GiB)": 26.31, "step": 2900, "train_speed(iter/s)": 0.582946 }, { "acc": 0.99793358, "epoch": 3.080593849416755, "grad_norm": 0.8058770895004272, "learning_rate": 8.287827008809755e-06, "loss": 0.00536377, "memory(GiB)": 26.31, "step": 2905, "train_speed(iter/s)": 0.582953 }, { "acc": 0.99707813, "epoch": 3.0858960763520678, "grad_norm": 0.595598578453064, "learning_rate": 8.281216898109827e-06, "loss": 0.00957358, "memory(GiB)": 26.31, "step": 2910, "train_speed(iter/s)": 0.582953 }, { "acc": 0.99796486, "epoch": 3.091198303287381, "grad_norm": 0.6424652934074402, "learning_rate": 8.27459669984639e-06, "loss": 0.00617009, "memory(GiB)": 26.31, "step": 2915, "train_speed(iter/s)": 0.582958 }, { "acc": 0.99840441, "epoch": 3.0965005302226936, "grad_norm": 0.7321480512619019, "learning_rate": 8.267966434375255e-06, "loss": 0.00683057, "memory(GiB)": 26.31, "step": 2920, "train_speed(iter/s)": 0.582957 }, { "acc": 0.99913998, "epoch": 3.1018027571580062, "grad_norm": 0.8098109364509583, "learning_rate": 8.261326122083194e-06, "loss": 0.00538251, "memory(GiB)": 26.31, "step": 2925, "train_speed(iter/s)": 0.582962 }, { "acc": 0.99811344, "epoch": 3.1071049840933194, "grad_norm": 0.4597850441932678, "learning_rate": 8.25467578338787e-06, "loss": 0.00507367, "memory(GiB)": 26.31, "step": 2930, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99837627, "epoch": 3.112407211028632, "grad_norm": 0.2444171905517578, "learning_rate": 8.248015438737775e-06, "loss": 0.00495638, "memory(GiB)": 26.31, "step": 2935, "train_speed(iter/s)": 0.582967 }, { "acc": 0.99865885, "epoch": 3.1177094379639447, "grad_norm": 0.6311067938804626, "learning_rate": 8.241345108612172e-06, "loss": 0.00445099, "memory(GiB)": 26.31, "step": 2940, "train_speed(iter/s)": 0.582975 }, { "acc": 0.99781256, "epoch": 3.123011664899258, "grad_norm": 0.39378607273101807, "learning_rate": 8.234664813521014e-06, "loss": 0.00721694, "memory(GiB)": 26.31, "step": 2945, "train_speed(iter/s)": 0.582972 }, { "acc": 0.99817371, "epoch": 3.1283138918345705, "grad_norm": 0.5058181285858154, "learning_rate": 8.227974574004911e-06, "loss": 0.00749256, "memory(GiB)": 26.31, "step": 2950, "train_speed(iter/s)": 0.582973 }, { "acc": 0.99721832, "epoch": 3.133616118769883, "grad_norm": 0.814859926700592, "learning_rate": 8.22127441063504e-06, "loss": 0.00681448, "memory(GiB)": 26.31, "step": 2955, "train_speed(iter/s)": 0.582983 }, { "acc": 0.99774599, "epoch": 3.1389183457051963, "grad_norm": 0.5003991723060608, "learning_rate": 8.214564344013093e-06, "loss": 0.00531913, "memory(GiB)": 26.31, "step": 2960, "train_speed(iter/s)": 0.582988 }, { "acc": 0.99780588, "epoch": 3.144220572640509, "grad_norm": 0.5367461442947388, "learning_rate": 8.207844394771218e-06, "loss": 0.00721164, "memory(GiB)": 26.31, "step": 2965, "train_speed(iter/s)": 0.58299 }, { "acc": 0.99879112, "epoch": 3.1495227995758217, "grad_norm": 0.35926058888435364, "learning_rate": 8.20111458357194e-06, "loss": 0.00350981, "memory(GiB)": 26.31, "step": 2970, "train_speed(iter/s)": 0.582989 }, { "acc": 0.9974081, "epoch": 3.154825026511135, "grad_norm": 1.1875823736190796, "learning_rate": 8.194374931108117e-06, "loss": 0.00708345, "memory(GiB)": 26.31, "step": 2975, "train_speed(iter/s)": 0.582989 }, { "acc": 0.99869471, "epoch": 3.1601272534464475, "grad_norm": 0.48193642497062683, "learning_rate": 8.187625458102865e-06, "loss": 0.00425645, "memory(GiB)": 26.31, "step": 2980, "train_speed(iter/s)": 0.582988 }, { "acc": 0.99819031, "epoch": 3.16542948038176, "grad_norm": 0.7630764842033386, "learning_rate": 8.180866185309493e-06, "loss": 0.00596784, "memory(GiB)": 26.31, "step": 2985, "train_speed(iter/s)": 0.582988 }, { "acc": 0.99840889, "epoch": 3.1707317073170733, "grad_norm": 0.7666534781455994, "learning_rate": 8.174097133511444e-06, "loss": 0.00508381, "memory(GiB)": 26.31, "step": 2990, "train_speed(iter/s)": 0.582986 }, { "acc": 0.99853182, "epoch": 3.176033934252386, "grad_norm": 0.497490793466568, "learning_rate": 8.167318323522232e-06, "loss": 0.00554109, "memory(GiB)": 26.31, "step": 2995, "train_speed(iter/s)": 0.582988 }, { "acc": 0.99774895, "epoch": 3.1813361611876987, "grad_norm": 0.9739230275154114, "learning_rate": 8.160529776185369e-06, "loss": 0.00726523, "memory(GiB)": 26.31, "step": 3000, "train_speed(iter/s)": 0.582985 }, { "acc": 0.99790821, "epoch": 3.186638388123012, "grad_norm": 0.8522117137908936, "learning_rate": 8.153731512374317e-06, "loss": 0.00599418, "memory(GiB)": 26.31, "step": 3005, "train_speed(iter/s)": 0.582992 }, { "acc": 0.99876814, "epoch": 3.1919406150583245, "grad_norm": 0.3050813674926758, "learning_rate": 8.146923552992406e-06, "loss": 0.00465048, "memory(GiB)": 26.31, "step": 3010, "train_speed(iter/s)": 0.582994 }, { "acc": 0.99889717, "epoch": 3.197242841993637, "grad_norm": 0.42212408781051636, "learning_rate": 8.14010591897278e-06, "loss": 0.00312484, "memory(GiB)": 26.31, "step": 3015, "train_speed(iter/s)": 0.582998 }, { "acc": 0.99857616, "epoch": 3.2025450689289503, "grad_norm": 0.4540407061576843, "learning_rate": 8.133278631278335e-06, "loss": 0.00428031, "memory(GiB)": 26.31, "step": 3020, "train_speed(iter/s)": 0.582996 }, { "acc": 0.99821129, "epoch": 3.207847295864263, "grad_norm": 0.46233507990837097, "learning_rate": 8.126441710901645e-06, "loss": 0.00802353, "memory(GiB)": 26.31, "step": 3025, "train_speed(iter/s)": 0.583002 }, { "acc": 0.99873466, "epoch": 3.2131495227995757, "grad_norm": 0.3596537411212921, "learning_rate": 8.119595178864904e-06, "loss": 0.00341255, "memory(GiB)": 26.31, "step": 3030, "train_speed(iter/s)": 0.583002 }, { "acc": 0.99799576, "epoch": 3.218451749734889, "grad_norm": 0.576862633228302, "learning_rate": 8.112739056219863e-06, "loss": 0.00627927, "memory(GiB)": 26.31, "step": 3035, "train_speed(iter/s)": 0.583007 }, { "acc": 0.99877367, "epoch": 3.2237539766702015, "grad_norm": 0.6581805348396301, "learning_rate": 8.105873364047757e-06, "loss": 0.00428308, "memory(GiB)": 26.31, "step": 3040, "train_speed(iter/s)": 0.583009 }, { "acc": 0.9980196, "epoch": 3.229056203605514, "grad_norm": 0.5988194942474365, "learning_rate": 8.098998123459246e-06, "loss": 0.00572769, "memory(GiB)": 26.31, "step": 3045, "train_speed(iter/s)": 0.583008 }, { "acc": 0.9987751, "epoch": 3.2343584305408273, "grad_norm": 0.5027371644973755, "learning_rate": 8.092113355594356e-06, "loss": 0.00393928, "memory(GiB)": 26.31, "step": 3050, "train_speed(iter/s)": 0.583009 }, { "acc": 0.99803228, "epoch": 3.23966065747614, "grad_norm": 0.9007562398910522, "learning_rate": 8.085219081622403e-06, "loss": 0.0059559, "memory(GiB)": 26.31, "step": 3055, "train_speed(iter/s)": 0.583009 }, { "acc": 0.99805775, "epoch": 3.2449628844114526, "grad_norm": 1.047590970993042, "learning_rate": 8.078315322741928e-06, "loss": 0.0064345, "memory(GiB)": 26.31, "step": 3060, "train_speed(iter/s)": 0.58301 }, { "acc": 0.99793072, "epoch": 3.2502651113467658, "grad_norm": 0.828264057636261, "learning_rate": 8.071402100180646e-06, "loss": 0.00672631, "memory(GiB)": 26.31, "step": 3065, "train_speed(iter/s)": 0.583012 }, { "acc": 0.99873915, "epoch": 3.2555673382820784, "grad_norm": 0.2712744176387787, "learning_rate": 8.064479435195362e-06, "loss": 0.00426545, "memory(GiB)": 26.31, "step": 3070, "train_speed(iter/s)": 0.58301 }, { "acc": 0.99843769, "epoch": 3.260869565217391, "grad_norm": 0.7335013151168823, "learning_rate": 8.05754734907192e-06, "loss": 0.00619958, "memory(GiB)": 26.31, "step": 3075, "train_speed(iter/s)": 0.583015 }, { "acc": 0.99847326, "epoch": 3.2661717921527043, "grad_norm": 0.544634997844696, "learning_rate": 8.050605863125132e-06, "loss": 0.00407071, "memory(GiB)": 26.31, "step": 3080, "train_speed(iter/s)": 0.583014 }, { "acc": 0.99833088, "epoch": 3.271474019088017, "grad_norm": 0.6793374419212341, "learning_rate": 8.04365499869871e-06, "loss": 0.00461064, "memory(GiB)": 26.31, "step": 3085, "train_speed(iter/s)": 0.583018 }, { "acc": 0.99863014, "epoch": 3.2767762460233296, "grad_norm": 0.5401763319969177, "learning_rate": 8.036694777165202e-06, "loss": 0.00503164, "memory(GiB)": 26.31, "step": 3090, "train_speed(iter/s)": 0.583023 }, { "acc": 0.99816036, "epoch": 3.2820784729586427, "grad_norm": 0.3845391571521759, "learning_rate": 8.029725219925932e-06, "loss": 0.00504886, "memory(GiB)": 26.31, "step": 3095, "train_speed(iter/s)": 0.583034 }, { "acc": 0.99808779, "epoch": 3.2873806998939554, "grad_norm": 0.49690601229667664, "learning_rate": 8.022746348410924e-06, "loss": 0.00739293, "memory(GiB)": 26.31, "step": 3100, "train_speed(iter/s)": 0.58304 }, { "acc": 0.99887943, "epoch": 3.292682926829268, "grad_norm": 0.667151927947998, "learning_rate": 8.015758184078849e-06, "loss": 0.00365211, "memory(GiB)": 26.31, "step": 3105, "train_speed(iter/s)": 0.583039 }, { "acc": 0.99844265, "epoch": 3.2979851537645812, "grad_norm": 0.32358503341674805, "learning_rate": 8.008760748416942e-06, "loss": 0.00485204, "memory(GiB)": 26.31, "step": 3110, "train_speed(iter/s)": 0.583039 }, { "acc": 0.99788647, "epoch": 3.303287380699894, "grad_norm": 0.7840277552604675, "learning_rate": 8.001754062940956e-06, "loss": 0.00626139, "memory(GiB)": 26.31, "step": 3115, "train_speed(iter/s)": 0.583038 }, { "acc": 0.99826546, "epoch": 3.3085896076352066, "grad_norm": 0.38156384229660034, "learning_rate": 7.994738149195074e-06, "loss": 0.00547561, "memory(GiB)": 26.31, "step": 3120, "train_speed(iter/s)": 0.583036 }, { "acc": 0.99824371, "epoch": 3.3138918345705197, "grad_norm": 0.6840994358062744, "learning_rate": 7.987713028751866e-06, "loss": 0.00518895, "memory(GiB)": 26.31, "step": 3125, "train_speed(iter/s)": 0.583035 }, { "acc": 0.99864225, "epoch": 3.3191940615058324, "grad_norm": 0.4946794807910919, "learning_rate": 7.9806787232122e-06, "loss": 0.00464142, "memory(GiB)": 26.31, "step": 3130, "train_speed(iter/s)": 0.583034 }, { "acc": 0.99752941, "epoch": 3.3244962884411455, "grad_norm": 0.7485939860343933, "learning_rate": 7.973635254205194e-06, "loss": 0.00718605, "memory(GiB)": 26.31, "step": 3135, "train_speed(iter/s)": 0.583046 }, { "acc": 0.99850063, "epoch": 3.329798515376458, "grad_norm": 0.5883116722106934, "learning_rate": 7.96658264338814e-06, "loss": 0.00474916, "memory(GiB)": 26.31, "step": 3140, "train_speed(iter/s)": 0.583044 }, { "acc": 0.99846611, "epoch": 3.335100742311771, "grad_norm": 0.5782555937767029, "learning_rate": 7.959520912446434e-06, "loss": 0.00569647, "memory(GiB)": 26.31, "step": 3145, "train_speed(iter/s)": 0.583046 }, { "acc": 0.99788933, "epoch": 3.3404029692470836, "grad_norm": 0.5309669375419617, "learning_rate": 7.952450083093521e-06, "loss": 0.00625908, "memory(GiB)": 26.31, "step": 3150, "train_speed(iter/s)": 0.583047 }, { "acc": 0.99900951, "epoch": 3.3457051961823967, "grad_norm": 0.366931676864624, "learning_rate": 7.945370177070823e-06, "loss": 0.00502579, "memory(GiB)": 26.31, "step": 3155, "train_speed(iter/s)": 0.583052 }, { "acc": 0.99902477, "epoch": 3.3510074231177094, "grad_norm": 0.38507696986198425, "learning_rate": 7.938281216147664e-06, "loss": 0.00419005, "memory(GiB)": 26.31, "step": 3160, "train_speed(iter/s)": 0.583057 }, { "acc": 0.99903908, "epoch": 3.3563096500530225, "grad_norm": 0.18632544577121735, "learning_rate": 7.931183222121217e-06, "loss": 0.00402411, "memory(GiB)": 26.31, "step": 3165, "train_speed(iter/s)": 0.583059 }, { "acc": 0.99872265, "epoch": 3.361611876988335, "grad_norm": 0.5425890684127808, "learning_rate": 7.924076216816423e-06, "loss": 0.00426805, "memory(GiB)": 26.31, "step": 3170, "train_speed(iter/s)": 0.58307 }, { "acc": 0.99785099, "epoch": 3.366914103923648, "grad_norm": 0.5266686081886292, "learning_rate": 7.916960222085938e-06, "loss": 0.00595398, "memory(GiB)": 26.31, "step": 3175, "train_speed(iter/s)": 0.583065 }, { "acc": 0.99811115, "epoch": 3.3722163308589606, "grad_norm": 0.8457557559013367, "learning_rate": 7.909835259810054e-06, "loss": 0.00490667, "memory(GiB)": 26.31, "step": 3180, "train_speed(iter/s)": 0.583063 }, { "acc": 0.99839916, "epoch": 3.3775185577942737, "grad_norm": 0.762008786201477, "learning_rate": 7.90270135189664e-06, "loss": 0.00429381, "memory(GiB)": 26.31, "step": 3185, "train_speed(iter/s)": 0.583065 }, { "acc": 0.9985775, "epoch": 3.3828207847295864, "grad_norm": 0.6448890566825867, "learning_rate": 7.895558520281066e-06, "loss": 0.0047732, "memory(GiB)": 26.31, "step": 3190, "train_speed(iter/s)": 0.583062 }, { "acc": 0.99828854, "epoch": 3.3881230116648995, "grad_norm": 0.6847289204597473, "learning_rate": 7.888406786926148e-06, "loss": 0.00535885, "memory(GiB)": 26.31, "step": 3195, "train_speed(iter/s)": 0.583062 }, { "acc": 0.99830017, "epoch": 3.393425238600212, "grad_norm": 0.4629160165786743, "learning_rate": 7.881246173822066e-06, "loss": 0.00592602, "memory(GiB)": 26.31, "step": 3200, "train_speed(iter/s)": 0.583058 }, { "acc": 0.99878588, "epoch": 3.398727465535525, "grad_norm": 0.4068206250667572, "learning_rate": 7.874076702986305e-06, "loss": 0.00484697, "memory(GiB)": 26.31, "step": 3205, "train_speed(iter/s)": 0.583052 }, { "acc": 0.99862919, "epoch": 3.4040296924708375, "grad_norm": 0.5081034898757935, "learning_rate": 7.866898396463588e-06, "loss": 0.00621317, "memory(GiB)": 26.31, "step": 3210, "train_speed(iter/s)": 0.583047 }, { "acc": 0.99848499, "epoch": 3.4093319194061507, "grad_norm": 0.20022617280483246, "learning_rate": 7.859711276325807e-06, "loss": 0.00451258, "memory(GiB)": 26.31, "step": 3215, "train_speed(iter/s)": 0.583041 }, { "acc": 0.99856482, "epoch": 3.4146341463414633, "grad_norm": 0.7532142400741577, "learning_rate": 7.85251536467195e-06, "loss": 0.00636707, "memory(GiB)": 26.31, "step": 3220, "train_speed(iter/s)": 0.583045 }, { "acc": 0.99817324, "epoch": 3.4199363732767765, "grad_norm": 0.6302582621574402, "learning_rate": 7.845310683628044e-06, "loss": 0.00610769, "memory(GiB)": 26.31, "step": 3225, "train_speed(iter/s)": 0.583041 }, { "acc": 0.99841137, "epoch": 3.425238600212089, "grad_norm": 0.32306987047195435, "learning_rate": 7.83809725534707e-06, "loss": 0.00437971, "memory(GiB)": 26.31, "step": 3230, "train_speed(iter/s)": 0.583039 }, { "acc": 0.99868917, "epoch": 3.430540827147402, "grad_norm": 0.7746492028236389, "learning_rate": 7.830875102008913e-06, "loss": 0.00378766, "memory(GiB)": 26.31, "step": 3235, "train_speed(iter/s)": 0.583036 }, { "acc": 0.99835024, "epoch": 3.4358430540827145, "grad_norm": 0.8032320141792297, "learning_rate": 7.823644245820282e-06, "loss": 0.00470252, "memory(GiB)": 26.31, "step": 3240, "train_speed(iter/s)": 0.583033 }, { "acc": 0.99794064, "epoch": 3.4411452810180276, "grad_norm": 0.5391832590103149, "learning_rate": 7.81640470901465e-06, "loss": 0.00547033, "memory(GiB)": 26.31, "step": 3245, "train_speed(iter/s)": 0.583029 }, { "acc": 0.99810715, "epoch": 3.4464475079533403, "grad_norm": 0.7630506753921509, "learning_rate": 7.80915651385218e-06, "loss": 0.00489411, "memory(GiB)": 26.31, "step": 3250, "train_speed(iter/s)": 0.583026 }, { "acc": 0.99926071, "epoch": 3.4517497348886534, "grad_norm": 0.25475117564201355, "learning_rate": 7.801899682619649e-06, "loss": 0.00258712, "memory(GiB)": 26.31, "step": 3255, "train_speed(iter/s)": 0.583022 }, { "acc": 0.99808798, "epoch": 3.457051961823966, "grad_norm": 0.8293874263763428, "learning_rate": 7.794634237630399e-06, "loss": 0.00512287, "memory(GiB)": 26.31, "step": 3260, "train_speed(iter/s)": 0.583018 }, { "acc": 0.99783936, "epoch": 3.462354188759279, "grad_norm": 0.6153422594070435, "learning_rate": 7.787360201224255e-06, "loss": 0.00503631, "memory(GiB)": 26.31, "step": 3265, "train_speed(iter/s)": 0.583018 }, { "acc": 0.99905224, "epoch": 3.4676564156945915, "grad_norm": 0.44570717215538025, "learning_rate": 7.780077595767458e-06, "loss": 0.00378264, "memory(GiB)": 26.31, "step": 3270, "train_speed(iter/s)": 0.583013 }, { "acc": 0.99846916, "epoch": 3.4729586426299046, "grad_norm": 0.2163510024547577, "learning_rate": 7.772786443652594e-06, "loss": 0.00520158, "memory(GiB)": 26.31, "step": 3275, "train_speed(iter/s)": 0.58301 }, { "acc": 0.99852314, "epoch": 3.4782608695652173, "grad_norm": 0.646382749080658, "learning_rate": 7.765486767298536e-06, "loss": 0.00608887, "memory(GiB)": 26.31, "step": 3280, "train_speed(iter/s)": 0.583007 }, { "acc": 0.99863243, "epoch": 3.4835630965005304, "grad_norm": 0.19896253943443298, "learning_rate": 7.758178589150358e-06, "loss": 0.00385373, "memory(GiB)": 26.31, "step": 3285, "train_speed(iter/s)": 0.583011 }, { "acc": 0.99773331, "epoch": 3.488865323435843, "grad_norm": 0.7520610690116882, "learning_rate": 7.750861931679285e-06, "loss": 0.00724002, "memory(GiB)": 26.31, "step": 3290, "train_speed(iter/s)": 0.583006 }, { "acc": 0.99894562, "epoch": 3.494167550371156, "grad_norm": 0.26213565468788147, "learning_rate": 7.743536817382603e-06, "loss": 0.00309457, "memory(GiB)": 26.31, "step": 3295, "train_speed(iter/s)": 0.583003 }, { "acc": 0.99864101, "epoch": 3.499469777306469, "grad_norm": 0.6381789445877075, "learning_rate": 7.73620326878361e-06, "loss": 0.00587287, "memory(GiB)": 26.31, "step": 3300, "train_speed(iter/s)": 0.582999 }, { "acc": 0.99802217, "epoch": 3.5047720042417816, "grad_norm": 0.4187053442001343, "learning_rate": 7.728861308431538e-06, "loss": 0.00545754, "memory(GiB)": 26.31, "step": 3305, "train_speed(iter/s)": 0.582994 }, { "acc": 0.9990015, "epoch": 3.5100742311770943, "grad_norm": 0.3954903781414032, "learning_rate": 7.721510958901476e-06, "loss": 0.0044807, "memory(GiB)": 26.31, "step": 3310, "train_speed(iter/s)": 0.582992 }, { "acc": 0.99838123, "epoch": 3.5153764581124074, "grad_norm": 0.635510265827179, "learning_rate": 7.714152242794319e-06, "loss": 0.00571715, "memory(GiB)": 26.31, "step": 3315, "train_speed(iter/s)": 0.582989 }, { "acc": 0.99914103, "epoch": 3.52067868504772, "grad_norm": 0.6559579968452454, "learning_rate": 7.706785182736675e-06, "loss": 0.00398727, "memory(GiB)": 26.31, "step": 3320, "train_speed(iter/s)": 0.582989 }, { "acc": 0.99828911, "epoch": 3.5259809119830328, "grad_norm": 0.6975615620613098, "learning_rate": 7.699409801380816e-06, "loss": 0.00605518, "memory(GiB)": 26.31, "step": 3325, "train_speed(iter/s)": 0.582989 }, { "acc": 0.99820814, "epoch": 3.5312831389183454, "grad_norm": 0.4516032338142395, "learning_rate": 7.692026121404602e-06, "loss": 0.00592186, "memory(GiB)": 26.31, "step": 3330, "train_speed(iter/s)": 0.582983 }, { "acc": 0.99784756, "epoch": 3.5365853658536586, "grad_norm": 0.6378289461135864, "learning_rate": 7.684634165511404e-06, "loss": 0.0053414, "memory(GiB)": 26.31, "step": 3335, "train_speed(iter/s)": 0.58299 }, { "acc": 0.99784336, "epoch": 3.5418875927889713, "grad_norm": 0.6353384852409363, "learning_rate": 7.677233956430041e-06, "loss": 0.00759888, "memory(GiB)": 26.31, "step": 3340, "train_speed(iter/s)": 0.582989 }, { "acc": 0.9990695, "epoch": 3.5471898197242844, "grad_norm": 0.4579092264175415, "learning_rate": 7.669825516914713e-06, "loss": 0.00361843, "memory(GiB)": 26.31, "step": 3345, "train_speed(iter/s)": 0.582995 }, { "acc": 0.99890776, "epoch": 3.552492046659597, "grad_norm": 0.6212195754051208, "learning_rate": 7.662408869744921e-06, "loss": 0.00401128, "memory(GiB)": 26.31, "step": 3350, "train_speed(iter/s)": 0.582995 }, { "acc": 0.99883776, "epoch": 3.5577942735949097, "grad_norm": 0.2507479786872864, "learning_rate": 7.65498403772541e-06, "loss": 0.00377725, "memory(GiB)": 26.31, "step": 3355, "train_speed(iter/s)": 0.582991 }, { "acc": 0.99893551, "epoch": 3.5630965005302224, "grad_norm": 0.6308388113975525, "learning_rate": 7.647551043686084e-06, "loss": 0.00425741, "memory(GiB)": 26.31, "step": 3360, "train_speed(iter/s)": 0.582986 }, { "acc": 0.99891872, "epoch": 3.5683987274655355, "grad_norm": 0.3604690134525299, "learning_rate": 7.640109910481947e-06, "loss": 0.00557023, "memory(GiB)": 26.31, "step": 3365, "train_speed(iter/s)": 0.582982 }, { "acc": 0.99903145, "epoch": 3.5737009544008482, "grad_norm": 0.19093440473079681, "learning_rate": 7.632660660993036e-06, "loss": 0.00356767, "memory(GiB)": 26.31, "step": 3370, "train_speed(iter/s)": 0.582979 }, { "acc": 0.99842091, "epoch": 3.5790031813361614, "grad_norm": 0.8038215637207031, "learning_rate": 7.625203318124332e-06, "loss": 0.00609244, "memory(GiB)": 26.31, "step": 3375, "train_speed(iter/s)": 0.582973 }, { "acc": 0.99883862, "epoch": 3.584305408271474, "grad_norm": 0.42334866523742676, "learning_rate": 7.617737904805709e-06, "loss": 0.00335372, "memory(GiB)": 26.31, "step": 3380, "train_speed(iter/s)": 0.582972 }, { "acc": 0.99890652, "epoch": 3.5896076352067867, "grad_norm": 0.5734203457832336, "learning_rate": 7.610264443991855e-06, "loss": 0.0062226, "memory(GiB)": 26.31, "step": 3385, "train_speed(iter/s)": 0.582974 }, { "acc": 0.99906788, "epoch": 3.5949098621421, "grad_norm": 0.5392136573791504, "learning_rate": 7.6027829586622016e-06, "loss": 0.00349057, "memory(GiB)": 26.31, "step": 3390, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99890661, "epoch": 3.6002120890774125, "grad_norm": 0.44584783911705017, "learning_rate": 7.5952934718208525e-06, "loss": 0.00261679, "memory(GiB)": 26.31, "step": 3395, "train_speed(iter/s)": 0.582966 }, { "acc": 0.99926147, "epoch": 3.605514316012725, "grad_norm": 0.3416913151741028, "learning_rate": 7.587796006496522e-06, "loss": 0.00221901, "memory(GiB)": 26.31, "step": 3400, "train_speed(iter/s)": 0.582958 }, { "acc": 0.99889278, "epoch": 3.6108165429480383, "grad_norm": 0.5704333186149597, "learning_rate": 7.580290585742445e-06, "loss": 0.0049392, "memory(GiB)": 26.31, "step": 3405, "train_speed(iter/s)": 0.582953 }, { "acc": 0.99948616, "epoch": 3.616118769883351, "grad_norm": 0.42870983481407166, "learning_rate": 7.572777232636328e-06, "loss": 0.00195065, "memory(GiB)": 26.31, "step": 3410, "train_speed(iter/s)": 0.582951 }, { "acc": 0.99882116, "epoch": 3.6214209968186637, "grad_norm": 0.202960804104805, "learning_rate": 7.565255970280263e-06, "loss": 0.00449147, "memory(GiB)": 26.31, "step": 3415, "train_speed(iter/s)": 0.582946 }, { "acc": 0.99910755, "epoch": 3.626723223753977, "grad_norm": 0.6247968077659607, "learning_rate": 7.557726821800661e-06, "loss": 0.00416288, "memory(GiB)": 26.31, "step": 3420, "train_speed(iter/s)": 0.582942 }, { "acc": 0.99842358, "epoch": 3.6320254506892895, "grad_norm": 0.5606263875961304, "learning_rate": 7.550189810348183e-06, "loss": 0.0057487, "memory(GiB)": 26.31, "step": 3425, "train_speed(iter/s)": 0.582939 }, { "acc": 0.99884071, "epoch": 3.637327677624602, "grad_norm": 0.521111786365509, "learning_rate": 7.542644959097668e-06, "loss": 0.00436895, "memory(GiB)": 26.31, "step": 3430, "train_speed(iter/s)": 0.582937 }, { "acc": 0.99864883, "epoch": 3.6426299045599153, "grad_norm": 0.42435264587402344, "learning_rate": 7.535092291248058e-06, "loss": 0.00424842, "memory(GiB)": 26.31, "step": 3435, "train_speed(iter/s)": 0.582939 }, { "acc": 0.99865971, "epoch": 3.647932131495228, "grad_norm": 0.5255548357963562, "learning_rate": 7.5275318300223345e-06, "loss": 0.00307308, "memory(GiB)": 26.31, "step": 3440, "train_speed(iter/s)": 0.582934 }, { "acc": 0.99876957, "epoch": 3.6532343584305407, "grad_norm": 0.34655559062957764, "learning_rate": 7.519963598667434e-06, "loss": 0.00385971, "memory(GiB)": 26.31, "step": 3445, "train_speed(iter/s)": 0.582934 }, { "acc": 0.99846992, "epoch": 3.658536585365854, "grad_norm": 0.617057740688324, "learning_rate": 7.5123876204541925e-06, "loss": 0.00614815, "memory(GiB)": 26.31, "step": 3450, "train_speed(iter/s)": 0.582928 }, { "acc": 0.99899044, "epoch": 3.6638388123011665, "grad_norm": 0.5251142978668213, "learning_rate": 7.504803918677261e-06, "loss": 0.00470486, "memory(GiB)": 26.31, "step": 3455, "train_speed(iter/s)": 0.582919 }, { "acc": 0.99908333, "epoch": 3.669141039236479, "grad_norm": 0.7263888120651245, "learning_rate": 7.497212516655043e-06, "loss": 0.00408497, "memory(GiB)": 26.31, "step": 3460, "train_speed(iter/s)": 0.582912 }, { "acc": 0.99742146, "epoch": 3.6744432661717923, "grad_norm": 0.6099832653999329, "learning_rate": 7.489613437729614e-06, "loss": 0.00595545, "memory(GiB)": 26.31, "step": 3465, "train_speed(iter/s)": 0.582906 }, { "acc": 0.99899769, "epoch": 3.679745493107105, "grad_norm": 1.0556585788726807, "learning_rate": 7.482006705266659e-06, "loss": 0.00584042, "memory(GiB)": 26.31, "step": 3470, "train_speed(iter/s)": 0.582903 }, { "acc": 0.99856606, "epoch": 3.6850477200424177, "grad_norm": 0.09941625595092773, "learning_rate": 7.474392342655393e-06, "loss": 0.00341795, "memory(GiB)": 26.31, "step": 3475, "train_speed(iter/s)": 0.582897 }, { "acc": 0.99857359, "epoch": 3.6903499469777308, "grad_norm": 0.6893133521080017, "learning_rate": 7.466770373308494e-06, "loss": 0.00646025, "memory(GiB)": 26.31, "step": 3480, "train_speed(iter/s)": 0.582891 }, { "acc": 0.99913788, "epoch": 3.6956521739130435, "grad_norm": 0.5789036750793457, "learning_rate": 7.459140820662029e-06, "loss": 0.00307661, "memory(GiB)": 26.31, "step": 3485, "train_speed(iter/s)": 0.582889 }, { "acc": 0.99864273, "epoch": 3.7009544008483566, "grad_norm": 0.5417762398719788, "learning_rate": 7.451503708175382e-06, "loss": 0.00463357, "memory(GiB)": 26.31, "step": 3490, "train_speed(iter/s)": 0.582884 }, { "acc": 0.99816399, "epoch": 3.7062566277836693, "grad_norm": 0.8491963148117065, "learning_rate": 7.4438590593311795e-06, "loss": 0.00563908, "memory(GiB)": 26.31, "step": 3495, "train_speed(iter/s)": 0.582886 }, { "acc": 0.99839602, "epoch": 3.711558854718982, "grad_norm": 0.42541468143463135, "learning_rate": 7.436206897635227e-06, "loss": 0.00373492, "memory(GiB)": 26.31, "step": 3500, "train_speed(iter/s)": 0.582891 }, { "acc": 0.99912691, "epoch": 3.7168610816542946, "grad_norm": 0.6051958799362183, "learning_rate": 7.428547246616425e-06, "loss": 0.00373506, "memory(GiB)": 26.31, "step": 3505, "train_speed(iter/s)": 0.58289 }, { "acc": 0.99895039, "epoch": 3.7221633085896078, "grad_norm": 0.6306867003440857, "learning_rate": 7.420880129826703e-06, "loss": 0.00296825, "memory(GiB)": 26.31, "step": 3510, "train_speed(iter/s)": 0.582881 }, { "acc": 0.99890804, "epoch": 3.7274655355249204, "grad_norm": 0.2302398830652237, "learning_rate": 7.413205570840947e-06, "loss": 0.00509791, "memory(GiB)": 26.31, "step": 3515, "train_speed(iter/s)": 0.58288 }, { "acc": 0.99802408, "epoch": 3.7327677624602336, "grad_norm": 0.7267009615898132, "learning_rate": 7.405523593256929e-06, "loss": 0.00455983, "memory(GiB)": 26.31, "step": 3520, "train_speed(iter/s)": 0.582875 }, { "acc": 0.99777699, "epoch": 3.7380699893955462, "grad_norm": 0.44353216886520386, "learning_rate": 7.397834220695225e-06, "loss": 0.00718109, "memory(GiB)": 26.31, "step": 3525, "train_speed(iter/s)": 0.582872 }, { "acc": 0.99837513, "epoch": 3.743372216330859, "grad_norm": 0.31030791997909546, "learning_rate": 7.390137476799156e-06, "loss": 0.00507879, "memory(GiB)": 26.31, "step": 3530, "train_speed(iter/s)": 0.582872 }, { "acc": 0.99864407, "epoch": 3.7486744432661716, "grad_norm": 0.37967705726623535, "learning_rate": 7.382433385234707e-06, "loss": 0.00317531, "memory(GiB)": 26.31, "step": 3535, "train_speed(iter/s)": 0.582872 }, { "acc": 0.99891911, "epoch": 3.7539766702014847, "grad_norm": 0.437358558177948, "learning_rate": 7.374721969690455e-06, "loss": 0.00412428, "memory(GiB)": 26.31, "step": 3540, "train_speed(iter/s)": 0.582873 }, { "acc": 0.99910431, "epoch": 3.7592788971367974, "grad_norm": 0.7306728363037109, "learning_rate": 7.367003253877494e-06, "loss": 0.00288328, "memory(GiB)": 26.31, "step": 3545, "train_speed(iter/s)": 0.582878 }, { "acc": 0.99876919, "epoch": 3.7645811240721105, "grad_norm": 0.45699283480644226, "learning_rate": 7.359277261529366e-06, "loss": 0.00326591, "memory(GiB)": 26.31, "step": 3550, "train_speed(iter/s)": 0.582882 }, { "acc": 0.99903383, "epoch": 3.7698833510074232, "grad_norm": 0.35999491810798645, "learning_rate": 7.35154401640199e-06, "loss": 0.00265577, "memory(GiB)": 26.31, "step": 3555, "train_speed(iter/s)": 0.582879 }, { "acc": 0.99812126, "epoch": 3.775185577942736, "grad_norm": 0.6905408501625061, "learning_rate": 7.343803542273583e-06, "loss": 0.00630668, "memory(GiB)": 26.31, "step": 3560, "train_speed(iter/s)": 0.582879 }, { "acc": 0.99930096, "epoch": 3.7804878048780486, "grad_norm": 0.20918644964694977, "learning_rate": 7.336055862944592e-06, "loss": 0.00258163, "memory(GiB)": 26.31, "step": 3565, "train_speed(iter/s)": 0.582885 }, { "acc": 0.99900961, "epoch": 3.7857900318133617, "grad_norm": 0.3650825023651123, "learning_rate": 7.328301002237616e-06, "loss": 0.00405125, "memory(GiB)": 26.31, "step": 3570, "train_speed(iter/s)": 0.582894 }, { "acc": 0.99857883, "epoch": 3.7910922587486744, "grad_norm": 0.4976001977920532, "learning_rate": 7.320538983997338e-06, "loss": 0.00360008, "memory(GiB)": 26.31, "step": 3575, "train_speed(iter/s)": 0.582893 }, { "acc": 0.99873447, "epoch": 3.7963944856839875, "grad_norm": 0.16031405329704285, "learning_rate": 7.312769832090447e-06, "loss": 0.00385965, "memory(GiB)": 26.31, "step": 3580, "train_speed(iter/s)": 0.582893 }, { "acc": 0.99866476, "epoch": 3.8016967126193, "grad_norm": 0.4230116307735443, "learning_rate": 7.304993570405567e-06, "loss": 0.0040944, "memory(GiB)": 26.31, "step": 3585, "train_speed(iter/s)": 0.582891 }, { "acc": 0.99939251, "epoch": 3.806998939554613, "grad_norm": 0.18150892853736877, "learning_rate": 7.297210222853182e-06, "loss": 0.00284926, "memory(GiB)": 26.31, "step": 3590, "train_speed(iter/s)": 0.582889 }, { "acc": 0.99903259, "epoch": 3.8123011664899256, "grad_norm": 0.49581795930862427, "learning_rate": 7.2894198133655665e-06, "loss": 0.00506911, "memory(GiB)": 26.31, "step": 3595, "train_speed(iter/s)": 0.582889 }, { "acc": 0.99930048, "epoch": 3.8176033934252387, "grad_norm": 0.1171208992600441, "learning_rate": 7.28162236589671e-06, "loss": 0.00261142, "memory(GiB)": 26.31, "step": 3600, "train_speed(iter/s)": 0.582888 }, { "acc": 0.99935799, "epoch": 3.8229056203605514, "grad_norm": 0.319844126701355, "learning_rate": 7.273817904422237e-06, "loss": 0.00172885, "memory(GiB)": 26.31, "step": 3605, "train_speed(iter/s)": 0.582886 }, { "acc": 0.99837723, "epoch": 3.8282078472958645, "grad_norm": 0.5585457682609558, "learning_rate": 7.266006452939342e-06, "loss": 0.0053467, "memory(GiB)": 26.31, "step": 3610, "train_speed(iter/s)": 0.582884 }, { "acc": 0.99833784, "epoch": 3.833510074231177, "grad_norm": 0.700864851474762, "learning_rate": 7.258188035466714e-06, "loss": 0.00526347, "memory(GiB)": 26.31, "step": 3615, "train_speed(iter/s)": 0.582882 }, { "acc": 0.99890614, "epoch": 3.83881230116649, "grad_norm": 0.5112093687057495, "learning_rate": 7.250362676044458e-06, "loss": 0.00356812, "memory(GiB)": 26.31, "step": 3620, "train_speed(iter/s)": 0.582879 }, { "acc": 0.99928646, "epoch": 3.8441145281018025, "grad_norm": 0.23124846816062927, "learning_rate": 7.2425303987340236e-06, "loss": 0.00238931, "memory(GiB)": 26.31, "step": 3625, "train_speed(iter/s)": 0.582878 }, { "acc": 0.99836931, "epoch": 3.8494167550371157, "grad_norm": 0.7917247414588928, "learning_rate": 7.234691227618136e-06, "loss": 0.00518584, "memory(GiB)": 26.31, "step": 3630, "train_speed(iter/s)": 0.582878 }, { "acc": 0.99892216, "epoch": 3.8547189819724283, "grad_norm": 0.30003005266189575, "learning_rate": 7.226845186800714e-06, "loss": 0.00322571, "memory(GiB)": 26.31, "step": 3635, "train_speed(iter/s)": 0.582877 }, { "acc": 0.99873142, "epoch": 3.8600212089077415, "grad_norm": 0.6209852695465088, "learning_rate": 7.218992300406802e-06, "loss": 0.00309149, "memory(GiB)": 26.31, "step": 3640, "train_speed(iter/s)": 0.582878 }, { "acc": 0.99855528, "epoch": 3.865323435843054, "grad_norm": 0.4968811571598053, "learning_rate": 7.211132592582487e-06, "loss": 0.00370005, "memory(GiB)": 26.31, "step": 3645, "train_speed(iter/s)": 0.582878 }, { "acc": 0.99924831, "epoch": 3.870625662778367, "grad_norm": 0.22578194737434387, "learning_rate": 7.2032660874948405e-06, "loss": 0.00231544, "memory(GiB)": 26.31, "step": 3650, "train_speed(iter/s)": 0.582882 }, { "acc": 0.99914141, "epoch": 3.8759278897136795, "grad_norm": 0.361869752407074, "learning_rate": 7.195392809331824e-06, "loss": 0.0024379, "memory(GiB)": 26.31, "step": 3655, "train_speed(iter/s)": 0.582881 }, { "acc": 0.9986701, "epoch": 3.8812301166489926, "grad_norm": 0.6052594184875488, "learning_rate": 7.1875127823022326e-06, "loss": 0.00489041, "memory(GiB)": 26.31, "step": 3660, "train_speed(iter/s)": 0.582889 }, { "acc": 0.99880333, "epoch": 3.8865323435843053, "grad_norm": 0.34441888332366943, "learning_rate": 7.179626030635611e-06, "loss": 0.00405798, "memory(GiB)": 26.31, "step": 3665, "train_speed(iter/s)": 0.582888 }, { "acc": 0.99863358, "epoch": 3.8918345705196185, "grad_norm": 0.7074710726737976, "learning_rate": 7.171732578582176e-06, "loss": 0.0041889, "memory(GiB)": 26.31, "step": 3670, "train_speed(iter/s)": 0.582887 }, { "acc": 0.99890327, "epoch": 3.897136797454931, "grad_norm": 0.5451133847236633, "learning_rate": 7.163832450412752e-06, "loss": 0.00401347, "memory(GiB)": 26.31, "step": 3675, "train_speed(iter/s)": 0.582885 }, { "acc": 0.99922581, "epoch": 3.902439024390244, "grad_norm": 0.23822365701198578, "learning_rate": 7.155925670418691e-06, "loss": 0.00292925, "memory(GiB)": 26.31, "step": 3680, "train_speed(iter/s)": 0.582892 }, { "acc": 0.99865017, "epoch": 3.9077412513255565, "grad_norm": 0.6244868040084839, "learning_rate": 7.148012262911795e-06, "loss": 0.00549463, "memory(GiB)": 26.31, "step": 3685, "train_speed(iter/s)": 0.58289 }, { "acc": 0.99927311, "epoch": 3.9130434782608696, "grad_norm": 0.47247928380966187, "learning_rate": 7.140092252224247e-06, "loss": 0.00199212, "memory(GiB)": 26.31, "step": 3690, "train_speed(iter/s)": 0.582894 }, { "acc": 0.99855614, "epoch": 3.9183457051961823, "grad_norm": 0.6953317523002625, "learning_rate": 7.1321656627085315e-06, "loss": 0.00421569, "memory(GiB)": 26.31, "step": 3695, "train_speed(iter/s)": 0.582895 }, { "acc": 0.99920654, "epoch": 3.9236479321314954, "grad_norm": 0.7520684599876404, "learning_rate": 7.124232518737365e-06, "loss": 0.00460742, "memory(GiB)": 26.31, "step": 3700, "train_speed(iter/s)": 0.582895 }, { "acc": 0.99842453, "epoch": 3.928950159066808, "grad_norm": 0.3329322040081024, "learning_rate": 7.116292844703613e-06, "loss": 0.00535532, "memory(GiB)": 26.31, "step": 3705, "train_speed(iter/s)": 0.582895 }, { "acc": 0.99893045, "epoch": 3.934252386002121, "grad_norm": 0.5064216256141663, "learning_rate": 7.108346665020224e-06, "loss": 0.00310686, "memory(GiB)": 26.31, "step": 3710, "train_speed(iter/s)": 0.582899 }, { "acc": 0.99881392, "epoch": 3.9395546129374335, "grad_norm": 0.5223778486251831, "learning_rate": 7.100394004120146e-06, "loss": 0.00423855, "memory(GiB)": 26.31, "step": 3715, "train_speed(iter/s)": 0.582897 }, { "acc": 0.99847584, "epoch": 3.9448568398727466, "grad_norm": 0.4084107279777527, "learning_rate": 7.092434886456258e-06, "loss": 0.00451448, "memory(GiB)": 26.31, "step": 3720, "train_speed(iter/s)": 0.582896 }, { "acc": 0.99924583, "epoch": 3.9501590668080593, "grad_norm": 0.4136032164096832, "learning_rate": 7.084469336501293e-06, "loss": 0.00311526, "memory(GiB)": 26.31, "step": 3725, "train_speed(iter/s)": 0.582899 }, { "acc": 0.9987318, "epoch": 3.9554612937433724, "grad_norm": 0.0378059521317482, "learning_rate": 7.076497378747761e-06, "loss": 0.00337362, "memory(GiB)": 26.31, "step": 3730, "train_speed(iter/s)": 0.582897 }, { "acc": 0.99868851, "epoch": 3.960763520678685, "grad_norm": 0.6353269219398499, "learning_rate": 7.068519037707873e-06, "loss": 0.00564696, "memory(GiB)": 26.31, "step": 3735, "train_speed(iter/s)": 0.582897 }, { "acc": 0.99876595, "epoch": 3.9660657476139978, "grad_norm": 0.6448261141777039, "learning_rate": 7.060534337913472e-06, "loss": 0.00475402, "memory(GiB)": 26.31, "step": 3740, "train_speed(iter/s)": 0.582898 }, { "acc": 0.99950418, "epoch": 3.9713679745493105, "grad_norm": 0.48298442363739014, "learning_rate": 7.052543303915944e-06, "loss": 0.00231354, "memory(GiB)": 26.31, "step": 3745, "train_speed(iter/s)": 0.582907 }, { "acc": 0.99877338, "epoch": 3.9766702014846236, "grad_norm": 0.329344242811203, "learning_rate": 7.044545960286163e-06, "loss": 0.00474007, "memory(GiB)": 26.31, "step": 3750, "train_speed(iter/s)": 0.58291 }, { "acc": 0.99898186, "epoch": 3.9819724284199363, "grad_norm": 0.2393208146095276, "learning_rate": 7.036542331614395e-06, "loss": 0.00327823, "memory(GiB)": 26.31, "step": 3755, "train_speed(iter/s)": 0.58291 }, { "acc": 0.99976311, "epoch": 3.9872746553552494, "grad_norm": 0.32558730244636536, "learning_rate": 7.028532442510238e-06, "loss": 0.00244776, "memory(GiB)": 26.31, "step": 3760, "train_speed(iter/s)": 0.582914 }, { "acc": 0.99965019, "epoch": 3.992576882290562, "grad_norm": 0.2150496244430542, "learning_rate": 7.020516317602532e-06, "loss": 0.00221206, "memory(GiB)": 26.31, "step": 3765, "train_speed(iter/s)": 0.582919 }, { "acc": 0.99868097, "epoch": 3.9978791092258747, "grad_norm": 1.0313560962677002, "learning_rate": 7.0124939815392985e-06, "loss": 0.00410159, "memory(GiB)": 26.31, "step": 3770, "train_speed(iter/s)": 0.582924 }, { "acc": 0.99927502, "epoch": 4.003181336161187, "grad_norm": 0.49767929315567017, "learning_rate": 7.0044654589876526e-06, "loss": 0.00273472, "memory(GiB)": 26.31, "step": 3775, "train_speed(iter/s)": 0.58284 }, { "acc": 0.99961834, "epoch": 4.0084835630965, "grad_norm": 0.1090165302157402, "learning_rate": 6.996430774633731e-06, "loss": 0.00123068, "memory(GiB)": 26.31, "step": 3780, "train_speed(iter/s)": 0.582849 }, { "acc": 0.99917088, "epoch": 4.013785790031814, "grad_norm": 0.4388119578361511, "learning_rate": 6.988389953182618e-06, "loss": 0.00246109, "memory(GiB)": 26.31, "step": 3785, "train_speed(iter/s)": 0.582852 }, { "acc": 0.99895935, "epoch": 4.019088016967126, "grad_norm": 0.7294406294822693, "learning_rate": 6.980343019358272e-06, "loss": 0.00368474, "memory(GiB)": 26.31, "step": 3790, "train_speed(iter/s)": 0.582858 }, { "acc": 0.99912395, "epoch": 4.024390243902439, "grad_norm": 0.7474674582481384, "learning_rate": 6.9722899979034404e-06, "loss": 0.0029763, "memory(GiB)": 26.31, "step": 3795, "train_speed(iter/s)": 0.58286 }, { "acc": 0.99937649, "epoch": 4.029692470837752, "grad_norm": 0.351416677236557, "learning_rate": 6.964230913579589e-06, "loss": 0.00383181, "memory(GiB)": 26.31, "step": 3800, "train_speed(iter/s)": 0.582865 }, { "acc": 0.99912262, "epoch": 4.034994697773064, "grad_norm": 0.2787763774394989, "learning_rate": 6.956165791166834e-06, "loss": 0.00277095, "memory(GiB)": 26.31, "step": 3805, "train_speed(iter/s)": 0.582864 }, { "acc": 0.99912815, "epoch": 4.040296924708377, "grad_norm": 0.9426448941230774, "learning_rate": 6.948094655463843e-06, "loss": 0.00392118, "memory(GiB)": 26.31, "step": 3810, "train_speed(iter/s)": 0.58287 }, { "acc": 0.99938164, "epoch": 4.045599151643691, "grad_norm": 0.4804195165634155, "learning_rate": 6.940017531287786e-06, "loss": 0.00344517, "memory(GiB)": 26.31, "step": 3815, "train_speed(iter/s)": 0.582869 }, { "acc": 0.99925079, "epoch": 4.050901378579003, "grad_norm": 0.6506812572479248, "learning_rate": 6.9319344434742395e-06, "loss": 0.00181158, "memory(GiB)": 26.31, "step": 3820, "train_speed(iter/s)": 0.582868 }, { "acc": 0.99804354, "epoch": 4.056203605514316, "grad_norm": 0.4788365662097931, "learning_rate": 6.923845416877123e-06, "loss": 0.0051, "memory(GiB)": 26.31, "step": 3825, "train_speed(iter/s)": 0.58287 }, { "acc": 0.99923143, "epoch": 4.061505832449629, "grad_norm": 0.34085676074028015, "learning_rate": 6.91575047636861e-06, "loss": 0.00306839, "memory(GiB)": 26.31, "step": 3830, "train_speed(iter/s)": 0.58287 }, { "acc": 0.99936924, "epoch": 4.066808059384941, "grad_norm": 0.35691240429878235, "learning_rate": 6.907649646839062e-06, "loss": 0.00194274, "memory(GiB)": 26.31, "step": 3835, "train_speed(iter/s)": 0.58287 }, { "acc": 0.99891167, "epoch": 4.072110286320255, "grad_norm": 0.3848298192024231, "learning_rate": 6.899542953196948e-06, "loss": 0.00519823, "memory(GiB)": 26.31, "step": 3840, "train_speed(iter/s)": 0.582869 }, { "acc": 0.99913979, "epoch": 4.077412513255568, "grad_norm": 0.2372390478849411, "learning_rate": 6.891430420368765e-06, "loss": 0.00238386, "memory(GiB)": 26.31, "step": 3845, "train_speed(iter/s)": 0.582872 }, { "acc": 0.99897785, "epoch": 4.08271474019088, "grad_norm": 0.5104010105133057, "learning_rate": 6.883312073298965e-06, "loss": 0.00465745, "memory(GiB)": 26.31, "step": 3850, "train_speed(iter/s)": 0.582872 }, { "acc": 0.99932671, "epoch": 4.088016967126193, "grad_norm": 0.13594026863574982, "learning_rate": 6.875187936949884e-06, "loss": 0.00181529, "memory(GiB)": 26.31, "step": 3855, "train_speed(iter/s)": 0.582873 }, { "acc": 0.99900427, "epoch": 4.093319194061506, "grad_norm": 0.45077431201934814, "learning_rate": 6.867058036301653e-06, "loss": 0.00304938, "memory(GiB)": 26.31, "step": 3860, "train_speed(iter/s)": 0.582874 }, { "acc": 0.99903927, "epoch": 4.098621420996818, "grad_norm": 0.2113030105829239, "learning_rate": 6.858922396352126e-06, "loss": 0.00292636, "memory(GiB)": 26.31, "step": 3865, "train_speed(iter/s)": 0.582874 }, { "acc": 0.99841099, "epoch": 4.103923647932131, "grad_norm": 0.6499300003051758, "learning_rate": 6.850781042116808e-06, "loss": 0.0049599, "memory(GiB)": 26.31, "step": 3870, "train_speed(iter/s)": 0.582876 }, { "acc": 0.99916124, "epoch": 4.109225874867445, "grad_norm": 0.3014342486858368, "learning_rate": 6.842633998628772e-06, "loss": 0.00299525, "memory(GiB)": 26.31, "step": 3875, "train_speed(iter/s)": 0.582881 }, { "acc": 0.99861727, "epoch": 4.114528101802757, "grad_norm": 1.1793856620788574, "learning_rate": 6.834481290938586e-06, "loss": 0.00433057, "memory(GiB)": 26.31, "step": 3880, "train_speed(iter/s)": 0.582885 }, { "acc": 0.99974747, "epoch": 4.11983032873807, "grad_norm": 0.1994122564792633, "learning_rate": 6.8263229441142296e-06, "loss": 0.00139203, "memory(GiB)": 26.31, "step": 3885, "train_speed(iter/s)": 0.582885 }, { "acc": 0.99805698, "epoch": 4.125132555673383, "grad_norm": 0.5237070322036743, "learning_rate": 6.818158983241031e-06, "loss": 0.00496773, "memory(GiB)": 26.31, "step": 3890, "train_speed(iter/s)": 0.582884 }, { "acc": 0.9987545, "epoch": 4.130434782608695, "grad_norm": 0.7213680148124695, "learning_rate": 6.809989433421572e-06, "loss": 0.00347864, "memory(GiB)": 26.31, "step": 3895, "train_speed(iter/s)": 0.582884 }, { "acc": 0.99951344, "epoch": 4.135737009544009, "grad_norm": 0.4179275929927826, "learning_rate": 6.801814319775623e-06, "loss": 0.00191055, "memory(GiB)": 26.31, "step": 3900, "train_speed(iter/s)": 0.582894 }, { "acc": 0.99886799, "epoch": 4.141039236479322, "grad_norm": 0.49606838822364807, "learning_rate": 6.79363366744006e-06, "loss": 0.00437418, "memory(GiB)": 26.31, "step": 3905, "train_speed(iter/s)": 0.582903 }, { "acc": 0.99911499, "epoch": 4.146341463414634, "grad_norm": 0.3049047589302063, "learning_rate": 6.785447501568789e-06, "loss": 0.00369115, "memory(GiB)": 26.31, "step": 3910, "train_speed(iter/s)": 0.582902 }, { "acc": 0.99878025, "epoch": 4.151643690349947, "grad_norm": 0.4471993148326874, "learning_rate": 6.777255847332676e-06, "loss": 0.00471104, "memory(GiB)": 26.31, "step": 3915, "train_speed(iter/s)": 0.582901 }, { "acc": 0.99911194, "epoch": 4.15694591728526, "grad_norm": 0.6976534128189087, "learning_rate": 6.769058729919454e-06, "loss": 0.00338649, "memory(GiB)": 26.31, "step": 3920, "train_speed(iter/s)": 0.582899 }, { "acc": 0.99886684, "epoch": 4.162248144220572, "grad_norm": 0.44167786836624146, "learning_rate": 6.76085617453366e-06, "loss": 0.0030509, "memory(GiB)": 26.31, "step": 3925, "train_speed(iter/s)": 0.582903 }, { "acc": 0.99937372, "epoch": 4.167550371155886, "grad_norm": 0.6355365514755249, "learning_rate": 6.752648206396546e-06, "loss": 0.00211098, "memory(GiB)": 26.31, "step": 3930, "train_speed(iter/s)": 0.582904 }, { "acc": 0.99952736, "epoch": 4.172852598091199, "grad_norm": 0.11797591298818588, "learning_rate": 6.744434850746011e-06, "loss": 0.00186485, "memory(GiB)": 26.31, "step": 3935, "train_speed(iter/s)": 0.582909 }, { "acc": 0.99925127, "epoch": 4.178154825026511, "grad_norm": 0.496011883020401, "learning_rate": 6.736216132836522e-06, "loss": 0.00295694, "memory(GiB)": 26.31, "step": 3940, "train_speed(iter/s)": 0.582911 }, { "acc": 0.99831095, "epoch": 4.183457051961824, "grad_norm": 0.7491662502288818, "learning_rate": 6.727992077939027e-06, "loss": 0.00389315, "memory(GiB)": 26.31, "step": 3945, "train_speed(iter/s)": 0.582912 }, { "acc": 0.99911022, "epoch": 4.188759278897137, "grad_norm": 0.3298349678516388, "learning_rate": 6.7197627113408905e-06, "loss": 0.00260169, "memory(GiB)": 26.31, "step": 3950, "train_speed(iter/s)": 0.582917 }, { "acc": 0.99928799, "epoch": 4.194061505832449, "grad_norm": 0.5814478993415833, "learning_rate": 6.711528058345805e-06, "loss": 0.0016897, "memory(GiB)": 26.31, "step": 3955, "train_speed(iter/s)": 0.582918 }, { "acc": 0.99962626, "epoch": 4.199363732767763, "grad_norm": 0.3780122697353363, "learning_rate": 6.703288144273724e-06, "loss": 0.00158315, "memory(GiB)": 26.31, "step": 3960, "train_speed(iter/s)": 0.582919 }, { "acc": 0.99910164, "epoch": 4.2046659597030756, "grad_norm": 0.5787876844406128, "learning_rate": 6.695042994460768e-06, "loss": 0.00327033, "memory(GiB)": 26.31, "step": 3965, "train_speed(iter/s)": 0.582919 }, { "acc": 0.99942017, "epoch": 4.209968186638388, "grad_norm": 0.538512110710144, "learning_rate": 6.686792634259165e-06, "loss": 0.00221283, "memory(GiB)": 26.31, "step": 3970, "train_speed(iter/s)": 0.58292 }, { "acc": 0.99916143, "epoch": 4.215270413573701, "grad_norm": 0.29662081599235535, "learning_rate": 6.678537089037162e-06, "loss": 0.00368981, "memory(GiB)": 26.31, "step": 3975, "train_speed(iter/s)": 0.582921 }, { "acc": 0.9993372, "epoch": 4.220572640509014, "grad_norm": 0.1427513062953949, "learning_rate": 6.670276384178945e-06, "loss": 0.00276616, "memory(GiB)": 26.31, "step": 3980, "train_speed(iter/s)": 0.582921 }, { "acc": 0.99876518, "epoch": 4.225874867444326, "grad_norm": 0.7543923854827881, "learning_rate": 6.6620105450845664e-06, "loss": 0.0031005, "memory(GiB)": 26.31, "step": 3985, "train_speed(iter/s)": 0.58292 }, { "acc": 0.99929838, "epoch": 4.23117709437964, "grad_norm": 0.41538140177726746, "learning_rate": 6.653739597169871e-06, "loss": 0.00201908, "memory(GiB)": 26.31, "step": 3990, "train_speed(iter/s)": 0.582925 }, { "acc": 0.99893169, "epoch": 4.2364793213149525, "grad_norm": 0.7107354402542114, "learning_rate": 6.645463565866404e-06, "loss": 0.00467964, "memory(GiB)": 26.31, "step": 3995, "train_speed(iter/s)": 0.582923 }, { "acc": 0.99891777, "epoch": 4.241781548250265, "grad_norm": 0.6780441999435425, "learning_rate": 6.637182476621346e-06, "loss": 0.00379675, "memory(GiB)": 26.31, "step": 4000, "train_speed(iter/s)": 0.582929 }, { "acc": 0.99919319, "epoch": 4.247083775185578, "grad_norm": 0.27641725540161133, "learning_rate": 6.628896354897429e-06, "loss": 0.00266308, "memory(GiB)": 26.31, "step": 4005, "train_speed(iter/s)": 0.582929 }, { "acc": 0.99924898, "epoch": 4.252386002120891, "grad_norm": 0.40583816170692444, "learning_rate": 6.620605226172858e-06, "loss": 0.00284122, "memory(GiB)": 26.31, "step": 4010, "train_speed(iter/s)": 0.582927 }, { "acc": 0.99864264, "epoch": 4.257688229056203, "grad_norm": 0.5590219497680664, "learning_rate": 6.6123091159412335e-06, "loss": 0.0070276, "memory(GiB)": 26.31, "step": 4015, "train_speed(iter/s)": 0.582926 }, { "acc": 0.99863548, "epoch": 4.262990455991517, "grad_norm": 0.7009803056716919, "learning_rate": 6.604008049711474e-06, "loss": 0.00366253, "memory(GiB)": 26.31, "step": 4020, "train_speed(iter/s)": 0.582925 }, { "acc": 0.99885464, "epoch": 4.2682926829268295, "grad_norm": 0.28384023904800415, "learning_rate": 6.595702053007738e-06, "loss": 0.00409203, "memory(GiB)": 26.31, "step": 4025, "train_speed(iter/s)": 0.582926 }, { "acc": 0.99863319, "epoch": 4.273594909862142, "grad_norm": 0.22047458589076996, "learning_rate": 6.5873911513693415e-06, "loss": 0.00477225, "memory(GiB)": 26.31, "step": 4030, "train_speed(iter/s)": 0.582931 }, { "acc": 0.99915199, "epoch": 4.278897136797455, "grad_norm": 0.4781891107559204, "learning_rate": 6.5790753703506814e-06, "loss": 0.00297477, "memory(GiB)": 26.31, "step": 4035, "train_speed(iter/s)": 0.58293 }, { "acc": 0.99888287, "epoch": 4.2841993637327676, "grad_norm": 0.12410692870616913, "learning_rate": 6.570754735521163e-06, "loss": 0.00334938, "memory(GiB)": 26.31, "step": 4040, "train_speed(iter/s)": 0.58293 }, { "acc": 0.99848261, "epoch": 4.28950159066808, "grad_norm": 0.6257500648498535, "learning_rate": 6.56242927246511e-06, "loss": 0.00495994, "memory(GiB)": 26.31, "step": 4045, "train_speed(iter/s)": 0.582935 }, { "acc": 0.99935036, "epoch": 4.294803817603394, "grad_norm": 0.5982515811920166, "learning_rate": 6.554099006781696e-06, "loss": 0.00224422, "memory(GiB)": 26.31, "step": 4050, "train_speed(iter/s)": 0.582936 }, { "acc": 0.99870186, "epoch": 4.3001060445387065, "grad_norm": 0.45738568902015686, "learning_rate": 6.545763964084861e-06, "loss": 0.00415319, "memory(GiB)": 26.31, "step": 4055, "train_speed(iter/s)": 0.582941 }, { "acc": 0.99977303, "epoch": 4.305408271474019, "grad_norm": 0.1528371125459671, "learning_rate": 6.537424170003233e-06, "loss": 0.00109178, "memory(GiB)": 26.31, "step": 4060, "train_speed(iter/s)": 0.582939 }, { "acc": 0.99936209, "epoch": 4.310710498409332, "grad_norm": 0.30072876811027527, "learning_rate": 6.529079650180048e-06, "loss": 0.00162833, "memory(GiB)": 26.31, "step": 4065, "train_speed(iter/s)": 0.582937 }, { "acc": 0.99911661, "epoch": 4.3160127253446445, "grad_norm": 0.7277874946594238, "learning_rate": 6.5207304302730755e-06, "loss": 0.00278707, "memory(GiB)": 26.31, "step": 4070, "train_speed(iter/s)": 0.582938 }, { "acc": 0.99898167, "epoch": 4.321314952279957, "grad_norm": 0.5926617980003357, "learning_rate": 6.512376535954534e-06, "loss": 0.00526775, "memory(GiB)": 26.31, "step": 4075, "train_speed(iter/s)": 0.582938 }, { "acc": 0.9995492, "epoch": 4.326617179215271, "grad_norm": 0.23842601478099823, "learning_rate": 6.504017992911017e-06, "loss": 0.00162778, "memory(GiB)": 26.31, "step": 4080, "train_speed(iter/s)": 0.582937 }, { "acc": 0.99949112, "epoch": 4.3319194061505835, "grad_norm": 0.2551683187484741, "learning_rate": 6.495654826843414e-06, "loss": 0.00149569, "memory(GiB)": 26.31, "step": 4085, "train_speed(iter/s)": 0.582941 }, { "acc": 0.99977875, "epoch": 4.337221633085896, "grad_norm": 0.04848746582865715, "learning_rate": 6.487287063466824e-06, "loss": 0.00142655, "memory(GiB)": 26.31, "step": 4090, "train_speed(iter/s)": 0.582945 }, { "acc": 0.99942417, "epoch": 4.342523860021209, "grad_norm": 0.15790040791034698, "learning_rate": 6.478914728510485e-06, "loss": 0.00233415, "memory(GiB)": 26.31, "step": 4095, "train_speed(iter/s)": 0.582945 }, { "acc": 0.99927073, "epoch": 4.3478260869565215, "grad_norm": 0.49188557267189026, "learning_rate": 6.470537847717692e-06, "loss": 0.00392356, "memory(GiB)": 26.31, "step": 4100, "train_speed(iter/s)": 0.582944 }, { "acc": 0.99959507, "epoch": 4.353128313891834, "grad_norm": 0.23772254586219788, "learning_rate": 6.462156446845715e-06, "loss": 0.00239973, "memory(GiB)": 26.31, "step": 4105, "train_speed(iter/s)": 0.582947 }, { "acc": 0.99922543, "epoch": 4.358430540827148, "grad_norm": 0.16435517370700836, "learning_rate": 6.453770551665727e-06, "loss": 0.00276972, "memory(GiB)": 26.31, "step": 4110, "train_speed(iter/s)": 0.582947 }, { "acc": 0.99857063, "epoch": 4.36373276776246, "grad_norm": 0.5997322201728821, "learning_rate": 6.445380187962715e-06, "loss": 0.00485413, "memory(GiB)": 26.31, "step": 4115, "train_speed(iter/s)": 0.582955 }, { "acc": 0.99888248, "epoch": 4.369034994697773, "grad_norm": 0.6237581372261047, "learning_rate": 6.43698538153541e-06, "loss": 0.00486804, "memory(GiB)": 26.31, "step": 4120, "train_speed(iter/s)": 0.582954 }, { "acc": 0.99929352, "epoch": 4.374337221633086, "grad_norm": 0.49355337023735046, "learning_rate": 6.4285861581962005e-06, "loss": 0.00364117, "memory(GiB)": 26.31, "step": 4125, "train_speed(iter/s)": 0.582954 }, { "acc": 0.99813728, "epoch": 4.3796394485683985, "grad_norm": 0.6591293215751648, "learning_rate": 6.4201825437710565e-06, "loss": 0.00500336, "memory(GiB)": 26.31, "step": 4130, "train_speed(iter/s)": 0.582953 }, { "acc": 0.9992382, "epoch": 4.384941675503711, "grad_norm": 0.1336752027273178, "learning_rate": 6.411774564099454e-06, "loss": 0.00252956, "memory(GiB)": 26.31, "step": 4135, "train_speed(iter/s)": 0.582955 }, { "acc": 0.99899483, "epoch": 4.390243902439025, "grad_norm": 0.30952557921409607, "learning_rate": 6.403362245034283e-06, "loss": 0.00370638, "memory(GiB)": 26.31, "step": 4140, "train_speed(iter/s)": 0.58296 }, { "acc": 0.9987875, "epoch": 4.395546129374337, "grad_norm": 0.29703906178474426, "learning_rate": 6.3949456124417855e-06, "loss": 0.00523653, "memory(GiB)": 26.31, "step": 4145, "train_speed(iter/s)": 0.58296 }, { "acc": 0.99912205, "epoch": 4.40084835630965, "grad_norm": 0.33913636207580566, "learning_rate": 6.386524692201459e-06, "loss": 0.00239438, "memory(GiB)": 26.31, "step": 4150, "train_speed(iter/s)": 0.582965 }, { "acc": 0.99977617, "epoch": 4.406150583244963, "grad_norm": 0.3019471764564514, "learning_rate": 6.378099510205991e-06, "loss": 0.00104185, "memory(GiB)": 26.31, "step": 4155, "train_speed(iter/s)": 0.582965 }, { "acc": 0.99841003, "epoch": 4.4114528101802755, "grad_norm": 0.7617125511169434, "learning_rate": 6.369670092361169e-06, "loss": 0.00464066, "memory(GiB)": 26.31, "step": 4160, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99932842, "epoch": 4.416755037115588, "grad_norm": 0.7266931533813477, "learning_rate": 6.361236464585805e-06, "loss": 0.00196429, "memory(GiB)": 26.31, "step": 4165, "train_speed(iter/s)": 0.582969 }, { "acc": 0.99902029, "epoch": 4.422057264050902, "grad_norm": 0.5494599938392639, "learning_rate": 6.352798652811657e-06, "loss": 0.00302969, "memory(GiB)": 26.31, "step": 4170, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99920559, "epoch": 4.427359490986214, "grad_norm": 0.44917190074920654, "learning_rate": 6.3443566829833485e-06, "loss": 0.00246186, "memory(GiB)": 26.31, "step": 4175, "train_speed(iter/s)": 0.582969 }, { "acc": 0.99954557, "epoch": 4.432661717921527, "grad_norm": 0.626193106174469, "learning_rate": 6.335910581058287e-06, "loss": 0.00248692, "memory(GiB)": 26.31, "step": 4180, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99891415, "epoch": 4.43796394485684, "grad_norm": 0.5783212184906006, "learning_rate": 6.327460373006584e-06, "loss": 0.00308748, "memory(GiB)": 26.31, "step": 4185, "train_speed(iter/s)": 0.582968 }, { "acc": 0.99954481, "epoch": 4.443266171792152, "grad_norm": 0.3852193057537079, "learning_rate": 6.319006084810983e-06, "loss": 0.00142698, "memory(GiB)": 26.31, "step": 4190, "train_speed(iter/s)": 0.582971 }, { "acc": 0.9996376, "epoch": 4.448568398727465, "grad_norm": 0.11862189322710037, "learning_rate": 6.310547742466766e-06, "loss": 0.00133829, "memory(GiB)": 26.31, "step": 4195, "train_speed(iter/s)": 0.582974 }, { "acc": 0.99929657, "epoch": 4.453870625662779, "grad_norm": 0.7278153896331787, "learning_rate": 6.302085371981682e-06, "loss": 0.00171, "memory(GiB)": 26.31, "step": 4200, "train_speed(iter/s)": 0.582982 }, { "acc": 0.99895, "epoch": 4.459172852598091, "grad_norm": 0.712354302406311, "learning_rate": 6.293618999375868e-06, "loss": 0.00472792, "memory(GiB)": 26.31, "step": 4205, "train_speed(iter/s)": 0.582981 }, { "acc": 0.99929352, "epoch": 4.464475079533404, "grad_norm": 0.6949740648269653, "learning_rate": 6.2851486506817635e-06, "loss": 0.00217784, "memory(GiB)": 26.31, "step": 4210, "train_speed(iter/s)": 0.582979 }, { "acc": 0.99937687, "epoch": 4.469777306468717, "grad_norm": 0.4581748843193054, "learning_rate": 6.276674351944042e-06, "loss": 0.00175911, "memory(GiB)": 26.31, "step": 4215, "train_speed(iter/s)": 0.582981 }, { "acc": 0.99987373, "epoch": 4.475079533404029, "grad_norm": 0.06770281493663788, "learning_rate": 6.2681961292195105e-06, "loss": 0.00088498, "memory(GiB)": 26.31, "step": 4220, "train_speed(iter/s)": 0.582986 }, { "acc": 0.99874878, "epoch": 4.480381760339343, "grad_norm": 0.5159828066825867, "learning_rate": 6.25971400857705e-06, "loss": 0.002787, "memory(GiB)": 26.31, "step": 4225, "train_speed(iter/s)": 0.582986 }, { "acc": 0.999405, "epoch": 4.485683987274656, "grad_norm": 0.49573227763175964, "learning_rate": 6.251228016097524e-06, "loss": 0.00399416, "memory(GiB)": 26.31, "step": 4230, "train_speed(iter/s)": 0.582995 }, { "acc": 0.99900513, "epoch": 4.490986214209968, "grad_norm": 0.26090532541275024, "learning_rate": 6.242738177873702e-06, "loss": 0.00458506, "memory(GiB)": 26.31, "step": 4235, "train_speed(iter/s)": 0.582992 }, { "acc": 0.99961739, "epoch": 4.496288441145281, "grad_norm": 0.11822298914194107, "learning_rate": 6.2342445200101755e-06, "loss": 0.00199409, "memory(GiB)": 26.31, "step": 4240, "train_speed(iter/s)": 0.582993 }, { "acc": 0.99923649, "epoch": 4.501590668080594, "grad_norm": 0.3978080749511719, "learning_rate": 6.2257470686232846e-06, "loss": 0.0028736, "memory(GiB)": 26.31, "step": 4245, "train_speed(iter/s)": 0.582992 }, { "acc": 0.99936199, "epoch": 4.506892895015906, "grad_norm": 0.2833632528781891, "learning_rate": 6.2172458498410336e-06, "loss": 0.0029819, "memory(GiB)": 26.31, "step": 4250, "train_speed(iter/s)": 0.582992 }, { "acc": 0.99923077, "epoch": 4.512195121951219, "grad_norm": 0.23151080310344696, "learning_rate": 6.2087408898030075e-06, "loss": 0.00369481, "memory(GiB)": 26.31, "step": 4255, "train_speed(iter/s)": 0.582993 }, { "acc": 0.99963894, "epoch": 4.517497348886533, "grad_norm": 0.5830540060997009, "learning_rate": 6.200232214660299e-06, "loss": 0.00243596, "memory(GiB)": 26.31, "step": 4260, "train_speed(iter/s)": 0.582998 }, { "acc": 0.99961758, "epoch": 4.522799575821845, "grad_norm": 0.3385343551635742, "learning_rate": 6.191719850575419e-06, "loss": 0.0011743, "memory(GiB)": 26.31, "step": 4265, "train_speed(iter/s)": 0.582996 }, { "acc": 0.99963942, "epoch": 4.528101802757158, "grad_norm": 0.44956743717193604, "learning_rate": 6.183203823722227e-06, "loss": 0.00175311, "memory(GiB)": 26.31, "step": 4270, "train_speed(iter/s)": 0.583003 }, { "acc": 0.99880333, "epoch": 4.533404029692471, "grad_norm": 0.4423983693122864, "learning_rate": 6.174684160285838e-06, "loss": 0.00511739, "memory(GiB)": 26.31, "step": 4275, "train_speed(iter/s)": 0.583007 }, { "acc": 0.99928932, "epoch": 4.538706256627783, "grad_norm": 0.3532782196998596, "learning_rate": 6.166160886462556e-06, "loss": 0.00230165, "memory(GiB)": 26.31, "step": 4280, "train_speed(iter/s)": 0.583012 }, { "acc": 0.99882469, "epoch": 4.544008483563097, "grad_norm": 0.5737372040748596, "learning_rate": 6.157634028459782e-06, "loss": 0.00340655, "memory(GiB)": 26.31, "step": 4285, "train_speed(iter/s)": 0.583014 }, { "acc": 0.99885597, "epoch": 4.54931071049841, "grad_norm": 1.0472294092178345, "learning_rate": 6.149103612495937e-06, "loss": 0.00416699, "memory(GiB)": 26.31, "step": 4290, "train_speed(iter/s)": 0.583027 }, { "acc": 0.99931746, "epoch": 4.554612937433722, "grad_norm": 0.3487650454044342, "learning_rate": 6.1405696648003845e-06, "loss": 0.00149525, "memory(GiB)": 26.31, "step": 4295, "train_speed(iter/s)": 0.583027 }, { "acc": 0.99887199, "epoch": 4.559915164369035, "grad_norm": 0.7196864485740662, "learning_rate": 6.132032211613346e-06, "loss": 0.00416826, "memory(GiB)": 26.31, "step": 4300, "train_speed(iter/s)": 0.583027 }, { "acc": 0.99932594, "epoch": 4.565217391304348, "grad_norm": 0.547074019908905, "learning_rate": 6.123491279185825e-06, "loss": 0.00252956, "memory(GiB)": 26.31, "step": 4305, "train_speed(iter/s)": 0.583032 }, { "acc": 0.99934731, "epoch": 4.57051961823966, "grad_norm": 0.8476079702377319, "learning_rate": 6.1149468937795145e-06, "loss": 0.00192753, "memory(GiB)": 26.31, "step": 4310, "train_speed(iter/s)": 0.583031 }, { "acc": 0.99914036, "epoch": 4.575821845174973, "grad_norm": 0.15940773487091064, "learning_rate": 6.106399081666734e-06, "loss": 0.00218152, "memory(GiB)": 26.31, "step": 4315, "train_speed(iter/s)": 0.583029 }, { "acc": 0.99965105, "epoch": 4.581124072110287, "grad_norm": 0.2619980275630951, "learning_rate": 6.0978478691303365e-06, "loss": 0.00189752, "memory(GiB)": 26.31, "step": 4320, "train_speed(iter/s)": 0.583034 }, { "acc": 0.99938726, "epoch": 4.586426299045599, "grad_norm": 0.09260746836662292, "learning_rate": 6.089293282463629e-06, "loss": 0.00186144, "memory(GiB)": 26.31, "step": 4325, "train_speed(iter/s)": 0.583033 }, { "acc": 0.99938898, "epoch": 4.591728525980912, "grad_norm": 0.5282265543937683, "learning_rate": 6.080735347970294e-06, "loss": 0.00238004, "memory(GiB)": 26.31, "step": 4330, "train_speed(iter/s)": 0.583033 }, { "acc": 0.99915371, "epoch": 4.597030752916225, "grad_norm": 0.27696260809898376, "learning_rate": 6.0721740919643066e-06, "loss": 0.00277539, "memory(GiB)": 26.31, "step": 4335, "train_speed(iter/s)": 0.583041 }, { "acc": 0.99952774, "epoch": 4.602332979851537, "grad_norm": 0.2598687410354614, "learning_rate": 6.063609540769858e-06, "loss": 0.00203173, "memory(GiB)": 26.31, "step": 4340, "train_speed(iter/s)": 0.583044 }, { "acc": 0.99864531, "epoch": 4.607635206786851, "grad_norm": 0.7812467813491821, "learning_rate": 6.055041720721268e-06, "loss": 0.00370566, "memory(GiB)": 26.31, "step": 4345, "train_speed(iter/s)": 0.583048 }, { "acc": 0.99851122, "epoch": 4.612937433722164, "grad_norm": 0.7187480330467224, "learning_rate": 6.046470658162914e-06, "loss": 0.00516219, "memory(GiB)": 26.31, "step": 4350, "train_speed(iter/s)": 0.583056 }, { "acc": 0.99863443, "epoch": 4.618239660657476, "grad_norm": 0.3305424153804779, "learning_rate": 6.037896379449135e-06, "loss": 0.00443163, "memory(GiB)": 26.31, "step": 4355, "train_speed(iter/s)": 0.583057 }, { "acc": 0.99921246, "epoch": 4.623541887592789, "grad_norm": 0.3764010965824127, "learning_rate": 6.029318910944164e-06, "loss": 0.00273427, "memory(GiB)": 26.31, "step": 4360, "train_speed(iter/s)": 0.583062 }, { "acc": 0.99932175, "epoch": 4.628844114528102, "grad_norm": 0.29550793766975403, "learning_rate": 6.02073827902204e-06, "loss": 0.00184609, "memory(GiB)": 26.31, "step": 4365, "train_speed(iter/s)": 0.583062 }, { "acc": 0.99910212, "epoch": 4.634146341463414, "grad_norm": 0.5608096122741699, "learning_rate": 6.012154510066532e-06, "loss": 0.00197052, "memory(GiB)": 26.31, "step": 4370, "train_speed(iter/s)": 0.58306 }, { "acc": 0.9996316, "epoch": 4.639448568398727, "grad_norm": 0.13115724921226501, "learning_rate": 6.003567630471049e-06, "loss": 0.00155056, "memory(GiB)": 26.31, "step": 4375, "train_speed(iter/s)": 0.583062 }, { "acc": 0.9994236, "epoch": 4.644750795334041, "grad_norm": 0.6535782814025879, "learning_rate": 5.994977666638571e-06, "loss": 0.00236299, "memory(GiB)": 26.31, "step": 4380, "train_speed(iter/s)": 0.583061 }, { "acc": 0.99901667, "epoch": 4.650053022269353, "grad_norm": 0.6454782485961914, "learning_rate": 5.986384644981558e-06, "loss": 0.0031538, "memory(GiB)": 26.31, "step": 4385, "train_speed(iter/s)": 0.583062 }, { "acc": 0.99875145, "epoch": 4.655355249204666, "grad_norm": 0.8551376461982727, "learning_rate": 5.977788591921871e-06, "loss": 0.00375093, "memory(GiB)": 26.31, "step": 4390, "train_speed(iter/s)": 0.583063 }, { "acc": 0.99974909, "epoch": 4.660657476139979, "grad_norm": 0.2662411332130432, "learning_rate": 5.969189533890697e-06, "loss": 0.00120749, "memory(GiB)": 26.31, "step": 4395, "train_speed(iter/s)": 0.583066 }, { "acc": 0.99951324, "epoch": 4.665959703075291, "grad_norm": 0.23411191999912262, "learning_rate": 5.960587497328457e-06, "loss": 0.00206366, "memory(GiB)": 26.31, "step": 4400, "train_speed(iter/s)": 0.583075 }, { "acc": 0.99952765, "epoch": 4.671261930010605, "grad_norm": 0.2640402019023895, "learning_rate": 5.951982508684733e-06, "loss": 0.00111532, "memory(GiB)": 26.31, "step": 4405, "train_speed(iter/s)": 0.583075 }, { "acc": 0.99938078, "epoch": 4.6765641569459175, "grad_norm": 0.5517047047615051, "learning_rate": 5.943374594418185e-06, "loss": 0.00198936, "memory(GiB)": 26.31, "step": 4410, "train_speed(iter/s)": 0.583076 }, { "acc": 0.99863434, "epoch": 4.68186638388123, "grad_norm": 0.8088098764419556, "learning_rate": 5.934763780996467e-06, "loss": 0.00394955, "memory(GiB)": 26.31, "step": 4415, "train_speed(iter/s)": 0.583074 }, { "acc": 0.99955444, "epoch": 4.687168610816543, "grad_norm": 0.11458373069763184, "learning_rate": 5.9261500948961496e-06, "loss": 0.00121496, "memory(GiB)": 26.31, "step": 4420, "train_speed(iter/s)": 0.583075 }, { "acc": 0.99962997, "epoch": 4.692470837751856, "grad_norm": 0.37143048644065857, "learning_rate": 5.917533562602632e-06, "loss": 0.00115484, "memory(GiB)": 26.31, "step": 4425, "train_speed(iter/s)": 0.583082 }, { "acc": 0.99890938, "epoch": 4.697773064687168, "grad_norm": 0.4180762767791748, "learning_rate": 5.90891421061007e-06, "loss": 0.00400325, "memory(GiB)": 26.31, "step": 4430, "train_speed(iter/s)": 0.583081 }, { "acc": 0.9990202, "epoch": 4.703075291622481, "grad_norm": 0.45803967118263245, "learning_rate": 5.900292065421285e-06, "loss": 0.00252781, "memory(GiB)": 26.31, "step": 4435, "train_speed(iter/s)": 0.583081 }, { "acc": 0.9988286, "epoch": 4.7083775185577945, "grad_norm": 0.3728193938732147, "learning_rate": 5.8916671535476886e-06, "loss": 0.00431239, "memory(GiB)": 26.31, "step": 4440, "train_speed(iter/s)": 0.583085 }, { "acc": 0.99901409, "epoch": 4.713679745493107, "grad_norm": 0.6913246512413025, "learning_rate": 5.8830395015092035e-06, "loss": 0.00349989, "memory(GiB)": 26.31, "step": 4445, "train_speed(iter/s)": 0.583085 }, { "acc": 0.99923077, "epoch": 4.71898197242842, "grad_norm": 0.8583769202232361, "learning_rate": 5.8744091358341706e-06, "loss": 0.00261297, "memory(GiB)": 26.31, "step": 4450, "train_speed(iter/s)": 0.583084 }, { "acc": 0.9992691, "epoch": 4.724284199363733, "grad_norm": 0.39945945143699646, "learning_rate": 5.865776083059279e-06, "loss": 0.00270182, "memory(GiB)": 26.31, "step": 4455, "train_speed(iter/s)": 0.583084 }, { "acc": 0.99868488, "epoch": 4.729586426299045, "grad_norm": 0.7474715709686279, "learning_rate": 5.8571403697294805e-06, "loss": 0.00387175, "memory(GiB)": 26.31, "step": 4460, "train_speed(iter/s)": 0.583083 }, { "acc": 0.99916058, "epoch": 4.734888653234359, "grad_norm": 0.2578558027744293, "learning_rate": 5.848502022397904e-06, "loss": 0.00217835, "memory(GiB)": 26.31, "step": 4465, "train_speed(iter/s)": 0.583082 }, { "acc": 0.9994772, "epoch": 4.7401908801696715, "grad_norm": 0.14401961863040924, "learning_rate": 5.839861067625784e-06, "loss": 0.00253854, "memory(GiB)": 26.31, "step": 4470, "train_speed(iter/s)": 0.583081 }, { "acc": 0.99930344, "epoch": 4.745493107104984, "grad_norm": 0.2320011854171753, "learning_rate": 5.831217531982364e-06, "loss": 0.00262179, "memory(GiB)": 26.31, "step": 4475, "train_speed(iter/s)": 0.583085 }, { "acc": 0.99925175, "epoch": 4.750795334040297, "grad_norm": 0.36902445554733276, "learning_rate": 5.822571442044829e-06, "loss": 0.00252635, "memory(GiB)": 26.31, "step": 4480, "train_speed(iter/s)": 0.583085 }, { "acc": 0.99939899, "epoch": 4.7560975609756095, "grad_norm": 0.5605974197387695, "learning_rate": 5.813922824398218e-06, "loss": 0.00225574, "memory(GiB)": 26.31, "step": 4485, "train_speed(iter/s)": 0.583089 }, { "acc": 0.99903069, "epoch": 4.761399787910922, "grad_norm": 0.5396479964256287, "learning_rate": 5.805271705635339e-06, "loss": 0.00240382, "memory(GiB)": 26.31, "step": 4490, "train_speed(iter/s)": 0.583087 }, { "acc": 0.99966717, "epoch": 4.766702014846236, "grad_norm": 0.5953534245491028, "learning_rate": 5.796618112356691e-06, "loss": 0.00141264, "memory(GiB)": 26.31, "step": 4495, "train_speed(iter/s)": 0.583086 }, { "acc": 0.99952297, "epoch": 4.7720042417815485, "grad_norm": 0.21910008788108826, "learning_rate": 5.787962071170385e-06, "loss": 0.00160427, "memory(GiB)": 26.31, "step": 4500, "train_speed(iter/s)": 0.583087 }, { "acc": 0.99864674, "epoch": 4.777306468716861, "grad_norm": 0.26406243443489075, "learning_rate": 5.779303608692054e-06, "loss": 0.00467952, "memory(GiB)": 26.31, "step": 4505, "train_speed(iter/s)": 0.583088 }, { "acc": 0.99888878, "epoch": 4.782608695652174, "grad_norm": 0.19614329934120178, "learning_rate": 5.7706427515447794e-06, "loss": 0.00390195, "memory(GiB)": 26.31, "step": 4510, "train_speed(iter/s)": 0.583088 }, { "acc": 0.9993845, "epoch": 4.7879109225874865, "grad_norm": 0.1621880978345871, "learning_rate": 5.761979526359009e-06, "loss": 0.00212824, "memory(GiB)": 26.31, "step": 4515, "train_speed(iter/s)": 0.583091 }, { "acc": 0.99922352, "epoch": 4.793213149522799, "grad_norm": 0.09368137270212173, "learning_rate": 5.753313959772461e-06, "loss": 0.00180674, "memory(GiB)": 26.31, "step": 4520, "train_speed(iter/s)": 0.58309 }, { "acc": 0.99976778, "epoch": 4.798515376458113, "grad_norm": 0.06735534965991974, "learning_rate": 5.744646078430065e-06, "loss": 0.00071397, "memory(GiB)": 26.31, "step": 4525, "train_speed(iter/s)": 0.58309 }, { "acc": 0.99915466, "epoch": 4.8038176033934255, "grad_norm": 0.1537499576807022, "learning_rate": 5.735975908983859e-06, "loss": 0.00218606, "memory(GiB)": 26.31, "step": 4530, "train_speed(iter/s)": 0.583094 }, { "acc": 0.99936771, "epoch": 4.809119830328738, "grad_norm": 0.4072479009628296, "learning_rate": 5.727303478092922e-06, "loss": 0.00248822, "memory(GiB)": 26.31, "step": 4535, "train_speed(iter/s)": 0.583093 }, { "acc": 1.0, "epoch": 4.814422057264051, "grad_norm": 0.10145048052072525, "learning_rate": 5.718628812423285e-06, "loss": 0.00044452, "memory(GiB)": 26.31, "step": 4540, "train_speed(iter/s)": 0.583099 }, { "acc": 0.99870358, "epoch": 4.8197242841993635, "grad_norm": 0.4591797888278961, "learning_rate": 5.70995193864785e-06, "loss": 0.00218927, "memory(GiB)": 26.31, "step": 4545, "train_speed(iter/s)": 0.583106 }, { "acc": 0.99918861, "epoch": 4.825026511134676, "grad_norm": 0.4732370972633362, "learning_rate": 5.701272883446308e-06, "loss": 0.00389646, "memory(GiB)": 26.31, "step": 4550, "train_speed(iter/s)": 0.58311 }, { "acc": 0.99949169, "epoch": 4.83032873806999, "grad_norm": 0.30895915627479553, "learning_rate": 5.692591673505058e-06, "loss": 0.00123058, "memory(GiB)": 26.31, "step": 4555, "train_speed(iter/s)": 0.583113 }, { "acc": 0.99925365, "epoch": 4.835630965005302, "grad_norm": 0.2049397975206375, "learning_rate": 5.683908335517124e-06, "loss": 0.00311317, "memory(GiB)": 26.31, "step": 4560, "train_speed(iter/s)": 0.583116 }, { "acc": 0.99915085, "epoch": 4.840933191940615, "grad_norm": 0.27029553055763245, "learning_rate": 5.675222896182074e-06, "loss": 0.00321835, "memory(GiB)": 26.31, "step": 4565, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99948587, "epoch": 4.846235418875928, "grad_norm": 0.4786251187324524, "learning_rate": 5.666535382205941e-06, "loss": 0.00240655, "memory(GiB)": 26.31, "step": 4570, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99940729, "epoch": 4.8515376458112405, "grad_norm": 0.35688337683677673, "learning_rate": 5.657845820301128e-06, "loss": 0.00213064, "memory(GiB)": 26.31, "step": 4575, "train_speed(iter/s)": 0.583112 }, { "acc": 0.99936666, "epoch": 4.856839872746553, "grad_norm": 0.12073680013418198, "learning_rate": 5.649154237186342e-06, "loss": 0.00155683, "memory(GiB)": 26.31, "step": 4580, "train_speed(iter/s)": 0.583112 }, { "acc": 0.99914742, "epoch": 4.862142099681867, "grad_norm": 0.41659778356552124, "learning_rate": 5.640460659586504e-06, "loss": 0.00253949, "memory(GiB)": 26.31, "step": 4585, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99952326, "epoch": 4.867444326617179, "grad_norm": 0.6744159460067749, "learning_rate": 5.631765114232667e-06, "loss": 0.0022239, "memory(GiB)": 26.31, "step": 4590, "train_speed(iter/s)": 0.583114 }, { "acc": 0.99954004, "epoch": 4.872746553552492, "grad_norm": 0.874528706073761, "learning_rate": 5.623067627861931e-06, "loss": 0.00239744, "memory(GiB)": 26.31, "step": 4595, "train_speed(iter/s)": 0.583113 }, { "acc": 0.99966507, "epoch": 4.878048780487805, "grad_norm": 0.06950568407773972, "learning_rate": 5.6143682272173716e-06, "loss": 0.00160969, "memory(GiB)": 26.31, "step": 4600, "train_speed(iter/s)": 0.583113 }, { "acc": 0.99939823, "epoch": 4.8833510074231175, "grad_norm": 0.6629308462142944, "learning_rate": 5.605666939047942e-06, "loss": 0.00220318, "memory(GiB)": 26.31, "step": 4605, "train_speed(iter/s)": 0.583116 }, { "acc": 0.99966755, "epoch": 4.888653234358431, "grad_norm": 0.7309154868125916, "learning_rate": 5.596963790108406e-06, "loss": 0.00109128, "memory(GiB)": 26.31, "step": 4610, "train_speed(iter/s)": 0.583116 }, { "acc": 0.99976997, "epoch": 4.893955461293744, "grad_norm": 0.412337064743042, "learning_rate": 5.588258807159247e-06, "loss": 0.00075653, "memory(GiB)": 26.31, "step": 4615, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99912081, "epoch": 4.899257688229056, "grad_norm": 0.44406965374946594, "learning_rate": 5.579552016966583e-06, "loss": 0.00252374, "memory(GiB)": 26.31, "step": 4620, "train_speed(iter/s)": 0.58312 }, { "acc": 0.99974184, "epoch": 4.904559915164369, "grad_norm": 0.325088769197464, "learning_rate": 5.570843446302096e-06, "loss": 0.00167765, "memory(GiB)": 26.31, "step": 4625, "train_speed(iter/s)": 0.583122 }, { "acc": 0.99926529, "epoch": 4.909862142099682, "grad_norm": 0.6848169565200806, "learning_rate": 5.562133121942941e-06, "loss": 0.00173596, "memory(GiB)": 26.31, "step": 4630, "train_speed(iter/s)": 0.583125 }, { "acc": 0.99988041, "epoch": 4.915164369034994, "grad_norm": 0.08015554398298264, "learning_rate": 5.5534210706716595e-06, "loss": 0.00062716, "memory(GiB)": 26.31, "step": 4635, "train_speed(iter/s)": 0.583128 }, { "acc": 0.99927998, "epoch": 4.920466595970307, "grad_norm": 0.2661329209804535, "learning_rate": 5.5447073192761095e-06, "loss": 0.00297623, "memory(GiB)": 26.31, "step": 4640, "train_speed(iter/s)": 0.583127 }, { "acc": 0.99924908, "epoch": 4.925768822905621, "grad_norm": 1.0690864324569702, "learning_rate": 5.5359918945493725e-06, "loss": 0.00258645, "memory(GiB)": 26.31, "step": 4645, "train_speed(iter/s)": 0.583127 }, { "acc": 0.99939957, "epoch": 4.931071049840933, "grad_norm": 0.19076597690582275, "learning_rate": 5.52727482328968e-06, "loss": 0.00235107, "memory(GiB)": 26.31, "step": 4650, "train_speed(iter/s)": 0.583127 }, { "acc": 0.9994956, "epoch": 4.936373276776246, "grad_norm": 0.4487408995628357, "learning_rate": 5.518556132300321e-06, "loss": 0.00155169, "memory(GiB)": 26.31, "step": 4655, "train_speed(iter/s)": 0.583128 }, { "acc": 0.99989033, "epoch": 4.941675503711559, "grad_norm": 0.20197904109954834, "learning_rate": 5.509835848389566e-06, "loss": 0.00048707, "memory(GiB)": 26.31, "step": 4660, "train_speed(iter/s)": 0.583128 }, { "acc": 0.99916096, "epoch": 4.946977730646871, "grad_norm": 0.18906237185001373, "learning_rate": 5.501113998370588e-06, "loss": 0.00304128, "memory(GiB)": 26.31, "step": 4665, "train_speed(iter/s)": 0.583131 }, { "acc": 0.99973669, "epoch": 4.952279957582185, "grad_norm": 0.11769578605890274, "learning_rate": 5.492390609061365e-06, "loss": 0.0011104, "memory(GiB)": 26.31, "step": 4670, "train_speed(iter/s)": 0.583129 }, { "acc": 0.99927292, "epoch": 4.957582184517498, "grad_norm": 0.06326154619455338, "learning_rate": 5.48366570728462e-06, "loss": 0.0023735, "memory(GiB)": 26.31, "step": 4675, "train_speed(iter/s)": 0.583129 }, { "acc": 0.99961071, "epoch": 4.96288441145281, "grad_norm": 0.15000776946544647, "learning_rate": 5.4749393198677225e-06, "loss": 0.00089669, "memory(GiB)": 26.31, "step": 4680, "train_speed(iter/s)": 0.583127 }, { "acc": 0.99912415, "epoch": 4.968186638388123, "grad_norm": 0.5361363887786865, "learning_rate": 5.466211473642606e-06, "loss": 0.00306102, "memory(GiB)": 26.31, "step": 4685, "train_speed(iter/s)": 0.583135 }, { "acc": 0.99913349, "epoch": 4.973488865323436, "grad_norm": 0.9397711753845215, "learning_rate": 5.457482195445693e-06, "loss": 0.00252779, "memory(GiB)": 26.31, "step": 4690, "train_speed(iter/s)": 0.583135 }, { "acc": 0.99924164, "epoch": 4.978791092258748, "grad_norm": 0.8357404470443726, "learning_rate": 5.44875151211781e-06, "loss": 0.00215698, "memory(GiB)": 26.31, "step": 4695, "train_speed(iter/s)": 0.583141 }, { "acc": 0.99975786, "epoch": 4.984093319194061, "grad_norm": 0.2508851885795593, "learning_rate": 5.440019450504101e-06, "loss": 0.00104784, "memory(GiB)": 26.31, "step": 4700, "train_speed(iter/s)": 0.583141 }, { "acc": 0.99964581, "epoch": 4.989395546129375, "grad_norm": 0.41223564743995667, "learning_rate": 5.431286037453949e-06, "loss": 0.00124911, "memory(GiB)": 26.31, "step": 4705, "train_speed(iter/s)": 0.583141 }, { "acc": 0.99922943, "epoch": 4.994697773064687, "grad_norm": 0.41894757747650146, "learning_rate": 5.422551299820895e-06, "loss": 0.00315995, "memory(GiB)": 26.31, "step": 4710, "train_speed(iter/s)": 0.583149 }, { "acc": 0.99934368, "epoch": 5.0, "grad_norm": 0.036096345633268356, "learning_rate": 5.4138152644625495e-06, "loss": 0.0019122, "memory(GiB)": 26.31, "step": 4715, "train_speed(iter/s)": 0.583142 }, { "acc": 0.99912224, "epoch": 5.005302226935313, "grad_norm": 0.32283639907836914, "learning_rate": 5.405077958240514e-06, "loss": 0.00247847, "memory(GiB)": 26.31, "step": 4720, "train_speed(iter/s)": 0.58308 }, { "acc": 0.99974022, "epoch": 5.010604453870625, "grad_norm": 0.3302988111972809, "learning_rate": 5.3963394080203e-06, "loss": 0.00123621, "memory(GiB)": 26.31, "step": 4725, "train_speed(iter/s)": 0.583087 }, { "acc": 0.99912148, "epoch": 5.015906680805939, "grad_norm": 0.8059219717979431, "learning_rate": 5.387599640671238e-06, "loss": 0.00170408, "memory(GiB)": 26.31, "step": 4730, "train_speed(iter/s)": 0.583087 }, { "acc": 0.99928551, "epoch": 5.021208907741252, "grad_norm": 0.8436117768287659, "learning_rate": 5.37885868306641e-06, "loss": 0.00291735, "memory(GiB)": 26.31, "step": 4735, "train_speed(iter/s)": 0.583086 }, { "acc": 0.99951, "epoch": 5.026511134676564, "grad_norm": 0.06486281007528305, "learning_rate": 5.370116562082551e-06, "loss": 0.00148729, "memory(GiB)": 26.31, "step": 4740, "train_speed(iter/s)": 0.583089 }, { "acc": 0.99920187, "epoch": 5.031813361611877, "grad_norm": 0.5544983148574829, "learning_rate": 5.361373304599975e-06, "loss": 0.00254798, "memory(GiB)": 26.31, "step": 4745, "train_speed(iter/s)": 0.583089 }, { "acc": 0.99915466, "epoch": 5.03711558854719, "grad_norm": 0.16032570600509644, "learning_rate": 5.352628937502491e-06, "loss": 0.00236086, "memory(GiB)": 26.31, "step": 4750, "train_speed(iter/s)": 0.583093 }, { "acc": 0.9997427, "epoch": 5.042417815482502, "grad_norm": 0.13562826812267303, "learning_rate": 5.343883487677319e-06, "loss": 0.00140394, "memory(GiB)": 26.31, "step": 4755, "train_speed(iter/s)": 0.583093 }, { "acc": 0.99940615, "epoch": 5.047720042417816, "grad_norm": 0.3697991967201233, "learning_rate": 5.335136982015008e-06, "loss": 0.00208272, "memory(GiB)": 26.31, "step": 4760, "train_speed(iter/s)": 0.583092 }, { "acc": 0.99912567, "epoch": 5.053022269353129, "grad_norm": 0.7061968445777893, "learning_rate": 5.326389447409356e-06, "loss": 0.00275432, "memory(GiB)": 26.31, "step": 4765, "train_speed(iter/s)": 0.583092 }, { "acc": 0.99899225, "epoch": 5.058324496288441, "grad_norm": 0.3614329993724823, "learning_rate": 5.31764091075732e-06, "loss": 0.0032732, "memory(GiB)": 26.31, "step": 4770, "train_speed(iter/s)": 0.583094 }, { "acc": 0.99961834, "epoch": 5.063626723223754, "grad_norm": 0.3613954186439514, "learning_rate": 5.308891398958944e-06, "loss": 0.00152841, "memory(GiB)": 26.31, "step": 4775, "train_speed(iter/s)": 0.583098 }, { "acc": 0.99952335, "epoch": 5.068928950159067, "grad_norm": 0.5642781853675842, "learning_rate": 5.300140938917265e-06, "loss": 0.00217171, "memory(GiB)": 26.31, "step": 4780, "train_speed(iter/s)": 0.583097 }, { "acc": 0.99950047, "epoch": 5.074231177094379, "grad_norm": 0.09304577857255936, "learning_rate": 5.29138955753824e-06, "loss": 0.00100198, "memory(GiB)": 26.31, "step": 4785, "train_speed(iter/s)": 0.583098 }, { "acc": 0.99951153, "epoch": 5.079533404029693, "grad_norm": 0.5502454042434692, "learning_rate": 5.282637281730657e-06, "loss": 0.00078877, "memory(GiB)": 26.31, "step": 4790, "train_speed(iter/s)": 0.583097 }, { "acc": 0.99952335, "epoch": 5.084835630965006, "grad_norm": 0.08497211337089539, "learning_rate": 5.273884138406053e-06, "loss": 0.00157482, "memory(GiB)": 26.31, "step": 4795, "train_speed(iter/s)": 0.583101 }, { "acc": 0.99952164, "epoch": 5.090137857900318, "grad_norm": 0.1908387541770935, "learning_rate": 5.265130154478633e-06, "loss": 0.00112022, "memory(GiB)": 26.31, "step": 4800, "train_speed(iter/s)": 0.583104 }, { "acc": 0.9996439, "epoch": 5.095440084835631, "grad_norm": 0.07793429493904114, "learning_rate": 5.25637535686519e-06, "loss": 0.00093803, "memory(GiB)": 26.31, "step": 4805, "train_speed(iter/s)": 0.583103 }, { "acc": 0.99963684, "epoch": 5.100742311770944, "grad_norm": 0.31934216618537903, "learning_rate": 5.247619772485013e-06, "loss": 0.00145084, "memory(GiB)": 26.31, "step": 4810, "train_speed(iter/s)": 0.583103 }, { "acc": 0.99963875, "epoch": 5.106044538706256, "grad_norm": 0.26898130774497986, "learning_rate": 5.238863428259817e-06, "loss": 0.00137375, "memory(GiB)": 26.31, "step": 4815, "train_speed(iter/s)": 0.583109 }, { "acc": 0.99987183, "epoch": 5.11134676564157, "grad_norm": 0.05188106745481491, "learning_rate": 5.230106351113646e-06, "loss": 0.00048276, "memory(GiB)": 26.31, "step": 4820, "train_speed(iter/s)": 0.583108 }, { "acc": 0.99940996, "epoch": 5.1166489925768825, "grad_norm": 0.23561055958271027, "learning_rate": 5.221348567972804e-06, "loss": 0.00167051, "memory(GiB)": 26.31, "step": 4825, "train_speed(iter/s)": 0.58311 }, { "acc": 0.99960842, "epoch": 5.121951219512195, "grad_norm": 0.11288487911224365, "learning_rate": 5.212590105765762e-06, "loss": 0.00139646, "memory(GiB)": 26.31, "step": 4830, "train_speed(iter/s)": 0.583114 }, { "acc": 0.99960985, "epoch": 5.127253446447508, "grad_norm": 0.46378016471862793, "learning_rate": 5.203830991423079e-06, "loss": 0.00243313, "memory(GiB)": 26.31, "step": 4835, "train_speed(iter/s)": 0.583114 }, { "acc": 0.99950123, "epoch": 5.132555673382821, "grad_norm": 0.47227156162261963, "learning_rate": 5.195071251877325e-06, "loss": 0.00209684, "memory(GiB)": 26.31, "step": 4840, "train_speed(iter/s)": 0.583117 }, { "acc": 0.99961729, "epoch": 5.137857900318133, "grad_norm": 0.3798484802246094, "learning_rate": 5.186310914062983e-06, "loss": 0.00167997, "memory(GiB)": 26.31, "step": 4845, "train_speed(iter/s)": 0.583117 }, { "acc": 0.99963169, "epoch": 5.143160127253447, "grad_norm": 0.23786208033561707, "learning_rate": 5.177550004916381e-06, "loss": 0.00179124, "memory(GiB)": 26.31, "step": 4850, "train_speed(iter/s)": 0.583127 }, { "acc": 0.99936047, "epoch": 5.1484623541887595, "grad_norm": 0.7719597220420837, "learning_rate": 5.168788551375607e-06, "loss": 0.00196373, "memory(GiB)": 26.31, "step": 4855, "train_speed(iter/s)": 0.583126 }, { "acc": 0.99967613, "epoch": 5.153764581124072, "grad_norm": 0.25141432881355286, "learning_rate": 5.160026580380412e-06, "loss": 0.00157652, "memory(GiB)": 26.31, "step": 4860, "train_speed(iter/s)": 0.583125 }, { "acc": 0.99951057, "epoch": 5.159066808059385, "grad_norm": 0.37007245421409607, "learning_rate": 5.15126411887215e-06, "loss": 0.00161061, "memory(GiB)": 26.31, "step": 4865, "train_speed(iter/s)": 0.583127 }, { "acc": 0.99910822, "epoch": 5.164369034994698, "grad_norm": 0.10854408144950867, "learning_rate": 5.142501193793677e-06, "loss": 0.00538175, "memory(GiB)": 26.31, "step": 4870, "train_speed(iter/s)": 0.583126 }, { "acc": 0.99895782, "epoch": 5.16967126193001, "grad_norm": 0.27228420972824097, "learning_rate": 5.133737832089277e-06, "loss": 0.00419342, "memory(GiB)": 26.31, "step": 4875, "train_speed(iter/s)": 0.583134 }, { "acc": 0.99924507, "epoch": 5.174973488865324, "grad_norm": 0.42566487193107605, "learning_rate": 5.124974060704574e-06, "loss": 0.00254047, "memory(GiB)": 26.31, "step": 4880, "train_speed(iter/s)": 0.583133 }, { "acc": 0.99926548, "epoch": 5.1802757158006365, "grad_norm": 0.09999915957450867, "learning_rate": 5.116209906586451e-06, "loss": 0.0023471, "memory(GiB)": 26.31, "step": 4885, "train_speed(iter/s)": 0.583137 }, { "acc": 0.99965687, "epoch": 5.185577942735949, "grad_norm": 0.17713366448879242, "learning_rate": 5.107445396682971e-06, "loss": 0.00108484, "memory(GiB)": 26.31, "step": 4890, "train_speed(iter/s)": 0.583138 }, { "acc": 0.9994688, "epoch": 5.190880169671262, "grad_norm": 0.3907831311225891, "learning_rate": 5.098680557943291e-06, "loss": 0.00225697, "memory(GiB)": 26.31, "step": 4895, "train_speed(iter/s)": 0.583141 }, { "acc": 0.99924278, "epoch": 5.1961823966065745, "grad_norm": 0.45619088411331177, "learning_rate": 5.089915417317577e-06, "loss": 0.0022859, "memory(GiB)": 26.31, "step": 4900, "train_speed(iter/s)": 0.583133 }, { "acc": 0.99950504, "epoch": 5.201484623541887, "grad_norm": 0.26541733741760254, "learning_rate": 5.081150001756924e-06, "loss": 0.0016038, "memory(GiB)": 26.31, "step": 4905, "train_speed(iter/s)": 0.583112 }, { "acc": 0.99899044, "epoch": 5.206786850477201, "grad_norm": 0.39049601554870605, "learning_rate": 5.072384338213271e-06, "loss": 0.00281408, "memory(GiB)": 26.31, "step": 4910, "train_speed(iter/s)": 0.5831 }, { "acc": 0.99951391, "epoch": 5.2120890774125135, "grad_norm": 0.9988411068916321, "learning_rate": 5.063618453639322e-06, "loss": 0.00213081, "memory(GiB)": 26.31, "step": 4915, "train_speed(iter/s)": 0.583079 }, { "acc": 0.99915142, "epoch": 5.217391304347826, "grad_norm": 0.15615594387054443, "learning_rate": 5.054852374988459e-06, "loss": 0.0027803, "memory(GiB)": 26.31, "step": 4920, "train_speed(iter/s)": 0.583078 }, { "acc": 0.99976625, "epoch": 5.222693531283139, "grad_norm": 0.1486440747976303, "learning_rate": 5.046086129214663e-06, "loss": 0.00100315, "memory(GiB)": 26.31, "step": 4925, "train_speed(iter/s)": 0.583081 }, { "acc": 0.99928789, "epoch": 5.2279957582184515, "grad_norm": 0.2671545743942261, "learning_rate": 5.037319743272424e-06, "loss": 0.0028695, "memory(GiB)": 26.31, "step": 4930, "train_speed(iter/s)": 0.583088 }, { "acc": 0.99950676, "epoch": 5.233297985153764, "grad_norm": 0.27838829159736633, "learning_rate": 5.028553244116671e-06, "loss": 0.00226881, "memory(GiB)": 26.31, "step": 4935, "train_speed(iter/s)": 0.583092 }, { "acc": 0.99974127, "epoch": 5.238600212089078, "grad_norm": 0.1396360993385315, "learning_rate": 5.01978665870267e-06, "loss": 0.00160907, "memory(GiB)": 26.31, "step": 4940, "train_speed(iter/s)": 0.583091 }, { "acc": 0.99960365, "epoch": 5.2439024390243905, "grad_norm": 0.18549151718616486, "learning_rate": 5.011020013985961e-06, "loss": 0.00230109, "memory(GiB)": 26.31, "step": 4945, "train_speed(iter/s)": 0.583095 }, { "acc": 0.99975605, "epoch": 5.249204665959703, "grad_norm": 0.3893892467021942, "learning_rate": 5.002253336922267e-06, "loss": 0.00109208, "memory(GiB)": 26.31, "step": 4950, "train_speed(iter/s)": 0.583098 }, { "acc": 0.99975195, "epoch": 5.254506892895016, "grad_norm": 0.3478403389453888, "learning_rate": 4.993486654467404e-06, "loss": 0.00157792, "memory(GiB)": 26.31, "step": 4955, "train_speed(iter/s)": 0.583105 }, { "acc": 0.99963999, "epoch": 5.2598091198303285, "grad_norm": 0.4681493639945984, "learning_rate": 4.984719993577207e-06, "loss": 0.00123787, "memory(GiB)": 26.31, "step": 4960, "train_speed(iter/s)": 0.583105 }, { "acc": 0.99987803, "epoch": 5.265111346765641, "grad_norm": 0.10217458754777908, "learning_rate": 4.9759533812074465e-06, "loss": 0.00166023, "memory(GiB)": 26.31, "step": 4965, "train_speed(iter/s)": 0.58311 }, { "acc": 0.99974689, "epoch": 5.270413573700955, "grad_norm": 0.04794376716017723, "learning_rate": 4.967186844313744e-06, "loss": 0.00107571, "memory(GiB)": 26.31, "step": 4970, "train_speed(iter/s)": 0.583114 }, { "acc": 0.99917164, "epoch": 5.275715800636267, "grad_norm": 1.2356112003326416, "learning_rate": 4.958420409851488e-06, "loss": 0.00387285, "memory(GiB)": 26.31, "step": 4975, "train_speed(iter/s)": 0.583113 }, { "acc": 0.99936285, "epoch": 5.28101802757158, "grad_norm": 0.6110440492630005, "learning_rate": 4.94965410477575e-06, "loss": 0.00184677, "memory(GiB)": 26.31, "step": 4980, "train_speed(iter/s)": 0.583113 }, { "acc": 0.9995368, "epoch": 5.286320254506893, "grad_norm": 0.2816583514213562, "learning_rate": 4.940887956041206e-06, "loss": 0.0014614, "memory(GiB)": 26.31, "step": 4985, "train_speed(iter/s)": 0.583111 }, { "acc": 1.0, "epoch": 5.2916224814422055, "grad_norm": 0.28331875801086426, "learning_rate": 4.932121990602051e-06, "loss": 0.0004526, "memory(GiB)": 26.31, "step": 4990, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99961424, "epoch": 5.296924708377518, "grad_norm": 0.41179099678993225, "learning_rate": 4.9233562354119146e-06, "loss": 0.00149953, "memory(GiB)": 26.31, "step": 4995, "train_speed(iter/s)": 0.583113 }, { "acc": 0.99923592, "epoch": 5.302226935312832, "grad_norm": 1.1502220630645752, "learning_rate": 4.914590717423784e-06, "loss": 0.00241425, "memory(GiB)": 26.31, "step": 5000, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99924545, "epoch": 5.307529162248144, "grad_norm": 0.2299642413854599, "learning_rate": 4.905825463589912e-06, "loss": 0.00222843, "memory(GiB)": 26.31, "step": 5005, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99898901, "epoch": 5.312831389183457, "grad_norm": 0.696652889251709, "learning_rate": 4.897060500861745e-06, "loss": 0.00209142, "memory(GiB)": 26.31, "step": 5010, "train_speed(iter/s)": 0.583115 }, { "acc": 0.99973383, "epoch": 5.31813361611877, "grad_norm": 0.57330721616745, "learning_rate": 4.888295856189828e-06, "loss": 0.00127123, "memory(GiB)": 26.31, "step": 5015, "train_speed(iter/s)": 0.583113 }, { "acc": 0.99952068, "epoch": 5.3234358430540825, "grad_norm": 0.04265210032463074, "learning_rate": 4.8795315565237325e-06, "loss": 0.00123829, "memory(GiB)": 26.31, "step": 5020, "train_speed(iter/s)": 0.583116 }, { "acc": 0.99938927, "epoch": 5.328738069989395, "grad_norm": 0.05427992716431618, "learning_rate": 4.870767628811968e-06, "loss": 0.00237837, "memory(GiB)": 26.31, "step": 5025, "train_speed(iter/s)": 0.583117 }, { "acc": 0.99955378, "epoch": 5.334040296924709, "grad_norm": 0.4924948215484619, "learning_rate": 4.862004100001898e-06, "loss": 0.00116051, "memory(GiB)": 26.31, "step": 5030, "train_speed(iter/s)": 0.583117 }, { "acc": 0.99962788, "epoch": 5.339342523860021, "grad_norm": 0.5894216299057007, "learning_rate": 4.853240997039663e-06, "loss": 0.00151597, "memory(GiB)": 26.31, "step": 5035, "train_speed(iter/s)": 0.58312 }, { "acc": 0.99962692, "epoch": 5.344644750795334, "grad_norm": 0.6638916730880737, "learning_rate": 4.8444783468700925e-06, "loss": 0.00201382, "memory(GiB)": 26.31, "step": 5040, "train_speed(iter/s)": 0.583123 }, { "acc": 0.99963865, "epoch": 5.349946977730647, "grad_norm": 0.33775195479393005, "learning_rate": 4.835716176436624e-06, "loss": 0.00127131, "memory(GiB)": 26.31, "step": 5045, "train_speed(iter/s)": 0.579829 }, { "acc": 0.99938679, "epoch": 5.355249204665959, "grad_norm": 0.35410913825035095, "learning_rate": 4.826954512681219e-06, "loss": 0.00201825, "memory(GiB)": 26.31, "step": 5050, "train_speed(iter/s)": 0.579835 }, { "acc": 0.99975891, "epoch": 5.360551431601272, "grad_norm": 0.05381745472550392, "learning_rate": 4.818193382544282e-06, "loss": 0.00096042, "memory(GiB)": 26.31, "step": 5055, "train_speed(iter/s)": 0.579845 }, { "acc": 0.99938011, "epoch": 5.365853658536586, "grad_norm": 0.1406700760126114, "learning_rate": 4.809432812964577e-06, "loss": 0.00262718, "memory(GiB)": 26.31, "step": 5060, "train_speed(iter/s)": 0.579849 }, { "acc": 0.99934797, "epoch": 5.371155885471898, "grad_norm": 0.4387652575969696, "learning_rate": 4.800672830879143e-06, "loss": 0.00249025, "memory(GiB)": 26.31, "step": 5065, "train_speed(iter/s)": 0.579852 }, { "acc": 0.99944916, "epoch": 5.376458112407211, "grad_norm": 0.17490963637828827, "learning_rate": 4.791913463223214e-06, "loss": 0.00172883, "memory(GiB)": 26.31, "step": 5070, "train_speed(iter/s)": 0.57986 }, { "acc": 0.9992794, "epoch": 5.381760339342524, "grad_norm": 0.642207682132721, "learning_rate": 4.7831547369301365e-06, "loss": 0.00203475, "memory(GiB)": 26.31, "step": 5075, "train_speed(iter/s)": 0.579867 }, { "acc": 0.99916515, "epoch": 5.387062566277836, "grad_norm": 0.1799846887588501, "learning_rate": 4.774396678931278e-06, "loss": 0.00362162, "memory(GiB)": 26.31, "step": 5080, "train_speed(iter/s)": 0.579871 }, { "acc": 0.99951611, "epoch": 5.392364793213149, "grad_norm": 0.38178086280822754, "learning_rate": 4.76563931615596e-06, "loss": 0.00107371, "memory(GiB)": 26.31, "step": 5085, "train_speed(iter/s)": 0.579875 }, { "acc": 0.99987803, "epoch": 5.397667020148463, "grad_norm": 0.07540518790483475, "learning_rate": 4.75688267553136e-06, "loss": 0.00127289, "memory(GiB)": 26.31, "step": 5090, "train_speed(iter/s)": 0.579882 }, { "acc": 0.9998889, "epoch": 5.402969247083775, "grad_norm": 0.10213793814182281, "learning_rate": 4.748126783982437e-06, "loss": 0.00054123, "memory(GiB)": 26.31, "step": 5095, "train_speed(iter/s)": 0.579889 }, { "acc": 0.99975166, "epoch": 5.408271474019088, "grad_norm": 0.5178495049476624, "learning_rate": 4.739371668431848e-06, "loss": 0.00119375, "memory(GiB)": 26.31, "step": 5100, "train_speed(iter/s)": 0.579894 }, { "acc": 0.99971905, "epoch": 5.413573700954401, "grad_norm": 0.019593840464949608, "learning_rate": 4.730617355799862e-06, "loss": 0.00118645, "memory(GiB)": 26.31, "step": 5105, "train_speed(iter/s)": 0.579899 }, { "acc": 0.99951067, "epoch": 5.418875927889713, "grad_norm": 0.05931418016552925, "learning_rate": 4.72186387300428e-06, "loss": 0.00153462, "memory(GiB)": 26.31, "step": 5110, "train_speed(iter/s)": 0.579902 }, { "acc": 0.99974537, "epoch": 5.424178154825027, "grad_norm": 0.16675366461277008, "learning_rate": 4.7131112469603526e-06, "loss": 0.0011792, "memory(GiB)": 26.31, "step": 5115, "train_speed(iter/s)": 0.579905 }, { "acc": 0.9995225, "epoch": 5.42948038176034, "grad_norm": 0.6896766424179077, "learning_rate": 4.704359504580694e-06, "loss": 0.0019345, "memory(GiB)": 26.31, "step": 5120, "train_speed(iter/s)": 0.579909 }, { "acc": 1.0, "epoch": 5.434782608695652, "grad_norm": 0.04017867520451546, "learning_rate": 4.695608672775202e-06, "loss": 0.00042916, "memory(GiB)": 26.31, "step": 5125, "train_speed(iter/s)": 0.57991 }, { "acc": 0.99963627, "epoch": 5.440084835630965, "grad_norm": 0.3894314765930176, "learning_rate": 4.686858778450975e-06, "loss": 0.00213848, "memory(GiB)": 26.31, "step": 5130, "train_speed(iter/s)": 0.579916 }, { "acc": 0.9996685, "epoch": 5.445387062566278, "grad_norm": 0.4688229560852051, "learning_rate": 4.678109848512228e-06, "loss": 0.00064681, "memory(GiB)": 26.31, "step": 5135, "train_speed(iter/s)": 0.57992 }, { "acc": 0.99963751, "epoch": 5.45068928950159, "grad_norm": 0.12050074338912964, "learning_rate": 4.669361909860213e-06, "loss": 0.00217735, "memory(GiB)": 26.31, "step": 5140, "train_speed(iter/s)": 0.579924 }, { "acc": 0.99936428, "epoch": 5.455991516436903, "grad_norm": 0.4008522629737854, "learning_rate": 4.660614989393132e-06, "loss": 0.001995, "memory(GiB)": 26.31, "step": 5145, "train_speed(iter/s)": 0.579927 }, { "acc": 0.99941254, "epoch": 5.461293743372217, "grad_norm": 0.2535358965396881, "learning_rate": 4.6518691140060545e-06, "loss": 0.00125535, "memory(GiB)": 26.31, "step": 5150, "train_speed(iter/s)": 0.57993 }, { "acc": 0.99964075, "epoch": 5.466595970307529, "grad_norm": 0.4533134996891022, "learning_rate": 4.64312431059084e-06, "loss": 0.00111725, "memory(GiB)": 26.31, "step": 5155, "train_speed(iter/s)": 0.579937 }, { "acc": 0.99975224, "epoch": 5.471898197242842, "grad_norm": 0.364401638507843, "learning_rate": 4.63438060603605e-06, "loss": 0.00101442, "memory(GiB)": 26.31, "step": 5160, "train_speed(iter/s)": 0.57994 }, { "acc": 0.9994957, "epoch": 5.477200424178155, "grad_norm": 0.11963590234518051, "learning_rate": 4.625638027226868e-06, "loss": 0.00167611, "memory(GiB)": 26.31, "step": 5165, "train_speed(iter/s)": 0.579947 }, { "acc": 0.99962101, "epoch": 5.482502651113467, "grad_norm": 0.37187841534614563, "learning_rate": 4.616896601045017e-06, "loss": 0.00190198, "memory(GiB)": 26.31, "step": 5170, "train_speed(iter/s)": 0.579944 }, { "acc": 0.9997221, "epoch": 5.487804878048781, "grad_norm": 0.09011055529117584, "learning_rate": 4.608156354368674e-06, "loss": 0.00183465, "memory(GiB)": 26.31, "step": 5175, "train_speed(iter/s)": 0.579952 }, { "acc": 0.99932022, "epoch": 5.493107104984094, "grad_norm": 0.5692781805992126, "learning_rate": 4.5994173140723894e-06, "loss": 0.00170957, "memory(GiB)": 26.31, "step": 5180, "train_speed(iter/s)": 0.579958 }, { "acc": 0.99976263, "epoch": 5.498409331919406, "grad_norm": 0.2293304055929184, "learning_rate": 4.590679507027005e-06, "loss": 0.00127389, "memory(GiB)": 26.31, "step": 5185, "train_speed(iter/s)": 0.57996 }, { "acc": 0.99975605, "epoch": 5.503711558854719, "grad_norm": 0.2158653885126114, "learning_rate": 4.581942960099572e-06, "loss": 0.00080789, "memory(GiB)": 26.31, "step": 5190, "train_speed(iter/s)": 0.579965 }, { "acc": 0.99951315, "epoch": 5.509013785790032, "grad_norm": 0.07053355872631073, "learning_rate": 4.5732077001532605e-06, "loss": 0.00099423, "memory(GiB)": 26.31, "step": 5195, "train_speed(iter/s)": 0.579968 }, { "acc": 0.99961748, "epoch": 5.514316012725344, "grad_norm": 0.08148149400949478, "learning_rate": 4.564473754047294e-06, "loss": 0.00141822, "memory(GiB)": 26.31, "step": 5200, "train_speed(iter/s)": 0.579971 }, { "acc": 0.99975567, "epoch": 5.519618239660657, "grad_norm": 0.391696959733963, "learning_rate": 4.555741148636848e-06, "loss": 0.00058978, "memory(GiB)": 26.31, "step": 5205, "train_speed(iter/s)": 0.579974 }, { "acc": 0.999753, "epoch": 5.524920466595971, "grad_norm": 0.0765453651547432, "learning_rate": 4.547009910772977e-06, "loss": 0.00054346, "memory(GiB)": 26.31, "step": 5210, "train_speed(iter/s)": 0.579977 }, { "acc": 0.99962978, "epoch": 5.530222693531283, "grad_norm": 0.12428858876228333, "learning_rate": 4.538280067302533e-06, "loss": 0.00103908, "memory(GiB)": 26.31, "step": 5215, "train_speed(iter/s)": 0.579983 }, { "acc": 0.99985294, "epoch": 5.535524920466596, "grad_norm": 0.0501282699406147, "learning_rate": 4.529551645068079e-06, "loss": 0.00052765, "memory(GiB)": 26.31, "step": 5220, "train_speed(iter/s)": 0.579989 }, { "acc": 0.99949055, "epoch": 5.540827147401909, "grad_norm": 0.6878573894500732, "learning_rate": 4.520824670907807e-06, "loss": 0.00138112, "memory(GiB)": 26.31, "step": 5225, "train_speed(iter/s)": 0.579995 }, { "acc": 0.9998724, "epoch": 5.546129374337221, "grad_norm": 0.4195837676525116, "learning_rate": 4.51209917165546e-06, "loss": 0.0007006, "memory(GiB)": 26.31, "step": 5230, "train_speed(iter/s)": 0.579999 }, { "acc": 0.99986629, "epoch": 5.551431601272535, "grad_norm": 0.13989785313606262, "learning_rate": 4.5033751741402414e-06, "loss": 0.00037461, "memory(GiB)": 26.31, "step": 5235, "train_speed(iter/s)": 0.580002 }, { "acc": 0.99955978, "epoch": 5.556733828207848, "grad_norm": 0.35394933819770813, "learning_rate": 4.49465270518674e-06, "loss": 0.00158651, "memory(GiB)": 26.31, "step": 5240, "train_speed(iter/s)": 0.580005 }, { "acc": 0.99952345, "epoch": 5.56203605514316, "grad_norm": 0.49062466621398926, "learning_rate": 4.485931791614843e-06, "loss": 0.00145427, "memory(GiB)": 26.31, "step": 5245, "train_speed(iter/s)": 0.580008 }, { "acc": 0.99963207, "epoch": 5.567338282078473, "grad_norm": 0.08707881718873978, "learning_rate": 4.477212460239658e-06, "loss": 0.00094686, "memory(GiB)": 26.31, "step": 5250, "train_speed(iter/s)": 0.58001 }, { "acc": 0.99987745, "epoch": 5.572640509013786, "grad_norm": 0.4491136968135834, "learning_rate": 4.468494737871423e-06, "loss": 0.00051695, "memory(GiB)": 26.31, "step": 5255, "train_speed(iter/s)": 0.580013 }, { "acc": 0.99973965, "epoch": 5.577942735949098, "grad_norm": 0.38590842485427856, "learning_rate": 4.45977865131543e-06, "loss": 0.00096882, "memory(GiB)": 26.31, "step": 5260, "train_speed(iter/s)": 0.580019 }, { "acc": 0.99960003, "epoch": 5.583244962884411, "grad_norm": 0.10056951642036438, "learning_rate": 4.451064227371946e-06, "loss": 0.0011963, "memory(GiB)": 26.31, "step": 5265, "train_speed(iter/s)": 0.580024 }, { "acc": 0.99985027, "epoch": 5.5885471898197245, "grad_norm": 0.023627113550901413, "learning_rate": 4.4423514928361204e-06, "loss": 0.00056202, "memory(GiB)": 26.31, "step": 5270, "train_speed(iter/s)": 0.580026 }, { "acc": 0.99952812, "epoch": 5.593849416755037, "grad_norm": 0.15069565176963806, "learning_rate": 4.433640474497909e-06, "loss": 0.0010099, "memory(GiB)": 26.31, "step": 5275, "train_speed(iter/s)": 0.580031 }, { "acc": 0.99974689, "epoch": 5.59915164369035, "grad_norm": 0.9341526031494141, "learning_rate": 4.424931199141993e-06, "loss": 0.00080414, "memory(GiB)": 26.31, "step": 5280, "train_speed(iter/s)": 0.580034 }, { "acc": 0.99950294, "epoch": 5.604453870625663, "grad_norm": 0.670882523059845, "learning_rate": 4.416223693547691e-06, "loss": 0.00239468, "memory(GiB)": 26.31, "step": 5285, "train_speed(iter/s)": 0.580036 }, { "acc": 0.9994276, "epoch": 5.609756097560975, "grad_norm": 0.03292407467961311, "learning_rate": 4.407517984488881e-06, "loss": 0.00245402, "memory(GiB)": 26.31, "step": 5290, "train_speed(iter/s)": 0.580039 }, { "acc": 0.99988422, "epoch": 5.615058324496289, "grad_norm": 0.22681885957717896, "learning_rate": 4.39881409873392e-06, "loss": 0.00070906, "memory(GiB)": 26.31, "step": 5295, "train_speed(iter/s)": 0.580041 }, { "acc": 0.99941082, "epoch": 5.6203605514316015, "grad_norm": 0.7250832319259644, "learning_rate": 4.390112063045555e-06, "loss": 0.00160247, "memory(GiB)": 26.31, "step": 5300, "train_speed(iter/s)": 0.580052 }, { "acc": 0.99921331, "epoch": 5.625662778366914, "grad_norm": 1.260987639427185, "learning_rate": 4.381411904180846e-06, "loss": 0.00214222, "memory(GiB)": 26.31, "step": 5305, "train_speed(iter/s)": 0.580058 }, { "acc": 0.9991765, "epoch": 5.630965005302227, "grad_norm": 0.24449238181114197, "learning_rate": 4.372713648891081e-06, "loss": 0.00172313, "memory(GiB)": 26.31, "step": 5310, "train_speed(iter/s)": 0.580061 }, { "acc": 0.99954405, "epoch": 5.63626723223754, "grad_norm": 0.8919406533241272, "learning_rate": 4.364017323921696e-06, "loss": 0.0016624, "memory(GiB)": 26.31, "step": 5315, "train_speed(iter/s)": 0.580063 }, { "acc": 0.99960203, "epoch": 5.641569459172852, "grad_norm": 0.2842835485935211, "learning_rate": 4.355322956012191e-06, "loss": 0.0010582, "memory(GiB)": 26.31, "step": 5320, "train_speed(iter/s)": 0.580066 }, { "acc": 0.99978628, "epoch": 5.646871686108166, "grad_norm": 0.6368212699890137, "learning_rate": 4.346630571896048e-06, "loss": 0.00189297, "memory(GiB)": 26.31, "step": 5325, "train_speed(iter/s)": 0.580071 }, { "acc": 0.99936647, "epoch": 5.6521739130434785, "grad_norm": 0.6470702290534973, "learning_rate": 4.337940198300652e-06, "loss": 0.00140759, "memory(GiB)": 26.31, "step": 5330, "train_speed(iter/s)": 0.580074 }, { "acc": 0.99936848, "epoch": 5.657476139978791, "grad_norm": 0.49525970220565796, "learning_rate": 4.329251861947202e-06, "loss": 0.00237361, "memory(GiB)": 26.31, "step": 5335, "train_speed(iter/s)": 0.580077 }, { "acc": 0.99964027, "epoch": 5.662778366914104, "grad_norm": 0.19526098668575287, "learning_rate": 4.320565589550637e-06, "loss": 0.00113255, "memory(GiB)": 26.31, "step": 5340, "train_speed(iter/s)": 0.580083 }, { "acc": 1.0, "epoch": 5.6680805938494165, "grad_norm": 0.20106367766857147, "learning_rate": 4.311881407819546e-06, "loss": 0.00045028, "memory(GiB)": 26.31, "step": 5345, "train_speed(iter/s)": 0.580089 }, { "acc": 0.99977512, "epoch": 5.673382820784729, "grad_norm": 0.2478099912405014, "learning_rate": 4.303199343456091e-06, "loss": 0.00076979, "memory(GiB)": 26.31, "step": 5350, "train_speed(iter/s)": 0.580095 }, { "acc": 0.99937286, "epoch": 5.678685047720043, "grad_norm": 0.7594713568687439, "learning_rate": 4.294519423155924e-06, "loss": 0.00199138, "memory(GiB)": 26.31, "step": 5355, "train_speed(iter/s)": 0.580108 }, { "acc": 0.99938717, "epoch": 5.6839872746553555, "grad_norm": 0.4824899733066559, "learning_rate": 4.285841673608106e-06, "loss": 0.0022531, "memory(GiB)": 26.31, "step": 5360, "train_speed(iter/s)": 0.580114 }, { "acc": 0.99939022, "epoch": 5.689289501590668, "grad_norm": 0.055846281349658966, "learning_rate": 4.2771661214950185e-06, "loss": 0.00241133, "memory(GiB)": 26.31, "step": 5365, "train_speed(iter/s)": 0.580123 }, { "acc": 1.0, "epoch": 5.694591728525981, "grad_norm": 0.22975574433803558, "learning_rate": 4.2684927934922925e-06, "loss": 0.00032793, "memory(GiB)": 26.31, "step": 5370, "train_speed(iter/s)": 0.580129 }, { "acc": 0.99939823, "epoch": 5.6998939554612935, "grad_norm": 0.4694140553474426, "learning_rate": 4.259821716268714e-06, "loss": 0.00242309, "memory(GiB)": 26.31, "step": 5375, "train_speed(iter/s)": 0.580134 }, { "acc": 0.99965668, "epoch": 5.705196182396606, "grad_norm": 0.22332924604415894, "learning_rate": 4.251152916486151e-06, "loss": 0.00169533, "memory(GiB)": 26.31, "step": 5380, "train_speed(iter/s)": 0.580136 }, { "acc": 0.99950085, "epoch": 5.71049840933192, "grad_norm": 0.08591309189796448, "learning_rate": 4.242486420799474e-06, "loss": 0.0019324, "memory(GiB)": 26.31, "step": 5385, "train_speed(iter/s)": 0.580138 }, { "acc": 0.99951143, "epoch": 5.7158006362672324, "grad_norm": 0.6218582391738892, "learning_rate": 4.233822255856459e-06, "loss": 0.00133204, "memory(GiB)": 26.31, "step": 5390, "train_speed(iter/s)": 0.58014 }, { "acc": 0.99975815, "epoch": 5.721102863202545, "grad_norm": 0.1280733346939087, "learning_rate": 4.225160448297724e-06, "loss": 0.00122716, "memory(GiB)": 26.31, "step": 5395, "train_speed(iter/s)": 0.580142 }, { "acc": 0.9993741, "epoch": 5.726405090137858, "grad_norm": 0.5542604327201843, "learning_rate": 4.216501024756633e-06, "loss": 0.00164174, "memory(GiB)": 26.31, "step": 5400, "train_speed(iter/s)": 0.580144 }, { "acc": 0.9996562, "epoch": 5.7317073170731705, "grad_norm": 0.3382243812084198, "learning_rate": 4.207844011859222e-06, "loss": 0.00148793, "memory(GiB)": 26.31, "step": 5405, "train_speed(iter/s)": 0.580147 }, { "acc": 0.9995121, "epoch": 5.737009544008483, "grad_norm": 0.5271421074867249, "learning_rate": 4.199189436224115e-06, "loss": 0.0008573, "memory(GiB)": 26.31, "step": 5410, "train_speed(iter/s)": 0.580148 }, { "acc": 0.99942265, "epoch": 5.742311770943797, "grad_norm": 0.20610488951206207, "learning_rate": 4.190537324462441e-06, "loss": 0.00160457, "memory(GiB)": 26.31, "step": 5415, "train_speed(iter/s)": 0.580149 }, { "acc": 0.99972239, "epoch": 5.747613997879109, "grad_norm": 0.7685815095901489, "learning_rate": 4.181887703177751e-06, "loss": 0.00161561, "memory(GiB)": 26.31, "step": 5420, "train_speed(iter/s)": 0.580155 }, { "acc": 0.99925451, "epoch": 5.752916224814422, "grad_norm": 0.04731239378452301, "learning_rate": 4.173240598965944e-06, "loss": 0.00405467, "memory(GiB)": 26.31, "step": 5425, "train_speed(iter/s)": 0.580157 }, { "acc": 0.99961529, "epoch": 5.758218451749735, "grad_norm": 0.22838689386844635, "learning_rate": 4.164596038415176e-06, "loss": 0.0010858, "memory(GiB)": 26.31, "step": 5430, "train_speed(iter/s)": 0.580164 }, { "acc": 0.9991354, "epoch": 5.7635206786850475, "grad_norm": 0.07062011957168579, "learning_rate": 4.155954048105779e-06, "loss": 0.00281267, "memory(GiB)": 26.31, "step": 5435, "train_speed(iter/s)": 0.580165 }, { "acc": 0.99977236, "epoch": 5.768822905620361, "grad_norm": 0.06927081197500229, "learning_rate": 4.1473146546101865e-06, "loss": 0.00067332, "memory(GiB)": 26.31, "step": 5440, "train_speed(iter/s)": 0.580172 }, { "acc": 0.9998889, "epoch": 5.774125132555674, "grad_norm": 0.14817936718463898, "learning_rate": 4.138677884492846e-06, "loss": 0.00083515, "memory(GiB)": 26.31, "step": 5445, "train_speed(iter/s)": 0.580178 }, { "acc": 0.99987869, "epoch": 5.779427359490986, "grad_norm": 0.05728980153799057, "learning_rate": 4.130043764310138e-06, "loss": 0.00039365, "memory(GiB)": 26.31, "step": 5450, "train_speed(iter/s)": 0.580179 }, { "acc": 0.99962482, "epoch": 5.784729586426299, "grad_norm": 0.5075556635856628, "learning_rate": 4.121412320610294e-06, "loss": 0.00210712, "memory(GiB)": 26.31, "step": 5455, "train_speed(iter/s)": 0.58018 }, { "acc": 0.99939346, "epoch": 5.790031813361612, "grad_norm": 0.4643910825252533, "learning_rate": 4.112783579933319e-06, "loss": 0.00085811, "memory(GiB)": 26.31, "step": 5460, "train_speed(iter/s)": 0.580182 }, { "acc": 1.0, "epoch": 5.7953340402969244, "grad_norm": 0.22472567856311798, "learning_rate": 4.1041575688109034e-06, "loss": 0.00029708, "memory(GiB)": 26.31, "step": 5465, "train_speed(iter/s)": 0.580188 }, { "acc": 0.99951172, "epoch": 5.800636267232237, "grad_norm": 0.0859142392873764, "learning_rate": 4.0955343137663466e-06, "loss": 0.00106492, "memory(GiB)": 26.31, "step": 5470, "train_speed(iter/s)": 0.58019 }, { "acc": 0.99962921, "epoch": 5.805938494167551, "grad_norm": 0.0403328463435173, "learning_rate": 4.086913841314474e-06, "loss": 0.0008122, "memory(GiB)": 26.31, "step": 5475, "train_speed(iter/s)": 0.580193 }, { "acc": 0.99966984, "epoch": 5.811240721102863, "grad_norm": 0.1314668506383896, "learning_rate": 4.078296177961553e-06, "loss": 0.00078211, "memory(GiB)": 26.31, "step": 5480, "train_speed(iter/s)": 0.580202 }, { "acc": 0.99951582, "epoch": 5.816542948038176, "grad_norm": 0.12210696190595627, "learning_rate": 4.069681350205214e-06, "loss": 0.00171753, "memory(GiB)": 26.31, "step": 5485, "train_speed(iter/s)": 0.580204 }, { "acc": 0.99915504, "epoch": 5.821845174973489, "grad_norm": 0.7598603963851929, "learning_rate": 4.06106938453437e-06, "loss": 0.00366015, "memory(GiB)": 26.31, "step": 5490, "train_speed(iter/s)": 0.580207 }, { "acc": 0.99903126, "epoch": 5.827147401908801, "grad_norm": 0.2603316903114319, "learning_rate": 4.0524603074291355e-06, "loss": 0.00248764, "memory(GiB)": 26.31, "step": 5495, "train_speed(iter/s)": 0.580214 }, { "acc": 0.99948807, "epoch": 5.832449628844115, "grad_norm": 0.06389316916465759, "learning_rate": 4.043854145360737e-06, "loss": 0.00247518, "memory(GiB)": 26.31, "step": 5500, "train_speed(iter/s)": 0.580215 }, { "acc": 0.99949951, "epoch": 5.837751855779428, "grad_norm": 0.43321940302848816, "learning_rate": 4.035250924791445e-06, "loss": 0.00341372, "memory(GiB)": 26.31, "step": 5505, "train_speed(iter/s)": 0.580221 }, { "acc": 1.0, "epoch": 5.84305408271474, "grad_norm": 0.21487638354301453, "learning_rate": 4.026650672174478e-06, "loss": 0.0005853, "memory(GiB)": 26.31, "step": 5510, "train_speed(iter/s)": 0.580226 }, { "acc": 0.99974747, "epoch": 5.848356309650053, "grad_norm": 0.10891429334878922, "learning_rate": 4.018053413953936e-06, "loss": 0.00077117, "memory(GiB)": 26.31, "step": 5515, "train_speed(iter/s)": 0.580229 }, { "acc": 0.99962959, "epoch": 5.853658536585366, "grad_norm": 0.06120121479034424, "learning_rate": 4.0094591765647055e-06, "loss": 0.00084518, "memory(GiB)": 26.31, "step": 5520, "train_speed(iter/s)": 0.580233 }, { "acc": 0.99952469, "epoch": 5.858960763520678, "grad_norm": 0.7826321125030518, "learning_rate": 4.00086798643239e-06, "loss": 0.00178249, "memory(GiB)": 26.31, "step": 5525, "train_speed(iter/s)": 0.580244 }, { "acc": 0.99960136, "epoch": 5.864262990455991, "grad_norm": 0.0882963016629219, "learning_rate": 3.992279869973219e-06, "loss": 0.00086801, "memory(GiB)": 26.31, "step": 5530, "train_speed(iter/s)": 0.580247 }, { "acc": 0.99987049, "epoch": 5.869565217391305, "grad_norm": 0.08971802890300751, "learning_rate": 3.983694853593975e-06, "loss": 0.00069402, "memory(GiB)": 26.31, "step": 5535, "train_speed(iter/s)": 0.580249 }, { "acc": 0.99986706, "epoch": 5.874867444326617, "grad_norm": 0.09412523359060287, "learning_rate": 3.975112963691903e-06, "loss": 0.00077367, "memory(GiB)": 26.31, "step": 5540, "train_speed(iter/s)": 0.580254 }, { "acc": 0.99965477, "epoch": 5.88016967126193, "grad_norm": 0.44179901480674744, "learning_rate": 3.966534226654638e-06, "loss": 0.00075631, "memory(GiB)": 26.31, "step": 5545, "train_speed(iter/s)": 0.580256 }, { "acc": 0.99975748, "epoch": 5.885471898197243, "grad_norm": 0.07655708491802216, "learning_rate": 3.957958668860124e-06, "loss": 0.00042496, "memory(GiB)": 26.31, "step": 5550, "train_speed(iter/s)": 0.580261 }, { "acc": 1.0, "epoch": 5.890774125132555, "grad_norm": 0.11391156911849976, "learning_rate": 3.9493863166765216e-06, "loss": 0.00015957, "memory(GiB)": 26.31, "step": 5555, "train_speed(iter/s)": 0.580265 }, { "acc": 0.99986839, "epoch": 5.896076352067869, "grad_norm": 0.0224043820053339, "learning_rate": 3.940817196462143e-06, "loss": 0.00036973, "memory(GiB)": 26.31, "step": 5560, "train_speed(iter/s)": 0.580273 }, { "acc": 0.99961348, "epoch": 5.901378579003182, "grad_norm": 0.495105504989624, "learning_rate": 3.932251334565355e-06, "loss": 0.00118398, "memory(GiB)": 26.31, "step": 5565, "train_speed(iter/s)": 0.580274 }, { "acc": 0.99988375, "epoch": 5.906680805938494, "grad_norm": 0.08777466416358948, "learning_rate": 3.923688757324512e-06, "loss": 0.00026788, "memory(GiB)": 26.31, "step": 5570, "train_speed(iter/s)": 0.580277 }, { "acc": 0.99988422, "epoch": 5.911983032873807, "grad_norm": 0.014756974764168262, "learning_rate": 3.915129491067865e-06, "loss": 0.00051979, "memory(GiB)": 26.31, "step": 5575, "train_speed(iter/s)": 0.580279 }, { "acc": 0.9996336, "epoch": 5.91728525980912, "grad_norm": 0.39819061756134033, "learning_rate": 3.906573562113485e-06, "loss": 0.00142694, "memory(GiB)": 26.31, "step": 5580, "train_speed(iter/s)": 0.580286 }, { "acc": 0.99911852, "epoch": 5.922587486744432, "grad_norm": 0.43225204944610596, "learning_rate": 3.898020996769183e-06, "loss": 0.00255523, "memory(GiB)": 26.31, "step": 5585, "train_speed(iter/s)": 0.58029 }, { "acc": 0.99950981, "epoch": 5.927889713679745, "grad_norm": 0.39705580472946167, "learning_rate": 3.8894718213324265e-06, "loss": 0.00156185, "memory(GiB)": 26.31, "step": 5590, "train_speed(iter/s)": 0.580292 }, { "acc": 0.99985552, "epoch": 5.933191940615059, "grad_norm": 0.013175041414797306, "learning_rate": 3.88092606209026e-06, "loss": 0.00043316, "memory(GiB)": 26.31, "step": 5595, "train_speed(iter/s)": 0.580292 }, { "acc": 0.99986343, "epoch": 5.938494167550371, "grad_norm": 0.03439586982131004, "learning_rate": 3.872383745319222e-06, "loss": 0.00046334, "memory(GiB)": 26.31, "step": 5600, "train_speed(iter/s)": 0.580293 }, { "acc": 0.99962111, "epoch": 5.943796394485684, "grad_norm": 0.24579821527004242, "learning_rate": 3.8638448972852696e-06, "loss": 0.00096211, "memory(GiB)": 26.31, "step": 5605, "train_speed(iter/s)": 0.580295 }, { "acc": 0.99963531, "epoch": 5.949098621420997, "grad_norm": 0.3907880187034607, "learning_rate": 3.8553095442436914e-06, "loss": 0.00164328, "memory(GiB)": 26.31, "step": 5610, "train_speed(iter/s)": 0.580297 }, { "acc": 0.99963646, "epoch": 5.954400848356309, "grad_norm": 0.0487142838537693, "learning_rate": 3.8467777124390305e-06, "loss": 0.00131084, "memory(GiB)": 26.31, "step": 5615, "train_speed(iter/s)": 0.580299 }, { "acc": 0.99961796, "epoch": 5.959703075291623, "grad_norm": 0.18401463329792023, "learning_rate": 3.838249428105002e-06, "loss": 0.00148167, "memory(GiB)": 26.31, "step": 5620, "train_speed(iter/s)": 0.580301 }, { "acc": 0.9992384, "epoch": 5.965005302226936, "grad_norm": 0.6816220879554749, "learning_rate": 3.829724717464415e-06, "loss": 0.00182049, "memory(GiB)": 26.31, "step": 5625, "train_speed(iter/s)": 0.580303 }, { "acc": 0.99937592, "epoch": 5.970307529162248, "grad_norm": 0.031764183193445206, "learning_rate": 3.82120360672909e-06, "loss": 0.00159306, "memory(GiB)": 26.31, "step": 5630, "train_speed(iter/s)": 0.580304 }, { "acc": 0.99964085, "epoch": 5.975609756097561, "grad_norm": 0.01764695718884468, "learning_rate": 3.812686122099777e-06, "loss": 0.00055448, "memory(GiB)": 26.31, "step": 5635, "train_speed(iter/s)": 0.580313 }, { "acc": 0.99962826, "epoch": 5.980911983032874, "grad_norm": 0.054920535534620285, "learning_rate": 3.8041722897660766e-06, "loss": 0.00077166, "memory(GiB)": 26.31, "step": 5640, "train_speed(iter/s)": 0.580314 }, { "acc": 0.99951496, "epoch": 5.986214209968186, "grad_norm": 0.5747213959693909, "learning_rate": 3.7956621359063607e-06, "loss": 0.00069109, "memory(GiB)": 26.31, "step": 5645, "train_speed(iter/s)": 0.580319 }, { "acc": 0.99986773, "epoch": 5.991516436903499, "grad_norm": 0.014520245604217052, "learning_rate": 3.7871556866876886e-06, "loss": 0.00121044, "memory(GiB)": 26.31, "step": 5650, "train_speed(iter/s)": 0.580322 }, { "acc": 0.99964981, "epoch": 5.996818663838813, "grad_norm": 0.18743537366390228, "learning_rate": 3.7786529682657307e-06, "loss": 0.00232155, "memory(GiB)": 26.31, "step": 5655, "train_speed(iter/s)": 0.580325 }, { "acc": 0.99961834, "epoch": 6.002120890774125, "grad_norm": 0.1396491974592209, "learning_rate": 3.7701540067846855e-06, "loss": 0.00328599, "memory(GiB)": 26.31, "step": 5660, "train_speed(iter/s)": 0.580272 }, { "acc": 0.99976521, "epoch": 6.007423117709438, "grad_norm": 0.07440268248319626, "learning_rate": 3.7616588283771987e-06, "loss": 0.00044463, "memory(GiB)": 26.31, "step": 5665, "train_speed(iter/s)": 0.580273 }, { "acc": 0.99973955, "epoch": 6.012725344644751, "grad_norm": 0.12697941064834595, "learning_rate": 3.7531674591642843e-06, "loss": 0.00148034, "memory(GiB)": 26.31, "step": 5670, "train_speed(iter/s)": 0.580275 }, { "acc": 0.9996397, "epoch": 6.018027571580063, "grad_norm": 0.05559253692626953, "learning_rate": 3.7446799252552435e-06, "loss": 0.00097631, "memory(GiB)": 26.31, "step": 5675, "train_speed(iter/s)": 0.580278 }, { "acc": 0.99927549, "epoch": 6.023329798515377, "grad_norm": 0.07566984742879868, "learning_rate": 3.736196252747585e-06, "loss": 0.00127908, "memory(GiB)": 26.31, "step": 5680, "train_speed(iter/s)": 0.58028 }, { "acc": 0.99911137, "epoch": 6.0286320254506895, "grad_norm": 0.5210506319999695, "learning_rate": 3.7277164677269428e-06, "loss": 0.00300755, "memory(GiB)": 26.31, "step": 5685, "train_speed(iter/s)": 0.580285 }, { "acc": 0.99961376, "epoch": 6.033934252386002, "grad_norm": 0.026948614045977592, "learning_rate": 3.7192405962670007e-06, "loss": 0.00089632, "memory(GiB)": 26.31, "step": 5690, "train_speed(iter/s)": 0.580288 }, { "acc": 0.99959393, "epoch": 6.039236479321315, "grad_norm": 0.08892546594142914, "learning_rate": 3.710768664429409e-06, "loss": 0.00135952, "memory(GiB)": 26.31, "step": 5695, "train_speed(iter/s)": 0.58029 }, { "acc": 0.99962883, "epoch": 6.044538706256628, "grad_norm": 0.039922814816236496, "learning_rate": 3.7023006982637e-06, "loss": 0.0013433, "memory(GiB)": 26.31, "step": 5700, "train_speed(iter/s)": 0.580292 }, { "acc": 0.99964199, "epoch": 6.04984093319194, "grad_norm": 0.30086588859558105, "learning_rate": 3.693836723807217e-06, "loss": 0.00114452, "memory(GiB)": 26.31, "step": 5705, "train_speed(iter/s)": 0.5803 }, { "acc": 1.0, "epoch": 6.055143160127254, "grad_norm": 0.05265273526310921, "learning_rate": 3.6853767670850277e-06, "loss": 0.00043265, "memory(GiB)": 26.31, "step": 5710, "train_speed(iter/s)": 0.580305 }, { "acc": 0.99988737, "epoch": 6.0604453870625665, "grad_norm": 0.20193615555763245, "learning_rate": 3.6769208541098445e-06, "loss": 0.0004146, "memory(GiB)": 26.31, "step": 5715, "train_speed(iter/s)": 0.58031 }, { "acc": 0.99975872, "epoch": 6.065747613997879, "grad_norm": 0.09985252469778061, "learning_rate": 3.6684690108819503e-06, "loss": 0.00076501, "memory(GiB)": 26.31, "step": 5720, "train_speed(iter/s)": 0.580312 }, { "acc": 0.99988317, "epoch": 6.071049840933192, "grad_norm": 0.12704992294311523, "learning_rate": 3.6600212633891115e-06, "loss": 0.00077298, "memory(GiB)": 26.31, "step": 5725, "train_speed(iter/s)": 0.580317 }, { "acc": 0.99954891, "epoch": 6.076352067868505, "grad_norm": 0.24382582306861877, "learning_rate": 3.6515776376064993e-06, "loss": 0.00140351, "memory(GiB)": 26.31, "step": 5730, "train_speed(iter/s)": 0.580321 }, { "acc": 0.99947262, "epoch": 6.081654294803817, "grad_norm": 0.2057705819606781, "learning_rate": 3.6431381594966132e-06, "loss": 0.00178049, "memory(GiB)": 26.31, "step": 5735, "train_speed(iter/s)": 0.580325 }, { "acc": 0.99937897, "epoch": 6.086956521739131, "grad_norm": 0.12836192548274994, "learning_rate": 3.634702855009202e-06, "loss": 0.00141335, "memory(GiB)": 26.31, "step": 5740, "train_speed(iter/s)": 0.580327 }, { "acc": 0.99946241, "epoch": 6.0922587486744435, "grad_norm": 0.024506429210305214, "learning_rate": 3.626271750081179e-06, "loss": 0.00189124, "memory(GiB)": 26.31, "step": 5745, "train_speed(iter/s)": 0.580333 }, { "acc": 0.99987984, "epoch": 6.097560975609756, "grad_norm": 0.09324854612350464, "learning_rate": 3.6178448706365425e-06, "loss": 0.00046005, "memory(GiB)": 26.31, "step": 5750, "train_speed(iter/s)": 0.580335 }, { "acc": 0.99986706, "epoch": 6.102863202545069, "grad_norm": 0.2579773962497711, "learning_rate": 3.609422242586302e-06, "loss": 0.00109415, "memory(GiB)": 26.31, "step": 5755, "train_speed(iter/s)": 0.580341 }, { "acc": 0.99936991, "epoch": 6.1081654294803815, "grad_norm": 1.3310717344284058, "learning_rate": 3.601003891828393e-06, "loss": 0.00154827, "memory(GiB)": 26.31, "step": 5760, "train_speed(iter/s)": 0.580343 }, { "acc": 0.99950371, "epoch": 6.113467656415694, "grad_norm": 0.8870381712913513, "learning_rate": 3.592589844247599e-06, "loss": 0.00193892, "memory(GiB)": 26.31, "step": 5765, "train_speed(iter/s)": 0.580345 }, { "acc": 0.9996335, "epoch": 6.118769883351008, "grad_norm": 0.17148469388484955, "learning_rate": 3.5841801257154724e-06, "loss": 0.00105367, "memory(GiB)": 26.31, "step": 5770, "train_speed(iter/s)": 0.580347 }, { "acc": 0.99933376, "epoch": 6.1240721102863205, "grad_norm": 0.3207806646823883, "learning_rate": 3.575774762090255e-06, "loss": 0.00165185, "memory(GiB)": 26.31, "step": 5775, "train_speed(iter/s)": 0.58035 }, { "acc": 0.99961987, "epoch": 6.129374337221633, "grad_norm": 0.5090070962905884, "learning_rate": 3.5673737792167974e-06, "loss": 0.00113542, "memory(GiB)": 26.31, "step": 5780, "train_speed(iter/s)": 0.580356 }, { "acc": 0.99986839, "epoch": 6.134676564156946, "grad_norm": 0.870831310749054, "learning_rate": 3.5589772029264806e-06, "loss": 0.00054594, "memory(GiB)": 26.31, "step": 5785, "train_speed(iter/s)": 0.580358 }, { "acc": 0.99954042, "epoch": 6.1399787910922585, "grad_norm": 0.043732356280088425, "learning_rate": 3.550585059037138e-06, "loss": 0.00141125, "memory(GiB)": 26.31, "step": 5790, "train_speed(iter/s)": 0.58036 }, { "acc": 0.99966211, "epoch": 6.145281018027571, "grad_norm": 0.08508095890283585, "learning_rate": 3.5421973733529703e-06, "loss": 0.00087431, "memory(GiB)": 26.31, "step": 5795, "train_speed(iter/s)": 0.580365 }, { "acc": 0.99954786, "epoch": 6.150583244962885, "grad_norm": 0.06490044295787811, "learning_rate": 3.5338141716644734e-06, "loss": 0.00147838, "memory(GiB)": 26.31, "step": 5800, "train_speed(iter/s)": 0.580367 }, { "acc": 0.99926376, "epoch": 6.1558854718981975, "grad_norm": 0.5829356908798218, "learning_rate": 3.5254354797483547e-06, "loss": 0.00214265, "memory(GiB)": 26.31, "step": 5805, "train_speed(iter/s)": 0.580374 }, { "acc": 0.99986916, "epoch": 6.16118769883351, "grad_norm": 0.05491666868329048, "learning_rate": 3.517061323367454e-06, "loss": 0.00057996, "memory(GiB)": 26.31, "step": 5810, "train_speed(iter/s)": 0.580377 }, { "acc": 0.99975939, "epoch": 6.166489925768823, "grad_norm": 0.16530375182628632, "learning_rate": 3.508691728270666e-06, "loss": 0.00065435, "memory(GiB)": 26.31, "step": 5815, "train_speed(iter/s)": 0.58038 }, { "acc": 0.99928436, "epoch": 6.1717921527041355, "grad_norm": 0.3584500849246979, "learning_rate": 3.500326720192862e-06, "loss": 0.00136761, "memory(GiB)": 26.31, "step": 5820, "train_speed(iter/s)": 0.580382 }, { "acc": 0.99927006, "epoch": 6.177094379639448, "grad_norm": 0.483460396528244, "learning_rate": 3.4919663248548074e-06, "loss": 0.00137038, "memory(GiB)": 26.31, "step": 5825, "train_speed(iter/s)": 0.580385 }, { "acc": 1.0, "epoch": 6.182396606574762, "grad_norm": 0.40385866165161133, "learning_rate": 3.483610567963083e-06, "loss": 0.00039675, "memory(GiB)": 26.31, "step": 5830, "train_speed(iter/s)": 0.580387 }, { "acc": 0.99945812, "epoch": 6.187698833510074, "grad_norm": 0.3637162744998932, "learning_rate": 3.4752594752100104e-06, "loss": 0.00089644, "memory(GiB)": 26.31, "step": 5835, "train_speed(iter/s)": 0.580392 }, { "acc": 0.99965353, "epoch": 6.193001060445387, "grad_norm": 0.01765528880059719, "learning_rate": 3.4669130722735677e-06, "loss": 0.00234709, "memory(GiB)": 26.31, "step": 5840, "train_speed(iter/s)": 0.580395 }, { "acc": 0.99975843, "epoch": 6.1983032873807, "grad_norm": 0.05318576842546463, "learning_rate": 3.4585713848173103e-06, "loss": 0.00053864, "memory(GiB)": 26.31, "step": 5845, "train_speed(iter/s)": 0.580397 }, { "acc": 0.99962549, "epoch": 6.2036055143160125, "grad_norm": 0.1479036659002304, "learning_rate": 3.450234438490302e-06, "loss": 0.00229071, "memory(GiB)": 26.31, "step": 5850, "train_speed(iter/s)": 0.580397 }, { "acc": 0.9994688, "epoch": 6.208907741251325, "grad_norm": 0.29508110880851746, "learning_rate": 3.441902258927023e-06, "loss": 0.00189362, "memory(GiB)": 26.31, "step": 5855, "train_speed(iter/s)": 0.580402 }, { "acc": 0.99962225, "epoch": 6.214209968186639, "grad_norm": 0.07441110908985138, "learning_rate": 3.4335748717472966e-06, "loss": 0.00136693, "memory(GiB)": 26.31, "step": 5860, "train_speed(iter/s)": 0.580404 }, { "acc": 0.99952202, "epoch": 6.219512195121951, "grad_norm": 0.049704521894454956, "learning_rate": 3.4252523025562127e-06, "loss": 0.00066762, "memory(GiB)": 26.31, "step": 5865, "train_speed(iter/s)": 0.580406 }, { "acc": 0.99935627, "epoch": 6.224814422057264, "grad_norm": 0.3847588896751404, "learning_rate": 3.4169345769440435e-06, "loss": 0.0027355, "memory(GiB)": 26.31, "step": 5870, "train_speed(iter/s)": 0.580411 }, { "acc": 0.99940147, "epoch": 6.230116648992577, "grad_norm": 0.10670210421085358, "learning_rate": 3.4086217204861722e-06, "loss": 0.00174768, "memory(GiB)": 26.31, "step": 5875, "train_speed(iter/s)": 0.580413 }, { "acc": 0.99975185, "epoch": 6.2354188759278895, "grad_norm": 0.04808919504284859, "learning_rate": 3.400313758743006e-06, "loss": 0.00071273, "memory(GiB)": 26.31, "step": 5880, "train_speed(iter/s)": 0.580414 }, { "acc": 0.99987183, "epoch": 6.240721102863202, "grad_norm": 0.04007013142108917, "learning_rate": 3.392010717259907e-06, "loss": 0.00057667, "memory(GiB)": 26.31, "step": 5885, "train_speed(iter/s)": 0.580419 }, { "acc": 0.99977446, "epoch": 6.246023329798516, "grad_norm": 0.38751649856567383, "learning_rate": 3.383712621567104e-06, "loss": 0.0004282, "memory(GiB)": 26.31, "step": 5890, "train_speed(iter/s)": 0.580424 }, { "acc": 1.0, "epoch": 6.251325556733828, "grad_norm": 0.02446839213371277, "learning_rate": 3.37541949717962e-06, "loss": 0.00021479, "memory(GiB)": 26.31, "step": 5895, "train_speed(iter/s)": 0.580426 }, { "acc": 1.0, "epoch": 6.256627783669141, "grad_norm": 0.03881816565990448, "learning_rate": 3.367131369597193e-06, "loss": 0.00025941, "memory(GiB)": 26.31, "step": 5900, "train_speed(iter/s)": 0.580428 }, { "acc": 0.99954081, "epoch": 6.261930010604454, "grad_norm": 0.10912594944238663, "learning_rate": 3.3588482643041955e-06, "loss": 0.00139892, "memory(GiB)": 26.31, "step": 5905, "train_speed(iter/s)": 0.580434 }, { "acc": 0.9998641, "epoch": 6.267232237539766, "grad_norm": 0.03487817570567131, "learning_rate": 3.3505702067695577e-06, "loss": 0.00065392, "memory(GiB)": 26.31, "step": 5910, "train_speed(iter/s)": 0.580436 }, { "acc": 0.99950733, "epoch": 6.272534464475079, "grad_norm": 0.21163144707679749, "learning_rate": 3.3422972224466905e-06, "loss": 0.00111407, "memory(GiB)": 26.31, "step": 5915, "train_speed(iter/s)": 0.580442 }, { "acc": 1.0, "epoch": 6.277836691410393, "grad_norm": 0.3267722427845001, "learning_rate": 3.334029336773403e-06, "loss": 0.00024705, "memory(GiB)": 26.31, "step": 5920, "train_speed(iter/s)": 0.580444 }, { "acc": 0.9996254, "epoch": 6.283138918345705, "grad_norm": 0.06944040954113007, "learning_rate": 3.32576657517183e-06, "loss": 0.00047762, "memory(GiB)": 26.31, "step": 5925, "train_speed(iter/s)": 0.580446 }, { "acc": 0.99964447, "epoch": 6.288441145281018, "grad_norm": 0.5120355486869812, "learning_rate": 3.3175089630483474e-06, "loss": 0.00079916, "memory(GiB)": 26.31, "step": 5930, "train_speed(iter/s)": 0.580449 }, { "acc": 0.99948606, "epoch": 6.293743372216331, "grad_norm": 0.38711291551589966, "learning_rate": 3.3092565257935004e-06, "loss": 0.00121321, "memory(GiB)": 26.31, "step": 5935, "train_speed(iter/s)": 0.58045 }, { "acc": 0.99974937, "epoch": 6.299045599151643, "grad_norm": 0.2560780644416809, "learning_rate": 3.3010092887819207e-06, "loss": 0.00046337, "memory(GiB)": 26.31, "step": 5940, "train_speed(iter/s)": 0.580453 }, { "acc": 0.99935503, "epoch": 6.304347826086957, "grad_norm": 0.19002583622932434, "learning_rate": 3.29276727737225e-06, "loss": 0.00214132, "memory(GiB)": 26.31, "step": 5945, "train_speed(iter/s)": 0.580458 }, { "acc": 0.99951153, "epoch": 6.30965005302227, "grad_norm": 0.35275277495384216, "learning_rate": 3.2845305169070658e-06, "loss": 0.00183648, "memory(GiB)": 26.31, "step": 5950, "train_speed(iter/s)": 0.580463 }, { "acc": 0.99953251, "epoch": 6.314952279957582, "grad_norm": 0.36648836731910706, "learning_rate": 3.2762990327127924e-06, "loss": 0.00148667, "memory(GiB)": 26.31, "step": 5955, "train_speed(iter/s)": 0.580466 }, { "acc": 0.99934921, "epoch": 6.320254506892895, "grad_norm": 0.5325089693069458, "learning_rate": 3.268072850099642e-06, "loss": 0.00145414, "memory(GiB)": 26.31, "step": 5960, "train_speed(iter/s)": 0.580472 }, { "acc": 0.99988632, "epoch": 6.325556733828208, "grad_norm": 0.43265655636787415, "learning_rate": 3.259851994361516e-06, "loss": 0.00036228, "memory(GiB)": 26.31, "step": 5965, "train_speed(iter/s)": 0.580474 }, { "acc": 0.99985638, "epoch": 6.33085896076352, "grad_norm": 0.12824462354183197, "learning_rate": 3.2516364907759384e-06, "loss": 0.00056045, "memory(GiB)": 26.31, "step": 5970, "train_speed(iter/s)": 0.580479 }, { "acc": 1.0, "epoch": 6.336161187698833, "grad_norm": 0.09254786372184753, "learning_rate": 3.24342636460398e-06, "loss": 0.00025173, "memory(GiB)": 26.31, "step": 5975, "train_speed(iter/s)": 0.580481 }, { "acc": 0.99964771, "epoch": 6.341463414634147, "grad_norm": 0.018121229484677315, "learning_rate": 3.2352216410901717e-06, "loss": 0.00077881, "memory(GiB)": 26.31, "step": 5980, "train_speed(iter/s)": 0.580483 }, { "acc": 0.99964867, "epoch": 6.346765641569459, "grad_norm": 0.2775912284851074, "learning_rate": 3.227022345462438e-06, "loss": 0.00143707, "memory(GiB)": 26.31, "step": 5985, "train_speed(iter/s)": 0.580484 }, { "acc": 0.9995945, "epoch": 6.352067868504772, "grad_norm": 0.21149085462093353, "learning_rate": 3.218828502932011e-06, "loss": 0.00178856, "memory(GiB)": 26.31, "step": 5990, "train_speed(iter/s)": 0.580487 }, { "acc": 0.99965143, "epoch": 6.357370095440085, "grad_norm": 0.11793508380651474, "learning_rate": 3.210640138693354e-06, "loss": 0.00077981, "memory(GiB)": 26.31, "step": 5995, "train_speed(iter/s)": 0.580491 }, { "acc": 1.0, "epoch": 6.362672322375397, "grad_norm": 0.08975546807050705, "learning_rate": 3.2024572779240894e-06, "loss": 0.00041794, "memory(GiB)": 26.31, "step": 6000, "train_speed(iter/s)": 0.580493 }, { "acc": 0.99987803, "epoch": 6.367974549310711, "grad_norm": 0.07812675833702087, "learning_rate": 3.1942799457849133e-06, "loss": 0.00044167, "memory(GiB)": 26.31, "step": 6005, "train_speed(iter/s)": 0.580495 }, { "acc": 0.99948139, "epoch": 6.373276776246024, "grad_norm": 0.2446129322052002, "learning_rate": 3.1861081674195256e-06, "loss": 0.00090716, "memory(GiB)": 26.31, "step": 6010, "train_speed(iter/s)": 0.580499 }, { "acc": 0.99952393, "epoch": 6.378579003181336, "grad_norm": 0.06943213939666748, "learning_rate": 3.1779419679545477e-06, "loss": 0.00096079, "memory(GiB)": 26.31, "step": 6015, "train_speed(iter/s)": 0.580503 }, { "acc": 0.99987497, "epoch": 6.383881230116649, "grad_norm": 0.0138889504596591, "learning_rate": 3.1697813724994486e-06, "loss": 0.00091873, "memory(GiB)": 26.31, "step": 6020, "train_speed(iter/s)": 0.580504 }, { "acc": 0.99928799, "epoch": 6.389183457051962, "grad_norm": 0.6376550793647766, "learning_rate": 3.161626406146464e-06, "loss": 0.00331742, "memory(GiB)": 26.31, "step": 6025, "train_speed(iter/s)": 0.580505 }, { "acc": 0.99977684, "epoch": 6.394485683987274, "grad_norm": 0.24388806521892548, "learning_rate": 3.153477093970523e-06, "loss": 0.00077564, "memory(GiB)": 26.31, "step": 6030, "train_speed(iter/s)": 0.580507 }, { "acc": 0.9996335, "epoch": 6.399787910922587, "grad_norm": 0.03800429403781891, "learning_rate": 3.1453334610291675e-06, "loss": 0.00044972, "memory(GiB)": 26.31, "step": 6035, "train_speed(iter/s)": 0.580512 }, { "acc": 0.9997385, "epoch": 6.405090137857901, "grad_norm": 0.27016544342041016, "learning_rate": 3.1371955323624764e-06, "loss": 0.00117639, "memory(GiB)": 26.31, "step": 6040, "train_speed(iter/s)": 0.580514 }, { "acc": 0.99986916, "epoch": 6.410392364793213, "grad_norm": 0.4027722179889679, "learning_rate": 3.1290633329929947e-06, "loss": 0.00075675, "memory(GiB)": 26.31, "step": 6045, "train_speed(iter/s)": 0.580516 }, { "acc": 0.9998744, "epoch": 6.415694591728526, "grad_norm": 0.09925177693367004, "learning_rate": 3.1209368879256437e-06, "loss": 0.00023921, "memory(GiB)": 26.31, "step": 6050, "train_speed(iter/s)": 0.580523 }, { "acc": 0.9996172, "epoch": 6.420996818663839, "grad_norm": 0.04051406309008598, "learning_rate": 3.112816222147655e-06, "loss": 0.00120936, "memory(GiB)": 26.31, "step": 6055, "train_speed(iter/s)": 0.580525 }, { "acc": 0.99975414, "epoch": 6.426299045599151, "grad_norm": 0.10684379935264587, "learning_rate": 3.1047013606284887e-06, "loss": 0.00150528, "memory(GiB)": 26.31, "step": 6060, "train_speed(iter/s)": 0.58053 }, { "acc": 0.99965429, "epoch": 6.431601272534465, "grad_norm": 0.0392468124628067, "learning_rate": 3.096592328319758e-06, "loss": 0.00054083, "memory(GiB)": 26.31, "step": 6065, "train_speed(iter/s)": 0.580537 }, { "acc": 0.99987803, "epoch": 6.436903499469778, "grad_norm": 0.04810251295566559, "learning_rate": 3.0884891501551552e-06, "loss": 0.00025685, "memory(GiB)": 26.31, "step": 6070, "train_speed(iter/s)": 0.580542 }, { "acc": 0.99963684, "epoch": 6.44220572640509, "grad_norm": 0.06519544869661331, "learning_rate": 3.0803918510503688e-06, "loss": 0.00039638, "memory(GiB)": 26.31, "step": 6075, "train_speed(iter/s)": 0.580553 }, { "acc": 0.99988785, "epoch": 6.447507953340403, "grad_norm": 0.024030832573771477, "learning_rate": 3.072300455903011e-06, "loss": 0.00040283, "memory(GiB)": 26.31, "step": 6080, "train_speed(iter/s)": 0.580555 }, { "acc": 0.99975195, "epoch": 6.452810180275716, "grad_norm": 0.03808549419045448, "learning_rate": 3.0642149895925435e-06, "loss": 0.00141243, "memory(GiB)": 26.31, "step": 6085, "train_speed(iter/s)": 0.580556 }, { "acc": 0.99953356, "epoch": 6.458112407211028, "grad_norm": 0.032950401306152344, "learning_rate": 3.056135476980193e-06, "loss": 0.00124777, "memory(GiB)": 26.31, "step": 6090, "train_speed(iter/s)": 0.580558 }, { "acc": 0.99964943, "epoch": 6.463414634146342, "grad_norm": 0.09107203036546707, "learning_rate": 3.0480619429088845e-06, "loss": 0.00064693, "memory(GiB)": 26.31, "step": 6095, "train_speed(iter/s)": 0.580561 }, { "acc": 0.99975853, "epoch": 6.468716861081655, "grad_norm": 0.4441612660884857, "learning_rate": 3.039994412203155e-06, "loss": 0.00034327, "memory(GiB)": 26.31, "step": 6100, "train_speed(iter/s)": 0.580563 }, { "acc": 0.99963551, "epoch": 6.474019088016967, "grad_norm": 0.2487131804227829, "learning_rate": 3.0319329096690882e-06, "loss": 0.00101695, "memory(GiB)": 26.31, "step": 6105, "train_speed(iter/s)": 0.580564 }, { "acc": 0.99965181, "epoch": 6.47932131495228, "grad_norm": 0.5057806968688965, "learning_rate": 3.023877460094226e-06, "loss": 0.00179216, "memory(GiB)": 26.31, "step": 6110, "train_speed(iter/s)": 0.580565 }, { "acc": 0.99960289, "epoch": 6.484623541887593, "grad_norm": 0.1770421415567398, "learning_rate": 3.0158280882475062e-06, "loss": 0.00084595, "memory(GiB)": 26.31, "step": 6115, "train_speed(iter/s)": 0.580567 }, { "acc": 0.99987621, "epoch": 6.489925768822905, "grad_norm": 0.013583851978182793, "learning_rate": 3.0077848188791724e-06, "loss": 0.00031157, "memory(GiB)": 26.31, "step": 6120, "train_speed(iter/s)": 0.580567 }, { "acc": 0.99923391, "epoch": 6.495227995758219, "grad_norm": 0.10873568058013916, "learning_rate": 2.999747676720706e-06, "loss": 0.00287378, "memory(GiB)": 26.31, "step": 6125, "train_speed(iter/s)": 0.580569 }, { "acc": 0.9996233, "epoch": 6.5005302226935315, "grad_norm": 0.021153470501303673, "learning_rate": 2.991716686484751e-06, "loss": 0.00045944, "memory(GiB)": 26.31, "step": 6130, "train_speed(iter/s)": 0.580571 }, { "acc": 0.99950199, "epoch": 6.505832449628844, "grad_norm": 0.4688558876514435, "learning_rate": 2.9836918728650304e-06, "loss": 0.00141591, "memory(GiB)": 26.31, "step": 6135, "train_speed(iter/s)": 0.580573 }, { "acc": 0.99975605, "epoch": 6.511134676564157, "grad_norm": 0.511949896812439, "learning_rate": 2.97567326053628e-06, "loss": 0.00100874, "memory(GiB)": 26.31, "step": 6140, "train_speed(iter/s)": 0.580578 }, { "acc": 1.0, "epoch": 6.51643690349947, "grad_norm": 0.07330206036567688, "learning_rate": 2.967660874154166e-06, "loss": 0.00015084, "memory(GiB)": 26.31, "step": 6145, "train_speed(iter/s)": 0.58058 }, { "acc": 0.99946985, "epoch": 6.521739130434782, "grad_norm": 0.05954563617706299, "learning_rate": 2.9596547383552127e-06, "loss": 0.00098971, "memory(GiB)": 26.31, "step": 6150, "train_speed(iter/s)": 0.580583 }, { "acc": 0.9996397, "epoch": 6.527041357370095, "grad_norm": 0.08538658916950226, "learning_rate": 2.9516548777567216e-06, "loss": 0.00134288, "memory(GiB)": 26.31, "step": 6155, "train_speed(iter/s)": 0.580586 }, { "acc": 0.99972944, "epoch": 6.5323435843054085, "grad_norm": 0.03336101025342941, "learning_rate": 2.9436613169567006e-06, "loss": 0.00101618, "memory(GiB)": 26.31, "step": 6160, "train_speed(iter/s)": 0.580587 }, { "acc": 0.9998641, "epoch": 6.537645811240721, "grad_norm": 0.1260402351617813, "learning_rate": 2.9356740805337897e-06, "loss": 0.00048851, "memory(GiB)": 26.31, "step": 6165, "train_speed(iter/s)": 0.58059 }, { "acc": 0.99977436, "epoch": 6.542948038176034, "grad_norm": 0.1818159520626068, "learning_rate": 2.9276931930471765e-06, "loss": 0.00140418, "memory(GiB)": 26.31, "step": 6170, "train_speed(iter/s)": 0.580592 }, { "acc": 0.99977322, "epoch": 6.548250265111347, "grad_norm": 0.5192388296127319, "learning_rate": 2.919718679036535e-06, "loss": 0.00106864, "memory(GiB)": 26.31, "step": 6175, "train_speed(iter/s)": 0.580594 }, { "acc": 0.99987564, "epoch": 6.553552492046659, "grad_norm": 0.46558713912963867, "learning_rate": 2.9117505630219366e-06, "loss": 0.00072226, "memory(GiB)": 26.31, "step": 6180, "train_speed(iter/s)": 0.580594 }, { "acc": 0.99974155, "epoch": 6.558854718981973, "grad_norm": 0.03769663721323013, "learning_rate": 2.903788869503782e-06, "loss": 0.00071387, "memory(GiB)": 26.31, "step": 6185, "train_speed(iter/s)": 0.580596 }, { "acc": 1.0, "epoch": 6.5641569459172855, "grad_norm": 0.009774613194167614, "learning_rate": 2.8958336229627208e-06, "loss": 0.00032132, "memory(GiB)": 26.31, "step": 6190, "train_speed(iter/s)": 0.580602 }, { "acc": 0.99953594, "epoch": 6.569459172852598, "grad_norm": 0.061937376856803894, "learning_rate": 2.8878848478595844e-06, "loss": 0.00216506, "memory(GiB)": 26.31, "step": 6195, "train_speed(iter/s)": 0.580604 }, { "acc": 0.99976463, "epoch": 6.574761399787911, "grad_norm": 0.03381747752428055, "learning_rate": 2.8799425686353022e-06, "loss": 0.00101449, "memory(GiB)": 26.31, "step": 6200, "train_speed(iter/s)": 0.580608 }, { "acc": 0.99985552, "epoch": 6.5800636267232235, "grad_norm": 0.02245154045522213, "learning_rate": 2.8720068097108316e-06, "loss": 0.00081072, "memory(GiB)": 26.31, "step": 6205, "train_speed(iter/s)": 0.580613 }, { "acc": 0.99988422, "epoch": 6.585365853658536, "grad_norm": 0.040584757924079895, "learning_rate": 2.8640775954870803e-06, "loss": 0.00035723, "memory(GiB)": 26.31, "step": 6210, "train_speed(iter/s)": 0.580614 }, { "acc": 1.0, "epoch": 6.59066808059385, "grad_norm": 0.02291068434715271, "learning_rate": 2.856154950344833e-06, "loss": 0.00016179, "memory(GiB)": 26.31, "step": 6215, "train_speed(iter/s)": 0.580616 }, { "acc": 0.99978943, "epoch": 6.5959703075291625, "grad_norm": 0.16216276586055756, "learning_rate": 2.8482388986446763e-06, "loss": 0.00112295, "memory(GiB)": 26.31, "step": 6220, "train_speed(iter/s)": 0.580618 }, { "acc": 0.99913769, "epoch": 6.601272534464475, "grad_norm": 0.33439528942108154, "learning_rate": 2.84032946472692e-06, "loss": 0.00229247, "memory(GiB)": 26.31, "step": 6225, "train_speed(iter/s)": 0.580622 }, { "acc": 0.99950619, "epoch": 6.606574761399788, "grad_norm": 0.15778516232967377, "learning_rate": 2.8324266729115323e-06, "loss": 0.00081515, "memory(GiB)": 26.31, "step": 6230, "train_speed(iter/s)": 0.580624 }, { "acc": 0.99988585, "epoch": 6.6118769883351005, "grad_norm": 0.056077904999256134, "learning_rate": 2.824530547498052e-06, "loss": 0.00068123, "memory(GiB)": 26.31, "step": 6235, "train_speed(iter/s)": 0.580629 }, { "acc": 0.99976425, "epoch": 6.617179215270413, "grad_norm": 0.3225567042827606, "learning_rate": 2.816641112765523e-06, "loss": 0.00101443, "memory(GiB)": 26.31, "step": 6240, "train_speed(iter/s)": 0.580632 }, { "acc": 0.99988632, "epoch": 6.622481442205727, "grad_norm": 0.33098331093788147, "learning_rate": 2.8087583929724137e-06, "loss": 0.00050625, "memory(GiB)": 26.31, "step": 6245, "train_speed(iter/s)": 0.580633 }, { "acc": 0.99946451, "epoch": 6.6277836691410394, "grad_norm": 0.08109508454799652, "learning_rate": 2.80088241235655e-06, "loss": 0.00112214, "memory(GiB)": 26.31, "step": 6250, "train_speed(iter/s)": 0.580634 }, { "acc": 0.99975662, "epoch": 6.633085896076352, "grad_norm": 0.02807941474020481, "learning_rate": 2.793013195135032e-06, "loss": 0.00050048, "memory(GiB)": 26.31, "step": 6255, "train_speed(iter/s)": 0.58064 }, { "acc": 0.99951725, "epoch": 6.638388123011665, "grad_norm": 0.6170868277549744, "learning_rate": 2.7851507655041646e-06, "loss": 0.00163178, "memory(GiB)": 26.31, "step": 6260, "train_speed(iter/s)": 0.580648 }, { "acc": 0.99974747, "epoch": 6.6436903499469775, "grad_norm": 0.03456702455878258, "learning_rate": 2.777295147639385e-06, "loss": 0.00059733, "memory(GiB)": 26.31, "step": 6265, "train_speed(iter/s)": 0.580653 }, { "acc": 0.99989538, "epoch": 6.648992576882291, "grad_norm": 0.032612890005111694, "learning_rate": 2.7694463656951816e-06, "loss": 0.00089563, "memory(GiB)": 26.31, "step": 6270, "train_speed(iter/s)": 0.580658 }, { "acc": 0.99964666, "epoch": 6.654294803817604, "grad_norm": 0.13008488714694977, "learning_rate": 2.761604443805025e-06, "loss": 0.00074435, "memory(GiB)": 26.31, "step": 6275, "train_speed(iter/s)": 0.580662 }, { "acc": 1.0, "epoch": 6.659597030752916, "grad_norm": 0.03558952361345291, "learning_rate": 2.7537694060812935e-06, "loss": 0.00016105, "memory(GiB)": 26.31, "step": 6280, "train_speed(iter/s)": 0.580663 }, { "acc": 0.99962788, "epoch": 6.664899257688229, "grad_norm": 0.5213852524757385, "learning_rate": 2.7459412766151992e-06, "loss": 0.0011747, "memory(GiB)": 26.31, "step": 6285, "train_speed(iter/s)": 0.580668 }, { "acc": 0.99911413, "epoch": 6.670201484623542, "grad_norm": 0.12430860847234726, "learning_rate": 2.738120079476708e-06, "loss": 0.00155401, "memory(GiB)": 26.31, "step": 6290, "train_speed(iter/s)": 0.580669 }, { "acc": 0.99988937, "epoch": 6.6755037115588545, "grad_norm": 0.04833541810512543, "learning_rate": 2.730305838714476e-06, "loss": 0.00025307, "memory(GiB)": 26.31, "step": 6295, "train_speed(iter/s)": 0.58067 }, { "acc": 0.99987307, "epoch": 6.680805938494167, "grad_norm": 0.21674484014511108, "learning_rate": 2.7224985783557656e-06, "loss": 0.00076243, "memory(GiB)": 26.31, "step": 6300, "train_speed(iter/s)": 0.580672 }, { "acc": 0.99974499, "epoch": 6.686108165429481, "grad_norm": 0.062140047550201416, "learning_rate": 2.7146983224063777e-06, "loss": 0.00059126, "memory(GiB)": 26.31, "step": 6305, "train_speed(iter/s)": 0.580674 }, { "acc": 0.99949837, "epoch": 6.691410392364793, "grad_norm": 0.5021100044250488, "learning_rate": 2.7069050948505743e-06, "loss": 0.00240989, "memory(GiB)": 26.31, "step": 6310, "train_speed(iter/s)": 0.580675 }, { "acc": 0.99959183, "epoch": 6.696712619300106, "grad_norm": 0.02061682753264904, "learning_rate": 2.6991189196510078e-06, "loss": 0.00075686, "memory(GiB)": 26.31, "step": 6315, "train_speed(iter/s)": 0.580677 }, { "acc": 0.99961433, "epoch": 6.702014846235419, "grad_norm": 0.011941331438720226, "learning_rate": 2.691339820748646e-06, "loss": 0.00153979, "memory(GiB)": 26.31, "step": 6320, "train_speed(iter/s)": 0.580678 }, { "acc": 0.9996357, "epoch": 6.7073170731707314, "grad_norm": 0.03024384006857872, "learning_rate": 2.683567822062698e-06, "loss": 0.00087377, "memory(GiB)": 26.31, "step": 6325, "train_speed(iter/s)": 0.58068 }, { "acc": 0.99974327, "epoch": 6.712619300106045, "grad_norm": 0.0527118556201458, "learning_rate": 2.6758029474905423e-06, "loss": 0.00085505, "memory(GiB)": 26.31, "step": 6330, "train_speed(iter/s)": 0.580683 }, { "acc": 0.99950848, "epoch": 6.717921527041358, "grad_norm": 0.009388357400894165, "learning_rate": 2.6680452209076494e-06, "loss": 0.00144344, "memory(GiB)": 26.31, "step": 6335, "train_speed(iter/s)": 0.580686 }, { "acc": 0.99961433, "epoch": 6.72322375397667, "grad_norm": 0.1637595295906067, "learning_rate": 2.6602946661675144e-06, "loss": 0.00094269, "memory(GiB)": 26.31, "step": 6340, "train_speed(iter/s)": 0.580692 }, { "acc": 0.99936409, "epoch": 6.728525980911983, "grad_norm": 0.3204542100429535, "learning_rate": 2.6525513071015786e-06, "loss": 0.0012643, "memory(GiB)": 26.31, "step": 6345, "train_speed(iter/s)": 0.580696 }, { "acc": 0.99953194, "epoch": 6.733828207847296, "grad_norm": 0.26317375898361206, "learning_rate": 2.6448151675191585e-06, "loss": 0.00247378, "memory(GiB)": 26.31, "step": 6350, "train_speed(iter/s)": 0.580697 }, { "acc": 0.99960594, "epoch": 6.739130434782608, "grad_norm": 0.04262370988726616, "learning_rate": 2.6370862712073693e-06, "loss": 0.00100168, "memory(GiB)": 26.31, "step": 6355, "train_speed(iter/s)": 0.580701 }, { "acc": 0.99975929, "epoch": 6.744432661717921, "grad_norm": 0.030493035912513733, "learning_rate": 2.629364641931062e-06, "loss": 0.00062508, "memory(GiB)": 26.31, "step": 6360, "train_speed(iter/s)": 0.580702 }, { "acc": 0.99924755, "epoch": 6.749734888653235, "grad_norm": 0.13393616676330566, "learning_rate": 2.6216503034327344e-06, "loss": 0.00160743, "memory(GiB)": 26.31, "step": 6365, "train_speed(iter/s)": 0.580705 }, { "acc": 0.9998908, "epoch": 6.755037115588547, "grad_norm": 0.07078687846660614, "learning_rate": 2.613943279432472e-06, "loss": 0.00053432, "memory(GiB)": 26.31, "step": 6370, "train_speed(iter/s)": 0.580709 }, { "acc": 0.99951706, "epoch": 6.76033934252386, "grad_norm": 0.05957571789622307, "learning_rate": 2.606243593627868e-06, "loss": 0.00148512, "memory(GiB)": 26.31, "step": 6375, "train_speed(iter/s)": 0.580711 }, { "acc": 0.9995348, "epoch": 6.765641569459173, "grad_norm": 0.12280748784542084, "learning_rate": 2.598551269693951e-06, "loss": 0.00106705, "memory(GiB)": 26.31, "step": 6380, "train_speed(iter/s)": 0.580712 }, { "acc": 0.99965029, "epoch": 6.770943796394485, "grad_norm": 0.6474934816360474, "learning_rate": 2.590866331283114e-06, "loss": 0.00132872, "memory(GiB)": 26.31, "step": 6385, "train_speed(iter/s)": 0.580718 }, { "acc": 1.0, "epoch": 6.776246023329799, "grad_norm": 0.0352616161108017, "learning_rate": 2.58318880202504e-06, "loss": 0.00019549, "memory(GiB)": 26.31, "step": 6390, "train_speed(iter/s)": 0.580719 }, { "acc": 0.99986486, "epoch": 6.781548250265112, "grad_norm": 0.0694318413734436, "learning_rate": 2.5755187055266353e-06, "loss": 0.000316, "memory(GiB)": 26.31, "step": 6395, "train_speed(iter/s)": 0.580724 }, { "acc": 0.99922085, "epoch": 6.786850477200424, "grad_norm": 0.0368763767182827, "learning_rate": 2.567856065371946e-06, "loss": 0.00130442, "memory(GiB)": 26.31, "step": 6400, "train_speed(iter/s)": 0.580728 }, { "acc": 0.99974422, "epoch": 6.792152704135737, "grad_norm": 0.32792964577674866, "learning_rate": 2.5602009051220933e-06, "loss": 0.00090845, "memory(GiB)": 26.31, "step": 6405, "train_speed(iter/s)": 0.580729 }, { "acc": 0.99988422, "epoch": 6.79745493107105, "grad_norm": 0.10647040605545044, "learning_rate": 2.5525532483152006e-06, "loss": 0.00050243, "memory(GiB)": 26.31, "step": 6410, "train_speed(iter/s)": 0.580731 }, { "acc": 1.0, "epoch": 6.802757158006362, "grad_norm": 0.005562162026762962, "learning_rate": 2.5449131184663174e-06, "loss": 0.00012104, "memory(GiB)": 26.31, "step": 6415, "train_speed(iter/s)": 0.580734 }, { "acc": 0.99987679, "epoch": 6.808059384941675, "grad_norm": 0.031007250770926476, "learning_rate": 2.5372805390673477e-06, "loss": 0.0004752, "memory(GiB)": 26.31, "step": 6420, "train_speed(iter/s)": 0.580736 }, { "acc": 0.99961834, "epoch": 6.813361611876989, "grad_norm": 0.0886315107345581, "learning_rate": 2.529655533586987e-06, "loss": 0.0014479, "memory(GiB)": 26.31, "step": 6425, "train_speed(iter/s)": 0.580737 }, { "acc": 0.99987803, "epoch": 6.818663838812301, "grad_norm": 0.01469503901898861, "learning_rate": 2.522038125470636e-06, "loss": 0.00058394, "memory(GiB)": 26.31, "step": 6430, "train_speed(iter/s)": 0.580739 }, { "acc": 0.9995821, "epoch": 6.823966065747614, "grad_norm": 0.09232966601848602, "learning_rate": 2.5144283381403357e-06, "loss": 0.00080152, "memory(GiB)": 26.31, "step": 6435, "train_speed(iter/s)": 0.580741 }, { "acc": 1.0, "epoch": 6.829268292682927, "grad_norm": 0.032236695289611816, "learning_rate": 2.5068261949946947e-06, "loss": 0.00030665, "memory(GiB)": 26.31, "step": 6440, "train_speed(iter/s)": 0.580742 }, { "acc": 0.99977741, "epoch": 6.834570519618239, "grad_norm": 0.011980585753917694, "learning_rate": 2.49923171940882e-06, "loss": 0.00056685, "memory(GiB)": 26.31, "step": 6445, "train_speed(iter/s)": 0.580747 }, { "acc": 0.99964066, "epoch": 6.839872746553553, "grad_norm": 0.2035069316625595, "learning_rate": 2.491644934734238e-06, "loss": 0.00075537, "memory(GiB)": 26.31, "step": 6450, "train_speed(iter/s)": 0.580748 }, { "acc": 0.99988585, "epoch": 6.845174973488866, "grad_norm": 0.008113852702081203, "learning_rate": 2.4840658642988314e-06, "loss": 0.00041419, "memory(GiB)": 26.31, "step": 6455, "train_speed(iter/s)": 0.58075 }, { "acc": 0.99985638, "epoch": 6.850477200424178, "grad_norm": 0.04093657806515694, "learning_rate": 2.476494531406759e-06, "loss": 0.00032245, "memory(GiB)": 26.31, "step": 6460, "train_speed(iter/s)": 0.580754 }, { "acc": 0.99964466, "epoch": 6.855779427359491, "grad_norm": 0.2267787754535675, "learning_rate": 2.468930959338392e-06, "loss": 0.00157925, "memory(GiB)": 26.31, "step": 6465, "train_speed(iter/s)": 0.580755 }, { "acc": 0.99977474, "epoch": 6.861081654294804, "grad_norm": 0.025810925289988518, "learning_rate": 2.4613751713502355e-06, "loss": 0.00087813, "memory(GiB)": 26.31, "step": 6470, "train_speed(iter/s)": 0.58076 }, { "acc": 0.99976044, "epoch": 6.866383881230116, "grad_norm": 0.06630630046129227, "learning_rate": 2.4538271906748628e-06, "loss": 0.00129472, "memory(GiB)": 26.31, "step": 6475, "train_speed(iter/s)": 0.580761 }, { "acc": 0.99987984, "epoch": 6.871686108165429, "grad_norm": 0.162663072347641, "learning_rate": 2.446287040520838e-06, "loss": 0.00037377, "memory(GiB)": 26.31, "step": 6480, "train_speed(iter/s)": 0.580763 }, { "acc": 0.99947395, "epoch": 6.876988335100743, "grad_norm": 0.961998462677002, "learning_rate": 2.4387547440726496e-06, "loss": 0.00098506, "memory(GiB)": 26.31, "step": 6485, "train_speed(iter/s)": 0.580764 }, { "acc": 0.99952316, "epoch": 6.882290562036055, "grad_norm": 0.5905699729919434, "learning_rate": 2.431230324490641e-06, "loss": 0.00158523, "memory(GiB)": 26.31, "step": 6490, "train_speed(iter/s)": 0.580766 }, { "acc": 0.99961891, "epoch": 6.887592788971368, "grad_norm": 0.2418178915977478, "learning_rate": 2.42371380491093e-06, "loss": 0.00067505, "memory(GiB)": 26.31, "step": 6495, "train_speed(iter/s)": 0.580771 }, { "acc": 0.99974556, "epoch": 6.892895015906681, "grad_norm": 0.6824781894683838, "learning_rate": 2.4162052084453438e-06, "loss": 0.00131153, "memory(GiB)": 26.31, "step": 6500, "train_speed(iter/s)": 0.580775 }, { "acc": 0.99976358, "epoch": 6.898197242841993, "grad_norm": 0.055095236748456955, "learning_rate": 2.408704558181354e-06, "loss": 0.00070585, "memory(GiB)": 26.31, "step": 6505, "train_speed(iter/s)": 0.580781 }, { "acc": 0.99989271, "epoch": 6.903499469777307, "grad_norm": 0.035954736173152924, "learning_rate": 2.4012118771819924e-06, "loss": 0.00029226, "memory(GiB)": 26.31, "step": 6510, "train_speed(iter/s)": 0.580784 }, { "acc": 1.0, "epoch": 6.90880169671262, "grad_norm": 0.03536577895283699, "learning_rate": 2.3937271884857895e-06, "loss": 0.00017438, "memory(GiB)": 26.31, "step": 6515, "train_speed(iter/s)": 0.580786 }, { "acc": 0.9996439, "epoch": 6.914103923647932, "grad_norm": 0.025644930079579353, "learning_rate": 2.3862505151067004e-06, "loss": 0.00101558, "memory(GiB)": 26.31, "step": 6520, "train_speed(iter/s)": 0.580788 }, { "acc": 0.99974003, "epoch": 6.919406150583245, "grad_norm": 0.014122070744633675, "learning_rate": 2.378781880034036e-06, "loss": 0.00101173, "memory(GiB)": 26.31, "step": 6525, "train_speed(iter/s)": 0.580792 }, { "acc": 0.99975986, "epoch": 6.924708377518558, "grad_norm": 0.35527971386909485, "learning_rate": 2.3713213062323886e-06, "loss": 0.00107869, "memory(GiB)": 26.31, "step": 6530, "train_speed(iter/s)": 0.580799 }, { "acc": 0.99964218, "epoch": 6.93001060445387, "grad_norm": 0.8211327791213989, "learning_rate": 2.363868816641566e-06, "loss": 0.00098987, "memory(GiB)": 26.31, "step": 6535, "train_speed(iter/s)": 0.580804 }, { "acc": 0.99974022, "epoch": 6.935312831389183, "grad_norm": 0.1665266454219818, "learning_rate": 2.3564244341765173e-06, "loss": 0.00067698, "memory(GiB)": 26.31, "step": 6540, "train_speed(iter/s)": 0.580808 }, { "acc": 1.0, "epoch": 6.9406150583244965, "grad_norm": 0.0726097822189331, "learning_rate": 2.3489881817272647e-06, "loss": 0.00023324, "memory(GiB)": 26.31, "step": 6545, "train_speed(iter/s)": 0.580812 }, { "acc": 0.99985952, "epoch": 6.945917285259809, "grad_norm": 0.018996795639395714, "learning_rate": 2.3415600821588293e-06, "loss": 0.00049761, "memory(GiB)": 26.31, "step": 6550, "train_speed(iter/s)": 0.580813 }, { "acc": 0.99961214, "epoch": 6.951219512195122, "grad_norm": 0.3382207155227661, "learning_rate": 2.33414015831117e-06, "loss": 0.00079759, "memory(GiB)": 26.31, "step": 6555, "train_speed(iter/s)": 0.580816 }, { "acc": 1.0, "epoch": 6.956521739130435, "grad_norm": 0.018769921734929085, "learning_rate": 2.3267284329991015e-06, "loss": 0.00018237, "memory(GiB)": 26.31, "step": 6560, "train_speed(iter/s)": 0.580823 }, { "acc": 1.0, "epoch": 6.961823966065747, "grad_norm": 0.0020895751658827066, "learning_rate": 2.3193249290122304e-06, "loss": 4.325e-05, "memory(GiB)": 26.31, "step": 6565, "train_speed(iter/s)": 0.580824 }, { "acc": 0.99958591, "epoch": 6.967126193001061, "grad_norm": 0.030543280765414238, "learning_rate": 2.3119296691148854e-06, "loss": 0.00201222, "memory(GiB)": 26.31, "step": 6570, "train_speed(iter/s)": 0.580829 }, { "acc": 1.0, "epoch": 6.9724284199363735, "grad_norm": 0.012603729963302612, "learning_rate": 2.3045426760460463e-06, "loss": 0.00011642, "memory(GiB)": 26.31, "step": 6575, "train_speed(iter/s)": 0.58083 }, { "acc": 0.99963036, "epoch": 6.977730646871686, "grad_norm": 0.09143660217523575, "learning_rate": 2.2971639725192722e-06, "loss": 0.00087152, "memory(GiB)": 26.31, "step": 6580, "train_speed(iter/s)": 0.580831 }, { "acc": 1.0, "epoch": 6.983032873806999, "grad_norm": 0.02645263820886612, "learning_rate": 2.2897935812226367e-06, "loss": 0.00021495, "memory(GiB)": 26.31, "step": 6585, "train_speed(iter/s)": 0.580836 }, { "acc": 0.99941025, "epoch": 6.988335100742312, "grad_norm": 1.0920523405075073, "learning_rate": 2.2824315248186522e-06, "loss": 0.00135035, "memory(GiB)": 26.31, "step": 6590, "train_speed(iter/s)": 0.580838 }, { "acc": 0.99966278, "epoch": 6.993637327677624, "grad_norm": 0.0877826139330864, "learning_rate": 2.2750778259442033e-06, "loss": 0.00058898, "memory(GiB)": 26.31, "step": 6595, "train_speed(iter/s)": 0.580838 }, { "acc": 0.99973173, "epoch": 6.998939554612938, "grad_norm": 0.10175105929374695, "learning_rate": 2.267732507210478e-06, "loss": 0.00033401, "memory(GiB)": 26.31, "step": 6600, "train_speed(iter/s)": 0.580839 }, { "acc": 1.0, "epoch": 7.0042417815482505, "grad_norm": 0.03302436321973801, "learning_rate": 2.2603955912028968e-06, "loss": 0.00014786, "memory(GiB)": 26.31, "step": 6605, "train_speed(iter/s)": 0.580791 }, { "acc": 0.99987745, "epoch": 7.009544008483563, "grad_norm": 0.049822960048913956, "learning_rate": 2.2530671004810408e-06, "loss": 0.00034393, "memory(GiB)": 26.31, "step": 6610, "train_speed(iter/s)": 0.580792 }, { "acc": 0.99964399, "epoch": 7.014846235418876, "grad_norm": 0.014102225191891193, "learning_rate": 2.24574705757859e-06, "loss": 0.0008734, "memory(GiB)": 26.31, "step": 6615, "train_speed(iter/s)": 0.580794 }, { "acc": 0.99962807, "epoch": 7.0201484623541885, "grad_norm": 0.02806282229721546, "learning_rate": 2.238435485003244e-06, "loss": 0.00088563, "memory(GiB)": 26.31, "step": 6620, "train_speed(iter/s)": 0.580795 }, { "acc": 0.99974384, "epoch": 7.025450689289501, "grad_norm": 0.05678127333521843, "learning_rate": 2.231132405236663e-06, "loss": 0.00090697, "memory(GiB)": 26.31, "step": 6625, "train_speed(iter/s)": 0.580799 }, { "acc": 0.99975758, "epoch": 7.030752916224815, "grad_norm": 0.0719962865114212, "learning_rate": 2.223837840734388e-06, "loss": 0.00198463, "memory(GiB)": 26.31, "step": 6630, "train_speed(iter/s)": 0.5808 }, { "acc": 1.0, "epoch": 7.0360551431601275, "grad_norm": 0.022466253489255905, "learning_rate": 2.2165518139257804e-06, "loss": 4.929e-05, "memory(GiB)": 26.31, "step": 6635, "train_speed(iter/s)": 0.580802 }, { "acc": 0.9998724, "epoch": 7.04135737009544, "grad_norm": 0.05142033100128174, "learning_rate": 2.2092743472139495e-06, "loss": 0.00026469, "memory(GiB)": 26.31, "step": 6640, "train_speed(iter/s)": 0.580804 }, { "acc": 0.9998826, "epoch": 7.046659597030753, "grad_norm": 0.006165654398500919, "learning_rate": 2.2020054629756837e-06, "loss": 0.00029751, "memory(GiB)": 26.31, "step": 6645, "train_speed(iter/s)": 0.580805 }, { "acc": 0.99987984, "epoch": 7.0519618239660655, "grad_norm": 0.14492355287075043, "learning_rate": 2.1947451835613813e-06, "loss": 0.00028882, "memory(GiB)": 26.31, "step": 6650, "train_speed(iter/s)": 0.580811 }, { "acc": 0.99928799, "epoch": 7.057264050901378, "grad_norm": 0.21041271090507507, "learning_rate": 2.1874935312949828e-06, "loss": 0.00141331, "memory(GiB)": 26.31, "step": 6655, "train_speed(iter/s)": 0.580812 }, { "acc": 0.99960728, "epoch": 7.062566277836692, "grad_norm": 0.048662737011909485, "learning_rate": 2.180250528473903e-06, "loss": 0.00084743, "memory(GiB)": 26.31, "step": 6660, "train_speed(iter/s)": 0.580812 }, { "acc": 0.99976778, "epoch": 7.0678685047720045, "grad_norm": 0.033747218549251556, "learning_rate": 2.17301619736896e-06, "loss": 0.00140712, "memory(GiB)": 26.31, "step": 6665, "train_speed(iter/s)": 0.580813 }, { "acc": 0.99985123, "epoch": 7.073170731707317, "grad_norm": 0.030595524236559868, "learning_rate": 2.1657905602243093e-06, "loss": 0.00018092, "memory(GiB)": 26.31, "step": 6670, "train_speed(iter/s)": 0.580819 }, { "acc": 0.99975853, "epoch": 7.07847295864263, "grad_norm": 0.24107412993907928, "learning_rate": 2.1585736392573724e-06, "loss": 0.0007273, "memory(GiB)": 26.31, "step": 6675, "train_speed(iter/s)": 0.58082 }, { "acc": 0.99972353, "epoch": 7.0837751855779425, "grad_norm": 0.01268590334802866, "learning_rate": 2.1513654566587705e-06, "loss": 0.00061308, "memory(GiB)": 26.31, "step": 6680, "train_speed(iter/s)": 0.580821 }, { "acc": 0.99951534, "epoch": 7.089077412513255, "grad_norm": 0.0627276599407196, "learning_rate": 2.14416603459226e-06, "loss": 0.00072982, "memory(GiB)": 26.31, "step": 6685, "train_speed(iter/s)": 0.580822 }, { "acc": 0.99988317, "epoch": 7.094379639448569, "grad_norm": 0.3131565451622009, "learning_rate": 2.1369753951946548e-06, "loss": 0.00025356, "memory(GiB)": 26.31, "step": 6690, "train_speed(iter/s)": 0.580825 }, { "acc": 0.99987745, "epoch": 7.099681866383881, "grad_norm": 0.020614469423890114, "learning_rate": 2.129793560575767e-06, "loss": 0.00057884, "memory(GiB)": 26.31, "step": 6695, "train_speed(iter/s)": 0.580828 }, { "acc": 0.99987869, "epoch": 7.104984093319194, "grad_norm": 0.041664209216833115, "learning_rate": 2.122620552818335e-06, "loss": 0.00065411, "memory(GiB)": 26.31, "step": 6700, "train_speed(iter/s)": 0.580832 }, { "acc": 1.0, "epoch": 7.110286320254507, "grad_norm": 0.025744246318936348, "learning_rate": 2.115456393977956e-06, "loss": 8.391e-05, "memory(GiB)": 26.31, "step": 6705, "train_speed(iter/s)": 0.580832 }, { "acc": 0.99976645, "epoch": 7.1155885471898195, "grad_norm": 0.06860660761594772, "learning_rate": 2.1083011060830183e-06, "loss": 0.00106212, "memory(GiB)": 26.31, "step": 6710, "train_speed(iter/s)": 0.580833 }, { "acc": 0.99988155, "epoch": 7.120890774125132, "grad_norm": 0.14570750296115875, "learning_rate": 2.101154711134634e-06, "loss": 0.00041477, "memory(GiB)": 26.31, "step": 6715, "train_speed(iter/s)": 0.580835 }, { "acc": 0.99988155, "epoch": 7.126193001060446, "grad_norm": 0.1013423353433609, "learning_rate": 2.0940172311065734e-06, "loss": 0.00021523, "memory(GiB)": 26.31, "step": 6720, "train_speed(iter/s)": 0.580836 }, { "acc": 0.99975052, "epoch": 7.131495227995758, "grad_norm": 0.10225103795528412, "learning_rate": 2.086888687945192e-06, "loss": 0.00051952, "memory(GiB)": 26.31, "step": 6725, "train_speed(iter/s)": 0.580836 }, { "acc": 0.99963379, "epoch": 7.136797454931071, "grad_norm": 0.03145146742463112, "learning_rate": 2.079769103569367e-06, "loss": 0.00069922, "memory(GiB)": 26.31, "step": 6730, "train_speed(iter/s)": 0.580837 }, { "acc": 0.99976254, "epoch": 7.142099681866384, "grad_norm": 0.009951247833669186, "learning_rate": 2.0726584998704293e-06, "loss": 0.00049842, "memory(GiB)": 26.31, "step": 6735, "train_speed(iter/s)": 0.580837 }, { "acc": 1.0, "epoch": 7.1474019088016965, "grad_norm": 0.02770557440817356, "learning_rate": 2.065556898712098e-06, "loss": 6.999e-05, "memory(GiB)": 26.31, "step": 6740, "train_speed(iter/s)": 0.580841 }, { "acc": 0.99976578, "epoch": 7.152704135737009, "grad_norm": 0.15290257334709167, "learning_rate": 2.0584643219304063e-06, "loss": 0.00046628, "memory(GiB)": 26.31, "step": 6745, "train_speed(iter/s)": 0.580842 }, { "acc": 0.99974947, "epoch": 7.158006362672323, "grad_norm": 0.025673704221844673, "learning_rate": 2.051380791333642e-06, "loss": 0.00038622, "memory(GiB)": 26.31, "step": 6750, "train_speed(iter/s)": 0.580842 }, { "acc": 0.99988422, "epoch": 7.163308589607635, "grad_norm": 0.005067603662610054, "learning_rate": 2.044306328702281e-06, "loss": 0.00027976, "memory(GiB)": 26.31, "step": 6755, "train_speed(iter/s)": 0.580845 }, { "acc": 0.99974613, "epoch": 7.168610816542948, "grad_norm": 0.004405306186527014, "learning_rate": 2.0372409557889127e-06, "loss": 0.00149132, "memory(GiB)": 26.31, "step": 6760, "train_speed(iter/s)": 0.580846 }, { "acc": 0.99988213, "epoch": 7.173913043478261, "grad_norm": 0.34647825360298157, "learning_rate": 2.030184694318177e-06, "loss": 0.00023813, "memory(GiB)": 26.31, "step": 6765, "train_speed(iter/s)": 0.58085 }, { "acc": 1.0, "epoch": 7.179215270413573, "grad_norm": 0.1090255081653595, "learning_rate": 2.0231375659867e-06, "loss": 0.00013765, "memory(GiB)": 26.31, "step": 6770, "train_speed(iter/s)": 0.58085 }, { "acc": 1.0, "epoch": 7.184517497348886, "grad_norm": 0.009488901123404503, "learning_rate": 2.0160995924630258e-06, "loss": 0.00031856, "memory(GiB)": 26.31, "step": 6775, "train_speed(iter/s)": 0.580851 }, { "acc": 0.9994812, "epoch": 7.1898197242842, "grad_norm": 0.13912135362625122, "learning_rate": 2.0090707953875464e-06, "loss": 0.00101557, "memory(GiB)": 26.31, "step": 6780, "train_speed(iter/s)": 0.580853 }, { "acc": 1.0, "epoch": 7.195121951219512, "grad_norm": 0.006838640663772821, "learning_rate": 2.00205119637244e-06, "loss": 0.00016393, "memory(GiB)": 26.31, "step": 6785, "train_speed(iter/s)": 0.580853 }, { "acc": 0.99976959, "epoch": 7.200424178154825, "grad_norm": 1.121747612953186, "learning_rate": 1.9950408170016023e-06, "loss": 0.00135582, "memory(GiB)": 26.31, "step": 6790, "train_speed(iter/s)": 0.580855 }, { "acc": 1.0, "epoch": 7.205726405090138, "grad_norm": 0.003712412202730775, "learning_rate": 1.98803967883058e-06, "loss": 0.00012902, "memory(GiB)": 26.31, "step": 6795, "train_speed(iter/s)": 0.580855 }, { "acc": 0.99988689, "epoch": 7.21102863202545, "grad_norm": 0.007916714064776897, "learning_rate": 1.981047803386506e-06, "loss": 0.00045676, "memory(GiB)": 26.31, "step": 6800, "train_speed(iter/s)": 0.580856 }, { "acc": 0.99976072, "epoch": 7.216330858960763, "grad_norm": 0.017491836100816727, "learning_rate": 1.97406521216803e-06, "loss": 0.00086029, "memory(GiB)": 26.31, "step": 6805, "train_speed(iter/s)": 0.580857 }, { "acc": 0.99939404, "epoch": 7.221633085896077, "grad_norm": 0.07758060842752457, "learning_rate": 1.9670919266452552e-06, "loss": 0.00167624, "memory(GiB)": 26.31, "step": 6810, "train_speed(iter/s)": 0.580857 }, { "acc": 0.9996129, "epoch": 7.226935312831389, "grad_norm": 0.22923319041728973, "learning_rate": 1.960127968259675e-06, "loss": 0.00129077, "memory(GiB)": 26.31, "step": 6815, "train_speed(iter/s)": 0.580859 }, { "acc": 1.0, "epoch": 7.232237539766702, "grad_norm": 0.027276834473013878, "learning_rate": 1.9531733584240996e-06, "loss": 0.00020395, "memory(GiB)": 26.31, "step": 6820, "train_speed(iter/s)": 0.580859 }, { "acc": 0.99987497, "epoch": 7.237539766702015, "grad_norm": 0.010682585649192333, "learning_rate": 1.946228118522594e-06, "loss": 0.00020047, "memory(GiB)": 26.31, "step": 6825, "train_speed(iter/s)": 0.580861 }, { "acc": 0.99988422, "epoch": 7.242841993637327, "grad_norm": 0.05258682742714882, "learning_rate": 1.9392922699104164e-06, "loss": 0.00039802, "memory(GiB)": 26.31, "step": 6830, "train_speed(iter/s)": 0.580862 }, { "acc": 0.99986038, "epoch": 7.248144220572641, "grad_norm": 0.140438511967659, "learning_rate": 1.9323658339139455e-06, "loss": 0.00027934, "memory(GiB)": 26.31, "step": 6835, "train_speed(iter/s)": 0.580864 }, { "acc": 0.99959145, "epoch": 7.253446447507954, "grad_norm": 0.024344947189092636, "learning_rate": 1.9254488318306183e-06, "loss": 0.00097043, "memory(GiB)": 26.31, "step": 6840, "train_speed(iter/s)": 0.580864 }, { "acc": 0.99974985, "epoch": 7.258748674443266, "grad_norm": 0.04792139679193497, "learning_rate": 1.918541284928866e-06, "loss": 0.00058359, "memory(GiB)": 26.31, "step": 6845, "train_speed(iter/s)": 0.580867 }, { "acc": 0.99988098, "epoch": 7.264050901378579, "grad_norm": 0.014974468387663364, "learning_rate": 1.911643214448044e-06, "loss": 0.00070725, "memory(GiB)": 26.31, "step": 6850, "train_speed(iter/s)": 0.580868 }, { "acc": 0.99977837, "epoch": 7.269353128313892, "grad_norm": 0.011281152255833149, "learning_rate": 1.9047546415983719e-06, "loss": 0.00041005, "memory(GiB)": 26.31, "step": 6855, "train_speed(iter/s)": 0.580868 }, { "acc": 0.99963131, "epoch": 7.274655355249204, "grad_norm": 0.06311699002981186, "learning_rate": 1.897875587560866e-06, "loss": 0.00196712, "memory(GiB)": 26.31, "step": 6860, "train_speed(iter/s)": 0.580869 }, { "acc": 0.99988041, "epoch": 7.279957582184517, "grad_norm": 0.28648215532302856, "learning_rate": 1.8910060734872732e-06, "loss": 0.00027213, "memory(GiB)": 26.31, "step": 6865, "train_speed(iter/s)": 0.58087 }, { "acc": 0.99951057, "epoch": 7.285259809119831, "grad_norm": 0.06947100162506104, "learning_rate": 1.8841461205000073e-06, "loss": 0.00103844, "memory(GiB)": 26.31, "step": 6870, "train_speed(iter/s)": 0.580872 }, { "acc": 0.99936943, "epoch": 7.290562036055143, "grad_norm": 0.1860935091972351, "learning_rate": 1.8772957496920822e-06, "loss": 0.00224305, "memory(GiB)": 26.31, "step": 6875, "train_speed(iter/s)": 0.580875 }, { "acc": 0.99958391, "epoch": 7.295864262990456, "grad_norm": 0.05386153236031532, "learning_rate": 1.8704549821270519e-06, "loss": 0.00067724, "memory(GiB)": 26.31, "step": 6880, "train_speed(iter/s)": 0.580881 }, { "acc": 0.99984941, "epoch": 7.301166489925769, "grad_norm": 0.06292575597763062, "learning_rate": 1.8636238388389394e-06, "loss": 0.00104737, "memory(GiB)": 26.31, "step": 6885, "train_speed(iter/s)": 0.580885 }, { "acc": 0.99929142, "epoch": 7.306468716861081, "grad_norm": 0.14226506650447845, "learning_rate": 1.8568023408321762e-06, "loss": 0.00195795, "memory(GiB)": 26.31, "step": 6890, "train_speed(iter/s)": 0.580884 }, { "acc": 0.99963522, "epoch": 7.311770943796395, "grad_norm": 0.09832887351512909, "learning_rate": 1.8499905090815348e-06, "loss": 0.00130671, "memory(GiB)": 26.31, "step": 6895, "train_speed(iter/s)": 0.580887 }, { "acc": 0.99989033, "epoch": 7.317073170731708, "grad_norm": 0.03976750746369362, "learning_rate": 1.8431883645320677e-06, "loss": 0.00054767, "memory(GiB)": 26.31, "step": 6900, "train_speed(iter/s)": 0.580888 }, { "acc": 0.99963274, "epoch": 7.32237539766702, "grad_norm": 0.0048141395673155785, "learning_rate": 1.8363959280990408e-06, "loss": 0.00075108, "memory(GiB)": 26.31, "step": 6905, "train_speed(iter/s)": 0.580889 }, { "acc": 0.99974098, "epoch": 7.327677624602333, "grad_norm": 0.9042002558708191, "learning_rate": 1.8296132206678684e-06, "loss": 0.00052153, "memory(GiB)": 26.31, "step": 6910, "train_speed(iter/s)": 0.580895 }, { "acc": 0.99987803, "epoch": 7.332979851537646, "grad_norm": 0.06263940036296844, "learning_rate": 1.8228402630940513e-06, "loss": 0.00030076, "memory(GiB)": 26.31, "step": 6915, "train_speed(iter/s)": 0.580896 }, { "acc": 0.99974251, "epoch": 7.338282078472958, "grad_norm": 0.05377192422747612, "learning_rate": 1.8160770762031102e-06, "loss": 0.00032192, "memory(GiB)": 26.31, "step": 6920, "train_speed(iter/s)": 0.580897 }, { "acc": 0.99951305, "epoch": 7.343584305408271, "grad_norm": 0.3651827871799469, "learning_rate": 1.8093236807905241e-06, "loss": 0.00175905, "memory(GiB)": 26.31, "step": 6925, "train_speed(iter/s)": 0.580899 }, { "acc": 0.99974251, "epoch": 7.348886532343585, "grad_norm": 0.0032980055548250675, "learning_rate": 1.8025800976216638e-06, "loss": 0.00044379, "memory(GiB)": 26.31, "step": 6930, "train_speed(iter/s)": 0.5809 }, { "acc": 1.0, "epoch": 7.354188759278897, "grad_norm": 0.1116834357380867, "learning_rate": 1.795846347431729e-06, "loss": 8.933e-05, "memory(GiB)": 26.31, "step": 6935, "train_speed(iter/s)": 0.580907 }, { "acc": 0.99988585, "epoch": 7.35949098621421, "grad_norm": 0.0936645120382309, "learning_rate": 1.789122450925689e-06, "loss": 0.00096569, "memory(GiB)": 26.31, "step": 6940, "train_speed(iter/s)": 0.58091 }, { "acc": 0.99987116, "epoch": 7.364793213149523, "grad_norm": 0.01857062429189682, "learning_rate": 1.782408428778208e-06, "loss": 0.00049213, "memory(GiB)": 26.31, "step": 6945, "train_speed(iter/s)": 0.580912 }, { "acc": 0.99951744, "epoch": 7.370095440084835, "grad_norm": 0.12067139893770218, "learning_rate": 1.7757043016335974e-06, "loss": 0.00087103, "memory(GiB)": 26.31, "step": 6950, "train_speed(iter/s)": 0.580912 }, { "acc": 0.99972391, "epoch": 7.375397667020149, "grad_norm": 0.03747297078371048, "learning_rate": 1.7690100901057356e-06, "loss": 0.00047853, "memory(GiB)": 26.31, "step": 6955, "train_speed(iter/s)": 0.580916 }, { "acc": 1.0, "epoch": 7.3806998939554616, "grad_norm": 0.05759872496128082, "learning_rate": 1.7623258147780149e-06, "loss": 0.00017719, "memory(GiB)": 26.31, "step": 6960, "train_speed(iter/s)": 0.580916 }, { "acc": 0.99973459, "epoch": 7.386002120890774, "grad_norm": 0.012503202073276043, "learning_rate": 1.7556514962032767e-06, "loss": 0.0014173, "memory(GiB)": 26.31, "step": 6965, "train_speed(iter/s)": 0.580917 }, { "acc": 0.99985714, "epoch": 7.391304347826087, "grad_norm": 0.004481893964111805, "learning_rate": 1.748987154903746e-06, "loss": 0.00022981, "memory(GiB)": 26.31, "step": 6970, "train_speed(iter/s)": 0.580917 }, { "acc": 0.99988041, "epoch": 7.3966065747614, "grad_norm": 0.006199334282428026, "learning_rate": 1.7423328113709714e-06, "loss": 0.00029288, "memory(GiB)": 26.31, "step": 6975, "train_speed(iter/s)": 0.580921 }, { "acc": 0.99976082, "epoch": 7.401908801696712, "grad_norm": 0.04824783280491829, "learning_rate": 1.735688486065758e-06, "loss": 0.00051028, "memory(GiB)": 26.31, "step": 6980, "train_speed(iter/s)": 0.580927 }, { "acc": 0.99965048, "epoch": 7.407211028632026, "grad_norm": 0.013628070242702961, "learning_rate": 1.7290541994181089e-06, "loss": 0.00077747, "memory(GiB)": 26.31, "step": 6985, "train_speed(iter/s)": 0.580928 }, { "acc": 0.99987745, "epoch": 7.4125132555673385, "grad_norm": 0.16469603776931763, "learning_rate": 1.722429971827159e-06, "loss": 0.00054255, "memory(GiB)": 26.31, "step": 6990, "train_speed(iter/s)": 0.580929 }, { "acc": 0.99987049, "epoch": 7.417815482502651, "grad_norm": 0.020483864471316338, "learning_rate": 1.7158158236611144e-06, "loss": 0.00021344, "memory(GiB)": 26.31, "step": 6995, "train_speed(iter/s)": 0.58093 }, { "acc": 0.99958992, "epoch": 7.423117709437964, "grad_norm": 0.0954505056142807, "learning_rate": 1.7092117752571875e-06, "loss": 0.00145397, "memory(GiB)": 26.31, "step": 7000, "train_speed(iter/s)": 0.58093 }, { "acc": 0.99971962, "epoch": 7.428419936373277, "grad_norm": 0.43828457593917847, "learning_rate": 1.7026178469215362e-06, "loss": 0.00043246, "memory(GiB)": 26.31, "step": 7005, "train_speed(iter/s)": 0.580933 }, { "acc": 1.0, "epoch": 7.433722163308589, "grad_norm": 0.03433601185679436, "learning_rate": 1.6960340589292051e-06, "loss": 0.00017182, "memory(GiB)": 26.31, "step": 7010, "train_speed(iter/s)": 0.580934 }, { "acc": 0.99989223, "epoch": 7.439024390243903, "grad_norm": 0.021550146862864494, "learning_rate": 1.689460431524054e-06, "loss": 0.00029675, "memory(GiB)": 26.31, "step": 7015, "train_speed(iter/s)": 0.580934 }, { "acc": 0.99976244, "epoch": 7.4443266171792155, "grad_norm": 0.12215669453144073, "learning_rate": 1.6828969849187032e-06, "loss": 0.00055049, "memory(GiB)": 26.31, "step": 7020, "train_speed(iter/s)": 0.580936 }, { "acc": 1.0, "epoch": 7.449628844114528, "grad_norm": 0.01776804029941559, "learning_rate": 1.6763437392944688e-06, "loss": 0.00015911, "memory(GiB)": 26.31, "step": 7025, "train_speed(iter/s)": 0.580937 }, { "acc": 0.99964476, "epoch": 7.454931071049841, "grad_norm": 0.053600966930389404, "learning_rate": 1.6698007148013e-06, "loss": 0.00056505, "memory(GiB)": 26.31, "step": 7030, "train_speed(iter/s)": 0.580936 }, { "acc": 0.99975433, "epoch": 7.4602332979851536, "grad_norm": 0.01400748547166586, "learning_rate": 1.66326793155772e-06, "loss": 0.00049177, "memory(GiB)": 26.31, "step": 7035, "train_speed(iter/s)": 0.580942 }, { "acc": 1.0, "epoch": 7.465535524920466, "grad_norm": 0.09328199923038483, "learning_rate": 1.6567454096507598e-06, "loss": 0.0001565, "memory(GiB)": 26.31, "step": 7040, "train_speed(iter/s)": 0.580943 }, { "acc": 0.99989405, "epoch": 7.47083775185578, "grad_norm": 0.03550637513399124, "learning_rate": 1.6502331691358995e-06, "loss": 0.00025591, "memory(GiB)": 26.31, "step": 7045, "train_speed(iter/s)": 0.580944 }, { "acc": 0.99951725, "epoch": 7.4761399787910925, "grad_norm": 1.2384247779846191, "learning_rate": 1.6437312300370084e-06, "loss": 0.00159495, "memory(GiB)": 26.31, "step": 7050, "train_speed(iter/s)": 0.580946 }, { "acc": 0.99976501, "epoch": 7.481442205726405, "grad_norm": 0.026157498359680176, "learning_rate": 1.6372396123462784e-06, "loss": 0.0002734, "memory(GiB)": 26.31, "step": 7055, "train_speed(iter/s)": 0.580946 }, { "acc": 0.99975224, "epoch": 7.486744432661718, "grad_norm": 0.07278779149055481, "learning_rate": 1.6307583360241658e-06, "loss": 0.00074835, "memory(GiB)": 26.31, "step": 7060, "train_speed(iter/s)": 0.580952 }, { "acc": 1.0, "epoch": 7.4920466595970305, "grad_norm": 0.03025503270328045, "learning_rate": 1.6242874209993275e-06, "loss": 5.68e-05, "memory(GiB)": 26.31, "step": 7065, "train_speed(iter/s)": 0.580955 }, { "acc": 1.0, "epoch": 7.497348886532343, "grad_norm": 0.007420065347105265, "learning_rate": 1.6178268871685647e-06, "loss": 0.00010299, "memory(GiB)": 26.31, "step": 7070, "train_speed(iter/s)": 0.580961 }, { "acc": 1.0, "epoch": 7.502651113467657, "grad_norm": 0.05877010524272919, "learning_rate": 1.611376754396754e-06, "loss": 9.454e-05, "memory(GiB)": 26.31, "step": 7075, "train_speed(iter/s)": 0.580962 }, { "acc": 1.0, "epoch": 7.5079533404029695, "grad_norm": 0.0142194963991642, "learning_rate": 1.604937042516797e-06, "loss": 0.00010233, "memory(GiB)": 26.31, "step": 7080, "train_speed(iter/s)": 0.580964 }, { "acc": 1.0, "epoch": 7.513255567338282, "grad_norm": 0.0043426030315458775, "learning_rate": 1.598507771329549e-06, "loss": 0.00012825, "memory(GiB)": 26.31, "step": 7085, "train_speed(iter/s)": 0.580964 }, { "acc": 0.99961729, "epoch": 7.518557794273595, "grad_norm": 0.27042967081069946, "learning_rate": 1.5920889606037612e-06, "loss": 0.00112635, "memory(GiB)": 26.31, "step": 7090, "train_speed(iter/s)": 0.580964 }, { "acc": 0.99977627, "epoch": 7.5238600212089075, "grad_norm": 0.004301531706005335, "learning_rate": 1.585680630076023e-06, "loss": 0.0010411, "memory(GiB)": 26.31, "step": 7095, "train_speed(iter/s)": 0.580966 }, { "acc": 0.99988317, "epoch": 7.52916224814422, "grad_norm": 0.05325544625520706, "learning_rate": 1.5792827994507001e-06, "loss": 0.00031735, "memory(GiB)": 26.31, "step": 7100, "train_speed(iter/s)": 0.58097 }, { "acc": 0.99973316, "epoch": 7.534464475079534, "grad_norm": 0.02882418781518936, "learning_rate": 1.57289548839987e-06, "loss": 0.00033525, "memory(GiB)": 26.31, "step": 7105, "train_speed(iter/s)": 0.580974 }, { "acc": 0.99977551, "epoch": 7.5397667020148464, "grad_norm": 0.06291340291500092, "learning_rate": 1.5665187165632675e-06, "loss": 0.0002386, "memory(GiB)": 26.31, "step": 7110, "train_speed(iter/s)": 0.580975 }, { "acc": 0.99977036, "epoch": 7.545068928950159, "grad_norm": 0.1016281470656395, "learning_rate": 1.5601525035482201e-06, "loss": 0.00037299, "memory(GiB)": 26.31, "step": 7115, "train_speed(iter/s)": 0.580977 }, { "acc": 1.0, "epoch": 7.550371155885472, "grad_norm": 0.04114781692624092, "learning_rate": 1.5537968689295879e-06, "loss": 0.00021627, "memory(GiB)": 26.31, "step": 7120, "train_speed(iter/s)": 0.580979 }, { "acc": 0.99988317, "epoch": 7.5556733828207845, "grad_norm": 0.01582081988453865, "learning_rate": 1.547451832249707e-06, "loss": 0.00034413, "memory(GiB)": 26.31, "step": 7125, "train_speed(iter/s)": 0.580981 }, { "acc": 0.99986982, "epoch": 7.560975609756097, "grad_norm": 0.004217559937387705, "learning_rate": 1.5411174130183246e-06, "loss": 0.00028027, "memory(GiB)": 26.31, "step": 7130, "train_speed(iter/s)": 0.580986 }, { "acc": 0.99988041, "epoch": 7.566277836691411, "grad_norm": 0.03366623446345329, "learning_rate": 1.5347936307125414e-06, "loss": 0.0001684, "memory(GiB)": 26.31, "step": 7135, "train_speed(iter/s)": 0.580991 }, { "acc": 1.0, "epoch": 7.571580063626723, "grad_norm": 0.11424966156482697, "learning_rate": 1.5284805047767555e-06, "loss": 0.00016731, "memory(GiB)": 26.31, "step": 7140, "train_speed(iter/s)": 0.580994 }, { "acc": 0.99987049, "epoch": 7.576882290562036, "grad_norm": 0.0017340040067210793, "learning_rate": 1.5221780546225942e-06, "loss": 0.0014659, "memory(GiB)": 26.31, "step": 7145, "train_speed(iter/s)": 0.581 }, { "acc": 0.999755, "epoch": 7.582184517497349, "grad_norm": 0.008549396879971027, "learning_rate": 1.5158862996288584e-06, "loss": 0.00028182, "memory(GiB)": 26.31, "step": 7150, "train_speed(iter/s)": 0.581002 }, { "acc": 0.99944859, "epoch": 7.5874867444326615, "grad_norm": 0.07049953192472458, "learning_rate": 1.509605259141469e-06, "loss": 0.00190711, "memory(GiB)": 26.31, "step": 7155, "train_speed(iter/s)": 0.581003 }, { "acc": 1.0, "epoch": 7.592788971367975, "grad_norm": 0.0234018936753273, "learning_rate": 1.503334952473397e-06, "loss": 5.961e-05, "memory(GiB)": 26.31, "step": 7160, "train_speed(iter/s)": 0.581003 }, { "acc": 0.99917202, "epoch": 7.598091198303288, "grad_norm": 0.20772714912891388, "learning_rate": 1.49707539890461e-06, "loss": 0.00167405, "memory(GiB)": 26.31, "step": 7165, "train_speed(iter/s)": 0.581009 }, { "acc": 0.99976158, "epoch": 7.6033934252386, "grad_norm": 0.0049901618622243404, "learning_rate": 1.490826617682013e-06, "loss": 0.00034158, "memory(GiB)": 26.31, "step": 7170, "train_speed(iter/s)": 0.58101 }, { "acc": 0.99975185, "epoch": 7.608695652173913, "grad_norm": 0.06982958316802979, "learning_rate": 1.4845886280193864e-06, "loss": 0.00028914, "memory(GiB)": 26.31, "step": 7175, "train_speed(iter/s)": 0.581013 }, { "acc": 1.0, "epoch": 7.613997879109226, "grad_norm": 0.048727817833423615, "learning_rate": 1.4783614490973306e-06, "loss": 4.181e-05, "memory(GiB)": 26.31, "step": 7180, "train_speed(iter/s)": 0.581015 }, { "acc": 0.99975662, "epoch": 7.6193001060445384, "grad_norm": 0.12339375168085098, "learning_rate": 1.4721451000632039e-06, "loss": 0.00052848, "memory(GiB)": 26.31, "step": 7185, "train_speed(iter/s)": 0.581018 }, { "acc": 1.0, "epoch": 7.624602332979851, "grad_norm": 0.036420684307813644, "learning_rate": 1.4659396000310644e-06, "loss": 0.00010585, "memory(GiB)": 26.31, "step": 7190, "train_speed(iter/s)": 0.581023 }, { "acc": 0.99966774, "epoch": 7.629904559915165, "grad_norm": 0.37844347953796387, "learning_rate": 1.4597449680816136e-06, "loss": 0.00042416, "memory(GiB)": 26.31, "step": 7195, "train_speed(iter/s)": 0.581026 }, { "acc": 0.99973469, "epoch": 7.635206786850477, "grad_norm": 0.1244843453168869, "learning_rate": 1.4535612232621336e-06, "loss": 0.00036726, "memory(GiB)": 26.31, "step": 7200, "train_speed(iter/s)": 0.581029 }, { "acc": 0.99987183, "epoch": 7.64050901378579, "grad_norm": 0.03416428714990616, "learning_rate": 1.4473883845864307e-06, "loss": 0.00026054, "memory(GiB)": 26.31, "step": 7205, "train_speed(iter/s)": 0.58103 }, { "acc": 0.99987679, "epoch": 7.645811240721103, "grad_norm": 0.03423444554209709, "learning_rate": 1.4412264710347803e-06, "loss": 0.00099486, "memory(GiB)": 26.31, "step": 7210, "train_speed(iter/s)": 0.581038 }, { "acc": 0.99987431, "epoch": 7.651113467656415, "grad_norm": 0.030545970425009727, "learning_rate": 1.4350755015538615e-06, "loss": 0.00020079, "memory(GiB)": 26.31, "step": 7215, "train_speed(iter/s)": 0.581038 }, { "acc": 0.99946852, "epoch": 7.656415694591729, "grad_norm": 0.521009087562561, "learning_rate": 1.4289354950567039e-06, "loss": 0.0009209, "memory(GiB)": 26.31, "step": 7220, "train_speed(iter/s)": 0.581041 }, { "acc": 0.99974804, "epoch": 7.661717921527042, "grad_norm": 0.00251702475361526, "learning_rate": 1.4228064704226276e-06, "loss": 0.00047443, "memory(GiB)": 26.31, "step": 7225, "train_speed(iter/s)": 0.581045 }, { "acc": 0.99924774, "epoch": 7.667020148462354, "grad_norm": 1.2205266952514648, "learning_rate": 1.4166884464971858e-06, "loss": 0.00217944, "memory(GiB)": 26.31, "step": 7230, "train_speed(iter/s)": 0.581047 }, { "acc": 0.99965, "epoch": 7.672322375397667, "grad_norm": 0.007064941339194775, "learning_rate": 1.4105814420921073e-06, "loss": 0.00124084, "memory(GiB)": 26.31, "step": 7235, "train_speed(iter/s)": 0.58105 }, { "acc": 0.99972963, "epoch": 7.67762460233298, "grad_norm": 0.028828129172325134, "learning_rate": 1.4044854759852378e-06, "loss": 0.00059189, "memory(GiB)": 26.31, "step": 7240, "train_speed(iter/s)": 0.581051 }, { "acc": 0.9998641, "epoch": 7.682926829268292, "grad_norm": 0.01702168956398964, "learning_rate": 1.3984005669204808e-06, "loss": 0.00023274, "memory(GiB)": 26.31, "step": 7245, "train_speed(iter/s)": 0.581055 }, { "acc": 0.99950886, "epoch": 7.688229056203605, "grad_norm": 0.09453985095024109, "learning_rate": 1.392326733607744e-06, "loss": 0.00087211, "memory(GiB)": 26.31, "step": 7250, "train_speed(iter/s)": 0.581057 }, { "acc": 0.99919186, "epoch": 7.693531283138919, "grad_norm": 0.06911499798297882, "learning_rate": 1.3862639947228785e-06, "loss": 0.00130631, "memory(GiB)": 26.31, "step": 7255, "train_speed(iter/s)": 0.581061 }, { "acc": 0.99972954, "epoch": 7.698833510074231, "grad_norm": 0.0033358214423060417, "learning_rate": 1.3802123689076192e-06, "loss": 0.00047809, "memory(GiB)": 26.31, "step": 7260, "train_speed(iter/s)": 0.581062 }, { "acc": 0.99975901, "epoch": 7.704135737009544, "grad_norm": 0.046811651438474655, "learning_rate": 1.3741718747695368e-06, "loss": 0.00049594, "memory(GiB)": 26.31, "step": 7265, "train_speed(iter/s)": 0.581065 }, { "acc": 1.0, "epoch": 7.709437963944857, "grad_norm": 0.006222330033779144, "learning_rate": 1.3681425308819673e-06, "loss": 0.0001086, "memory(GiB)": 26.31, "step": 7270, "train_speed(iter/s)": 0.581069 }, { "acc": 0.99987497, "epoch": 7.714740190880169, "grad_norm": 0.4325442910194397, "learning_rate": 1.3621243557839688e-06, "loss": 0.00109212, "memory(GiB)": 26.31, "step": 7275, "train_speed(iter/s)": 0.581071 }, { "acc": 0.99974489, "epoch": 7.720042417815483, "grad_norm": 0.10214357823133469, "learning_rate": 1.3561173679802524e-06, "loss": 0.00060738, "memory(GiB)": 26.31, "step": 7280, "train_speed(iter/s)": 0.581077 }, { "acc": 0.99987564, "epoch": 7.725344644750796, "grad_norm": 0.01606924459338188, "learning_rate": 1.3501215859411318e-06, "loss": 0.00048148, "memory(GiB)": 26.31, "step": 7285, "train_speed(iter/s)": 0.581079 }, { "acc": 0.99961681, "epoch": 7.730646871686108, "grad_norm": 0.06415250152349472, "learning_rate": 1.3441370281024654e-06, "loss": 0.00113499, "memory(GiB)": 26.31, "step": 7290, "train_speed(iter/s)": 0.581081 }, { "acc": 1.0, "epoch": 7.735949098621421, "grad_norm": 0.0027941372245550156, "learning_rate": 1.3381637128655995e-06, "loss": 0.00013706, "memory(GiB)": 26.31, "step": 7295, "train_speed(iter/s)": 0.581085 }, { "acc": 0.99935102, "epoch": 7.741251325556734, "grad_norm": 0.05772147700190544, "learning_rate": 1.3322016585973113e-06, "loss": 0.00217016, "memory(GiB)": 26.31, "step": 7300, "train_speed(iter/s)": 0.581086 }, { "acc": 0.99988098, "epoch": 7.746553552492046, "grad_norm": 0.2209397852420807, "learning_rate": 1.326250883629753e-06, "loss": 0.00022564, "memory(GiB)": 26.31, "step": 7305, "train_speed(iter/s)": 0.581088 }, { "acc": 1.0, "epoch": 7.751855779427359, "grad_norm": 0.03734031319618225, "learning_rate": 1.3203114062603944e-06, "loss": 0.00019665, "memory(GiB)": 26.31, "step": 7310, "train_speed(iter/s)": 0.581088 }, { "acc": 0.99974689, "epoch": 7.757158006362673, "grad_norm": 0.24819055199623108, "learning_rate": 1.3143832447519692e-06, "loss": 0.00042938, "memory(GiB)": 26.31, "step": 7315, "train_speed(iter/s)": 0.581089 }, { "acc": 0.99972458, "epoch": 7.762460233297985, "grad_norm": 0.016377611085772514, "learning_rate": 1.3084664173324144e-06, "loss": 0.0007581, "memory(GiB)": 26.31, "step": 7320, "train_speed(iter/s)": 0.581091 }, { "acc": 0.99985714, "epoch": 7.767762460233298, "grad_norm": 0.03054504469037056, "learning_rate": 1.30256094219482e-06, "loss": 0.00041984, "memory(GiB)": 26.31, "step": 7325, "train_speed(iter/s)": 0.581093 }, { "acc": 0.99988937, "epoch": 7.773064687168611, "grad_norm": 0.00745094520971179, "learning_rate": 1.2966668374973673e-06, "loss": 0.00020874, "memory(GiB)": 26.31, "step": 7330, "train_speed(iter/s)": 0.581097 }, { "acc": 0.99976444, "epoch": 7.778366914103923, "grad_norm": 0.009208225645124912, "learning_rate": 1.290784121363275e-06, "loss": 0.00059624, "memory(GiB)": 26.31, "step": 7335, "train_speed(iter/s)": 0.5811 }, { "acc": 0.99987926, "epoch": 7.783669141039237, "grad_norm": 0.01450218167155981, "learning_rate": 1.28491281188075e-06, "loss": 0.00031939, "memory(GiB)": 26.31, "step": 7340, "train_speed(iter/s)": 0.581101 }, { "acc": 0.99976578, "epoch": 7.78897136797455, "grad_norm": 0.007167202420532703, "learning_rate": 1.2790529271029191e-06, "loss": 0.00062234, "memory(GiB)": 26.31, "step": 7345, "train_speed(iter/s)": 0.581099 }, { "acc": 0.99965649, "epoch": 7.794273594909862, "grad_norm": 0.3587786555290222, "learning_rate": 1.2732044850477839e-06, "loss": 0.00082365, "memory(GiB)": 26.31, "step": 7350, "train_speed(iter/s)": 0.581101 }, { "acc": 0.99975052, "epoch": 7.799575821845175, "grad_norm": 0.15072518587112427, "learning_rate": 1.2673675036981609e-06, "loss": 0.00072264, "memory(GiB)": 26.31, "step": 7355, "train_speed(iter/s)": 0.581097 }, { "acc": 1.0, "epoch": 7.804878048780488, "grad_norm": 0.005496453959494829, "learning_rate": 1.2615420010016277e-06, "loss": 3.213e-05, "memory(GiB)": 26.31, "step": 7360, "train_speed(iter/s)": 0.581099 }, { "acc": 0.99988422, "epoch": 7.8101802757158, "grad_norm": 0.0037899240851402283, "learning_rate": 1.2557279948704668e-06, "loss": 0.00021584, "memory(GiB)": 26.31, "step": 7365, "train_speed(iter/s)": 0.581101 }, { "acc": 0.99967022, "epoch": 7.815482502651113, "grad_norm": 0.002349494956433773, "learning_rate": 1.2499255031816091e-06, "loss": 0.00057175, "memory(GiB)": 26.31, "step": 7370, "train_speed(iter/s)": 0.581103 }, { "acc": 0.99985123, "epoch": 7.820784729586427, "grad_norm": 0.013030619360506535, "learning_rate": 1.244134543776587e-06, "loss": 0.00116417, "memory(GiB)": 26.31, "step": 7375, "train_speed(iter/s)": 0.581104 }, { "acc": 1.0, "epoch": 7.826086956521739, "grad_norm": 0.006604825146496296, "learning_rate": 1.238355134461467e-06, "loss": 0.00014518, "memory(GiB)": 26.31, "step": 7380, "train_speed(iter/s)": 0.581105 }, { "acc": 1.0, "epoch": 7.831389183457052, "grad_norm": 0.014985980466008186, "learning_rate": 1.2325872930068038e-06, "loss": 0.00011804, "memory(GiB)": 26.31, "step": 7385, "train_speed(iter/s)": 0.58111 }, { "acc": 0.99986839, "epoch": 7.836691410392365, "grad_norm": 0.09459065645933151, "learning_rate": 1.2268310371475835e-06, "loss": 0.00033445, "memory(GiB)": 26.31, "step": 7390, "train_speed(iter/s)": 0.581111 }, { "acc": 0.99988422, "epoch": 7.841993637327677, "grad_norm": 0.840636134147644, "learning_rate": 1.2210863845831671e-06, "loss": 0.0012121, "memory(GiB)": 26.31, "step": 7395, "train_speed(iter/s)": 0.581115 }, { "acc": 0.99977112, "epoch": 7.847295864262991, "grad_norm": 0.11717978119850159, "learning_rate": 1.215353352977239e-06, "loss": 0.00099637, "memory(GiB)": 26.31, "step": 7400, "train_speed(iter/s)": 0.58112 }, { "acc": 0.99964523, "epoch": 7.8525980911983035, "grad_norm": 0.04506349936127663, "learning_rate": 1.2096319599577535e-06, "loss": 0.00089844, "memory(GiB)": 26.31, "step": 7405, "train_speed(iter/s)": 0.581127 }, { "acc": 0.99959965, "epoch": 7.857900318133616, "grad_norm": 0.02824774570763111, "learning_rate": 1.203922223116874e-06, "loss": 0.00082064, "memory(GiB)": 26.31, "step": 7410, "train_speed(iter/s)": 0.58113 }, { "acc": 0.99965115, "epoch": 7.863202545068929, "grad_norm": 0.19301556050777435, "learning_rate": 1.1982241600109274e-06, "loss": 0.00085676, "memory(GiB)": 26.31, "step": 7415, "train_speed(iter/s)": 0.581131 }, { "acc": 0.99988985, "epoch": 7.868504772004242, "grad_norm": 0.005214956123381853, "learning_rate": 1.1925377881603432e-06, "loss": 0.00020933, "memory(GiB)": 26.31, "step": 7420, "train_speed(iter/s)": 0.581132 }, { "acc": 0.99974422, "epoch": 7.873806998939554, "grad_norm": 0.0082614840939641, "learning_rate": 1.1868631250496052e-06, "loss": 0.00044404, "memory(GiB)": 26.31, "step": 7425, "train_speed(iter/s)": 0.581134 }, { "acc": 0.99974804, "epoch": 7.879109225874867, "grad_norm": 0.037489645183086395, "learning_rate": 1.1812001881271926e-06, "loss": 0.00052237, "memory(GiB)": 26.31, "step": 7430, "train_speed(iter/s)": 0.581136 }, { "acc": 1.0, "epoch": 7.8844114528101805, "grad_norm": 0.03406713902950287, "learning_rate": 1.1755489948055305e-06, "loss": 1.791e-05, "memory(GiB)": 26.31, "step": 7435, "train_speed(iter/s)": 0.581139 }, { "acc": 0.99985552, "epoch": 7.889713679745493, "grad_norm": 0.0035832603462040424, "learning_rate": 1.1699095624609343e-06, "loss": 0.00037999, "memory(GiB)": 26.31, "step": 7440, "train_speed(iter/s)": 0.581145 }, { "acc": 1.0, "epoch": 7.895015906680806, "grad_norm": 0.016665274277329445, "learning_rate": 1.1642819084335577e-06, "loss": 4.864e-05, "memory(GiB)": 26.31, "step": 7445, "train_speed(iter/s)": 0.581146 }, { "acc": 0.99987803, "epoch": 7.900318133616119, "grad_norm": 0.021315449848771095, "learning_rate": 1.1586660500273351e-06, "loss": 0.00027156, "memory(GiB)": 26.31, "step": 7450, "train_speed(iter/s)": 0.581149 }, { "acc": 0.99988155, "epoch": 7.905620360551431, "grad_norm": 0.049092620611190796, "learning_rate": 1.1530620045099361e-06, "loss": 0.00037108, "memory(GiB)": 26.31, "step": 7455, "train_speed(iter/s)": 0.58115 }, { "acc": 0.99988375, "epoch": 7.910922587486745, "grad_norm": 0.017999274656176567, "learning_rate": 1.1474697891127047e-06, "loss": 0.00031416, "memory(GiB)": 26.31, "step": 7460, "train_speed(iter/s)": 0.581151 }, { "acc": 0.9998724, "epoch": 7.9162248144220575, "grad_norm": 0.00615483894944191, "learning_rate": 1.14188942103061e-06, "loss": 0.00021588, "memory(GiB)": 26.31, "step": 7465, "train_speed(iter/s)": 0.581153 }, { "acc": 0.99965563, "epoch": 7.92152704135737, "grad_norm": 0.0026157486718147993, "learning_rate": 1.1363209174221953e-06, "loss": 0.00055385, "memory(GiB)": 26.31, "step": 7470, "train_speed(iter/s)": 0.581155 }, { "acc": 0.99960432, "epoch": 7.926829268292683, "grad_norm": 0.014894828200340271, "learning_rate": 1.130764295409521e-06, "loss": 0.00133177, "memory(GiB)": 26.31, "step": 7475, "train_speed(iter/s)": 0.581159 }, { "acc": 0.999753, "epoch": 7.9321314952279955, "grad_norm": 0.11886921525001526, "learning_rate": 1.1252195720781122e-06, "loss": 0.00043521, "memory(GiB)": 26.31, "step": 7480, "train_speed(iter/s)": 0.581161 }, { "acc": 1.0, "epoch": 7.937433722163308, "grad_norm": 0.012216082774102688, "learning_rate": 1.1196867644769127e-06, "loss": 1.175e-05, "memory(GiB)": 26.31, "step": 7485, "train_speed(iter/s)": 0.581162 }, { "acc": 0.99987621, "epoch": 7.942735949098622, "grad_norm": 0.0035921805538237095, "learning_rate": 1.1141658896182242e-06, "loss": 0.0002258, "memory(GiB)": 26.31, "step": 7490, "train_speed(iter/s)": 0.581163 }, { "acc": 0.99988585, "epoch": 7.9480381760339345, "grad_norm": 0.06408660113811493, "learning_rate": 1.1086569644776578e-06, "loss": 0.0002532, "memory(GiB)": 26.31, "step": 7495, "train_speed(iter/s)": 0.581165 }, { "acc": 1.0, "epoch": 7.953340402969247, "grad_norm": 0.06699665635824203, "learning_rate": 1.1031600059940816e-06, "loss": 5.644e-05, "memory(GiB)": 26.31, "step": 7500, "train_speed(iter/s)": 0.581169 }, { "acc": 1.0, "epoch": 7.95864262990456, "grad_norm": 0.09777071326971054, "learning_rate": 1.0976750310695696e-06, "loss": 4.99e-05, "memory(GiB)": 26.31, "step": 7505, "train_speed(iter/s)": 0.58117 }, { "acc": 0.99987183, "epoch": 7.9639448568398725, "grad_norm": 0.04002142325043678, "learning_rate": 1.0922020565693477e-06, "loss": 0.00034577, "memory(GiB)": 26.31, "step": 7510, "train_speed(iter/s)": 0.581168 }, { "acc": 0.99974804, "epoch": 7.969247083775185, "grad_norm": 0.04886091500520706, "learning_rate": 1.0867410993217438e-06, "loss": 0.00035892, "memory(GiB)": 26.31, "step": 7515, "train_speed(iter/s)": 0.581173 }, { "acc": 0.99939957, "epoch": 7.974549310710499, "grad_norm": 0.06108390912413597, "learning_rate": 1.0812921761181341e-06, "loss": 0.0011682, "memory(GiB)": 26.31, "step": 7520, "train_speed(iter/s)": 0.581173 }, { "acc": 0.99964771, "epoch": 7.9798515376458115, "grad_norm": 0.04557066038250923, "learning_rate": 1.0758553037128931e-06, "loss": 0.00045693, "memory(GiB)": 26.31, "step": 7525, "train_speed(iter/s)": 0.581175 }, { "acc": 0.99987621, "epoch": 7.985153764581124, "grad_norm": 0.4263116121292114, "learning_rate": 1.0704304988233402e-06, "loss": 0.00060724, "memory(GiB)": 26.31, "step": 7530, "train_speed(iter/s)": 0.58117 }, { "acc": 0.99987745, "epoch": 7.990455991516437, "grad_norm": 0.04181910306215286, "learning_rate": 1.0650177781296923e-06, "loss": 0.00010795, "memory(GiB)": 26.31, "step": 7535, "train_speed(iter/s)": 0.581171 }, { "acc": 0.99963703, "epoch": 7.9957582184517495, "grad_norm": 0.001203623367473483, "learning_rate": 1.0596171582750076e-06, "loss": 0.00166455, "memory(GiB)": 26.31, "step": 7540, "train_speed(iter/s)": 0.581161 }, { "acc": 1.0, "epoch": 8.001060445387063, "grad_norm": 0.004303697030991316, "learning_rate": 1.0542286558651369e-06, "loss": 0.00017417, "memory(GiB)": 26.31, "step": 7545, "train_speed(iter/s)": 0.58112 }, { "acc": 0.99951143, "epoch": 8.006362672322375, "grad_norm": 0.03637172281742096, "learning_rate": 1.048852287468672e-06, "loss": 0.00085328, "memory(GiB)": 26.31, "step": 7550, "train_speed(iter/s)": 0.581124 }, { "acc": 0.99976511, "epoch": 8.011664899257688, "grad_norm": 0.018262850120663643, "learning_rate": 1.0434880696168952e-06, "loss": 0.00045877, "memory(GiB)": 26.31, "step": 7555, "train_speed(iter/s)": 0.581125 }, { "acc": 0.99960289, "epoch": 8.016967126193, "grad_norm": 0.08547333627939224, "learning_rate": 1.0381360188037295e-06, "loss": 0.00064282, "memory(GiB)": 26.31, "step": 7560, "train_speed(iter/s)": 0.581126 }, { "acc": 0.9999012, "epoch": 8.022269353128314, "grad_norm": 0.04770037904381752, "learning_rate": 1.0327961514856845e-06, "loss": 0.00030082, "memory(GiB)": 26.31, "step": 7565, "train_speed(iter/s)": 0.581131 }, { "acc": 0.9998579, "epoch": 8.027571580063627, "grad_norm": 0.008269469253718853, "learning_rate": 1.0274684840818093e-06, "loss": 0.00032678, "memory(GiB)": 26.31, "step": 7570, "train_speed(iter/s)": 0.581133 }, { "acc": 0.9997838, "epoch": 8.03287380699894, "grad_norm": 0.0030161449685692787, "learning_rate": 1.0221530329736403e-06, "loss": 0.00061227, "memory(GiB)": 26.31, "step": 7575, "train_speed(iter/s)": 0.581134 }, { "acc": 0.9998724, "epoch": 8.038176033934253, "grad_norm": 0.13272728025913239, "learning_rate": 1.0168498145051508e-06, "loss": 0.00031736, "memory(GiB)": 26.31, "step": 7580, "train_speed(iter/s)": 0.581136 }, { "acc": 0.99987679, "epoch": 8.043478260869565, "grad_norm": 0.004710217472165823, "learning_rate": 1.0115588449827022e-06, "loss": 0.00022688, "memory(GiB)": 26.31, "step": 7585, "train_speed(iter/s)": 0.581137 }, { "acc": 0.99988213, "epoch": 8.048780487804878, "grad_norm": 0.023875171318650246, "learning_rate": 1.0062801406749908e-06, "loss": 0.00036154, "memory(GiB)": 26.31, "step": 7590, "train_speed(iter/s)": 0.581142 }, { "acc": 1.0, "epoch": 8.054082714740192, "grad_norm": 0.055615752935409546, "learning_rate": 1.0010137178130023e-06, "loss": 3.904e-05, "memory(GiB)": 26.31, "step": 7595, "train_speed(iter/s)": 0.581142 }, { "acc": 0.9998724, "epoch": 8.059384941675503, "grad_norm": 0.013751799240708351, "learning_rate": 9.957595925899576e-07, "loss": 0.00014761, "memory(GiB)": 26.31, "step": 7600, "train_speed(iter/s)": 0.581144 }, { "acc": 0.99953594, "epoch": 8.064687168610817, "grad_norm": 0.13239844143390656, "learning_rate": 9.90517781161266e-07, "loss": 0.0006467, "memory(GiB)": 26.31, "step": 7605, "train_speed(iter/s)": 0.581146 }, { "acc": 0.99987803, "epoch": 8.069989395546129, "grad_norm": 0.0015992814442142844, "learning_rate": 9.852882996444734e-07, "loss": 0.00015576, "memory(GiB)": 26.31, "step": 7610, "train_speed(iter/s)": 0.581147 }, { "acc": 0.99988155, "epoch": 8.075291622481442, "grad_norm": 0.0007152906036935747, "learning_rate": 9.800711641192137e-07, "loss": 0.000157, "memory(GiB)": 26.31, "step": 7615, "train_speed(iter/s)": 0.581148 }, { "acc": 0.99987621, "epoch": 8.080593849416754, "grad_norm": 0.14747610688209534, "learning_rate": 9.748663906271589e-07, "loss": 0.00015784, "memory(GiB)": 26.31, "step": 7620, "train_speed(iter/s)": 0.581152 }, { "acc": 0.99974289, "epoch": 8.085896076352068, "grad_norm": 0.04375872015953064, "learning_rate": 9.696739951719706e-07, "loss": 0.0003344, "memory(GiB)": 26.31, "step": 7625, "train_speed(iter/s)": 0.581152 }, { "acc": 1.0, "epoch": 8.091198303287381, "grad_norm": 0.0009364792495034635, "learning_rate": 9.644939937192512e-07, "loss": 1.078e-05, "memory(GiB)": 26.31, "step": 7630, "train_speed(iter/s)": 0.581156 }, { "acc": 0.9997592, "epoch": 8.096500530222693, "grad_norm": 0.44751542806625366, "learning_rate": 9.593264021964919e-07, "loss": 0.00053736, "memory(GiB)": 26.31, "step": 7635, "train_speed(iter/s)": 0.581158 }, { "acc": 0.99986486, "epoch": 8.101802757158007, "grad_norm": 0.08755602687597275, "learning_rate": 9.541712364930284e-07, "loss": 0.00027139, "memory(GiB)": 26.31, "step": 7640, "train_speed(iter/s)": 0.581162 }, { "acc": 1.0, "epoch": 8.107104984093318, "grad_norm": 0.06107502058148384, "learning_rate": 9.490285124599867e-07, "loss": 9.593e-05, "memory(GiB)": 26.31, "step": 7645, "train_speed(iter/s)": 0.581166 }, { "acc": 0.99987621, "epoch": 8.112407211028632, "grad_norm": 0.0004909554845653474, "learning_rate": 9.438982459102395e-07, "loss": 0.00014782, "memory(GiB)": 26.31, "step": 7650, "train_speed(iter/s)": 0.581168 }, { "acc": 0.99988422, "epoch": 8.117709437963946, "grad_norm": 0.003382364520803094, "learning_rate": 9.387804526183543e-07, "loss": 0.00013919, "memory(GiB)": 26.31, "step": 7655, "train_speed(iter/s)": 0.581171 }, { "acc": 0.99941235, "epoch": 8.123011664899257, "grad_norm": 0.0029366735834628344, "learning_rate": 9.336751483205435e-07, "loss": 0.00112311, "memory(GiB)": 26.31, "step": 7660, "train_speed(iter/s)": 0.581173 }, { "acc": 0.99973831, "epoch": 8.128313891834571, "grad_norm": 0.03579862788319588, "learning_rate": 9.285823487146234e-07, "loss": 0.00040023, "memory(GiB)": 26.31, "step": 7665, "train_speed(iter/s)": 0.581174 }, { "acc": 0.99987373, "epoch": 8.133616118769883, "grad_norm": 0.04333464428782463, "learning_rate": 9.235020694599566e-07, "loss": 0.0002684, "memory(GiB)": 26.31, "step": 7670, "train_speed(iter/s)": 0.581176 }, { "acc": 1.0, "epoch": 8.138918345705196, "grad_norm": 0.022197069600224495, "learning_rate": 9.18434326177409e-07, "loss": 0.0001166, "memory(GiB)": 26.31, "step": 7675, "train_speed(iter/s)": 0.581179 }, { "acc": 0.99988155, "epoch": 8.14422057264051, "grad_norm": 0.01878158375620842, "learning_rate": 9.133791344493017e-07, "loss": 0.00013026, "memory(GiB)": 26.31, "step": 7680, "train_speed(iter/s)": 0.581185 }, { "acc": 0.99974842, "epoch": 8.149522799575822, "grad_norm": 0.03990185260772705, "learning_rate": 9.083365098193609e-07, "loss": 0.00041892, "memory(GiB)": 26.31, "step": 7685, "train_speed(iter/s)": 0.581189 }, { "acc": 0.99949589, "epoch": 8.154825026511135, "grad_norm": 0.003369416343048215, "learning_rate": 9.033064677926724e-07, "loss": 0.00066151, "memory(GiB)": 26.31, "step": 7690, "train_speed(iter/s)": 0.58119 }, { "acc": 1.0, "epoch": 8.160127253446447, "grad_norm": 0.0658353790640831, "learning_rate": 8.982890238356318e-07, "loss": 0.00011477, "memory(GiB)": 26.31, "step": 7695, "train_speed(iter/s)": 0.581195 }, { "acc": 0.99987116, "epoch": 8.16542948038176, "grad_norm": 0.07666454464197159, "learning_rate": 8.932841933759011e-07, "loss": 0.00020501, "memory(GiB)": 26.31, "step": 7700, "train_speed(iter/s)": 0.581198 }, { "acc": 0.99975843, "epoch": 8.170731707317072, "grad_norm": 0.0565459281206131, "learning_rate": 8.882919918023548e-07, "loss": 0.00032766, "memory(GiB)": 26.31, "step": 7705, "train_speed(iter/s)": 0.5812 }, { "acc": 0.99987307, "epoch": 8.176033934252386, "grad_norm": 0.001517447759397328, "learning_rate": 8.833124344650383e-07, "loss": 0.00012655, "memory(GiB)": 26.31, "step": 7710, "train_speed(iter/s)": 0.581202 }, { "acc": 1.0, "epoch": 8.1813361611877, "grad_norm": 0.008927865885198116, "learning_rate": 8.783455366751168e-07, "loss": 8.786e-05, "memory(GiB)": 26.31, "step": 7715, "train_speed(iter/s)": 0.581204 }, { "acc": 1.0, "epoch": 8.186638388123011, "grad_norm": 0.001537539646960795, "learning_rate": 8.733913137048305e-07, "loss": 7.365e-05, "memory(GiB)": 26.31, "step": 7720, "train_speed(iter/s)": 0.581208 }, { "acc": 0.99978895, "epoch": 8.191940615058325, "grad_norm": 0.0009232127922587097, "learning_rate": 8.68449780787448e-07, "loss": 0.00085949, "memory(GiB)": 26.31, "step": 7725, "train_speed(iter/s)": 0.581209 }, { "acc": 0.99941664, "epoch": 8.197242841993637, "grad_norm": 0.1339799463748932, "learning_rate": 8.635209531172154e-07, "loss": 0.00167964, "memory(GiB)": 26.31, "step": 7730, "train_speed(iter/s)": 0.581211 }, { "acc": 0.99975662, "epoch": 8.20254506892895, "grad_norm": 0.030591564252972603, "learning_rate": 8.586048458493177e-07, "loss": 0.00045913, "memory(GiB)": 26.31, "step": 7735, "train_speed(iter/s)": 0.581213 }, { "acc": 0.9998641, "epoch": 8.207847295864262, "grad_norm": 0.03506559878587723, "learning_rate": 8.537014740998235e-07, "loss": 0.00042046, "memory(GiB)": 26.31, "step": 7740, "train_speed(iter/s)": 0.581217 }, { "acc": 1.0, "epoch": 8.213149522799576, "grad_norm": 0.01826823502779007, "learning_rate": 8.488108529456423e-07, "loss": 3.581e-05, "memory(GiB)": 26.31, "step": 7745, "train_speed(iter/s)": 0.581219 }, { "acc": 0.99974556, "epoch": 8.21845174973489, "grad_norm": 0.11806601285934448, "learning_rate": 8.439329974244791e-07, "loss": 0.00072233, "memory(GiB)": 26.31, "step": 7750, "train_speed(iter/s)": 0.581223 }, { "acc": 1.0, "epoch": 8.223753976670201, "grad_norm": 0.01329426933079958, "learning_rate": 8.390679225347866e-07, "loss": 0.00020656, "memory(GiB)": 26.31, "step": 7755, "train_speed(iter/s)": 0.581224 }, { "acc": 0.99974918, "epoch": 8.229056203605515, "grad_norm": 0.03872642666101456, "learning_rate": 8.342156432357194e-07, "loss": 0.00053237, "memory(GiB)": 26.31, "step": 7760, "train_speed(iter/s)": 0.581225 }, { "acc": 0.99976864, "epoch": 8.234358430540826, "grad_norm": 0.04402993246912956, "learning_rate": 8.293761744470884e-07, "loss": 0.0003388, "memory(GiB)": 26.31, "step": 7765, "train_speed(iter/s)": 0.581229 }, { "acc": 0.99985552, "epoch": 8.23966065747614, "grad_norm": 0.021901650354266167, "learning_rate": 8.245495310493146e-07, "loss": 0.00030562, "memory(GiB)": 26.31, "step": 7770, "train_speed(iter/s)": 0.58123 }, { "acc": 0.99988585, "epoch": 8.244962884411454, "grad_norm": 0.1328391581773758, "learning_rate": 8.197357278833834e-07, "loss": 0.00030064, "memory(GiB)": 26.31, "step": 7775, "train_speed(iter/s)": 0.581233 }, { "acc": 0.99988213, "epoch": 8.250265111346765, "grad_norm": 0.0058752140030264854, "learning_rate": 8.149347797507994e-07, "loss": 0.00020896, "memory(GiB)": 26.31, "step": 7780, "train_speed(iter/s)": 0.581236 }, { "acc": 0.99989176, "epoch": 8.255567338282079, "grad_norm": 0.06424537301063538, "learning_rate": 8.101467014135403e-07, "loss": 0.00027083, "memory(GiB)": 26.31, "step": 7785, "train_speed(iter/s)": 0.581237 }, { "acc": 0.99976292, "epoch": 8.26086956521739, "grad_norm": 0.0293315090239048, "learning_rate": 8.053715075940096e-07, "loss": 0.00043081, "memory(GiB)": 26.31, "step": 7790, "train_speed(iter/s)": 0.581239 }, { "acc": 0.99987926, "epoch": 8.266171792152704, "grad_norm": 0.015792246907949448, "learning_rate": 8.006092129749986e-07, "loss": 0.00016041, "memory(GiB)": 26.31, "step": 7795, "train_speed(iter/s)": 0.581236 }, { "acc": 1.0, "epoch": 8.271474019088018, "grad_norm": 0.0027385330758988857, "learning_rate": 7.958598321996309e-07, "loss": 6.26e-06, "memory(GiB)": 26.31, "step": 7800, "train_speed(iter/s)": 0.581237 }, { "acc": 1.0, "epoch": 8.27677624602333, "grad_norm": 0.0007324932957999408, "learning_rate": 7.91123379871324e-07, "loss": 0.0001466, "memory(GiB)": 26.31, "step": 7805, "train_speed(iter/s)": 0.58124 }, { "acc": 0.99987679, "epoch": 8.282078472958643, "grad_norm": 0.000622512015979737, "learning_rate": 7.863998705537454e-07, "loss": 0.00020528, "memory(GiB)": 26.31, "step": 7810, "train_speed(iter/s)": 0.581241 }, { "acc": 0.99974909, "epoch": 8.287380699893955, "grad_norm": 0.16693612933158875, "learning_rate": 7.816893187707619e-07, "loss": 0.00094907, "memory(GiB)": 26.31, "step": 7815, "train_speed(iter/s)": 0.581242 }, { "acc": 0.99962425, "epoch": 8.292682926829269, "grad_norm": 0.7306221127510071, "learning_rate": 7.769917390064011e-07, "loss": 0.00160853, "memory(GiB)": 26.31, "step": 7820, "train_speed(iter/s)": 0.581245 }, { "acc": 0.99975004, "epoch": 8.29798515376458, "grad_norm": 0.07459704577922821, "learning_rate": 7.72307145704802e-07, "loss": 0.00056356, "memory(GiB)": 26.31, "step": 7825, "train_speed(iter/s)": 0.581249 }, { "acc": 0.99986629, "epoch": 8.303287380699894, "grad_norm": 0.0025041713379323483, "learning_rate": 7.676355532701742e-07, "loss": 0.00018201, "memory(GiB)": 26.31, "step": 7830, "train_speed(iter/s)": 0.581255 }, { "acc": 1.0, "epoch": 8.308589607635207, "grad_norm": 0.010728196240961552, "learning_rate": 7.629769760667513e-07, "loss": 0.00011778, "memory(GiB)": 26.31, "step": 7835, "train_speed(iter/s)": 0.581257 }, { "acc": 1.0, "epoch": 8.31389183457052, "grad_norm": 0.01707724668085575, "learning_rate": 7.583314284187486e-07, "loss": 0.0001733, "memory(GiB)": 26.31, "step": 7840, "train_speed(iter/s)": 0.581258 }, { "acc": 0.99988317, "epoch": 8.319194061505833, "grad_norm": 0.002590770134702325, "learning_rate": 7.536989246103177e-07, "loss": 0.00010814, "memory(GiB)": 26.31, "step": 7845, "train_speed(iter/s)": 0.581261 }, { "acc": 0.99975109, "epoch": 8.324496288441145, "grad_norm": 0.14999988675117493, "learning_rate": 7.490794788855018e-07, "loss": 0.0007674, "memory(GiB)": 26.31, "step": 7850, "train_speed(iter/s)": 0.581263 }, { "acc": 0.99988632, "epoch": 8.329798515376458, "grad_norm": 0.0026431609876453876, "learning_rate": 7.444731054481951e-07, "loss": 0.00010075, "memory(GiB)": 26.31, "step": 7855, "train_speed(iter/s)": 0.581264 }, { "acc": 0.99971752, "epoch": 8.335100742311772, "grad_norm": 0.049115173518657684, "learning_rate": 7.398798184620941e-07, "loss": 0.00072776, "memory(GiB)": 26.31, "step": 7860, "train_speed(iter/s)": 0.581265 }, { "acc": 0.99974127, "epoch": 8.340402969247084, "grad_norm": 0.06810028851032257, "learning_rate": 7.352996320506616e-07, "loss": 0.00034255, "memory(GiB)": 26.31, "step": 7865, "train_speed(iter/s)": 0.581268 }, { "acc": 0.99976759, "epoch": 8.345705196182397, "grad_norm": 0.0013417246518656611, "learning_rate": 7.307325602970744e-07, "loss": 0.0008347, "memory(GiB)": 26.31, "step": 7870, "train_speed(iter/s)": 0.581269 }, { "acc": 0.99987373, "epoch": 8.351007423117709, "grad_norm": 0.016306953504681587, "learning_rate": 7.261786172441866e-07, "loss": 0.0002477, "memory(GiB)": 26.31, "step": 7875, "train_speed(iter/s)": 0.581272 }, { "acc": 1.0, "epoch": 8.356309650053023, "grad_norm": 0.03384576737880707, "learning_rate": 7.216378168944825e-07, "loss": 0.00020672, "memory(GiB)": 26.31, "step": 7880, "train_speed(iter/s)": 0.581277 }, { "acc": 0.9998826, "epoch": 8.361611876988334, "grad_norm": 0.00972414668649435, "learning_rate": 7.171101732100366e-07, "loss": 0.00042422, "memory(GiB)": 26.31, "step": 7885, "train_speed(iter/s)": 0.581279 }, { "acc": 0.99987803, "epoch": 8.366914103923648, "grad_norm": 0.010824889875948429, "learning_rate": 7.125957001124683e-07, "loss": 0.00028841, "memory(GiB)": 26.31, "step": 7890, "train_speed(iter/s)": 0.58128 }, { "acc": 0.99977264, "epoch": 8.372216330858961, "grad_norm": 0.004371269606053829, "learning_rate": 7.080944114829013e-07, "loss": 0.00040134, "memory(GiB)": 26.31, "step": 7895, "train_speed(iter/s)": 0.581281 }, { "acc": 1.0, "epoch": 8.377518557794273, "grad_norm": 0.0028774836100637913, "learning_rate": 7.036063211619177e-07, "loss": 0.00010729, "memory(GiB)": 26.31, "step": 7900, "train_speed(iter/s)": 0.581282 }, { "acc": 0.99987183, "epoch": 8.382820784729587, "grad_norm": 0.005267132073640823, "learning_rate": 6.991314429495186e-07, "loss": 0.00031727, "memory(GiB)": 26.31, "step": 7905, "train_speed(iter/s)": 0.581283 }, { "acc": 0.99976959, "epoch": 8.388123011664899, "grad_norm": 0.050596244633197784, "learning_rate": 6.946697906050808e-07, "loss": 0.00017752, "memory(GiB)": 26.31, "step": 7910, "train_speed(iter/s)": 0.581285 }, { "acc": 0.9996068, "epoch": 8.393425238600212, "grad_norm": 0.011514488607645035, "learning_rate": 6.902213778473115e-07, "loss": 0.00113118, "memory(GiB)": 26.31, "step": 7915, "train_speed(iter/s)": 0.581286 }, { "acc": 0.99986191, "epoch": 8.398727465535526, "grad_norm": 0.038335733115673065, "learning_rate": 6.857862183542143e-07, "loss": 0.00021071, "memory(GiB)": 26.31, "step": 7920, "train_speed(iter/s)": 0.581289 }, { "acc": 0.99976358, "epoch": 8.404029692470838, "grad_norm": 0.040934812277555466, "learning_rate": 6.813643257630354e-07, "loss": 0.00021678, "memory(GiB)": 26.31, "step": 7925, "train_speed(iter/s)": 0.581293 }, { "acc": 0.99987116, "epoch": 8.409331919406151, "grad_norm": 0.0021664213854819536, "learning_rate": 6.769557136702325e-07, "loss": 0.00011409, "memory(GiB)": 26.31, "step": 7930, "train_speed(iter/s)": 0.581294 }, { "acc": 0.99972258, "epoch": 8.414634146341463, "grad_norm": 0.07637397944927216, "learning_rate": 6.725603956314253e-07, "loss": 0.00044861, "memory(GiB)": 26.31, "step": 7935, "train_speed(iter/s)": 0.581296 }, { "acc": 0.99988537, "epoch": 8.419936373276776, "grad_norm": 0.00367990811355412, "learning_rate": 6.681783851613587e-07, "loss": 0.0002019, "memory(GiB)": 26.31, "step": 7940, "train_speed(iter/s)": 0.581297 }, { "acc": 0.99964008, "epoch": 8.425238600212088, "grad_norm": 0.07514069974422455, "learning_rate": 6.638096957338587e-07, "loss": 0.00043779, "memory(GiB)": 26.31, "step": 7945, "train_speed(iter/s)": 0.5813 }, { "acc": 0.9997551, "epoch": 8.430540827147402, "grad_norm": 0.0004876448365394026, "learning_rate": 6.594543407817915e-07, "loss": 0.00077535, "memory(GiB)": 26.31, "step": 7950, "train_speed(iter/s)": 0.581301 }, { "acc": 0.99986038, "epoch": 8.435843054082715, "grad_norm": 0.05327408015727997, "learning_rate": 6.551123336970226e-07, "loss": 0.0002358, "memory(GiB)": 26.31, "step": 7955, "train_speed(iter/s)": 0.581301 }, { "acc": 0.99986629, "epoch": 8.441145281018027, "grad_norm": 0.003401283174753189, "learning_rate": 6.507836878303758e-07, "loss": 0.00015811, "memory(GiB)": 26.31, "step": 7960, "train_speed(iter/s)": 0.581305 }, { "acc": 1.0, "epoch": 8.44644750795334, "grad_norm": 0.0021104025654494762, "learning_rate": 6.46468416491591e-07, "loss": 4.444e-05, "memory(GiB)": 26.31, "step": 7965, "train_speed(iter/s)": 0.581306 }, { "acc": 0.99987803, "epoch": 8.451749734888653, "grad_norm": 0.001826183870434761, "learning_rate": 6.421665329492848e-07, "loss": 0.00017718, "memory(GiB)": 26.31, "step": 7970, "train_speed(iter/s)": 0.581307 }, { "acc": 0.99988098, "epoch": 8.457051961823966, "grad_norm": 0.025131119415163994, "learning_rate": 6.378780504309089e-07, "loss": 0.00017582, "memory(GiB)": 26.31, "step": 7975, "train_speed(iter/s)": 0.581308 }, { "acc": 0.9997613, "epoch": 8.46235418875928, "grad_norm": 0.06657654792070389, "learning_rate": 6.336029821227086e-07, "loss": 0.00028992, "memory(GiB)": 26.31, "step": 7980, "train_speed(iter/s)": 0.58131 }, { "acc": 0.9998724, "epoch": 8.467656415694591, "grad_norm": 0.002139176009222865, "learning_rate": 6.293413411696846e-07, "loss": 0.00022113, "memory(GiB)": 26.31, "step": 7985, "train_speed(iter/s)": 0.581311 }, { "acc": 0.99986038, "epoch": 8.472958642629905, "grad_norm": 0.001917119137942791, "learning_rate": 6.250931406755482e-07, "loss": 0.00025974, "memory(GiB)": 26.31, "step": 7990, "train_speed(iter/s)": 0.581313 }, { "acc": 1.0, "epoch": 8.478260869565217, "grad_norm": 0.0010230530751869082, "learning_rate": 6.208583937026887e-07, "loss": 0.00013286, "memory(GiB)": 26.31, "step": 7995, "train_speed(iter/s)": 0.581317 }, { "acc": 0.99986267, "epoch": 8.48356309650053, "grad_norm": 0.0006629582494497299, "learning_rate": 6.166371132721243e-07, "loss": 0.00013081, "memory(GiB)": 26.31, "step": 8000, "train_speed(iter/s)": 0.581318 }, { "acc": 1.0, "epoch": 8.488865323435842, "grad_norm": 0.0006745496648363769, "learning_rate": 6.124293123634681e-07, "loss": 5.918e-05, "memory(GiB)": 26.31, "step": 8005, "train_speed(iter/s)": 0.581318 }, { "acc": 0.99989834, "epoch": 8.494167550371156, "grad_norm": 0.0012669694842770696, "learning_rate": 6.082350039148852e-07, "loss": 0.00026928, "memory(GiB)": 26.31, "step": 8010, "train_speed(iter/s)": 0.581319 }, { "acc": 0.99975195, "epoch": 8.49946977730647, "grad_norm": 0.057022713124752045, "learning_rate": 6.040542008230552e-07, "loss": 0.00072886, "memory(GiB)": 26.31, "step": 8015, "train_speed(iter/s)": 0.581322 }, { "acc": 0.99977112, "epoch": 8.504772004241781, "grad_norm": 0.0022008903324604034, "learning_rate": 5.998869159431307e-07, "loss": 0.00036921, "memory(GiB)": 26.31, "step": 8020, "train_speed(iter/s)": 0.581326 }, { "acc": 0.99954042, "epoch": 8.510074231177095, "grad_norm": 0.22246907651424408, "learning_rate": 5.957331620886968e-07, "loss": 0.00077993, "memory(GiB)": 26.31, "step": 8025, "train_speed(iter/s)": 0.581328 }, { "acc": 0.99975395, "epoch": 8.515376458112407, "grad_norm": 0.12080390006303787, "learning_rate": 5.915929520317385e-07, "loss": 0.00024467, "memory(GiB)": 26.31, "step": 8030, "train_speed(iter/s)": 0.581329 }, { "acc": 1.0, "epoch": 8.52067868504772, "grad_norm": 0.04112159460783005, "learning_rate": 5.874662985025903e-07, "loss": 0.00021345, "memory(GiB)": 26.31, "step": 8035, "train_speed(iter/s)": 0.581332 }, { "acc": 0.99977589, "epoch": 8.525980911983034, "grad_norm": 0.00460523646324873, "learning_rate": 5.833532141899069e-07, "loss": 0.00028215, "memory(GiB)": 26.31, "step": 8040, "train_speed(iter/s)": 0.581338 }, { "acc": 0.99978123, "epoch": 8.531283138918345, "grad_norm": 0.07351583242416382, "learning_rate": 5.792537117406182e-07, "loss": 0.00062952, "memory(GiB)": 26.31, "step": 8045, "train_speed(iter/s)": 0.581339 }, { "acc": 1.0, "epoch": 8.536585365853659, "grad_norm": 0.004166380036622286, "learning_rate": 5.751678037598939e-07, "loss": 8.209e-05, "memory(GiB)": 26.31, "step": 8050, "train_speed(iter/s)": 0.581341 }, { "acc": 0.99986343, "epoch": 8.54188759278897, "grad_norm": 0.004580026958137751, "learning_rate": 5.710955028111013e-07, "loss": 0.00103224, "memory(GiB)": 26.31, "step": 8055, "train_speed(iter/s)": 0.581343 }, { "acc": 0.99973564, "epoch": 8.547189819724284, "grad_norm": 0.002784633543342352, "learning_rate": 5.670368214157719e-07, "loss": 0.00031335, "memory(GiB)": 26.31, "step": 8060, "train_speed(iter/s)": 0.581345 }, { "acc": 0.99988317, "epoch": 8.552492046659598, "grad_norm": 0.03273988887667656, "learning_rate": 5.629917720535582e-07, "loss": 0.00073676, "memory(GiB)": 26.31, "step": 8065, "train_speed(iter/s)": 0.581346 }, { "acc": 1.0, "epoch": 8.55779427359491, "grad_norm": 0.0014363267691805959, "learning_rate": 5.589603671621957e-07, "loss": 8.492e-05, "memory(GiB)": 26.31, "step": 8070, "train_speed(iter/s)": 0.581354 }, { "acc": 0.99988155, "epoch": 8.563096500530223, "grad_norm": 0.0011595258256420493, "learning_rate": 5.549426191374673e-07, "loss": 0.00026058, "memory(GiB)": 26.31, "step": 8075, "train_speed(iter/s)": 0.581356 }, { "acc": 0.99974566, "epoch": 8.568398727465535, "grad_norm": 0.4120935797691345, "learning_rate": 5.509385403331628e-07, "loss": 0.00089901, "memory(GiB)": 26.31, "step": 8080, "train_speed(iter/s)": 0.581356 }, { "acc": 0.99988785, "epoch": 8.573700954400849, "grad_norm": 0.03953966125845909, "learning_rate": 5.46948143061043e-07, "loss": 0.00022376, "memory(GiB)": 26.31, "step": 8085, "train_speed(iter/s)": 0.581357 }, { "acc": 0.99988422, "epoch": 8.57900318133616, "grad_norm": 0.04297780618071556, "learning_rate": 5.429714395907992e-07, "loss": 0.00016779, "memory(GiB)": 26.31, "step": 8090, "train_speed(iter/s)": 0.581358 }, { "acc": 0.99986973, "epoch": 8.584305408271474, "grad_norm": 0.11942251026630402, "learning_rate": 5.39008442150018e-07, "loss": 0.00034182, "memory(GiB)": 26.31, "step": 8095, "train_speed(iter/s)": 0.581362 }, { "acc": 0.99975233, "epoch": 8.589607635206788, "grad_norm": 0.4256443381309509, "learning_rate": 5.350591629241419e-07, "loss": 0.00112517, "memory(GiB)": 26.31, "step": 8100, "train_speed(iter/s)": 0.581363 }, { "acc": 0.99988317, "epoch": 8.5949098621421, "grad_norm": 0.03107130341231823, "learning_rate": 5.311236140564336e-07, "loss": 0.00030147, "memory(GiB)": 26.31, "step": 8105, "train_speed(iter/s)": 0.581364 }, { "acc": 1.0, "epoch": 8.600212089077413, "grad_norm": 0.01565447449684143, "learning_rate": 5.272018076479365e-07, "loss": 0.00019858, "memory(GiB)": 26.31, "step": 8110, "train_speed(iter/s)": 0.581366 }, { "acc": 1.0, "epoch": 8.605514316012725, "grad_norm": 0.01462327130138874, "learning_rate": 5.232937557574392e-07, "loss": 0.00015697, "memory(GiB)": 26.31, "step": 8115, "train_speed(iter/s)": 0.581369 }, { "acc": 1.0, "epoch": 8.610816542948038, "grad_norm": 0.0016528492560610175, "learning_rate": 5.193994704014368e-07, "loss": 4.902e-05, "memory(GiB)": 26.31, "step": 8120, "train_speed(iter/s)": 0.581376 }, { "acc": 0.9997674, "epoch": 8.61611876988335, "grad_norm": 0.010499105788767338, "learning_rate": 5.155189635540981e-07, "loss": 0.00049511, "memory(GiB)": 26.31, "step": 8125, "train_speed(iter/s)": 0.581377 }, { "acc": 1.0, "epoch": 8.621420996818664, "grad_norm": 0.040201202034950256, "learning_rate": 5.116522471472227e-07, "loss": 3.062e-05, "memory(GiB)": 26.31, "step": 8130, "train_speed(iter/s)": 0.581379 }, { "acc": 1.0, "epoch": 8.626723223753977, "grad_norm": 0.0030659495387226343, "learning_rate": 5.07799333070206e-07, "loss": 4.329e-05, "memory(GiB)": 26.31, "step": 8135, "train_speed(iter/s)": 0.581384 }, { "acc": 0.99987679, "epoch": 8.632025450689289, "grad_norm": 0.003144504502415657, "learning_rate": 5.039602331700092e-07, "loss": 0.00025904, "memory(GiB)": 26.31, "step": 8140, "train_speed(iter/s)": 0.581386 }, { "acc": 0.99987803, "epoch": 8.637327677624603, "grad_norm": 0.026025842875242233, "learning_rate": 5.001349592511136e-07, "loss": 0.00028486, "memory(GiB)": 26.31, "step": 8145, "train_speed(iter/s)": 0.58139 }, { "acc": 0.99986706, "epoch": 8.642629904559914, "grad_norm": 0.004663050640374422, "learning_rate": 4.963235230754879e-07, "loss": 0.00026084, "memory(GiB)": 26.31, "step": 8150, "train_speed(iter/s)": 0.581392 }, { "acc": 0.99987745, "epoch": 8.647932131495228, "grad_norm": 0.029145730659365654, "learning_rate": 4.92525936362555e-07, "loss": 0.00018182, "memory(GiB)": 26.31, "step": 8155, "train_speed(iter/s)": 0.581394 }, { "acc": 0.99975986, "epoch": 8.653234358430542, "grad_norm": 0.00190520950127393, "learning_rate": 4.887422107891513e-07, "loss": 0.00081231, "memory(GiB)": 26.31, "step": 8160, "train_speed(iter/s)": 0.581395 }, { "acc": 0.99974737, "epoch": 8.658536585365853, "grad_norm": 0.0602591373026371, "learning_rate": 4.84972357989494e-07, "loss": 0.00022456, "memory(GiB)": 26.31, "step": 8165, "train_speed(iter/s)": 0.581396 }, { "acc": 0.99988375, "epoch": 8.663838812301167, "grad_norm": 0.018995080143213272, "learning_rate": 4.812163895551438e-07, "loss": 0.00012022, "memory(GiB)": 26.31, "step": 8170, "train_speed(iter/s)": 0.5814 }, { "acc": 0.99986629, "epoch": 8.669141039236479, "grad_norm": 0.04597296565771103, "learning_rate": 4.774743170349703e-07, "loss": 0.00028186, "memory(GiB)": 26.31, "step": 8175, "train_speed(iter/s)": 0.581403 }, { "acc": 0.99921989, "epoch": 8.674443266171792, "grad_norm": 0.06943114101886749, "learning_rate": 4.7374615193511503e-07, "loss": 0.0026015, "memory(GiB)": 26.31, "step": 8180, "train_speed(iter/s)": 0.581403 }, { "acc": 1.0, "epoch": 8.679745493107106, "grad_norm": 0.041785068809986115, "learning_rate": 4.7003190571895607e-07, "loss": 9.327e-05, "memory(GiB)": 26.31, "step": 8185, "train_speed(iter/s)": 0.581404 }, { "acc": 0.99987984, "epoch": 8.685047720042418, "grad_norm": 0.027803828939795494, "learning_rate": 4.663315898070774e-07, "loss": 0.00023471, "memory(GiB)": 26.31, "step": 8190, "train_speed(iter/s)": 0.581409 }, { "acc": 0.99975643, "epoch": 8.690349946977731, "grad_norm": 0.0025035461876541376, "learning_rate": 4.626452155772263e-07, "loss": 0.00022978, "memory(GiB)": 26.31, "step": 8195, "train_speed(iter/s)": 0.581412 }, { "acc": 0.99966631, "epoch": 8.695652173913043, "grad_norm": 0.028833532705903053, "learning_rate": 4.589727943642835e-07, "loss": 0.00042026, "memory(GiB)": 26.31, "step": 8200, "train_speed(iter/s)": 0.581415 }, { "acc": 0.99961128, "epoch": 8.700954400848357, "grad_norm": 0.002554490463808179, "learning_rate": 4.553143374602267e-07, "loss": 0.00063897, "memory(GiB)": 26.31, "step": 8205, "train_speed(iter/s)": 0.581416 }, { "acc": 0.99988985, "epoch": 8.706256627783668, "grad_norm": 0.5980751514434814, "learning_rate": 4.516698561140955e-07, "loss": 0.00077893, "memory(GiB)": 26.31, "step": 8210, "train_speed(iter/s)": 0.581417 }, { "acc": 0.99973564, "epoch": 8.711558854718982, "grad_norm": 0.08448156714439392, "learning_rate": 4.480393615319589e-07, "loss": 0.00050244, "memory(GiB)": 26.31, "step": 8215, "train_speed(iter/s)": 0.581419 }, { "acc": 1.0, "epoch": 8.716861081654296, "grad_norm": 0.0047545284032821655, "learning_rate": 4.4442286487687844e-07, "loss": 0.00010639, "memory(GiB)": 26.31, "step": 8220, "train_speed(iter/s)": 0.581419 }, { "acc": 0.99964352, "epoch": 8.722163308589607, "grad_norm": 0.004202633630484343, "learning_rate": 4.4082037726887366e-07, "loss": 0.0005389, "memory(GiB)": 26.31, "step": 8225, "train_speed(iter/s)": 0.58142 }, { "acc": 0.99972897, "epoch": 8.72746553552492, "grad_norm": 0.0017853471217676997, "learning_rate": 4.372319097848914e-07, "loss": 0.00035583, "memory(GiB)": 26.31, "step": 8230, "train_speed(iter/s)": 0.581421 }, { "acc": 0.99976826, "epoch": 8.732767762460233, "grad_norm": 0.03493494167923927, "learning_rate": 4.3365747345876773e-07, "loss": 0.00036009, "memory(GiB)": 26.31, "step": 8235, "train_speed(iter/s)": 0.581425 }, { "acc": 0.99986839, "epoch": 8.738069989395546, "grad_norm": 0.160511776804924, "learning_rate": 4.300970792811949e-07, "loss": 0.0003888, "memory(GiB)": 26.31, "step": 8240, "train_speed(iter/s)": 0.581432 }, { "acc": 0.99985552, "epoch": 8.743372216330858, "grad_norm": 0.036365751177072525, "learning_rate": 4.265507381996912e-07, "loss": 0.00027258, "memory(GiB)": 26.31, "step": 8245, "train_speed(iter/s)": 0.581436 }, { "acc": 0.99975319, "epoch": 8.748674443266172, "grad_norm": 0.05677983909845352, "learning_rate": 4.2301846111856155e-07, "loss": 0.00044567, "memory(GiB)": 26.31, "step": 8250, "train_speed(iter/s)": 0.581437 }, { "acc": 0.99958639, "epoch": 8.753976670201485, "grad_norm": 0.04142065346240997, "learning_rate": 4.1950025889886813e-07, "loss": 0.00106637, "memory(GiB)": 26.31, "step": 8255, "train_speed(iter/s)": 0.581441 }, { "acc": 0.99974995, "epoch": 8.759278897136797, "grad_norm": 0.002513850573450327, "learning_rate": 4.1599614235839595e-07, "loss": 0.00029619, "memory(GiB)": 26.31, "step": 8260, "train_speed(iter/s)": 0.581444 }, { "acc": 0.99987869, "epoch": 8.76458112407211, "grad_norm": 0.04915174841880798, "learning_rate": 4.1250612227161794e-07, "loss": 0.00031457, "memory(GiB)": 26.31, "step": 8265, "train_speed(iter/s)": 0.581448 }, { "acc": 0.99959469, "epoch": 8.769883351007422, "grad_norm": 0.0011660271557047963, "learning_rate": 4.0903020936966484e-07, "loss": 0.0006362, "memory(GiB)": 26.31, "step": 8270, "train_speed(iter/s)": 0.58145 }, { "acc": 0.99973412, "epoch": 8.775185577942736, "grad_norm": 0.0009855523239821196, "learning_rate": 4.0556841434028936e-07, "loss": 0.00034017, "memory(GiB)": 26.31, "step": 8275, "train_speed(iter/s)": 0.581451 }, { "acc": 0.99977989, "epoch": 8.78048780487805, "grad_norm": 0.0027486311737447977, "learning_rate": 4.0212074782783416e-07, "loss": 0.00026543, "memory(GiB)": 26.31, "step": 8280, "train_speed(iter/s)": 0.581452 }, { "acc": 1.0, "epoch": 8.785790031813361, "grad_norm": 0.0027941821608692408, "learning_rate": 3.986872204332013e-07, "loss": 5.459e-05, "memory(GiB)": 26.31, "step": 8285, "train_speed(iter/s)": 0.581454 }, { "acc": 0.99988098, "epoch": 8.791092258748675, "grad_norm": 0.012088056653738022, "learning_rate": 3.9526784271381666e-07, "loss": 0.00016703, "memory(GiB)": 26.31, "step": 8290, "train_speed(iter/s)": 0.581458 }, { "acc": 0.99985714, "epoch": 8.796394485683987, "grad_norm": 0.05154626443982124, "learning_rate": 3.9186262518359763e-07, "loss": 0.00036198, "memory(GiB)": 26.31, "step": 8295, "train_speed(iter/s)": 0.581462 }, { "acc": 0.99988937, "epoch": 8.8016967126193, "grad_norm": 0.06838499009609222, "learning_rate": 3.8847157831292366e-07, "loss": 0.0004502, "memory(GiB)": 26.31, "step": 8300, "train_speed(iter/s)": 0.581463 }, { "acc": 0.99950447, "epoch": 8.806998939554614, "grad_norm": 0.30260416865348816, "learning_rate": 3.8509471252860156e-07, "loss": 0.00166767, "memory(GiB)": 26.31, "step": 8305, "train_speed(iter/s)": 0.581468 }, { "acc": 0.99987803, "epoch": 8.812301166489926, "grad_norm": 0.0005098844994790852, "learning_rate": 3.8173203821383315e-07, "loss": 0.00017671, "memory(GiB)": 26.31, "step": 8310, "train_speed(iter/s)": 0.581471 }, { "acc": 0.9998724, "epoch": 8.81760339342524, "grad_norm": 0.005667832680046558, "learning_rate": 3.7838356570818497e-07, "loss": 0.00022659, "memory(GiB)": 26.31, "step": 8315, "train_speed(iter/s)": 0.581473 }, { "acc": 0.99974613, "epoch": 8.822905620360551, "grad_norm": 0.0008252383559010923, "learning_rate": 3.7504930530755664e-07, "loss": 0.00026339, "memory(GiB)": 26.31, "step": 8320, "train_speed(iter/s)": 0.581474 }, { "acc": 0.99961548, "epoch": 8.828207847295864, "grad_norm": 0.026456259191036224, "learning_rate": 3.7172926726414727e-07, "loss": 0.00107916, "memory(GiB)": 26.31, "step": 8325, "train_speed(iter/s)": 0.581474 }, { "acc": 0.99976177, "epoch": 8.833510074231176, "grad_norm": 0.018122496083378792, "learning_rate": 3.684234617864247e-07, "loss": 0.00023061, "memory(GiB)": 26.31, "step": 8330, "train_speed(iter/s)": 0.581477 }, { "acc": 0.99963703, "epoch": 8.83881230116649, "grad_norm": 0.052434735000133514, "learning_rate": 3.6513189903909565e-07, "loss": 0.00043928, "memory(GiB)": 26.31, "step": 8335, "train_speed(iter/s)": 0.581478 }, { "acc": 0.99975014, "epoch": 8.844114528101803, "grad_norm": 0.06747570633888245, "learning_rate": 3.618545891430718e-07, "loss": 0.00037864, "memory(GiB)": 26.31, "step": 8340, "train_speed(iter/s)": 0.581481 }, { "acc": 0.9998848, "epoch": 8.849416755037115, "grad_norm": 0.003688774537295103, "learning_rate": 3.5859154217544087e-07, "loss": 0.00032196, "memory(GiB)": 26.31, "step": 8345, "train_speed(iter/s)": 0.581482 }, { "acc": 1.0, "epoch": 8.854718981972429, "grad_norm": 0.004484543111175299, "learning_rate": 3.5534276816943463e-07, "loss": 0.00030583, "memory(GiB)": 26.31, "step": 8350, "train_speed(iter/s)": 0.581487 }, { "acc": 1.0, "epoch": 8.86002120890774, "grad_norm": 0.05573554337024689, "learning_rate": 3.5210827711439973e-07, "loss": 7.674e-05, "memory(GiB)": 26.31, "step": 8355, "train_speed(iter/s)": 0.581493 }, { "acc": 0.99985876, "epoch": 8.865323435843054, "grad_norm": 0.13664424419403076, "learning_rate": 3.488880789557624e-07, "loss": 0.00028313, "memory(GiB)": 26.31, "step": 8360, "train_speed(iter/s)": 0.581494 }, { "acc": 0.99986773, "epoch": 8.870625662778368, "grad_norm": 0.04399307072162628, "learning_rate": 3.456821835950048e-07, "loss": 0.00018446, "memory(GiB)": 26.31, "step": 8365, "train_speed(iter/s)": 0.581495 }, { "acc": 0.99987803, "epoch": 8.87592788971368, "grad_norm": 0.002994449343532324, "learning_rate": 3.4249060088962706e-07, "loss": 0.0001245, "memory(GiB)": 26.31, "step": 8370, "train_speed(iter/s)": 0.581498 }, { "acc": 0.99988842, "epoch": 8.881230116648993, "grad_norm": 0.0009798618266358972, "learning_rate": 3.393133406531237e-07, "loss": 0.00014089, "memory(GiB)": 26.31, "step": 8375, "train_speed(iter/s)": 0.5815 }, { "acc": 1.0, "epoch": 8.886532343584305, "grad_norm": 0.0361686572432518, "learning_rate": 3.3615041265494834e-07, "loss": 0.00016998, "memory(GiB)": 26.31, "step": 8380, "train_speed(iter/s)": 0.581501 }, { "acc": 0.99961624, "epoch": 8.891834570519618, "grad_norm": 0.13913202285766602, "learning_rate": 3.330018266204864e-07, "loss": 0.00047402, "memory(GiB)": 26.31, "step": 8385, "train_speed(iter/s)": 0.581501 }, { "acc": 0.99974327, "epoch": 8.89713679745493, "grad_norm": 0.0007361209718510509, "learning_rate": 3.298675922310256e-07, "loss": 0.0005421, "memory(GiB)": 26.31, "step": 8390, "train_speed(iter/s)": 0.581505 }, { "acc": 0.99986343, "epoch": 8.902439024390244, "grad_norm": 0.02788899466395378, "learning_rate": 3.2674771912372485e-07, "loss": 0.00022882, "memory(GiB)": 26.31, "step": 8395, "train_speed(iter/s)": 0.581506 }, { "acc": 0.99964542, "epoch": 8.907741251325557, "grad_norm": 0.06532458961009979, "learning_rate": 3.2364221689158365e-07, "loss": 0.00044322, "memory(GiB)": 26.31, "step": 8400, "train_speed(iter/s)": 0.581509 }, { "acc": 0.99950171, "epoch": 8.91304347826087, "grad_norm": 0.49831533432006836, "learning_rate": 3.2055109508341453e-07, "loss": 0.00124991, "memory(GiB)": 26.31, "step": 8405, "train_speed(iter/s)": 0.581512 }, { "acc": 0.99976196, "epoch": 8.918345705196183, "grad_norm": 0.05611543357372284, "learning_rate": 3.174743632038135e-07, "loss": 0.00023922, "memory(GiB)": 26.31, "step": 8410, "train_speed(iter/s)": 0.581514 }, { "acc": 0.99985952, "epoch": 8.923647932131495, "grad_norm": 0.00900158379226923, "learning_rate": 3.1441203071312993e-07, "loss": 0.00030074, "memory(GiB)": 26.31, "step": 8415, "train_speed(iter/s)": 0.581517 }, { "acc": 1.0, "epoch": 8.928950159066808, "grad_norm": 0.005048360675573349, "learning_rate": 3.113641070274376e-07, "loss": 8.591e-05, "memory(GiB)": 26.31, "step": 8420, "train_speed(iter/s)": 0.581523 }, { "acc": 0.99986706, "epoch": 8.934252386002122, "grad_norm": 0.04308345913887024, "learning_rate": 3.0833060151850695e-07, "loss": 0.00046462, "memory(GiB)": 26.31, "step": 8425, "train_speed(iter/s)": 0.581523 }, { "acc": 0.99988098, "epoch": 8.939554612937433, "grad_norm": 0.0006353101343847811, "learning_rate": 3.0531152351377423e-07, "loss": 0.00011048, "memory(GiB)": 26.31, "step": 8430, "train_speed(iter/s)": 0.581524 }, { "acc": 0.9997304, "epoch": 8.944856839872747, "grad_norm": 0.08065080642700195, "learning_rate": 3.0230688229631476e-07, "loss": 0.00043914, "memory(GiB)": 26.31, "step": 8435, "train_speed(iter/s)": 0.581527 }, { "acc": 0.9997674, "epoch": 8.950159066808059, "grad_norm": 0.008123918436467648, "learning_rate": 2.993166871048129e-07, "loss": 0.00030036, "memory(GiB)": 26.31, "step": 8440, "train_speed(iter/s)": 0.58153 }, { "acc": 0.99975395, "epoch": 8.955461293743372, "grad_norm": 0.048602763563394547, "learning_rate": 2.96340947133535e-07, "loss": 0.00044107, "memory(GiB)": 26.31, "step": 8445, "train_speed(iter/s)": 0.581531 }, { "acc": 1.0, "epoch": 8.960763520678686, "grad_norm": 0.004590745083987713, "learning_rate": 2.933796715323001e-07, "loss": 0.00012758, "memory(GiB)": 26.31, "step": 8450, "train_speed(iter/s)": 0.581533 }, { "acc": 0.9992733, "epoch": 8.966065747613998, "grad_norm": 0.03488789498806, "learning_rate": 2.9043286940645254e-07, "loss": 0.00081013, "memory(GiB)": 26.31, "step": 8455, "train_speed(iter/s)": 0.581534 }, { "acc": 1.0, "epoch": 8.971367974549311, "grad_norm": 0.02488904632627964, "learning_rate": 2.8750054981683154e-07, "loss": 0.00012265, "memory(GiB)": 26.31, "step": 8460, "train_speed(iter/s)": 0.581537 }, { "acc": 0.99952488, "epoch": 8.976670201484623, "grad_norm": 0.03694002330303192, "learning_rate": 2.845827217797496e-07, "loss": 0.00072966, "memory(GiB)": 26.31, "step": 8465, "train_speed(iter/s)": 0.58154 }, { "acc": 0.99976711, "epoch": 8.981972428419937, "grad_norm": 0.021575380116701126, "learning_rate": 2.816793942669559e-07, "loss": 0.00030038, "memory(GiB)": 26.31, "step": 8470, "train_speed(iter/s)": 0.581543 }, { "acc": 0.99987869, "epoch": 8.987274655355248, "grad_norm": 0.006561917718499899, "learning_rate": 2.7879057620561597e-07, "loss": 0.00012396, "memory(GiB)": 26.31, "step": 8475, "train_speed(iter/s)": 0.581543 }, { "acc": 0.99978104, "epoch": 8.992576882290562, "grad_norm": 0.07010827213525772, "learning_rate": 2.759162764782804e-07, "loss": 0.00043721, "memory(GiB)": 26.31, "step": 8480, "train_speed(iter/s)": 0.581544 }, { "acc": 0.99987984, "epoch": 8.997879109225876, "grad_norm": 0.018172938376665115, "learning_rate": 2.7305650392286003e-07, "loss": 0.00020561, "memory(GiB)": 26.31, "step": 8485, "train_speed(iter/s)": 0.581544 }, { "acc": 1.0, "epoch": 9.003181336161187, "grad_norm": 0.03349972888827324, "learning_rate": 2.7021126733259704e-07, "loss": 0.00010355, "memory(GiB)": 26.31, "step": 8490, "train_speed(iter/s)": 0.581504 }, { "acc": 0.99965086, "epoch": 9.008483563096501, "grad_norm": 0.0025340563151985407, "learning_rate": 2.673805754560371e-07, "loss": 0.00051078, "memory(GiB)": 26.31, "step": 8495, "train_speed(iter/s)": 0.581505 }, { "acc": 0.99989128, "epoch": 9.013785790031813, "grad_norm": 0.036855071783065796, "learning_rate": 2.64564436997005e-07, "loss": 0.00042027, "memory(GiB)": 26.31, "step": 8500, "train_speed(iter/s)": 0.581505 }, { "acc": 0.9998724, "epoch": 9.019088016967126, "grad_norm": 0.03355565294623375, "learning_rate": 2.617628606145764e-07, "loss": 0.00017373, "memory(GiB)": 26.31, "step": 8505, "train_speed(iter/s)": 0.581508 }, { "acc": 0.99989882, "epoch": 9.024390243902438, "grad_norm": 0.0011976395035162568, "learning_rate": 2.589758549230506e-07, "loss": 0.00010378, "memory(GiB)": 26.31, "step": 8510, "train_speed(iter/s)": 0.58151 }, { "acc": 0.99986916, "epoch": 9.029692470837752, "grad_norm": 0.0022142117377370596, "learning_rate": 2.562034284919272e-07, "loss": 0.0002, "memory(GiB)": 26.31, "step": 8515, "train_speed(iter/s)": 0.581514 }, { "acc": 0.99988632, "epoch": 9.034994697773065, "grad_norm": 0.06629278510808945, "learning_rate": 2.5344558984587513e-07, "loss": 0.00030244, "memory(GiB)": 26.31, "step": 8520, "train_speed(iter/s)": 0.581515 }, { "acc": 0.99987869, "epoch": 9.040296924708377, "grad_norm": 0.00030782382236793637, "learning_rate": 2.5070234746470904e-07, "loss": 0.00025399, "memory(GiB)": 26.31, "step": 8525, "train_speed(iter/s)": 0.581518 }, { "acc": 1.0, "epoch": 9.04559915164369, "grad_norm": 0.0020412448793649673, "learning_rate": 2.479737097833636e-07, "loss": 0.00021156, "memory(GiB)": 26.31, "step": 8530, "train_speed(iter/s)": 0.581519 }, { "acc": 0.9998538, "epoch": 9.050901378579002, "grad_norm": 0.03479117900133133, "learning_rate": 2.452596851918667e-07, "loss": 0.00110229, "memory(GiB)": 26.31, "step": 8535, "train_speed(iter/s)": 0.58152 }, { "acc": 0.99977226, "epoch": 9.056203605514316, "grad_norm": 0.16863708198070526, "learning_rate": 2.4256028203531364e-07, "loss": 0.00090512, "memory(GiB)": 26.31, "step": 8540, "train_speed(iter/s)": 0.581522 }, { "acc": 0.9998724, "epoch": 9.06150583244963, "grad_norm": 0.001541724894195795, "learning_rate": 2.3987550861384156e-07, "loss": 0.00036416, "memory(GiB)": 26.31, "step": 8545, "train_speed(iter/s)": 0.581523 }, { "acc": 0.99988098, "epoch": 9.066808059384941, "grad_norm": 0.06520693749189377, "learning_rate": 2.3720537318260462e-07, "loss": 0.00031393, "memory(GiB)": 26.31, "step": 8550, "train_speed(iter/s)": 0.581525 }, { "acc": 1.0, "epoch": 9.072110286320255, "grad_norm": 0.002617688849568367, "learning_rate": 2.3454988395174696e-07, "loss": 7.239e-05, "memory(GiB)": 26.31, "step": 8555, "train_speed(iter/s)": 0.581526 }, { "acc": 0.9998724, "epoch": 9.077412513255567, "grad_norm": 0.0007258942350745201, "learning_rate": 2.3190904908637995e-07, "loss": 0.00016769, "memory(GiB)": 26.31, "step": 8560, "train_speed(iter/s)": 0.58153 }, { "acc": 1.0, "epoch": 9.08271474019088, "grad_norm": 0.029376430436968803, "learning_rate": 2.2928287670655434e-07, "loss": 7.223e-05, "memory(GiB)": 26.31, "step": 8565, "train_speed(iter/s)": 0.581533 }, { "acc": 0.99989033, "epoch": 9.088016967126194, "grad_norm": 0.03681400418281555, "learning_rate": 2.266713748872365e-07, "loss": 0.0001561, "memory(GiB)": 26.31, "step": 8570, "train_speed(iter/s)": 0.581535 }, { "acc": 1.0, "epoch": 9.093319194061506, "grad_norm": 0.004051406867802143, "learning_rate": 2.2407455165828564e-07, "loss": 0.00010099, "memory(GiB)": 26.31, "step": 8575, "train_speed(iter/s)": 0.581535 }, { "acc": 0.99986486, "epoch": 9.09862142099682, "grad_norm": 0.008981379680335522, "learning_rate": 2.2149241500442432e-07, "loss": 0.0002387, "memory(GiB)": 26.31, "step": 8580, "train_speed(iter/s)": 0.581541 }, { "acc": 0.99962225, "epoch": 9.103923647932131, "grad_norm": 0.004154110327363014, "learning_rate": 2.189249728652202e-07, "loss": 0.00068734, "memory(GiB)": 26.31, "step": 8585, "train_speed(iter/s)": 0.581544 }, { "acc": 1.0, "epoch": 9.109225874867445, "grad_norm": 0.001803599065169692, "learning_rate": 2.163722331350555e-07, "loss": 3.99e-06, "memory(GiB)": 26.31, "step": 8590, "train_speed(iter/s)": 0.581545 }, { "acc": 0.9998744, "epoch": 9.114528101802756, "grad_norm": 0.058594174683094025, "learning_rate": 2.1383420366310594e-07, "loss": 0.00020836, "memory(GiB)": 26.31, "step": 8595, "train_speed(iter/s)": 0.581545 }, { "acc": 0.99986115, "epoch": 9.11983032873807, "grad_norm": 0.05806174874305725, "learning_rate": 2.1131089225331617e-07, "loss": 0.00021942, "memory(GiB)": 26.31, "step": 8600, "train_speed(iter/s)": 0.581546 }, { "acc": 1.0, "epoch": 9.125132555673384, "grad_norm": 0.0015247429255396128, "learning_rate": 2.0880230666437563e-07, "loss": 3.77e-06, "memory(GiB)": 26.31, "step": 8605, "train_speed(iter/s)": 0.581549 }, { "acc": 0.99960184, "epoch": 9.130434782608695, "grad_norm": 0.04763953387737274, "learning_rate": 2.0630845460969548e-07, "loss": 0.00065606, "memory(GiB)": 26.31, "step": 8610, "train_speed(iter/s)": 0.581552 }, { "acc": 0.99988041, "epoch": 9.135737009544009, "grad_norm": 0.005552299320697784, "learning_rate": 2.038293437573838e-07, "loss": 0.00065457, "memory(GiB)": 26.31, "step": 8615, "train_speed(iter/s)": 0.581553 }, { "acc": 1.0, "epoch": 9.14103923647932, "grad_norm": 0.0331796295940876, "learning_rate": 2.0136498173022114e-07, "loss": 7.548e-05, "memory(GiB)": 26.31, "step": 8620, "train_speed(iter/s)": 0.581554 }, { "acc": 1.0, "epoch": 9.146341463414634, "grad_norm": 0.0036993902176618576, "learning_rate": 1.9891537610563945e-07, "loss": 8.443e-05, "memory(GiB)": 26.31, "step": 8625, "train_speed(iter/s)": 0.581555 }, { "acc": 0.9998826, "epoch": 9.151643690349948, "grad_norm": 0.0014519346877932549, "learning_rate": 1.9648053441569815e-07, "loss": 0.00015098, "memory(GiB)": 26.31, "step": 8630, "train_speed(iter/s)": 0.581558 }, { "acc": 0.99976854, "epoch": 9.15694591728526, "grad_norm": 0.0005271465633995831, "learning_rate": 1.9406046414705976e-07, "loss": 0.00032699, "memory(GiB)": 26.31, "step": 8635, "train_speed(iter/s)": 0.581562 }, { "acc": 0.99975309, "epoch": 9.162248144220573, "grad_norm": 0.06643125414848328, "learning_rate": 1.9165517274096597e-07, "loss": 0.00028296, "memory(GiB)": 26.31, "step": 8640, "train_speed(iter/s)": 0.581565 }, { "acc": 1.0, "epoch": 9.167550371155885, "grad_norm": 0.0011175911640748382, "learning_rate": 1.8926466759321995e-07, "loss": 4.757e-05, "memory(GiB)": 26.31, "step": 8645, "train_speed(iter/s)": 0.581566 }, { "acc": 0.99977655, "epoch": 9.172852598091199, "grad_norm": 0.29192134737968445, "learning_rate": 1.8688895605415696e-07, "loss": 0.00076506, "memory(GiB)": 26.31, "step": 8650, "train_speed(iter/s)": 0.581567 }, { "acc": 0.99963932, "epoch": 9.17815482502651, "grad_norm": 0.024805352091789246, "learning_rate": 1.8452804542862633e-07, "loss": 0.00051685, "memory(GiB)": 26.31, "step": 8655, "train_speed(iter/s)": 0.581568 }, { "acc": 0.99968348, "epoch": 9.183457051961824, "grad_norm": 0.0008316601160913706, "learning_rate": 1.8218194297596643e-07, "loss": 0.00039648, "memory(GiB)": 26.31, "step": 8660, "train_speed(iter/s)": 0.581569 }, { "acc": 0.99961185, "epoch": 9.188759278897138, "grad_norm": 0.032385896891355515, "learning_rate": 1.798506559099847e-07, "loss": 0.00075617, "memory(GiB)": 26.31, "step": 8665, "train_speed(iter/s)": 0.581571 }, { "acc": 0.99973583, "epoch": 9.19406150583245, "grad_norm": 0.00038688286440446973, "learning_rate": 1.7753419139893244e-07, "loss": 0.00191715, "memory(GiB)": 26.31, "step": 8670, "train_speed(iter/s)": 0.581573 }, { "acc": 0.9996172, "epoch": 9.199363732767763, "grad_norm": 0.11170506477355957, "learning_rate": 1.7523255656548533e-07, "loss": 0.00074308, "memory(GiB)": 26.31, "step": 8675, "train_speed(iter/s)": 0.581575 }, { "acc": 0.99962177, "epoch": 9.204665959703075, "grad_norm": 0.0007719770655967295, "learning_rate": 1.7294575848672227e-07, "loss": 0.00152528, "memory(GiB)": 26.31, "step": 8680, "train_speed(iter/s)": 0.581576 }, { "acc": 0.99953203, "epoch": 9.209968186638388, "grad_norm": 0.0024956208653748035, "learning_rate": 1.7067380419409986e-07, "loss": 0.0006236, "memory(GiB)": 26.31, "step": 8685, "train_speed(iter/s)": 0.581577 }, { "acc": 0.99964581, "epoch": 9.215270413573702, "grad_norm": 0.06978549808263779, "learning_rate": 1.684167006734337e-07, "loss": 0.00045885, "memory(GiB)": 26.31, "step": 8690, "train_speed(iter/s)": 0.581578 }, { "acc": 0.99962521, "epoch": 9.220572640509014, "grad_norm": 0.03831864148378372, "learning_rate": 1.661744548648758e-07, "loss": 0.00048588, "memory(GiB)": 26.31, "step": 8695, "train_speed(iter/s)": 0.581585 }, { "acc": 0.99987869, "epoch": 9.225874867444327, "grad_norm": 0.04184339568018913, "learning_rate": 1.6394707366289395e-07, "loss": 0.00197599, "memory(GiB)": 26.31, "step": 8700, "train_speed(iter/s)": 0.581587 }, { "acc": 1.0, "epoch": 9.231177094379639, "grad_norm": 0.004102388396859169, "learning_rate": 1.6173456391625027e-07, "loss": 6.415e-05, "memory(GiB)": 26.31, "step": 8705, "train_speed(iter/s)": 0.58159 }, { "acc": 0.99989033, "epoch": 9.236479321314953, "grad_norm": 0.0021884252782911062, "learning_rate": 1.5953693242797986e-07, "loss": 0.00030634, "memory(GiB)": 26.31, "step": 8710, "train_speed(iter/s)": 0.581591 }, { "acc": 0.99988785, "epoch": 9.241781548250264, "grad_norm": 0.030813999474048615, "learning_rate": 1.57354185955371e-07, "loss": 0.00035523, "memory(GiB)": 26.31, "step": 8715, "train_speed(iter/s)": 0.581594 }, { "acc": 0.99987745, "epoch": 9.247083775185578, "grad_norm": 0.001368062337860465, "learning_rate": 1.5518633120994226e-07, "loss": 0.00024497, "memory(GiB)": 26.31, "step": 8720, "train_speed(iter/s)": 0.581596 }, { "acc": 0.99973602, "epoch": 9.252386002120891, "grad_norm": 0.15966103971004486, "learning_rate": 1.5303337485742323e-07, "loss": 0.00089951, "memory(GiB)": 26.31, "step": 8725, "train_speed(iter/s)": 0.581599 }, { "acc": 1.0, "epoch": 9.257688229056203, "grad_norm": 0.012701621279120445, "learning_rate": 1.5089532351773431e-07, "loss": 5.587e-05, "memory(GiB)": 26.31, "step": 8730, "train_speed(iter/s)": 0.581599 }, { "acc": 1.0, "epoch": 9.262990455991517, "grad_norm": 0.029211273416876793, "learning_rate": 1.487721837649655e-07, "loss": 4.952e-05, "memory(GiB)": 26.31, "step": 8735, "train_speed(iter/s)": 0.5816 }, { "acc": 0.99989033, "epoch": 9.268292682926829, "grad_norm": 0.0012377180391922593, "learning_rate": 1.466639621273562e-07, "loss": 0.0001556, "memory(GiB)": 26.31, "step": 8740, "train_speed(iter/s)": 0.5816 }, { "acc": 0.9997323, "epoch": 9.273594909862142, "grad_norm": 0.046300843358039856, "learning_rate": 1.445706650872768e-07, "loss": 0.00038963, "memory(GiB)": 26.31, "step": 8745, "train_speed(iter/s)": 0.581603 }, { "acc": 0.99973717, "epoch": 9.278897136797456, "grad_norm": 0.04876074939966202, "learning_rate": 1.4249229908120717e-07, "loss": 0.00023545, "memory(GiB)": 26.31, "step": 8750, "train_speed(iter/s)": 0.581604 }, { "acc": 1.0, "epoch": 9.284199363732768, "grad_norm": 0.001729241805151105, "learning_rate": 1.4042887049971513e-07, "loss": 0.00016176, "memory(GiB)": 26.31, "step": 8755, "train_speed(iter/s)": 0.581604 }, { "acc": 0.99988842, "epoch": 9.289501590668081, "grad_norm": 0.0015717835631221533, "learning_rate": 1.383803856874422e-07, "loss": 0.00022297, "memory(GiB)": 26.31, "step": 8760, "train_speed(iter/s)": 0.58161 }, { "acc": 1.0, "epoch": 9.294803817603393, "grad_norm": 0.014836992137134075, "learning_rate": 1.363468509430778e-07, "loss": 0.0001063, "memory(GiB)": 26.31, "step": 8765, "train_speed(iter/s)": 0.581611 }, { "acc": 0.99987049, "epoch": 9.300106044538706, "grad_norm": 0.0016947545809671283, "learning_rate": 1.3432827251934395e-07, "loss": 0.00029463, "memory(GiB)": 26.31, "step": 8770, "train_speed(iter/s)": 0.581612 }, { "acc": 0.99973669, "epoch": 9.305408271474018, "grad_norm": 0.04517170786857605, "learning_rate": 1.3232465662297557e-07, "loss": 0.00024452, "memory(GiB)": 26.31, "step": 8775, "train_speed(iter/s)": 0.581613 }, { "acc": 0.99987564, "epoch": 9.310710498409332, "grad_norm": 0.0015988461673259735, "learning_rate": 1.3033600941470023e-07, "loss": 0.00018611, "memory(GiB)": 26.31, "step": 8780, "train_speed(iter/s)": 0.581617 }, { "acc": 0.99976091, "epoch": 9.316012725344645, "grad_norm": 0.05802205204963684, "learning_rate": 1.2836233700921908e-07, "loss": 0.00027926, "memory(GiB)": 26.31, "step": 8785, "train_speed(iter/s)": 0.581621 }, { "acc": 0.99987803, "epoch": 9.321314952279957, "grad_norm": 0.040856651961803436, "learning_rate": 1.2640364547518917e-07, "loss": 0.00032329, "memory(GiB)": 26.31, "step": 8790, "train_speed(iter/s)": 0.581624 }, { "acc": 0.99987497, "epoch": 9.32661717921527, "grad_norm": 0.0010028522228822112, "learning_rate": 1.2445994083520462e-07, "loss": 0.00016964, "memory(GiB)": 26.31, "step": 8795, "train_speed(iter/s)": 0.581624 }, { "acc": 0.99973888, "epoch": 9.331919406150583, "grad_norm": 0.0012929553631693125, "learning_rate": 1.2253122906577757e-07, "loss": 0.00022281, "memory(GiB)": 26.31, "step": 8800, "train_speed(iter/s)": 0.581625 }, { "acc": 0.9998641, "epoch": 9.337221633085896, "grad_norm": 0.0011795631144195795, "learning_rate": 1.2061751609731894e-07, "loss": 0.00024719, "memory(GiB)": 26.31, "step": 8805, "train_speed(iter/s)": 0.581626 }, { "acc": 0.99986839, "epoch": 9.34252386002121, "grad_norm": 0.03850088268518448, "learning_rate": 1.1871880781412345e-07, "loss": 0.00030572, "memory(GiB)": 26.31, "step": 8810, "train_speed(iter/s)": 0.581627 }, { "acc": 0.99987869, "epoch": 9.347826086956522, "grad_norm": 0.000750505831092596, "learning_rate": 1.1683511005434775e-07, "loss": 0.00012297, "memory(GiB)": 26.31, "step": 8815, "train_speed(iter/s)": 0.58163 }, { "acc": 0.99974213, "epoch": 9.353128313891835, "grad_norm": 0.004773573484271765, "learning_rate": 1.1496642860999406e-07, "loss": 0.00058561, "memory(GiB)": 26.31, "step": 8820, "train_speed(iter/s)": 0.581633 }, { "acc": 1.0, "epoch": 9.358430540827147, "grad_norm": 0.001007193815894425, "learning_rate": 1.1311276922689271e-07, "loss": 5.364e-05, "memory(GiB)": 26.31, "step": 8825, "train_speed(iter/s)": 0.581633 }, { "acc": 0.99976463, "epoch": 9.36373276776246, "grad_norm": 0.07513611763715744, "learning_rate": 1.1127413760468455e-07, "loss": 0.00029684, "memory(GiB)": 26.31, "step": 8830, "train_speed(iter/s)": 0.581635 }, { "acc": 0.99988041, "epoch": 9.369034994697772, "grad_norm": 0.00388423097319901, "learning_rate": 1.0945053939680258e-07, "loss": 0.00026647, "memory(GiB)": 26.31, "step": 8835, "train_speed(iter/s)": 0.581636 }, { "acc": 0.99988842, "epoch": 9.374337221633086, "grad_norm": 0.01094045676290989, "learning_rate": 1.0764198021045414e-07, "loss": 0.00116209, "memory(GiB)": 26.31, "step": 8840, "train_speed(iter/s)": 0.581639 }, { "acc": 0.99964161, "epoch": 9.3796394485684, "grad_norm": 0.04025011137127876, "learning_rate": 1.0584846560660656e-07, "loss": 0.00033552, "memory(GiB)": 26.31, "step": 8845, "train_speed(iter/s)": 0.581644 }, { "acc": 1.0, "epoch": 9.384941675503711, "grad_norm": 0.032477378845214844, "learning_rate": 1.0407000109996655e-07, "loss": 0.00027492, "memory(GiB)": 26.31, "step": 8850, "train_speed(iter/s)": 0.581648 }, { "acc": 0.99972134, "epoch": 9.390243902439025, "grad_norm": 0.055506035685539246, "learning_rate": 1.0230659215896525e-07, "loss": 0.00033372, "memory(GiB)": 26.31, "step": 8855, "train_speed(iter/s)": 0.581651 }, { "acc": 0.9998992, "epoch": 9.395546129374337, "grad_norm": 0.0010284942109137774, "learning_rate": 1.0055824420573994e-07, "loss": 0.00014935, "memory(GiB)": 26.31, "step": 8860, "train_speed(iter/s)": 0.581653 }, { "acc": 0.99975452, "epoch": 9.40084835630965, "grad_norm": 0.026280825957655907, "learning_rate": 9.882496261611954e-08, "loss": 0.00039164, "memory(GiB)": 26.31, "step": 8865, "train_speed(iter/s)": 0.581654 }, { "acc": 0.99987307, "epoch": 9.406150583244964, "grad_norm": 0.04926552250981331, "learning_rate": 9.710675271960584e-08, "loss": 0.00021097, "memory(GiB)": 26.31, "step": 8870, "train_speed(iter/s)": 0.581656 }, { "acc": 1.0, "epoch": 9.411452810180275, "grad_norm": 0.0016084155067801476, "learning_rate": 9.54036197993589e-08, "loss": 7.331e-05, "memory(GiB)": 26.31, "step": 8875, "train_speed(iter/s)": 0.581657 }, { "acc": 1.0, "epoch": 9.416755037115589, "grad_norm": 0.0036949021741747856, "learning_rate": 9.371556909217946e-08, "loss": 1.104e-05, "memory(GiB)": 26.31, "step": 8880, "train_speed(iter/s)": 0.581658 }, { "acc": 0.99987745, "epoch": 9.4220572640509, "grad_norm": 0.03400897979736328, "learning_rate": 9.204260578849446e-08, "loss": 0.00027076, "memory(GiB)": 26.31, "step": 8885, "train_speed(iter/s)": 0.581659 }, { "acc": 1.0, "epoch": 9.427359490986214, "grad_norm": 0.0016227929154410958, "learning_rate": 9.038473503233809e-08, "loss": 2.005e-05, "memory(GiB)": 26.31, "step": 8890, "train_speed(iter/s)": 0.581662 }, { "acc": 0.99986191, "epoch": 9.432661717921526, "grad_norm": 0.08077705651521683, "learning_rate": 8.874196192133971e-08, "loss": 0.0001798, "memory(GiB)": 26.31, "step": 8895, "train_speed(iter/s)": 0.581665 }, { "acc": 0.9996232, "epoch": 9.43796394485684, "grad_norm": 0.03813088312745094, "learning_rate": 8.711429150670538e-08, "loss": 0.00063143, "memory(GiB)": 26.31, "step": 8900, "train_speed(iter/s)": 0.581666 }, { "acc": 1.0, "epoch": 9.443266171792153, "grad_norm": 0.0346071831882, "learning_rate": 8.550172879320358e-08, "loss": 0.00033753, "memory(GiB)": 26.31, "step": 8905, "train_speed(iter/s)": 0.581667 }, { "acc": 0.99988155, "epoch": 9.448568398727465, "grad_norm": 0.0006143233040347695, "learning_rate": 8.390427873915009e-08, "loss": 0.00035487, "memory(GiB)": 26.31, "step": 8910, "train_speed(iter/s)": 0.581669 }, { "acc": 1.0, "epoch": 9.453870625662779, "grad_norm": 0.0009098179871216416, "learning_rate": 8.232194625639202e-08, "loss": 5.3e-06, "memory(GiB)": 26.31, "step": 8915, "train_speed(iter/s)": 0.581671 }, { "acc": 0.99975357, "epoch": 9.45917285259809, "grad_norm": 0.0381086990237236, "learning_rate": 8.07547362102916e-08, "loss": 0.00026466, "memory(GiB)": 26.31, "step": 8920, "train_speed(iter/s)": 0.581675 }, { "acc": 1.0, "epoch": 9.464475079533404, "grad_norm": 0.0034874596167355776, "learning_rate": 7.920265341971407e-08, "loss": 8.239e-05, "memory(GiB)": 26.31, "step": 8925, "train_speed(iter/s)": 0.581675 }, { "acc": 1.0, "epoch": 9.469777306468718, "grad_norm": 0.042031679302453995, "learning_rate": 7.766570265701036e-08, "loss": 0.00014838, "memory(GiB)": 26.31, "step": 8930, "train_speed(iter/s)": 0.581678 }, { "acc": 0.9998889, "epoch": 9.47507953340403, "grad_norm": 0.0011388412676751614, "learning_rate": 7.614388864800498e-08, "loss": 0.00030469, "memory(GiB)": 26.31, "step": 8935, "train_speed(iter/s)": 0.581679 }, { "acc": 0.99949665, "epoch": 9.480381760339343, "grad_norm": 0.000366046471754089, "learning_rate": 7.46372160719771e-08, "loss": 0.00050993, "memory(GiB)": 26.31, "step": 8940, "train_speed(iter/s)": 0.581683 }, { "acc": 0.99976463, "epoch": 9.485683987274655, "grad_norm": 0.08309927582740784, "learning_rate": 7.314568956165167e-08, "loss": 0.00032754, "memory(GiB)": 26.31, "step": 8945, "train_speed(iter/s)": 0.581684 }, { "acc": 0.9993681, "epoch": 9.490986214209968, "grad_norm": 0.0019951926078647375, "learning_rate": 7.166931370318059e-08, "loss": 0.0007342, "memory(GiB)": 26.31, "step": 8950, "train_speed(iter/s)": 0.581686 }, { "acc": 0.99973173, "epoch": 9.496288441145282, "grad_norm": 0.047303207218647, "learning_rate": 7.020809303613208e-08, "loss": 0.00031462, "memory(GiB)": 26.31, "step": 8955, "train_speed(iter/s)": 0.581688 }, { "acc": 1.0, "epoch": 9.501590668080594, "grad_norm": 0.004503290168941021, "learning_rate": 6.8762032053473e-08, "loss": 8.68e-06, "memory(GiB)": 26.31, "step": 8960, "train_speed(iter/s)": 0.581689 }, { "acc": 0.99988422, "epoch": 9.506892895015907, "grad_norm": 0.0026324728969484568, "learning_rate": 6.733113520155939e-08, "loss": 0.00016019, "memory(GiB)": 26.31, "step": 8965, "train_speed(iter/s)": 0.581689 }, { "acc": 1.0, "epoch": 9.512195121951219, "grad_norm": 0.0014660786837339401, "learning_rate": 6.591540688011867e-08, "loss": 0.00019902, "memory(GiB)": 26.31, "step": 8970, "train_speed(iter/s)": 0.58169 }, { "acc": 0.99988213, "epoch": 9.517497348886533, "grad_norm": 0.0004259504785295576, "learning_rate": 6.45148514422397e-08, "loss": 0.00029562, "memory(GiB)": 26.31, "step": 8975, "train_speed(iter/s)": 0.581692 }, { "acc": 0.9997159, "epoch": 9.522799575821844, "grad_norm": 0.00198033987544477, "learning_rate": 6.312947319435664e-08, "loss": 0.00066467, "memory(GiB)": 26.31, "step": 8980, "train_speed(iter/s)": 0.581692 }, { "acc": 1.0, "epoch": 9.528101802757158, "grad_norm": 0.002281604567542672, "learning_rate": 6.175927639623681e-08, "loss": 0.00010886, "memory(GiB)": 26.31, "step": 8985, "train_speed(iter/s)": 0.581692 }, { "acc": 0.99974871, "epoch": 9.533404029692472, "grad_norm": 0.03849175199866295, "learning_rate": 6.040426526096728e-08, "loss": 0.00055996, "memory(GiB)": 26.31, "step": 8990, "train_speed(iter/s)": 0.581695 }, { "acc": 0.9996357, "epoch": 9.538706256627783, "grad_norm": 0.06854744255542755, "learning_rate": 5.906444395494326e-08, "loss": 0.00036071, "memory(GiB)": 26.31, "step": 8995, "train_speed(iter/s)": 0.581698 }, { "acc": 0.99976387, "epoch": 9.544008483563097, "grad_norm": 0.0022573957685381174, "learning_rate": 5.7739816597852564e-08, "loss": 0.00025793, "memory(GiB)": 26.31, "step": 9000, "train_speed(iter/s)": 0.581698 }, { "acc": 0.99987116, "epoch": 9.549310710498409, "grad_norm": 0.0029828150290995836, "learning_rate": 5.6430387262666145e-08, "loss": 0.00016854, "memory(GiB)": 26.31, "step": 9005, "train_speed(iter/s)": 0.581699 }, { "acc": 0.99987183, "epoch": 9.554612937433722, "grad_norm": 0.00038223754381760955, "learning_rate": 5.5136159975623705e-08, "loss": 0.00025214, "memory(GiB)": 26.31, "step": 9010, "train_speed(iter/s)": 0.581699 }, { "acc": 0.99986115, "epoch": 9.559915164369034, "grad_norm": 0.0022197780199348927, "learning_rate": 5.3857138716220866e-08, "loss": 0.0002799, "memory(GiB)": 26.31, "step": 9015, "train_speed(iter/s)": 0.581702 }, { "acc": 0.99963999, "epoch": 9.565217391304348, "grad_norm": 0.0437530018389225, "learning_rate": 5.2593327417198165e-08, "loss": 0.0003854, "memory(GiB)": 26.31, "step": 9020, "train_speed(iter/s)": 0.581703 }, { "acc": 0.9998889, "epoch": 9.570519618239661, "grad_norm": 0.0011901530670002103, "learning_rate": 5.134472996452874e-08, "loss": 0.00021868, "memory(GiB)": 26.31, "step": 9025, "train_speed(iter/s)": 0.581704 }, { "acc": 0.99976788, "epoch": 9.575821845174973, "grad_norm": 0.025663409382104874, "learning_rate": 5.011135019740506e-08, "loss": 0.00028868, "memory(GiB)": 26.31, "step": 9030, "train_speed(iter/s)": 0.581705 }, { "acc": 0.9998908, "epoch": 9.581124072110287, "grad_norm": 0.046961378306150436, "learning_rate": 4.88931919082295e-08, "loss": 0.0001989, "memory(GiB)": 26.31, "step": 9035, "train_speed(iter/s)": 0.581706 }, { "acc": 1.0, "epoch": 9.586426299045598, "grad_norm": 0.0007390428800135851, "learning_rate": 4.7690258842601e-08, "loss": 6.1e-06, "memory(GiB)": 26.31, "step": 9040, "train_speed(iter/s)": 0.581709 }, { "acc": 1.0, "epoch": 9.591728525980912, "grad_norm": 0.001649084035307169, "learning_rate": 4.65025546993034e-08, "loss": 4.62e-06, "memory(GiB)": 26.31, "step": 9045, "train_speed(iter/s)": 0.581709 }, { "acc": 0.99962807, "epoch": 9.597030752916226, "grad_norm": 0.07276555895805359, "learning_rate": 4.5330083130294896e-08, "loss": 0.00061633, "memory(GiB)": 26.31, "step": 9050, "train_speed(iter/s)": 0.581712 }, { "acc": 0.99985876, "epoch": 9.602332979851537, "grad_norm": 0.00032748220837675035, "learning_rate": 4.417284774069643e-08, "loss": 0.00020644, "memory(GiB)": 26.31, "step": 9055, "train_speed(iter/s)": 0.581715 }, { "acc": 0.99963312, "epoch": 9.607635206786851, "grad_norm": 0.0011951240012422204, "learning_rate": 4.3030852088781635e-08, "loss": 0.00039923, "memory(GiB)": 26.31, "step": 9060, "train_speed(iter/s)": 0.581716 }, { "acc": 0.99988317, "epoch": 9.612937433722163, "grad_norm": 0.05354034900665283, "learning_rate": 4.190409968596244e-08, "loss": 0.00014953, "memory(GiB)": 26.31, "step": 9065, "train_speed(iter/s)": 0.581717 }, { "acc": 0.99965878, "epoch": 9.618239660657476, "grad_norm": 0.002356098499149084, "learning_rate": 4.0792593996783495e-08, "loss": 0.00033963, "memory(GiB)": 26.31, "step": 9070, "train_speed(iter/s)": 0.581719 }, { "acc": 0.99976301, "epoch": 9.62354188759279, "grad_norm": 0.037054624408483505, "learning_rate": 3.969633843890722e-08, "loss": 0.00020338, "memory(GiB)": 26.31, "step": 9075, "train_speed(iter/s)": 0.58172 }, { "acc": 0.99988213, "epoch": 9.628844114528102, "grad_norm": 0.0032587756868451834, "learning_rate": 3.861533638310546e-08, "loss": 0.00041673, "memory(GiB)": 26.31, "step": 9080, "train_speed(iter/s)": 0.58172 }, { "acc": 0.9996336, "epoch": 9.634146341463415, "grad_norm": 0.09497502446174622, "learning_rate": 3.7549591153248365e-08, "loss": 0.00046227, "memory(GiB)": 26.31, "step": 9085, "train_speed(iter/s)": 0.58172 }, { "acc": 0.99973307, "epoch": 9.639448568398727, "grad_norm": 0.042482633143663406, "learning_rate": 3.649910602629388e-08, "loss": 0.00025767, "memory(GiB)": 26.31, "step": 9090, "train_speed(iter/s)": 0.581721 }, { "acc": 0.99988632, "epoch": 9.64475079533404, "grad_norm": 0.031739819794893265, "learning_rate": 3.546388423227937e-08, "loss": 0.00018167, "memory(GiB)": 26.31, "step": 9095, "train_speed(iter/s)": 0.581722 }, { "acc": 1.0, "epoch": 9.650053022269352, "grad_norm": 0.09785232692956924, "learning_rate": 3.4443928954308384e-08, "loss": 5.175e-05, "memory(GiB)": 26.31, "step": 9100, "train_speed(iter/s)": 0.581723 }, { "acc": 0.99978104, "epoch": 9.655355249204666, "grad_norm": 0.03592614457011223, "learning_rate": 3.343924332854555e-08, "loss": 0.00019715, "memory(GiB)": 26.31, "step": 9105, "train_speed(iter/s)": 0.581724 }, { "acc": 0.9996417, "epoch": 9.66065747613998, "grad_norm": 0.03002362884581089, "learning_rate": 3.244983044420168e-08, "loss": 0.00065632, "memory(GiB)": 26.31, "step": 9110, "train_speed(iter/s)": 0.581727 }, { "acc": 0.99974298, "epoch": 9.665959703075291, "grad_norm": 0.007756541948765516, "learning_rate": 3.1475693343528725e-08, "loss": 0.00035167, "memory(GiB)": 26.31, "step": 9115, "train_speed(iter/s)": 0.581728 }, { "acc": 1.0, "epoch": 9.671261930010605, "grad_norm": 0.0003773514472413808, "learning_rate": 3.051683502180812e-08, "loss": 8.733e-05, "memory(GiB)": 26.31, "step": 9120, "train_speed(iter/s)": 0.581728 }, { "acc": 0.99949741, "epoch": 9.676564156945917, "grad_norm": 0.03288589045405388, "learning_rate": 2.9573258427341382e-08, "loss": 0.00087277, "memory(GiB)": 26.31, "step": 9125, "train_speed(iter/s)": 0.581729 }, { "acc": 1.0, "epoch": 9.68186638388123, "grad_norm": 0.0020175960380584, "learning_rate": 2.864496646144231e-08, "loss": 1.535e-05, "memory(GiB)": 26.31, "step": 9130, "train_speed(iter/s)": 0.58173 }, { "acc": 0.99988375, "epoch": 9.687168610816542, "grad_norm": 0.0019167440477758646, "learning_rate": 2.7731961978427547e-08, "loss": 0.00056926, "memory(GiB)": 26.31, "step": 9135, "train_speed(iter/s)": 0.581732 }, { "acc": 0.99974852, "epoch": 9.692470837751856, "grad_norm": 0.040033094584941864, "learning_rate": 2.683424778560772e-08, "loss": 0.00044879, "memory(GiB)": 26.31, "step": 9140, "train_speed(iter/s)": 0.581737 }, { "acc": 1.0, "epoch": 9.69777306468717, "grad_norm": 0.0005087403696961701, "learning_rate": 2.5951826643277988e-08, "loss": 4.28e-06, "memory(GiB)": 26.31, "step": 9145, "train_speed(iter/s)": 0.581741 }, { "acc": 1.0, "epoch": 9.703075291622481, "grad_norm": 0.040018994361162186, "learning_rate": 2.508470126471083e-08, "loss": 0.00033192, "memory(GiB)": 26.31, "step": 9150, "train_speed(iter/s)": 0.581742 }, { "acc": 0.99962864, "epoch": 9.708377518557795, "grad_norm": 0.15460503101348877, "learning_rate": 2.423287431614827e-08, "loss": 0.00039951, "memory(GiB)": 26.31, "step": 9155, "train_speed(iter/s)": 0.581747 }, { "acc": 1.0, "epoch": 9.713679745493106, "grad_norm": 0.00020370646961964667, "learning_rate": 2.339634841679135e-08, "loss": 5.839e-05, "memory(GiB)": 26.31, "step": 9160, "train_speed(iter/s)": 0.581747 }, { "acc": 0.99962616, "epoch": 9.71898197242842, "grad_norm": 0.03239269554615021, "learning_rate": 2.257512613879289e-08, "loss": 0.00045688, "memory(GiB)": 26.31, "step": 9165, "train_speed(iter/s)": 0.581748 }, { "acc": 1.0, "epoch": 9.724284199363733, "grad_norm": 0.0013463557697832584, "learning_rate": 2.1769210007252503e-08, "loss": 2.67e-06, "memory(GiB)": 26.31, "step": 9170, "train_speed(iter/s)": 0.581748 }, { "acc": 0.99976797, "epoch": 9.729586426299045, "grad_norm": 0.00148050079587847, "learning_rate": 2.0978602500203827e-08, "loss": 0.00020894, "memory(GiB)": 26.31, "step": 9175, "train_speed(iter/s)": 0.581752 }, { "acc": 0.99971256, "epoch": 9.734888653234359, "grad_norm": 0.035606566816568375, "learning_rate": 2.020330604861065e-08, "loss": 0.00057267, "memory(GiB)": 26.31, "step": 9180, "train_speed(iter/s)": 0.581755 }, { "acc": 0.99963932, "epoch": 9.74019088016967, "grad_norm": 0.02676430158317089, "learning_rate": 1.9443323036358563e-08, "loss": 0.00058691, "memory(GiB)": 26.31, "step": 9185, "train_speed(iter/s)": 0.581756 }, { "acc": 0.99975109, "epoch": 9.745493107104984, "grad_norm": 0.002146985149011016, "learning_rate": 1.86986558002461e-08, "loss": 0.00033436, "memory(GiB)": 26.31, "step": 9190, "train_speed(iter/s)": 0.581757 }, { "acc": 1.0, "epoch": 9.750795334040298, "grad_norm": 0.0011241419706493616, "learning_rate": 1.7969306629980287e-08, "loss": 0.00013526, "memory(GiB)": 26.31, "step": 9195, "train_speed(iter/s)": 0.581757 }, { "acc": 0.99987183, "epoch": 9.75609756097561, "grad_norm": 0.002333273645490408, "learning_rate": 1.7255277768166646e-08, "loss": 0.0001136, "memory(GiB)": 26.31, "step": 9200, "train_speed(iter/s)": 0.58176 }, { "acc": 0.99960842, "epoch": 9.761399787910923, "grad_norm": 0.003362901508808136, "learning_rate": 1.6556571410304774e-08, "loss": 0.00092601, "memory(GiB)": 26.31, "step": 9205, "train_speed(iter/s)": 0.581765 }, { "acc": 1.0, "epoch": 9.766702014846235, "grad_norm": 0.0012096832506358624, "learning_rate": 1.587318970478055e-08, "loss": 0.00019076, "memory(GiB)": 26.31, "step": 9210, "train_speed(iter/s)": 0.581766 }, { "acc": 1.0, "epoch": 9.772004241781548, "grad_norm": 0.03516737371683121, "learning_rate": 1.5205134752858374e-08, "loss": 0.00025258, "memory(GiB)": 26.31, "step": 9215, "train_speed(iter/s)": 0.58177 }, { "acc": 0.99987183, "epoch": 9.77730646871686, "grad_norm": 0.002632312010973692, "learning_rate": 1.455240860867729e-08, "loss": 0.0003727, "memory(GiB)": 26.31, "step": 9220, "train_speed(iter/s)": 0.58177 }, { "acc": 0.9998889, "epoch": 9.782608695652174, "grad_norm": 0.027906369417905807, "learning_rate": 1.3915013279242639e-08, "loss": 0.00029545, "memory(GiB)": 26.31, "step": 9225, "train_speed(iter/s)": 0.581771 }, { "acc": 0.99974747, "epoch": 9.787910922587487, "grad_norm": 0.07243865728378296, "learning_rate": 1.3292950724419978e-08, "loss": 0.00066217, "memory(GiB)": 26.31, "step": 9230, "train_speed(iter/s)": 0.581772 }, { "acc": 0.99987984, "epoch": 9.7932131495228, "grad_norm": 0.010551735758781433, "learning_rate": 1.268622285693117e-08, "loss": 0.00021876, "memory(GiB)": 26.31, "step": 9235, "train_speed(iter/s)": 0.581772 }, { "acc": 0.99988317, "epoch": 9.798515376458113, "grad_norm": 0.0032518194057047367, "learning_rate": 1.2094831542344976e-08, "loss": 0.00018491, "memory(GiB)": 26.31, "step": 9240, "train_speed(iter/s)": 0.581772 }, { "acc": 0.99975433, "epoch": 9.803817603393425, "grad_norm": 0.038194481283426285, "learning_rate": 1.1518778599074806e-08, "loss": 0.00021037, "memory(GiB)": 26.31, "step": 9245, "train_speed(iter/s)": 0.581772 }, { "acc": 0.9997674, "epoch": 9.809119830328738, "grad_norm": 0.01002445723861456, "learning_rate": 1.0958065798370409e-08, "loss": 0.00052305, "memory(GiB)": 26.31, "step": 9250, "train_speed(iter/s)": 0.581772 }, { "acc": 1.0, "epoch": 9.814422057264052, "grad_norm": 0.00044991099275648594, "learning_rate": 1.041269486431399e-08, "loss": 6.751e-05, "memory(GiB)": 26.31, "step": 9255, "train_speed(iter/s)": 0.581772 }, { "acc": 1.0, "epoch": 9.819724284199363, "grad_norm": 0.011789188720285892, "learning_rate": 9.882667473815209e-09, "loss": 1.297e-05, "memory(GiB)": 26.31, "step": 9260, "train_speed(iter/s)": 0.581774 }, { "acc": 0.99988985, "epoch": 9.825026511134677, "grad_norm": 0.0348358228802681, "learning_rate": 9.367985256604519e-09, "loss": 0.0001662, "memory(GiB)": 26.31, "step": 9265, "train_speed(iter/s)": 0.581774 }, { "acc": 1.0, "epoch": 9.830328738069989, "grad_norm": 0.0035749017260968685, "learning_rate": 8.868649795228724e-09, "loss": 9.127e-05, "memory(GiB)": 26.31, "step": 9270, "train_speed(iter/s)": 0.581775 }, { "acc": 0.99986839, "epoch": 9.835630965005302, "grad_norm": 0.020208999514579773, "learning_rate": 8.38466262504766e-09, "loss": 0.00017922, "memory(GiB)": 26.31, "step": 9275, "train_speed(iter/s)": 0.581776 }, { "acc": 1.0, "epoch": 9.840933191940614, "grad_norm": 0.002023870823904872, "learning_rate": 7.916025234226407e-09, "loss": 0.00025274, "memory(GiB)": 26.31, "step": 9280, "train_speed(iter/s)": 0.581777 }, { "acc": 0.99986706, "epoch": 9.846235418875928, "grad_norm": 0.025883223861455917, "learning_rate": 7.462739063734198e-09, "loss": 0.0001587, "memory(GiB)": 26.31, "step": 9285, "train_speed(iter/s)": 0.581782 }, { "acc": 0.99988098, "epoch": 9.851537645811241, "grad_norm": 0.045160189270973206, "learning_rate": 7.024805507337186e-09, "loss": 0.00011031, "memory(GiB)": 26.31, "step": 9290, "train_speed(iter/s)": 0.581782 }, { "acc": 0.99988213, "epoch": 9.856839872746553, "grad_norm": 0.000631249975413084, "learning_rate": 6.602225911595128e-09, "loss": 0.00017462, "memory(GiB)": 26.31, "step": 9295, "train_speed(iter/s)": 0.581784 }, { "acc": 0.99974346, "epoch": 9.862142099681867, "grad_norm": 0.03226780891418457, "learning_rate": 6.1950015758580405e-09, "loss": 0.00077184, "memory(GiB)": 26.31, "step": 9300, "train_speed(iter/s)": 0.581785 }, { "acc": 0.99987984, "epoch": 9.867444326617179, "grad_norm": 0.07820426672697067, "learning_rate": 5.803133752260661e-09, "loss": 0.00019142, "memory(GiB)": 26.31, "step": 9305, "train_speed(iter/s)": 0.581786 }, { "acc": 0.9997633, "epoch": 9.872746553552492, "grad_norm": 0.0021600211039185524, "learning_rate": 5.426623645721333e-09, "loss": 0.00034086, "memory(GiB)": 26.31, "step": 9310, "train_speed(iter/s)": 0.581787 }, { "acc": 0.9997406, "epoch": 9.878048780487806, "grad_norm": 0.05200710520148277, "learning_rate": 5.065472413933678e-09, "loss": 0.00040007, "memory(GiB)": 26.31, "step": 9315, "train_speed(iter/s)": 0.581789 }, { "acc": 0.99986706, "epoch": 9.883351007423117, "grad_norm": 0.0008037837687879801, "learning_rate": 4.71968116736771e-09, "loss": 0.00021175, "memory(GiB)": 26.31, "step": 9320, "train_speed(iter/s)": 0.581789 }, { "acc": 0.99984474, "epoch": 9.888653234358431, "grad_norm": 0.051441438496112823, "learning_rate": 4.389250969264283e-09, "loss": 0.00016188, "memory(GiB)": 26.31, "step": 9325, "train_speed(iter/s)": 0.58179 }, { "acc": 0.99963465, "epoch": 9.893955461293743, "grad_norm": 0.02896001748740673, "learning_rate": 4.0741828356312046e-09, "loss": 0.00053013, "memory(GiB)": 26.31, "step": 9330, "train_speed(iter/s)": 0.581792 }, { "acc": 0.99972954, "epoch": 9.899257688229056, "grad_norm": 0.0005594078684225678, "learning_rate": 3.774477735241571e-09, "loss": 0.0003093, "memory(GiB)": 26.31, "step": 9335, "train_speed(iter/s)": 0.581792 }, { "acc": 0.9995079, "epoch": 9.90455991516437, "grad_norm": 0.0392618291079998, "learning_rate": 3.490136589629885e-09, "loss": 0.00048772, "memory(GiB)": 26.31, "step": 9340, "train_speed(iter/s)": 0.581795 }, { "acc": 1.0, "epoch": 9.909862142099682, "grad_norm": 0.011398269794881344, "learning_rate": 3.221160273090386e-09, "loss": 1.085e-05, "memory(GiB)": 26.31, "step": 9345, "train_speed(iter/s)": 0.581797 }, { "acc": 0.99987679, "epoch": 9.915164369034995, "grad_norm": 0.0018169950926676393, "learning_rate": 2.9675496126715013e-09, "loss": 0.00024445, "memory(GiB)": 26.31, "step": 9350, "train_speed(iter/s)": 0.581797 }, { "acc": 1.0, "epoch": 9.920466595970307, "grad_norm": 0.026712248101830482, "learning_rate": 2.7293053881769583e-09, "loss": 0.00015502, "memory(GiB)": 26.31, "step": 9355, "train_speed(iter/s)": 0.581798 }, { "acc": 1.0, "epoch": 9.92576882290562, "grad_norm": 0.0008010675082914531, "learning_rate": 2.5064283321618967e-09, "loss": 7.096e-05, "memory(GiB)": 26.31, "step": 9360, "train_speed(iter/s)": 0.581799 }, { "acc": 0.99964323, "epoch": 9.931071049840932, "grad_norm": 0.0023205948527902365, "learning_rate": 2.298919129928429e-09, "loss": 0.00044941, "memory(GiB)": 26.31, "step": 9365, "train_speed(iter/s)": 0.5818 }, { "acc": 1.0, "epoch": 9.936373276776246, "grad_norm": 0.0038338894955813885, "learning_rate": 2.1067784195278586e-09, "loss": 0.00015866, "memory(GiB)": 26.31, "step": 9370, "train_speed(iter/s)": 0.581804 }, { "acc": 0.99964285, "epoch": 9.94167550371156, "grad_norm": 0.0026238136924803257, "learning_rate": 1.9300067917551333e-09, "loss": 0.00037828, "memory(GiB)": 26.31, "step": 9375, "train_speed(iter/s)": 0.581804 }, { "acc": 1.0, "epoch": 9.946977730646871, "grad_norm": 0.0014907962176948786, "learning_rate": 1.7686047901482875e-09, "loss": 7.117e-05, "memory(GiB)": 26.31, "step": 9380, "train_speed(iter/s)": 0.581807 }, { "acc": 0.99976768, "epoch": 9.952279957582185, "grad_norm": 0.04889528080821037, "learning_rate": 1.6225729109867767e-09, "loss": 0.00037514, "memory(GiB)": 26.31, "step": 9385, "train_speed(iter/s)": 0.58181 }, { "acc": 0.99988585, "epoch": 9.957582184517497, "grad_norm": 0.0002773120941128582, "learning_rate": 1.491911603290369e-09, "loss": 0.00015811, "memory(GiB)": 26.31, "step": 9390, "train_speed(iter/s)": 0.581812 }, { "acc": 0.99976158, "epoch": 9.96288441145281, "grad_norm": 0.06627703458070755, "learning_rate": 1.3766212688169235e-09, "loss": 0.00021021, "memory(GiB)": 26.31, "step": 9395, "train_speed(iter/s)": 0.581812 }, { "acc": 0.99974079, "epoch": 9.968186638388122, "grad_norm": 0.05650794133543968, "learning_rate": 1.2767022620618365e-09, "loss": 0.00020128, "memory(GiB)": 26.31, "step": 9400, "train_speed(iter/s)": 0.581814 }, { "acc": 0.99948587, "epoch": 9.973488865323436, "grad_norm": 0.030863817781209946, "learning_rate": 1.1921548902563759e-09, "loss": 0.00049622, "memory(GiB)": 26.31, "step": 9405, "train_speed(iter/s)": 0.581818 }, { "acc": 0.9998889, "epoch": 9.97879109225875, "grad_norm": 0.0006861954461783171, "learning_rate": 1.1229794133676798e-09, "loss": 0.00015753, "memory(GiB)": 26.31, "step": 9410, "train_speed(iter/s)": 0.58182 }, { "acc": 0.99987621, "epoch": 9.984093319194061, "grad_norm": 0.0052332268096506596, "learning_rate": 1.0691760440959835e-09, "loss": 0.00019103, "memory(GiB)": 26.31, "step": 9415, "train_speed(iter/s)": 0.58182 }, { "acc": 0.99984474, "epoch": 9.989395546129375, "grad_norm": 0.0013717457186430693, "learning_rate": 1.0307449478762829e-09, "loss": 0.00013517, "memory(GiB)": 26.31, "step": 9420, "train_speed(iter/s)": 0.581821 }, { "acc": 0.99989624, "epoch": 9.994697773064686, "grad_norm": 0.0009563152561895549, "learning_rate": 1.0076862428777806e-09, "loss": 0.0004057, "memory(GiB)": 26.31, "step": 9425, "train_speed(iter/s)": 0.581826 }, { "acc": 0.99975662, "epoch": 10.0, "grad_norm": 0.040931958705186844, "learning_rate": 1e-09, "loss": 0.00025084, "memory(GiB)": 26.31, "step": 9430, "train_speed(iter/s)": 0.581823 }, { "epoch": 10.0, "eval_acc": 0.9468321662075603, "eval_loss": 1.008236289024353, "eval_runtime": 109.9561, "eval_samples_per_second": 205.555, "eval_steps_per_second": 0.809, "step": 9430 } ], "logging_steps": 5, "max_steps": 9430, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.203857723149517e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }