{ "best_metric": 1.48696911, "best_model_checkpoint": "/mnt/bn/haiyang-dataset-lq/medical/outputreport/qwen2-vl-2b-instruct/v1-20241109-145429/checkpoint-10000", "epoch": 49.9558693733451, "eval_steps": 10000, "global_step": 28300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "acc": 0.85609853, "epoch": 0.00176522506619594, "grad_norm": 14.651725769042969, "learning_rate": 0.0, "loss": 1.14392924, "memory(GiB)": 12.72, "step": 1, "train_speed(iter/s)": 0.029024 }, { "acc": 0.8501277, "epoch": 0.0088261253309797, "grad_norm": 22.414897918701172, "learning_rate": 2.218419664221993e-06, "loss": 1.08649182, "memory(GiB)": 14.16, "step": 5, "train_speed(iter/s)": 0.134456 }, { "acc": 0.86518497, "epoch": 0.0176522506619594, "grad_norm": 14.280951499938965, "learning_rate": 3.1738410095714533e-06, "loss": 0.95808268, "memory(GiB)": 14.16, "step": 10, "train_speed(iter/s)": 0.245284 }, { "acc": 0.84847279, "epoch": 0.0264783759929391, "grad_norm": 9.792591094970703, "learning_rate": 3.7327266689894446e-06, "loss": 1.11168499, "memory(GiB)": 14.16, "step": 15, "train_speed(iter/s)": 0.340693 }, { "acc": 0.85716591, "epoch": 0.0353045013239188, "grad_norm": 15.905067443847656, "learning_rate": 4.129262354920913e-06, "loss": 1.04715109, "memory(GiB)": 14.16, "step": 20, "train_speed(iter/s)": 0.4211 }, { "acc": 0.86385231, "epoch": 0.0441306266548985, "grad_norm": 10.421817779541016, "learning_rate": 4.436839328443986e-06, "loss": 0.96752071, "memory(GiB)": 14.16, "step": 25, "train_speed(iter/s)": 0.491163 }, { "acc": 0.85875874, "epoch": 0.0529567519858782, "grad_norm": 14.02324390411377, "learning_rate": 4.6881480143389055e-06, "loss": 0.98890724, "memory(GiB)": 14.16, "step": 30, "train_speed(iter/s)": 0.551542 }, { "acc": 0.84993057, "epoch": 0.0617828773168579, "grad_norm": 13.64876937866211, "learning_rate": 4.900626480727698e-06, "loss": 1.07389021, "memory(GiB)": 14.16, "step": 35, "train_speed(iter/s)": 0.605945 }, { "acc": 0.86327934, "epoch": 0.0706090026478376, "grad_norm": 11.647984504699707, "learning_rate": 5.084683700270374e-06, "loss": 0.83797007, "memory(GiB)": 14.16, "step": 40, "train_speed(iter/s)": 0.654203 }, { "acc": 0.85296698, "epoch": 0.0794351279788173, "grad_norm": 16.294282913208008, "learning_rate": 5.247033673756897e-06, "loss": 1.06365299, "memory(GiB)": 14.16, "step": 45, "train_speed(iter/s)": 0.697546 }, { "acc": 0.84158993, "epoch": 0.088261253309797, "grad_norm": 17.551116943359375, "learning_rate": 5.392260673793445e-06, "loss": 1.18469706, "memory(GiB)": 14.16, "step": 50, "train_speed(iter/s)": 0.735994 }, { "acc": 0.85941944, "epoch": 0.0970873786407767, "grad_norm": 20.091724395751953, "learning_rate": 5.523634475444901e-06, "loss": 0.98214798, "memory(GiB)": 14.16, "step": 55, "train_speed(iter/s)": 0.768512 }, { "acc": 0.8567688, "epoch": 0.1059135039717564, "grad_norm": 10.861732482910156, "learning_rate": 5.643569359688366e-06, "loss": 1.06068583, "memory(GiB)": 14.16, "step": 60, "train_speed(iter/s)": 0.802508 }, { "acc": 0.83880882, "epoch": 0.1147396293027361, "grad_norm": 14.642560005187988, "learning_rate": 5.753898758112932e-06, "loss": 1.17724476, "memory(GiB)": 14.16, "step": 65, "train_speed(iter/s)": 0.833911 }, { "acc": 0.82984314, "epoch": 0.1235657546337158, "grad_norm": 14.15579605102539, "learning_rate": 5.856047826077159e-06, "loss": 1.19530334, "memory(GiB)": 14.16, "step": 70, "train_speed(iter/s)": 0.860717 }, { "acc": 0.83807402, "epoch": 0.1323918799646955, "grad_norm": 18.4400577545166, "learning_rate": 5.951146333211437e-06, "loss": 1.08725853, "memory(GiB)": 14.16, "step": 75, "train_speed(iter/s)": 0.884851 }, { "acc": 0.85065269, "epoch": 0.1412180052956752, "grad_norm": 15.610552787780762, "learning_rate": 6.040105045619834e-06, "loss": 1.04840374, "memory(GiB)": 14.16, "step": 80, "train_speed(iter/s)": 0.906777 }, { "acc": 0.85234985, "epoch": 0.1500441306266549, "grad_norm": 13.724344253540039, "learning_rate": 6.12366891107532e-06, "loss": 1.07202253, "memory(GiB)": 14.16, "step": 85, "train_speed(iter/s)": 0.928218 }, { "acc": 0.84972496, "epoch": 0.1588702559576346, "grad_norm": 14.061919212341309, "learning_rate": 6.202455019106357e-06, "loss": 1.00122509, "memory(GiB)": 14.16, "step": 90, "train_speed(iter/s)": 0.946951 }, { "acc": 0.84771767, "epoch": 0.1676963812886143, "grad_norm": 13.212542533874512, "learning_rate": 6.276980284063251e-06, "loss": 1.0103857, "memory(GiB)": 14.16, "step": 95, "train_speed(iter/s)": 0.962892 }, { "acc": 0.84891243, "epoch": 0.176522506619594, "grad_norm": 16.057748794555664, "learning_rate": 6.347682019142907e-06, "loss": 1.12434425, "memory(GiB)": 14.16, "step": 100, "train_speed(iter/s)": 0.98208 }, { "acc": 0.8437892, "epoch": 0.1853486319505737, "grad_norm": 17.242244720458984, "learning_rate": 6.41493348549515e-06, "loss": 1.04363413, "memory(GiB)": 14.16, "step": 105, "train_speed(iter/s)": 0.997677 }, { "acc": 0.84031267, "epoch": 0.1941747572815534, "grad_norm": 44.25764083862305, "learning_rate": 6.479055820794361e-06, "loss": 1.09292545, "memory(GiB)": 14.16, "step": 110, "train_speed(iter/s)": 1.013134 }, { "acc": 0.84558134, "epoch": 0.2030008826125331, "grad_norm": 17.429080963134766, "learning_rate": 6.5403273140496205e-06, "loss": 1.05013237, "memory(GiB)": 14.16, "step": 115, "train_speed(iter/s)": 1.027069 }, { "acc": 0.85383377, "epoch": 0.2118270079435128, "grad_norm": 17.708940505981445, "learning_rate": 6.598990705037825e-06, "loss": 1.02983837, "memory(GiB)": 14.16, "step": 120, "train_speed(iter/s)": 1.041385 }, { "acc": 0.85622272, "epoch": 0.22065313327449249, "grad_norm": 14.592547416687012, "learning_rate": 6.65525899266598e-06, "loss": 0.95670757, "memory(GiB)": 14.16, "step": 125, "train_speed(iter/s)": 1.054874 }, { "acc": 0.85668678, "epoch": 0.2294792586054722, "grad_norm": 14.590570449829102, "learning_rate": 6.709320103462394e-06, "loss": 0.95585012, "memory(GiB)": 14.16, "step": 130, "train_speed(iter/s)": 1.067432 }, { "acc": 0.84441109, "epoch": 0.2383053839364519, "grad_norm": 16.04547119140625, "learning_rate": 6.761340678524349e-06, "loss": 1.1049633, "memory(GiB)": 14.16, "step": 135, "train_speed(iter/s)": 1.080466 }, { "acc": 0.84873962, "epoch": 0.2471315092674316, "grad_norm": 16.376794815063477, "learning_rate": 6.811469171426618e-06, "loss": 1.0144474, "memory(GiB)": 14.16, "step": 140, "train_speed(iter/s)": 1.088851 }, { "acc": 0.84958744, "epoch": 0.2559576345984113, "grad_norm": 20.41977310180664, "learning_rate": 6.859838402268888e-06, "loss": 1.04827213, "memory(GiB)": 14.16, "step": 145, "train_speed(iter/s)": 1.098584 }, { "acc": 0.84693375, "epoch": 0.264783759929391, "grad_norm": 17.980966567993164, "learning_rate": 6.906567678560898e-06, "loss": 1.01062307, "memory(GiB)": 14.16, "step": 150, "train_speed(iter/s)": 1.108769 }, { "acc": 0.84662094, "epoch": 0.2736098852603707, "grad_norm": 15.284897804260254, "learning_rate": 6.951764568217154e-06, "loss": 1.0376976, "memory(GiB)": 14.16, "step": 155, "train_speed(iter/s)": 1.118654 }, { "acc": 0.83441086, "epoch": 0.2824360105913504, "grad_norm": 22.383628845214844, "learning_rate": 6.995526390969294e-06, "loss": 1.05802212, "memory(GiB)": 14.16, "step": 160, "train_speed(iter/s)": 1.127511 }, { "acc": 0.84830742, "epoch": 0.2912621359223301, "grad_norm": 13.756697654724121, "learning_rate": 7.0379414802123525e-06, "loss": 1.03619394, "memory(GiB)": 14.16, "step": 165, "train_speed(iter/s)": 1.135776 }, { "acc": 0.84229088, "epoch": 0.3000882612533098, "grad_norm": 11.301949501037598, "learning_rate": 7.079090256424781e-06, "loss": 1.0430851, "memory(GiB)": 14.16, "step": 170, "train_speed(iter/s)": 1.14307 }, { "acc": 0.83564148, "epoch": 0.3089143865842895, "grad_norm": 19.758413314819336, "learning_rate": 7.119046144949692e-06, "loss": 1.05794601, "memory(GiB)": 14.16, "step": 175, "train_speed(iter/s)": 1.151003 }, { "acc": 0.84479065, "epoch": 0.3177405119152692, "grad_norm": 16.006404876708984, "learning_rate": 7.157876364455818e-06, "loss": 1.0950695, "memory(GiB)": 14.16, "step": 180, "train_speed(iter/s)": 1.159154 }, { "acc": 0.83304977, "epoch": 0.3265666372462489, "grad_norm": 18.35784339904785, "learning_rate": 7.195642607346478e-06, "loss": 1.15077944, "memory(GiB)": 14.16, "step": 185, "train_speed(iter/s)": 1.16548 }, { "acc": 0.87202711, "epoch": 0.3353927625772286, "grad_norm": 11.456375122070312, "learning_rate": 7.232401629412711e-06, "loss": 0.85602608, "memory(GiB)": 14.16, "step": 190, "train_speed(iter/s)": 1.173137 }, { "acc": 0.85403309, "epoch": 0.3442188879082083, "grad_norm": 14.63279914855957, "learning_rate": 7.2682057628803845e-06, "loss": 0.94002991, "memory(GiB)": 14.16, "step": 195, "train_speed(iter/s)": 1.17894 }, { "acc": 0.84755888, "epoch": 0.353045013239188, "grad_norm": 20.316997528076172, "learning_rate": 7.303103364492366e-06, "loss": 0.99241638, "memory(GiB)": 14.16, "step": 200, "train_speed(iter/s)": 1.184453 }, { "acc": 0.85116348, "epoch": 0.36187113857016767, "grad_norm": 55.0746955871582, "learning_rate": 7.3371392082539e-06, "loss": 0.97003441, "memory(GiB)": 14.16, "step": 205, "train_speed(iter/s)": 1.189641 }, { "acc": 0.844977, "epoch": 0.3706972639011474, "grad_norm": 28.961469650268555, "learning_rate": 7.37035483084461e-06, "loss": 0.93029585, "memory(GiB)": 14.16, "step": 210, "train_speed(iter/s)": 1.196388 }, { "acc": 0.85105257, "epoch": 0.3795233892321271, "grad_norm": 19.541975021362305, "learning_rate": 7.40278883638183e-06, "loss": 0.95558834, "memory(GiB)": 14.16, "step": 215, "train_speed(iter/s)": 1.201079 }, { "acc": 0.86030617, "epoch": 0.3883495145631068, "grad_norm": 20.67847442626953, "learning_rate": 7.434477166143822e-06, "loss": 0.95868387, "memory(GiB)": 14.16, "step": 220, "train_speed(iter/s)": 1.206789 }, { "acc": 0.84174948, "epoch": 0.3971756398940865, "grad_norm": 19.561363220214844, "learning_rate": 7.465453337978889e-06, "loss": 1.02173529, "memory(GiB)": 14.16, "step": 225, "train_speed(iter/s)": 1.212036 }, { "acc": 0.86592693, "epoch": 0.4060017652250662, "grad_norm": 11.455013275146484, "learning_rate": 7.495748659399082e-06, "loss": 0.79227252, "memory(GiB)": 14.16, "step": 230, "train_speed(iter/s)": 1.216576 }, { "acc": 0.83328362, "epoch": 0.4148278905560459, "grad_norm": 29.084524154663086, "learning_rate": 7.525392417754955e-06, "loss": 1.01948166, "memory(GiB)": 14.16, "step": 235, "train_speed(iter/s)": 1.221725 }, { "acc": 0.8429554, "epoch": 0.4236540158870256, "grad_norm": 20.903335571289062, "learning_rate": 7.554412050387287e-06, "loss": 0.99282942, "memory(GiB)": 14.16, "step": 240, "train_speed(iter/s)": 1.224743 }, { "acc": 0.85016003, "epoch": 0.4324801412180053, "grad_norm": 16.691232681274414, "learning_rate": 7.582833297233404e-06, "loss": 0.99217443, "memory(GiB)": 14.16, "step": 245, "train_speed(iter/s)": 1.230053 }, { "acc": 0.87495918, "epoch": 0.44130626654898497, "grad_norm": 12.78386402130127, "learning_rate": 7.610680338015439e-06, "loss": 0.87768221, "memory(GiB)": 14.16, "step": 250, "train_speed(iter/s)": 1.234699 }, { "acc": 0.84126263, "epoch": 0.4501323918799647, "grad_norm": 14.539063453674316, "learning_rate": 7.637975915842772e-06, "loss": 1.01728516, "memory(GiB)": 14.16, "step": 255, "train_speed(iter/s)": 1.239356 }, { "acc": 0.85397568, "epoch": 0.4589585172109444, "grad_norm": 12.476675033569336, "learning_rate": 7.664741448811854e-06, "loss": 0.99510822, "memory(GiB)": 14.16, "step": 260, "train_speed(iter/s)": 1.243331 }, { "acc": 0.85002632, "epoch": 0.4677846425419241, "grad_norm": 15.768548011779785, "learning_rate": 7.690997130975458e-06, "loss": 0.95526009, "memory(GiB)": 14.16, "step": 265, "train_speed(iter/s)": 1.246577 }, { "acc": 0.84644508, "epoch": 0.4766107678729038, "grad_norm": 15.827385902404785, "learning_rate": 7.71676202387381e-06, "loss": 0.99780674, "memory(GiB)": 14.16, "step": 270, "train_speed(iter/s)": 1.249431 }, { "acc": 0.86341457, "epoch": 0.4854368932038835, "grad_norm": 11.238011360168457, "learning_rate": 7.742054139666894e-06, "loss": 0.84536343, "memory(GiB)": 14.16, "step": 275, "train_speed(iter/s)": 1.252802 }, { "acc": 0.86229143, "epoch": 0.4942630185348632, "grad_norm": 12.086544036865234, "learning_rate": 7.76689051677608e-06, "loss": 0.85777378, "memory(GiB)": 14.16, "step": 280, "train_speed(iter/s)": 1.257632 }, { "acc": 0.84981022, "epoch": 0.5030891438658429, "grad_norm": 14.779291152954102, "learning_rate": 7.791287288830704e-06, "loss": 0.93616142, "memory(GiB)": 14.16, "step": 285, "train_speed(iter/s)": 1.261747 }, { "acc": 0.83907986, "epoch": 0.5119152691968226, "grad_norm": 17.486122131347656, "learning_rate": 7.815259747618348e-06, "loss": 1.05808353, "memory(GiB)": 14.16, "step": 290, "train_speed(iter/s)": 1.265056 }, { "acc": 0.8565239, "epoch": 0.5207413945278023, "grad_norm": 10.786604881286621, "learning_rate": 7.838822400653937e-06, "loss": 0.93398495, "memory(GiB)": 14.16, "step": 295, "train_speed(iter/s)": 1.268351 }, { "acc": 0.85258522, "epoch": 0.529567519858782, "grad_norm": 38.95402526855469, "learning_rate": 7.861989023910358e-06, "loss": 0.90065269, "memory(GiB)": 14.16, "step": 300, "train_speed(iter/s)": 1.271608 }, { "acc": 0.8546196, "epoch": 0.5383936451897617, "grad_norm": 21.24909210205078, "learning_rate": 7.884772710190602e-06, "loss": 0.8546505, "memory(GiB)": 15.03, "step": 305, "train_speed(iter/s)": 1.27394 }, { "acc": 0.85160217, "epoch": 0.5472197705207414, "grad_norm": 14.723251342773438, "learning_rate": 7.907185913566615e-06, "loss": 0.94713554, "memory(GiB)": 15.03, "step": 310, "train_speed(iter/s)": 1.277631 }, { "acc": 0.85263901, "epoch": 0.556045895851721, "grad_norm": 16.077661514282227, "learning_rate": 7.929240490262602e-06, "loss": 0.94810934, "memory(GiB)": 15.03, "step": 315, "train_speed(iter/s)": 1.280571 }, { "acc": 0.85038147, "epoch": 0.5648720211827007, "grad_norm": 15.50709056854248, "learning_rate": 7.950947736318755e-06, "loss": 0.92968903, "memory(GiB)": 15.03, "step": 320, "train_speed(iter/s)": 1.284158 }, { "acc": 0.82379665, "epoch": 0.5736981465136805, "grad_norm": 15.399169921875, "learning_rate": 7.972318422334926e-06, "loss": 1.16283875, "memory(GiB)": 15.03, "step": 325, "train_speed(iter/s)": 1.286126 }, { "acc": 0.85887756, "epoch": 0.5825242718446602, "grad_norm": 11.815301895141602, "learning_rate": 7.993362825561812e-06, "loss": 0.91573811, "memory(GiB)": 15.03, "step": 330, "train_speed(iter/s)": 1.289159 }, { "acc": 0.86126022, "epoch": 0.5913503971756399, "grad_norm": 12.373947143554688, "learning_rate": 8.014090759578977e-06, "loss": 0.8286418, "memory(GiB)": 15.03, "step": 335, "train_speed(iter/s)": 1.292452 }, { "acc": 0.84649496, "epoch": 0.6001765225066196, "grad_norm": 10.716691017150879, "learning_rate": 8.034511601774241e-06, "loss": 0.93031359, "memory(GiB)": 15.03, "step": 340, "train_speed(iter/s)": 1.295785 }, { "acc": 0.86900921, "epoch": 0.6090026478375993, "grad_norm": 9.307262420654297, "learning_rate": 8.054634318817073e-06, "loss": 0.79148679, "memory(GiB)": 15.03, "step": 345, "train_speed(iter/s)": 1.298964 }, { "acc": 0.85980539, "epoch": 0.617828773168579, "grad_norm": 19.837223052978516, "learning_rate": 8.074467490299152e-06, "loss": 0.85251389, "memory(GiB)": 15.03, "step": 350, "train_speed(iter/s)": 1.300599 }, { "acc": 0.85743332, "epoch": 0.6266548984995587, "grad_norm": 13.691913604736328, "learning_rate": 8.094019330698125e-06, "loss": 0.80583038, "memory(GiB)": 15.03, "step": 355, "train_speed(iter/s)": 1.303037 }, { "acc": 0.86313419, "epoch": 0.6354810238305384, "grad_norm": 20.739458084106445, "learning_rate": 8.113297709805278e-06, "loss": 0.8464941, "memory(GiB)": 15.03, "step": 360, "train_speed(iter/s)": 1.305274 }, { "acc": 0.85477276, "epoch": 0.6443071491615181, "grad_norm": 15.048263549804688, "learning_rate": 8.132310171744269e-06, "loss": 0.90760918, "memory(GiB)": 15.03, "step": 365, "train_speed(iter/s)": 1.308229 }, { "acc": 0.84394989, "epoch": 0.6531332744924978, "grad_norm": 22.2601261138916, "learning_rate": 8.151063952695938e-06, "loss": 0.96272449, "memory(GiB)": 15.03, "step": 370, "train_speed(iter/s)": 1.310176 }, { "acc": 0.87063484, "epoch": 0.6619593998234775, "grad_norm": 17.136972427368164, "learning_rate": 8.16956599743343e-06, "loss": 0.7666791, "memory(GiB)": 15.03, "step": 375, "train_speed(iter/s)": 1.312272 }, { "acc": 0.85499554, "epoch": 0.6707855251544572, "grad_norm": 14.284866333007812, "learning_rate": 8.187822974762172e-06, "loss": 0.86938667, "memory(GiB)": 15.03, "step": 380, "train_speed(iter/s)": 1.314682 }, { "acc": 0.84185934, "epoch": 0.6796116504854369, "grad_norm": 16.207565307617188, "learning_rate": 8.205841291950608e-06, "loss": 0.97270164, "memory(GiB)": 15.03, "step": 385, "train_speed(iter/s)": 1.316353 }, { "acc": 0.8727807, "epoch": 0.6884377758164166, "grad_norm": 13.051177978515625, "learning_rate": 8.223627108229846e-06, "loss": 0.81391993, "memory(GiB)": 15.03, "step": 390, "train_speed(iter/s)": 1.317646 }, { "acc": 0.85919895, "epoch": 0.6972639011473963, "grad_norm": 12.227827072143555, "learning_rate": 8.24118634743339e-06, "loss": 0.82849236, "memory(GiB)": 15.03, "step": 395, "train_speed(iter/s)": 1.319128 }, { "acc": 0.85667152, "epoch": 0.706090026478376, "grad_norm": 12.862465858459473, "learning_rate": 8.258524709841826e-06, "loss": 0.82565088, "memory(GiB)": 15.03, "step": 400, "train_speed(iter/s)": 1.320598 }, { "acc": 0.84189749, "epoch": 0.7149161518093556, "grad_norm": 27.904634475708008, "learning_rate": 8.275647683291801e-06, "loss": 0.94655628, "memory(GiB)": 15.03, "step": 405, "train_speed(iter/s)": 1.322937 }, { "acc": 0.85747814, "epoch": 0.7237422771403353, "grad_norm": 10.16468620300293, "learning_rate": 8.292560553603361e-06, "loss": 0.80448122, "memory(GiB)": 15.03, "step": 410, "train_speed(iter/s)": 1.325071 }, { "acc": 0.83789501, "epoch": 0.732568402471315, "grad_norm": 14.793179512023926, "learning_rate": 8.30926841437533e-06, "loss": 0.98678055, "memory(GiB)": 15.03, "step": 415, "train_speed(iter/s)": 1.327377 }, { "acc": 0.87835693, "epoch": 0.7413945278022948, "grad_norm": 10.76827621459961, "learning_rate": 8.32577617619407e-06, "loss": 0.68938456, "memory(GiB)": 15.03, "step": 420, "train_speed(iter/s)": 1.329043 }, { "acc": 0.84287548, "epoch": 0.7502206531332745, "grad_norm": 15.259495735168457, "learning_rate": 8.342088575297314e-06, "loss": 0.94884081, "memory(GiB)": 15.03, "step": 425, "train_speed(iter/s)": 1.330862 }, { "acc": 0.86222849, "epoch": 0.7590467784642542, "grad_norm": 40.286434173583984, "learning_rate": 8.358210181731291e-06, "loss": 0.77577968, "memory(GiB)": 15.03, "step": 430, "train_speed(iter/s)": 1.332436 }, { "acc": 0.85204277, "epoch": 0.7678729037952339, "grad_norm": 14.985507011413574, "learning_rate": 8.374145407036341e-06, "loss": 0.88468866, "memory(GiB)": 15.03, "step": 435, "train_speed(iter/s)": 1.334587 }, { "acc": 0.85831385, "epoch": 0.7766990291262136, "grad_norm": 20.61726188659668, "learning_rate": 8.38989851149328e-06, "loss": 0.84114628, "memory(GiB)": 15.03, "step": 440, "train_speed(iter/s)": 1.335729 }, { "acc": 0.84973698, "epoch": 0.7855251544571933, "grad_norm": 12.351398468017578, "learning_rate": 8.405473610960385e-06, "loss": 0.88097677, "memory(GiB)": 15.03, "step": 445, "train_speed(iter/s)": 1.337001 }, { "acc": 0.86829004, "epoch": 0.794351279788173, "grad_norm": 12.410344123840332, "learning_rate": 8.420874683328351e-06, "loss": 0.71265383, "memory(GiB)": 15.03, "step": 450, "train_speed(iter/s)": 1.338892 }, { "acc": 0.8560379, "epoch": 0.8031774051191527, "grad_norm": 17.64111328125, "learning_rate": 8.436105574618638e-06, "loss": 0.82901678, "memory(GiB)": 15.03, "step": 455, "train_speed(iter/s)": 1.34038 }, { "acc": 0.85172873, "epoch": 0.8120035304501324, "grad_norm": 11.742136001586914, "learning_rate": 8.45117000474854e-06, "loss": 0.88270187, "memory(GiB)": 15.03, "step": 460, "train_speed(iter/s)": 1.341647 }, { "acc": 0.8538866, "epoch": 0.8208296557811121, "grad_norm": 16.57000160217285, "learning_rate": 8.466071572984606e-06, "loss": 0.81329994, "memory(GiB)": 15.03, "step": 465, "train_speed(iter/s)": 1.343189 }, { "acc": 0.8637826, "epoch": 0.8296557811120918, "grad_norm": 13.3612642288208, "learning_rate": 8.480813763104416e-06, "loss": 0.85239983, "memory(GiB)": 15.03, "step": 470, "train_speed(iter/s)": 1.344435 }, { "acc": 0.86099529, "epoch": 0.8384819064430715, "grad_norm": 16.987106323242188, "learning_rate": 8.495399948285244e-06, "loss": 0.80814066, "memory(GiB)": 15.03, "step": 475, "train_speed(iter/s)": 1.346043 }, { "acc": 0.85453424, "epoch": 0.8473080317740512, "grad_norm": 15.087614059448242, "learning_rate": 8.509833395736747e-06, "loss": 0.83186865, "memory(GiB)": 15.03, "step": 480, "train_speed(iter/s)": 1.347373 }, { "acc": 0.87046375, "epoch": 0.8561341571050309, "grad_norm": 20.40123748779297, "learning_rate": 8.5241172710936e-06, "loss": 0.76273909, "memory(GiB)": 15.03, "step": 485, "train_speed(iter/s)": 1.349092 }, { "acc": 0.84939156, "epoch": 0.8649602824360106, "grad_norm": 24.134910583496094, "learning_rate": 8.538254642582863e-06, "loss": 0.89786205, "memory(GiB)": 15.03, "step": 490, "train_speed(iter/s)": 1.350334 }, { "acc": 0.84870033, "epoch": 0.8737864077669902, "grad_norm": 14.48202133178711, "learning_rate": 8.552248484979806e-06, "loss": 0.853339, "memory(GiB)": 15.03, "step": 495, "train_speed(iter/s)": 1.350908 }, { "acc": 0.85977726, "epoch": 0.8826125330979699, "grad_norm": 14.291667938232422, "learning_rate": 8.5661016833649e-06, "loss": 0.80336876, "memory(GiB)": 15.03, "step": 500, "train_speed(iter/s)": 1.352228 }, { "acc": 0.85660152, "epoch": 0.8914386584289496, "grad_norm": 16.971019744873047, "learning_rate": 8.579817036693938e-06, "loss": 0.79841681, "memory(GiB)": 15.03, "step": 505, "train_speed(iter/s)": 1.353377 }, { "acc": 0.84147816, "epoch": 0.9002647837599294, "grad_norm": 17.44623565673828, "learning_rate": 8.593397261192231e-06, "loss": 0.92683554, "memory(GiB)": 15.03, "step": 510, "train_speed(iter/s)": 1.354834 }, { "acc": 0.87031116, "epoch": 0.9090909090909091, "grad_norm": 12.483320236206055, "learning_rate": 8.60684499358326e-06, "loss": 0.75720358, "memory(GiB)": 15.03, "step": 515, "train_speed(iter/s)": 1.355461 }, { "acc": 0.88779926, "epoch": 0.9179170344218888, "grad_norm": 8.908056259155273, "learning_rate": 8.620162794161314e-06, "loss": 0.6034523, "memory(GiB)": 15.03, "step": 520, "train_speed(iter/s)": 1.356535 }, { "acc": 0.85247717, "epoch": 0.9267431597528685, "grad_norm": 14.843001365661621, "learning_rate": 8.633353149717143e-06, "loss": 0.87303743, "memory(GiB)": 15.03, "step": 525, "train_speed(iter/s)": 1.357547 }, { "acc": 0.86702938, "epoch": 0.9355692850838482, "grad_norm": 16.123022079467773, "learning_rate": 8.646418476324917e-06, "loss": 0.73595791, "memory(GiB)": 15.03, "step": 530, "train_speed(iter/s)": 1.358546 }, { "acc": 0.86741362, "epoch": 0.9443954104148279, "grad_norm": 15.112690925598145, "learning_rate": 8.659361121998349e-06, "loss": 0.73224492, "memory(GiB)": 15.03, "step": 535, "train_speed(iter/s)": 1.359622 }, { "acc": 0.86125507, "epoch": 0.9532215357458076, "grad_norm": 17.77021026611328, "learning_rate": 8.672183369223271e-06, "loss": 0.81944818, "memory(GiB)": 15.03, "step": 540, "train_speed(iter/s)": 1.360458 }, { "acc": 0.85830431, "epoch": 0.9620476610767873, "grad_norm": 16.815032958984375, "learning_rate": 8.684887437373492e-06, "loss": 0.88071756, "memory(GiB)": 15.03, "step": 545, "train_speed(iter/s)": 1.361365 }, { "acc": 0.8703146, "epoch": 0.970873786407767, "grad_norm": 9.287626266479492, "learning_rate": 8.697475485016354e-06, "loss": 0.73585243, "memory(GiB)": 15.03, "step": 550, "train_speed(iter/s)": 1.362273 }, { "acc": 0.87306328, "epoch": 0.9796999117387467, "grad_norm": 16.025371551513672, "learning_rate": 8.709949612113931e-06, "loss": 0.77072811, "memory(GiB)": 15.03, "step": 555, "train_speed(iter/s)": 1.363539 }, { "acc": 0.84512033, "epoch": 0.9885260370697264, "grad_norm": 210.82101440429688, "learning_rate": 8.722311862125539e-06, "loss": 0.91017971, "memory(GiB)": 15.03, "step": 560, "train_speed(iter/s)": 1.364547 }, { "acc": 0.86893339, "epoch": 0.9973521624007061, "grad_norm": 10.498364448547363, "learning_rate": 8.734564224016798e-06, "loss": 0.79639912, "memory(GiB)": 15.03, "step": 565, "train_speed(iter/s)": 1.366037 }, { "acc": 0.85702171, "epoch": 1.0061782877316858, "grad_norm": 15.488100051879883, "learning_rate": 8.746708634180163e-06, "loss": 0.76875796, "memory(GiB)": 15.03, "step": 570, "train_speed(iter/s)": 1.365807 }, { "acc": 0.86608248, "epoch": 1.0150044130626654, "grad_norm": 13.724185943603516, "learning_rate": 8.758746978271614e-06, "loss": 0.79540052, "memory(GiB)": 15.03, "step": 575, "train_speed(iter/s)": 1.366837 }, { "acc": 0.85060978, "epoch": 1.0238305383936452, "grad_norm": 11.490763664245605, "learning_rate": 8.770681092967809e-06, "loss": 0.86474276, "memory(GiB)": 15.03, "step": 580, "train_speed(iter/s)": 1.36831 }, { "acc": 0.87470121, "epoch": 1.0326566637246248, "grad_norm": 18.0929012298584, "learning_rate": 8.782512767647838e-06, "loss": 0.76726036, "memory(GiB)": 15.03, "step": 585, "train_speed(iter/s)": 1.368444 }, { "acc": 0.86219568, "epoch": 1.0414827890556047, "grad_norm": 16.510112762451172, "learning_rate": 8.794243746003395e-06, "loss": 0.80221128, "memory(GiB)": 15.03, "step": 590, "train_speed(iter/s)": 1.36943 }, { "acc": 0.87719679, "epoch": 1.0503089143865842, "grad_norm": 10.92737102508545, "learning_rate": 8.805875727581025e-06, "loss": 0.69948559, "memory(GiB)": 15.03, "step": 595, "train_speed(iter/s)": 1.36967 }, { "acc": 0.86749077, "epoch": 1.059135039717564, "grad_norm": 24.504356384277344, "learning_rate": 8.81741036925982e-06, "loss": 0.76590767, "memory(GiB)": 15.03, "step": 600, "train_speed(iter/s)": 1.370355 }, { "acc": 0.87485991, "epoch": 1.0679611650485437, "grad_norm": 12.609575271606445, "learning_rate": 8.82884928666781e-06, "loss": 0.71499319, "memory(GiB)": 15.03, "step": 605, "train_speed(iter/s)": 1.371168 }, { "acc": 0.87356024, "epoch": 1.0767872903795235, "grad_norm": 18.036577224731445, "learning_rate": 8.840194055540063e-06, "loss": 0.72343092, "memory(GiB)": 15.03, "step": 610, "train_speed(iter/s)": 1.372371 }, { "acc": 0.87252808, "epoch": 1.085613415710503, "grad_norm": 20.791675567626953, "learning_rate": 8.851446213021351e-06, "loss": 0.7672431, "memory(GiB)": 15.03, "step": 615, "train_speed(iter/s)": 1.373139 }, { "acc": 0.85088482, "epoch": 1.0944395410414829, "grad_norm": 11.626408576965332, "learning_rate": 8.862607258916076e-06, "loss": 0.83257885, "memory(GiB)": 15.03, "step": 620, "train_speed(iter/s)": 1.374001 }, { "acc": 0.879706, "epoch": 1.1032656663724625, "grad_norm": 22.43731117248535, "learning_rate": 8.873678656887972e-06, "loss": 0.68585067, "memory(GiB)": 15.03, "step": 625, "train_speed(iter/s)": 1.374912 }, { "acc": 0.87554665, "epoch": 1.1120917917034423, "grad_norm": 11.467029571533203, "learning_rate": 8.884661835612063e-06, "loss": 0.76199694, "memory(GiB)": 15.03, "step": 630, "train_speed(iter/s)": 1.375997 }, { "acc": 0.88249702, "epoch": 1.120917917034422, "grad_norm": 9.456093788146973, "learning_rate": 8.895558189881028e-06, "loss": 0.68395119, "memory(GiB)": 15.03, "step": 635, "train_speed(iter/s)": 1.376871 }, { "acc": 0.86988831, "epoch": 1.1297440423654015, "grad_norm": 14.116201400756836, "learning_rate": 8.906369081668215e-06, "loss": 0.74227285, "memory(GiB)": 15.03, "step": 640, "train_speed(iter/s)": 1.37754 }, { "acc": 0.87651501, "epoch": 1.1385701676963813, "grad_norm": 14.511102676391602, "learning_rate": 8.917095841149283e-06, "loss": 0.71151581, "memory(GiB)": 15.03, "step": 645, "train_speed(iter/s)": 1.378557 }, { "acc": 0.85872736, "epoch": 1.147396293027361, "grad_norm": 36.083534240722656, "learning_rate": 8.927739767684386e-06, "loss": 0.78520188, "memory(GiB)": 15.03, "step": 650, "train_speed(iter/s)": 1.378683 }, { "acc": 0.87407236, "epoch": 1.1562224183583407, "grad_norm": 13.215569496154785, "learning_rate": 8.938302130762743e-06, "loss": 0.71990705, "memory(GiB)": 15.03, "step": 655, "train_speed(iter/s)": 1.379885 }, { "acc": 0.85896072, "epoch": 1.1650485436893203, "grad_norm": 16.800809860229492, "learning_rate": 8.948784170911273e-06, "loss": 0.82673702, "memory(GiB)": 15.03, "step": 660, "train_speed(iter/s)": 1.381092 }, { "acc": 0.87594452, "epoch": 1.1738746690203001, "grad_norm": 19.80055046081543, "learning_rate": 8.959187100568957e-06, "loss": 0.69231596, "memory(GiB)": 15.03, "step": 665, "train_speed(iter/s)": 1.38192 }, { "acc": 0.87555962, "epoch": 1.1827007943512797, "grad_norm": 16.16434669494629, "learning_rate": 8.969512104928436e-06, "loss": 0.69566627, "memory(GiB)": 15.03, "step": 670, "train_speed(iter/s)": 1.382925 }, { "acc": 0.88165331, "epoch": 1.1915269196822595, "grad_norm": 8.82634162902832, "learning_rate": 8.979760342746341e-06, "loss": 0.62969561, "memory(GiB)": 15.03, "step": 675, "train_speed(iter/s)": 1.383412 }, { "acc": 0.87543392, "epoch": 1.2003530450132391, "grad_norm": 10.876725196838379, "learning_rate": 8.9899329471237e-06, "loss": 0.70207052, "memory(GiB)": 15.03, "step": 680, "train_speed(iter/s)": 1.384108 }, { "acc": 0.86599121, "epoch": 1.209179170344219, "grad_norm": 16.53150749206543, "learning_rate": 9.000031026257772e-06, "loss": 0.73462429, "memory(GiB)": 15.03, "step": 685, "train_speed(iter/s)": 1.384991 }, { "acc": 0.86249514, "epoch": 1.2180052956751986, "grad_norm": 11.62083625793457, "learning_rate": 9.010055664166532e-06, "loss": 0.79541802, "memory(GiB)": 15.03, "step": 690, "train_speed(iter/s)": 1.385442 }, { "acc": 0.85869989, "epoch": 1.2268314210061784, "grad_norm": 10.43891716003418, "learning_rate": 9.020007921387016e-06, "loss": 0.77812929, "memory(GiB)": 15.03, "step": 695, "train_speed(iter/s)": 1.386141 }, { "acc": 0.85712376, "epoch": 1.235657546337158, "grad_norm": 15.630046844482422, "learning_rate": 9.029888835648611e-06, "loss": 0.76715212, "memory(GiB)": 15.03, "step": 700, "train_speed(iter/s)": 1.38676 }, { "acc": 0.89375858, "epoch": 1.2444836716681378, "grad_norm": 9.320642471313477, "learning_rate": 9.039699422522407e-06, "loss": 0.61515408, "memory(GiB)": 15.03, "step": 705, "train_speed(iter/s)": 1.387531 }, { "acc": 0.87358236, "epoch": 1.2533097969991174, "grad_norm": 21.755651473999023, "learning_rate": 9.049440676047584e-06, "loss": 0.71240931, "memory(GiB)": 15.03, "step": 710, "train_speed(iter/s)": 1.38822 }, { "acc": 0.87096338, "epoch": 1.262135922330097, "grad_norm": 13.609408378601074, "learning_rate": 9.05911356933584e-06, "loss": 0.71472459, "memory(GiB)": 15.03, "step": 715, "train_speed(iter/s)": 1.389061 }, { "acc": 0.86782246, "epoch": 1.2709620476610768, "grad_norm": 16.83635902404785, "learning_rate": 9.068719055154739e-06, "loss": 0.76871777, "memory(GiB)": 15.03, "step": 720, "train_speed(iter/s)": 1.389519 }, { "acc": 0.87236013, "epoch": 1.2797881729920566, "grad_norm": 13.626837730407715, "learning_rate": 9.078258066490881e-06, "loss": 0.74826159, "memory(GiB)": 15.03, "step": 725, "train_speed(iter/s)": 1.39014 }, { "acc": 0.86891193, "epoch": 1.2886142983230362, "grad_norm": 10.106908798217773, "learning_rate": 9.087731517093729e-06, "loss": 0.73042183, "memory(GiB)": 15.03, "step": 730, "train_speed(iter/s)": 1.391175 }, { "acc": 0.86670141, "epoch": 1.2974404236540158, "grad_norm": 21.56298065185547, "learning_rate": 9.097140302000855e-06, "loss": 0.79023056, "memory(GiB)": 15.03, "step": 735, "train_speed(iter/s)": 1.392306 }, { "acc": 0.86781406, "epoch": 1.3062665489849956, "grad_norm": 17.55681610107422, "learning_rate": 9.106485298045399e-06, "loss": 0.75457993, "memory(GiB)": 15.03, "step": 740, "train_speed(iter/s)": 1.39252 }, { "acc": 0.86887312, "epoch": 1.3150926743159752, "grad_norm": 18.56260871887207, "learning_rate": 9.115767364346424e-06, "loss": 0.70807643, "memory(GiB)": 15.03, "step": 745, "train_speed(iter/s)": 1.393062 }, { "acc": 0.8685297, "epoch": 1.323918799646955, "grad_norm": 18.306211471557617, "learning_rate": 9.124987342782891e-06, "loss": 0.74744511, "memory(GiB)": 15.03, "step": 750, "train_speed(iter/s)": 1.393404 }, { "acc": 0.8781641, "epoch": 1.3327449249779346, "grad_norm": 13.647163391113281, "learning_rate": 9.134146058451869e-06, "loss": 0.68170605, "memory(GiB)": 15.03, "step": 755, "train_speed(iter/s)": 1.393796 }, { "acc": 0.87922821, "epoch": 1.3415710503089144, "grad_norm": 13.303306579589844, "learning_rate": 9.143244320111633e-06, "loss": 0.64709396, "memory(GiB)": 15.03, "step": 760, "train_speed(iter/s)": 1.3945 }, { "acc": 0.88469067, "epoch": 1.350397175639894, "grad_norm": 16.81705093383789, "learning_rate": 9.152282920610223e-06, "loss": 0.63452024, "memory(GiB)": 15.03, "step": 765, "train_speed(iter/s)": 1.395033 }, { "acc": 0.86394882, "epoch": 1.3592233009708738, "grad_norm": 19.79634666442871, "learning_rate": 9.161262637300065e-06, "loss": 0.77901225, "memory(GiB)": 15.03, "step": 770, "train_speed(iter/s)": 1.395744 }, { "acc": 0.87729263, "epoch": 1.3680494263018534, "grad_norm": 9.955446243286133, "learning_rate": 9.170184232439148e-06, "loss": 0.73748941, "memory(GiB)": 15.03, "step": 775, "train_speed(iter/s)": 1.396616 }, { "acc": 0.87977848, "epoch": 1.3768755516328333, "grad_norm": 26.05901527404785, "learning_rate": 9.179048453579305e-06, "loss": 0.69573069, "memory(GiB)": 15.03, "step": 780, "train_speed(iter/s)": 1.396684 }, { "acc": 0.89207993, "epoch": 1.3857016769638129, "grad_norm": 8.431520462036133, "learning_rate": 9.187856033942118e-06, "loss": 0.6223176, "memory(GiB)": 15.03, "step": 785, "train_speed(iter/s)": 1.39761 }, { "acc": 0.88160381, "epoch": 1.3945278022947925, "grad_norm": 22.775667190551758, "learning_rate": 9.196607692782849e-06, "loss": 0.6586874, "memory(GiB)": 15.03, "step": 790, "train_speed(iter/s)": 1.3981 }, { "acc": 0.87976484, "epoch": 1.4033539276257723, "grad_norm": 15.035825729370117, "learning_rate": 9.205304135742909e-06, "loss": 0.72166901, "memory(GiB)": 15.03, "step": 795, "train_speed(iter/s)": 1.398853 }, { "acc": 0.88060598, "epoch": 1.412180052956752, "grad_norm": 9.954267501831055, "learning_rate": 9.213946055191287e-06, "loss": 0.65977607, "memory(GiB)": 15.03, "step": 800, "train_speed(iter/s)": 1.399873 }, { "acc": 0.88905354, "epoch": 1.4210061782877317, "grad_norm": 10.58818244934082, "learning_rate": 9.222534130555326e-06, "loss": 0.63798847, "memory(GiB)": 15.03, "step": 805, "train_speed(iter/s)": 1.400476 }, { "acc": 0.88176765, "epoch": 1.4298323036187113, "grad_norm": 12.966394424438477, "learning_rate": 9.231069028641261e-06, "loss": 0.67824659, "memory(GiB)": 15.03, "step": 810, "train_speed(iter/s)": 1.401127 }, { "acc": 0.8844079, "epoch": 1.438658428949691, "grad_norm": 12.72673511505127, "learning_rate": 9.239551403944905e-06, "loss": 0.66224747, "memory(GiB)": 15.03, "step": 815, "train_speed(iter/s)": 1.401919 }, { "acc": 0.88658886, "epoch": 1.447484554280671, "grad_norm": 12.08267879486084, "learning_rate": 9.247981898952821e-06, "loss": 0.60803099, "memory(GiB)": 15.03, "step": 820, "train_speed(iter/s)": 1.402394 }, { "acc": 0.8817709, "epoch": 1.4563106796116505, "grad_norm": 14.413646697998047, "learning_rate": 9.256361144434345e-06, "loss": 0.68519716, "memory(GiB)": 15.03, "step": 825, "train_speed(iter/s)": 1.402993 }, { "acc": 0.87616367, "epoch": 1.46513680494263, "grad_norm": 32.22182083129883, "learning_rate": 9.26468975972479e-06, "loss": 0.7470521, "memory(GiB)": 15.03, "step": 830, "train_speed(iter/s)": 1.403705 }, { "acc": 0.90904369, "epoch": 1.47396293027361, "grad_norm": 9.247931480407715, "learning_rate": 9.272968353000135e-06, "loss": 0.51509295, "memory(GiB)": 15.03, "step": 835, "train_speed(iter/s)": 1.404019 }, { "acc": 0.88075924, "epoch": 1.4827890556045895, "grad_norm": 17.574228286743164, "learning_rate": 9.281197521543531e-06, "loss": 0.67989264, "memory(GiB)": 15.03, "step": 840, "train_speed(iter/s)": 1.40421 }, { "acc": 0.89161472, "epoch": 1.4916151809355693, "grad_norm": 15.008139610290527, "learning_rate": 9.289377852003874e-06, "loss": 0.63312459, "memory(GiB)": 15.03, "step": 845, "train_speed(iter/s)": 1.404454 }, { "acc": 0.88426228, "epoch": 1.500441306266549, "grad_norm": 14.265517234802246, "learning_rate": 9.297509920646773e-06, "loss": 0.66849756, "memory(GiB)": 15.03, "step": 850, "train_speed(iter/s)": 1.405266 }, { "acc": 0.89539175, "epoch": 1.5092674315975287, "grad_norm": 10.675289154052734, "learning_rate": 9.305594293598155e-06, "loss": 0.56573772, "memory(GiB)": 15.03, "step": 855, "train_speed(iter/s)": 1.406066 }, { "acc": 0.89242611, "epoch": 1.5180935569285083, "grad_norm": 19.510107040405273, "learning_rate": 9.31363152708075e-06, "loss": 0.63375998, "memory(GiB)": 15.03, "step": 860, "train_speed(iter/s)": 1.406526 }, { "acc": 0.89230366, "epoch": 1.526919682259488, "grad_norm": 9.313959121704102, "learning_rate": 9.321622167643745e-06, "loss": 0.64916496, "memory(GiB)": 15.03, "step": 865, "train_speed(iter/s)": 1.406919 }, { "acc": 0.88901911, "epoch": 1.5357458075904677, "grad_norm": 11.9878568649292, "learning_rate": 9.3295667523858e-06, "loss": 0.62488303, "memory(GiB)": 15.03, "step": 870, "train_speed(iter/s)": 1.407436 }, { "acc": 0.88207655, "epoch": 1.5445719329214476, "grad_norm": 10.038602828979492, "learning_rate": 9.337465809171683e-06, "loss": 0.65001602, "memory(GiB)": 15.03, "step": 875, "train_speed(iter/s)": 1.407493 }, { "acc": 0.88994827, "epoch": 1.5533980582524272, "grad_norm": 12.364340782165527, "learning_rate": 9.345319856842741e-06, "loss": 0.63789039, "memory(GiB)": 15.03, "step": 880, "train_speed(iter/s)": 1.408071 }, { "acc": 0.8806591, "epoch": 1.5622241835834068, "grad_norm": 16.858013153076172, "learning_rate": 9.353129405421386e-06, "loss": 0.66663342, "memory(GiB)": 15.03, "step": 885, "train_speed(iter/s)": 1.408067 }, { "acc": 0.88406448, "epoch": 1.5710503089143866, "grad_norm": 10.093111038208008, "learning_rate": 9.360894956309846e-06, "loss": 0.6731246, "memory(GiB)": 15.03, "step": 890, "train_speed(iter/s)": 1.408568 }, { "acc": 0.87881975, "epoch": 1.5798764342453664, "grad_norm": 12.624429702758789, "learning_rate": 9.368617002483326e-06, "loss": 0.65332942, "memory(GiB)": 15.03, "step": 895, "train_speed(iter/s)": 1.409081 }, { "acc": 0.88159962, "epoch": 1.588702559576346, "grad_norm": 9.530009269714355, "learning_rate": 9.376296028677811e-06, "loss": 0.63584528, "memory(GiB)": 15.03, "step": 900, "train_speed(iter/s)": 1.409724 }, { "acc": 0.8855258, "epoch": 1.5975286849073256, "grad_norm": 10.42114543914795, "learning_rate": 9.383932511572646e-06, "loss": 0.66988831, "memory(GiB)": 15.03, "step": 905, "train_speed(iter/s)": 1.410019 }, { "acc": 0.90641155, "epoch": 1.6063548102383054, "grad_norm": 10.369808197021484, "learning_rate": 9.391526919968097e-06, "loss": 0.53250246, "memory(GiB)": 15.03, "step": 910, "train_speed(iter/s)": 1.409755 }, { "acc": 0.88352089, "epoch": 1.6151809355692852, "grad_norm": 11.006196022033691, "learning_rate": 9.399079714958053e-06, "loss": 0.65867739, "memory(GiB)": 15.03, "step": 915, "train_speed(iter/s)": 1.410435 }, { "acc": 0.88434277, "epoch": 1.6240070609002648, "grad_norm": 12.444877624511719, "learning_rate": 9.406591350098002e-06, "loss": 0.70287695, "memory(GiB)": 15.03, "step": 920, "train_speed(iter/s)": 1.410846 }, { "acc": 0.8821207, "epoch": 1.6328331862312444, "grad_norm": 12.70352840423584, "learning_rate": 9.414062271568471e-06, "loss": 0.64877195, "memory(GiB)": 15.03, "step": 925, "train_speed(iter/s)": 1.411127 }, { "acc": 0.89188652, "epoch": 1.6416593115622242, "grad_norm": 11.997596740722656, "learning_rate": 9.421492918334066e-06, "loss": 0.6488348, "memory(GiB)": 15.03, "step": 930, "train_speed(iter/s)": 1.411639 }, { "acc": 0.88455486, "epoch": 1.650485436893204, "grad_norm": 12.429160118103027, "learning_rate": 9.428883722298227e-06, "loss": 0.64072094, "memory(GiB)": 15.03, "step": 935, "train_speed(iter/s)": 1.412108 }, { "acc": 0.88315144, "epoch": 1.6593115622241836, "grad_norm": 11.372185707092285, "learning_rate": 9.436235108453877e-06, "loss": 0.67062583, "memory(GiB)": 15.03, "step": 940, "train_speed(iter/s)": 1.412648 }, { "acc": 0.89071655, "epoch": 1.6681376875551632, "grad_norm": 9.39794635772705, "learning_rate": 9.443547495030055e-06, "loss": 0.64256811, "memory(GiB)": 15.03, "step": 945, "train_speed(iter/s)": 1.413144 }, { "acc": 0.89566469, "epoch": 1.676963812886143, "grad_norm": 10.983919143676758, "learning_rate": 9.450821293634705e-06, "loss": 0.55221248, "memory(GiB)": 15.03, "step": 950, "train_speed(iter/s)": 1.413781 }, { "acc": 0.90415955, "epoch": 1.6857899382171226, "grad_norm": 13.477981567382812, "learning_rate": 9.458056909393693e-06, "loss": 0.5188931, "memory(GiB)": 15.03, "step": 955, "train_speed(iter/s)": 1.414294 }, { "acc": 0.88154793, "epoch": 1.6946160635481022, "grad_norm": 16.245464324951172, "learning_rate": 9.465254741086207e-06, "loss": 0.67751064, "memory(GiB)": 15.03, "step": 960, "train_speed(iter/s)": 1.414549 }, { "acc": 0.88493137, "epoch": 1.703442188879082, "grad_norm": 15.04032039642334, "learning_rate": 9.47241518127664e-06, "loss": 0.62156134, "memory(GiB)": 15.03, "step": 965, "train_speed(iter/s)": 1.415108 }, { "acc": 0.89369011, "epoch": 1.7122683142100619, "grad_norm": 11.915094375610352, "learning_rate": 9.47953861644306e-06, "loss": 0.59790082, "memory(GiB)": 15.03, "step": 970, "train_speed(iter/s)": 1.415488 }, { "acc": 0.89546175, "epoch": 1.7210944395410415, "grad_norm": 8.28010082244873, "learning_rate": 9.486625427102379e-06, "loss": 0.5619544, "memory(GiB)": 15.03, "step": 975, "train_speed(iter/s)": 1.415743 }, { "acc": 0.89236155, "epoch": 1.729920564872021, "grad_norm": 12.17982006072998, "learning_rate": 9.493675987932323e-06, "loss": 0.59782009, "memory(GiB)": 15.03, "step": 980, "train_speed(iter/s)": 1.415939 }, { "acc": 0.89113998, "epoch": 1.7387466902030009, "grad_norm": 33.9180793762207, "learning_rate": 9.500690667890305e-06, "loss": 0.67377186, "memory(GiB)": 15.03, "step": 985, "train_speed(iter/s)": 1.41642 }, { "acc": 0.87668629, "epoch": 1.7475728155339807, "grad_norm": 12.061941146850586, "learning_rate": 9.507669830329265e-06, "loss": 0.71430349, "memory(GiB)": 15.03, "step": 990, "train_speed(iter/s)": 1.41649 }, { "acc": 0.87580357, "epoch": 1.7563989408649603, "grad_norm": 12.536811828613281, "learning_rate": 9.514613833110618e-06, "loss": 0.69564028, "memory(GiB)": 15.03, "step": 995, "train_speed(iter/s)": 1.41702 }, { "acc": 0.87407379, "epoch": 1.7652250661959399, "grad_norm": 10.782868385314941, "learning_rate": 9.52152302871436e-06, "loss": 0.68095927, "memory(GiB)": 15.03, "step": 1000, "train_speed(iter/s)": 1.41756 }, { "acc": 0.90452003, "epoch": 1.7740511915269197, "grad_norm": 13.136680603027344, "learning_rate": 9.528397764346428e-06, "loss": 0.50396791, "memory(GiB)": 15.03, "step": 1005, "train_speed(iter/s)": 1.417694 }, { "acc": 0.88649578, "epoch": 1.7828773168578995, "grad_norm": 10.960343360900879, "learning_rate": 9.535238382043396e-06, "loss": 0.66277518, "memory(GiB)": 15.03, "step": 1010, "train_speed(iter/s)": 1.417692 }, { "acc": 0.88433723, "epoch": 1.7917034421888791, "grad_norm": 11.241856575012207, "learning_rate": 9.542045218774593e-06, "loss": 0.68447266, "memory(GiB)": 15.03, "step": 1015, "train_speed(iter/s)": 1.418007 }, { "acc": 0.88271923, "epoch": 1.8005295675198587, "grad_norm": 13.789383888244629, "learning_rate": 9.548818606541693e-06, "loss": 0.62464457, "memory(GiB)": 15.03, "step": 1020, "train_speed(iter/s)": 1.418114 }, { "acc": 0.89521027, "epoch": 1.8093556928508385, "grad_norm": 12.18415355682373, "learning_rate": 9.555558872475895e-06, "loss": 0.58274288, "memory(GiB)": 15.03, "step": 1025, "train_speed(iter/s)": 1.418547 }, { "acc": 0.89534492, "epoch": 1.8181818181818183, "grad_norm": 9.136192321777344, "learning_rate": 9.56226633893272e-06, "loss": 0.58249364, "memory(GiB)": 15.03, "step": 1030, "train_speed(iter/s)": 1.418562 }, { "acc": 0.87189541, "epoch": 1.8270079435127977, "grad_norm": 31.211952209472656, "learning_rate": 9.568941323584524e-06, "loss": 0.69691725, "memory(GiB)": 15.03, "step": 1035, "train_speed(iter/s)": 1.419256 }, { "acc": 0.89501991, "epoch": 1.8358340688437775, "grad_norm": 15.75580883026123, "learning_rate": 9.575584139510775e-06, "loss": 0.56669755, "memory(GiB)": 15.03, "step": 1040, "train_speed(iter/s)": 1.419602 }, { "acc": 0.90598373, "epoch": 1.8446601941747574, "grad_norm": 11.128259658813477, "learning_rate": 9.582195095286159e-06, "loss": 0.54364185, "memory(GiB)": 15.03, "step": 1045, "train_speed(iter/s)": 1.419783 }, { "acc": 0.88986807, "epoch": 1.853486319505737, "grad_norm": 15.21756649017334, "learning_rate": 9.588774495066605e-06, "loss": 0.65906324, "memory(GiB)": 15.03, "step": 1050, "train_speed(iter/s)": 1.420034 }, { "acc": 0.89392462, "epoch": 1.8623124448367165, "grad_norm": 10.783296585083008, "learning_rate": 9.595322638673238e-06, "loss": 0.60379801, "memory(GiB)": 15.03, "step": 1055, "train_speed(iter/s)": 1.420133 }, { "acc": 0.89213877, "epoch": 1.8711385701676964, "grad_norm": 8.503034591674805, "learning_rate": 9.601839821674378e-06, "loss": 0.61805177, "memory(GiB)": 15.03, "step": 1060, "train_speed(iter/s)": 1.420464 }, { "acc": 0.90360165, "epoch": 1.8799646954986762, "grad_norm": 11.33566665649414, "learning_rate": 9.608326335465576e-06, "loss": 0.54236746, "memory(GiB)": 15.03, "step": 1065, "train_speed(iter/s)": 1.420398 }, { "acc": 0.8924449, "epoch": 1.8887908208296558, "grad_norm": 8.497529983520508, "learning_rate": 9.614782467347809e-06, "loss": 0.58285637, "memory(GiB)": 15.03, "step": 1070, "train_speed(iter/s)": 1.420647 }, { "acc": 0.89489689, "epoch": 1.8976169461606354, "grad_norm": 9.63233470916748, "learning_rate": 9.621208500603823e-06, "loss": 0.53378448, "memory(GiB)": 15.03, "step": 1075, "train_speed(iter/s)": 1.420983 }, { "acc": 0.88891964, "epoch": 1.9064430714916152, "grad_norm": 20.11314582824707, "learning_rate": 9.62760471457273e-06, "loss": 0.66272831, "memory(GiB)": 15.03, "step": 1080, "train_speed(iter/s)": 1.421334 }, { "acc": 0.89561214, "epoch": 1.915269196822595, "grad_norm": 9.382156372070312, "learning_rate": 9.63397138472286e-06, "loss": 0.59761391, "memory(GiB)": 15.03, "step": 1085, "train_speed(iter/s)": 1.421561 }, { "acc": 0.88319712, "epoch": 1.9240953221535746, "grad_norm": 30.93239974975586, "learning_rate": 9.640308782722952e-06, "loss": 0.67434788, "memory(GiB)": 15.03, "step": 1090, "train_speed(iter/s)": 1.421852 }, { "acc": 0.89910412, "epoch": 1.9329214474845542, "grad_norm": 10.270589828491211, "learning_rate": 9.646617176511722e-06, "loss": 0.61153879, "memory(GiB)": 15.03, "step": 1095, "train_speed(iter/s)": 1.422189 }, { "acc": 0.90277824, "epoch": 1.941747572815534, "grad_norm": 16.421043395996094, "learning_rate": 9.652896830365815e-06, "loss": 0.51509657, "memory(GiB)": 15.03, "step": 1100, "train_speed(iter/s)": 1.422507 }, { "acc": 0.90009937, "epoch": 1.9505736981465138, "grad_norm": 9.790301322937012, "learning_rate": 9.65914800496626e-06, "loss": 0.59155245, "memory(GiB)": 15.03, "step": 1105, "train_speed(iter/s)": 1.422769 }, { "acc": 0.90443134, "epoch": 1.9593998234774934, "grad_norm": 10.15743350982666, "learning_rate": 9.66537095746339e-06, "loss": 0.5526217, "memory(GiB)": 15.03, "step": 1110, "train_speed(iter/s)": 1.422945 }, { "acc": 0.89292736, "epoch": 1.968225948808473, "grad_norm": 11.902684211730957, "learning_rate": 9.671565941540321e-06, "loss": 0.57819004, "memory(GiB)": 15.03, "step": 1115, "train_speed(iter/s)": 1.423191 }, { "acc": 0.87777376, "epoch": 1.9770520741394528, "grad_norm": 25.931177139282227, "learning_rate": 9.677733207475e-06, "loss": 0.67513151, "memory(GiB)": 15.03, "step": 1120, "train_speed(iter/s)": 1.423112 }, { "acc": 0.89389963, "epoch": 1.9858781994704324, "grad_norm": 11.598034858703613, "learning_rate": 9.683873002200883e-06, "loss": 0.60335078, "memory(GiB)": 15.03, "step": 1125, "train_speed(iter/s)": 1.423191 }, { "acc": 0.88208179, "epoch": 1.994704324801412, "grad_norm": 14.008034706115723, "learning_rate": 9.689985569366258e-06, "loss": 0.65341744, "memory(GiB)": 15.03, "step": 1130, "train_speed(iter/s)": 1.423561 }, { "acc": 0.91093693, "epoch": 2.003530450132392, "grad_norm": 16.504602432250977, "learning_rate": 9.696071149392264e-06, "loss": 0.4594039, "memory(GiB)": 15.03, "step": 1135, "train_speed(iter/s)": 1.423126 }, { "acc": 0.89320984, "epoch": 2.0123565754633717, "grad_norm": 17.193164825439453, "learning_rate": 9.702129979529625e-06, "loss": 0.62504878, "memory(GiB)": 15.03, "step": 1140, "train_speed(iter/s)": 1.423107 }, { "acc": 0.90270777, "epoch": 2.0211827007943515, "grad_norm": 10.173110961914062, "learning_rate": 9.708162293914162e-06, "loss": 0.56589737, "memory(GiB)": 15.03, "step": 1145, "train_speed(iter/s)": 1.423597 }, { "acc": 0.91170883, "epoch": 2.030008826125331, "grad_norm": 17.4721622467041, "learning_rate": 9.714168323621074e-06, "loss": 0.47709441, "memory(GiB)": 15.03, "step": 1150, "train_speed(iter/s)": 1.42374 }, { "acc": 0.90018482, "epoch": 2.0388349514563107, "grad_norm": 12.034164428710938, "learning_rate": 9.720148296718059e-06, "loss": 0.58367968, "memory(GiB)": 15.03, "step": 1155, "train_speed(iter/s)": 1.423854 }, { "acc": 0.89761066, "epoch": 2.0476610767872905, "grad_norm": 9.888873100280762, "learning_rate": 9.726102438317269e-06, "loss": 0.58655739, "memory(GiB)": 15.03, "step": 1160, "train_speed(iter/s)": 1.42452 }, { "acc": 0.90061598, "epoch": 2.0564872021182703, "grad_norm": 9.12770938873291, "learning_rate": 9.73203097062617e-06, "loss": 0.55484486, "memory(GiB)": 15.03, "step": 1165, "train_speed(iter/s)": 1.424946 }, { "acc": 0.9084589, "epoch": 2.0653133274492497, "grad_norm": 11.611934661865234, "learning_rate": 9.737934112997299e-06, "loss": 0.51068997, "memory(GiB)": 15.03, "step": 1170, "train_speed(iter/s)": 1.425218 }, { "acc": 0.89887505, "epoch": 2.0741394527802295, "grad_norm": 8.761435508728027, "learning_rate": 9.743812081976949e-06, "loss": 0.55125942, "memory(GiB)": 15.03, "step": 1175, "train_speed(iter/s)": 1.425337 }, { "acc": 0.89826851, "epoch": 2.0829655781112093, "grad_norm": 10.650542259216309, "learning_rate": 9.749665091352856e-06, "loss": 0.6236135, "memory(GiB)": 15.03, "step": 1180, "train_speed(iter/s)": 1.425594 }, { "acc": 0.88310032, "epoch": 2.0917917034421887, "grad_norm": 10.003929138183594, "learning_rate": 9.75549335220084e-06, "loss": 0.6795033, "memory(GiB)": 15.03, "step": 1185, "train_speed(iter/s)": 1.425636 }, { "acc": 0.91393566, "epoch": 2.1006178287731685, "grad_norm": 10.860987663269043, "learning_rate": 9.761297072930485e-06, "loss": 0.49088507, "memory(GiB)": 15.03, "step": 1190, "train_speed(iter/s)": 1.425602 }, { "acc": 0.90685883, "epoch": 2.1094439541041483, "grad_norm": 10.807658195495605, "learning_rate": 9.767076459329854e-06, "loss": 0.56696253, "memory(GiB)": 15.03, "step": 1195, "train_speed(iter/s)": 1.425988 }, { "acc": 0.91263876, "epoch": 2.118270079435128, "grad_norm": 10.464794158935547, "learning_rate": 9.772831714609279e-06, "loss": 0.45279794, "memory(GiB)": 15.03, "step": 1200, "train_speed(iter/s)": 1.42604 }, { "acc": 0.90193005, "epoch": 2.1270962047661075, "grad_norm": 15.045731544494629, "learning_rate": 9.77856303944422e-06, "loss": 0.59439569, "memory(GiB)": 15.03, "step": 1205, "train_speed(iter/s)": 1.426159 }, { "acc": 0.90878582, "epoch": 2.1359223300970873, "grad_norm": 7.888996601104736, "learning_rate": 9.784270632017267e-06, "loss": 0.48742652, "memory(GiB)": 15.03, "step": 1210, "train_speed(iter/s)": 1.426612 }, { "acc": 0.88868141, "epoch": 2.144748455428067, "grad_norm": 13.094260215759277, "learning_rate": 9.789954688059253e-06, "loss": 0.61020746, "memory(GiB)": 15.03, "step": 1215, "train_speed(iter/s)": 1.42683 }, { "acc": 0.9051054, "epoch": 2.153574580759047, "grad_norm": 10.036234855651855, "learning_rate": 9.795615400889522e-06, "loss": 0.49365778, "memory(GiB)": 15.03, "step": 1220, "train_speed(iter/s)": 1.426955 }, { "acc": 0.90231075, "epoch": 2.1624007060900263, "grad_norm": 12.29619312286377, "learning_rate": 9.801252961455397e-06, "loss": 0.58524351, "memory(GiB)": 15.03, "step": 1225, "train_speed(iter/s)": 1.427517 }, { "acc": 0.90875416, "epoch": 2.171226831421006, "grad_norm": 9.258191108703613, "learning_rate": 9.806867558370813e-06, "loss": 0.53002496, "memory(GiB)": 15.03, "step": 1230, "train_speed(iter/s)": 1.427893 }, { "acc": 0.90600929, "epoch": 2.180052956751986, "grad_norm": 10.734365463256836, "learning_rate": 9.812459377954191e-06, "loss": 0.51976771, "memory(GiB)": 15.03, "step": 1235, "train_speed(iter/s)": 1.428137 }, { "acc": 0.90658321, "epoch": 2.1888790820829658, "grad_norm": 12.375234603881836, "learning_rate": 9.818028604265535e-06, "loss": 0.52808528, "memory(GiB)": 15.03, "step": 1240, "train_speed(iter/s)": 1.428257 }, { "acc": 0.90715809, "epoch": 2.197705207413945, "grad_norm": 12.734635353088379, "learning_rate": 9.823575419142782e-06, "loss": 0.47110147, "memory(GiB)": 15.03, "step": 1245, "train_speed(iter/s)": 1.428452 }, { "acc": 0.90526409, "epoch": 2.206531332744925, "grad_norm": 7.555601596832275, "learning_rate": 9.829100002237431e-06, "loss": 0.51687703, "memory(GiB)": 15.03, "step": 1250, "train_speed(iter/s)": 1.428704 }, { "acc": 0.90856056, "epoch": 2.215357458075905, "grad_norm": 64.70488739013672, "learning_rate": 9.834602531049457e-06, "loss": 0.56342926, "memory(GiB)": 15.03, "step": 1255, "train_speed(iter/s)": 1.428839 }, { "acc": 0.90410919, "epoch": 2.2241835834068846, "grad_norm": 9.30129623413086, "learning_rate": 9.840083180961523e-06, "loss": 0.52348576, "memory(GiB)": 15.03, "step": 1260, "train_speed(iter/s)": 1.428899 }, { "acc": 0.91737423, "epoch": 2.233009708737864, "grad_norm": 9.280898094177246, "learning_rate": 9.845542125272528e-06, "loss": 0.457551, "memory(GiB)": 15.03, "step": 1265, "train_speed(iter/s)": 1.428884 }, { "acc": 0.91135292, "epoch": 2.241835834068844, "grad_norm": 18.865543365478516, "learning_rate": 9.850979535230489e-06, "loss": 0.55818, "memory(GiB)": 15.03, "step": 1270, "train_speed(iter/s)": 1.429155 }, { "acc": 0.90930262, "epoch": 2.2506619593998236, "grad_norm": 10.205679893493652, "learning_rate": 9.856395580064765e-06, "loss": 0.51842232, "memory(GiB)": 15.03, "step": 1275, "train_speed(iter/s)": 1.429243 }, { "acc": 0.8965498, "epoch": 2.259488084730803, "grad_norm": 13.221919059753418, "learning_rate": 9.861790427017676e-06, "loss": 0.63396263, "memory(GiB)": 15.03, "step": 1280, "train_speed(iter/s)": 1.429696 }, { "acc": 0.89874535, "epoch": 2.268314210061783, "grad_norm": 9.255766868591309, "learning_rate": 9.867164241375475e-06, "loss": 0.53985863, "memory(GiB)": 15.03, "step": 1285, "train_speed(iter/s)": 1.430049 }, { "acc": 0.90246325, "epoch": 2.2771403353927626, "grad_norm": 14.346580505371094, "learning_rate": 9.872517186498744e-06, "loss": 0.5807147, "memory(GiB)": 15.03, "step": 1290, "train_speed(iter/s)": 1.430432 }, { "acc": 0.91761456, "epoch": 2.2859664607237424, "grad_norm": 7.036569595336914, "learning_rate": 9.877849423852183e-06, "loss": 0.42423973, "memory(GiB)": 15.03, "step": 1295, "train_speed(iter/s)": 1.430794 }, { "acc": 0.90731106, "epoch": 2.294792586054722, "grad_norm": 10.987264633178711, "learning_rate": 9.883161113033847e-06, "loss": 0.50618978, "memory(GiB)": 15.03, "step": 1300, "train_speed(iter/s)": 1.431154 }, { "acc": 0.90925388, "epoch": 2.3036187113857016, "grad_norm": 11.737427711486816, "learning_rate": 9.888452411803792e-06, "loss": 0.5499877, "memory(GiB)": 15.03, "step": 1305, "train_speed(iter/s)": 1.431308 }, { "acc": 0.89539261, "epoch": 2.3124448367166814, "grad_norm": 16.05523681640625, "learning_rate": 9.893723476112203e-06, "loss": 0.59871988, "memory(GiB)": 15.03, "step": 1310, "train_speed(iter/s)": 1.431687 }, { "acc": 0.91072311, "epoch": 2.3212709620476613, "grad_norm": 11.775300025939941, "learning_rate": 9.898974460126969e-06, "loss": 0.53940406, "memory(GiB)": 15.03, "step": 1315, "train_speed(iter/s)": 1.431563 }, { "acc": 0.90041723, "epoch": 2.3300970873786406, "grad_norm": 10.455345153808594, "learning_rate": 9.904205516260733e-06, "loss": 0.53496099, "memory(GiB)": 15.03, "step": 1320, "train_speed(iter/s)": 1.431558 }, { "acc": 0.91113758, "epoch": 2.3389232127096204, "grad_norm": 7.756198406219482, "learning_rate": 9.909416795197449e-06, "loss": 0.49310832, "memory(GiB)": 15.03, "step": 1325, "train_speed(iter/s)": 1.431959 }, { "acc": 0.90454254, "epoch": 2.3477493380406003, "grad_norm": 8.34156322479248, "learning_rate": 9.914608445918417e-06, "loss": 0.54202747, "memory(GiB)": 15.03, "step": 1330, "train_speed(iter/s)": 1.432282 }, { "acc": 0.91801443, "epoch": 2.3565754633715796, "grad_norm": 9.080329895019531, "learning_rate": 9.919780615727838e-06, "loss": 0.45209417, "memory(GiB)": 15.03, "step": 1335, "train_speed(iter/s)": 1.432276 }, { "acc": 0.91402245, "epoch": 2.3654015887025595, "grad_norm": 13.115955352783203, "learning_rate": 9.924933450277898e-06, "loss": 0.5182528, "memory(GiB)": 15.03, "step": 1340, "train_speed(iter/s)": 1.432439 }, { "acc": 0.90965252, "epoch": 2.3742277140335393, "grad_norm": 12.215981483459473, "learning_rate": 9.930067093593376e-06, "loss": 0.48638868, "memory(GiB)": 15.03, "step": 1345, "train_speed(iter/s)": 1.432669 }, { "acc": 0.92375946, "epoch": 2.383053839364519, "grad_norm": 11.16447925567627, "learning_rate": 9.935181688095803e-06, "loss": 0.45179071, "memory(GiB)": 15.03, "step": 1350, "train_speed(iter/s)": 1.432986 }, { "acc": 0.90522795, "epoch": 2.391879964695499, "grad_norm": 8.998376846313477, "learning_rate": 9.940277374627159e-06, "loss": 0.52132626, "memory(GiB)": 15.03, "step": 1355, "train_speed(iter/s)": 1.433309 }, { "acc": 0.90350838, "epoch": 2.4007060900264783, "grad_norm": 15.337845802307129, "learning_rate": 9.94535429247316e-06, "loss": 0.54949584, "memory(GiB)": 15.03, "step": 1360, "train_speed(iter/s)": 1.43371 }, { "acc": 0.92455578, "epoch": 2.409532215357458, "grad_norm": 8.579594612121582, "learning_rate": 9.95041257938609e-06, "loss": 0.43822021, "memory(GiB)": 15.03, "step": 1365, "train_speed(iter/s)": 1.433908 }, { "acc": 0.91102133, "epoch": 2.418358340688438, "grad_norm": 18.262666702270508, "learning_rate": 9.955452371607232e-06, "loss": 0.54052696, "memory(GiB)": 15.03, "step": 1370, "train_speed(iter/s)": 1.434114 }, { "acc": 0.91117821, "epoch": 2.4271844660194173, "grad_norm": 10.7975492477417, "learning_rate": 9.960473803888885e-06, "loss": 0.46693506, "memory(GiB)": 15.03, "step": 1375, "train_speed(iter/s)": 1.434191 }, { "acc": 0.91250458, "epoch": 2.436010591350397, "grad_norm": 15.292963027954102, "learning_rate": 9.965477009515992e-06, "loss": 0.50179811, "memory(GiB)": 15.03, "step": 1380, "train_speed(iter/s)": 1.434378 }, { "acc": 0.91072559, "epoch": 2.444836716681377, "grad_norm": 12.853521347045898, "learning_rate": 9.970462120327354e-06, "loss": 0.51297169, "memory(GiB)": 15.03, "step": 1385, "train_speed(iter/s)": 1.43437 }, { "acc": 0.9085741, "epoch": 2.4536628420123567, "grad_norm": 10.283921241760254, "learning_rate": 9.975429266736478e-06, "loss": 0.54332867, "memory(GiB)": 15.03, "step": 1390, "train_speed(iter/s)": 1.434644 }, { "acc": 0.91670628, "epoch": 2.462488967343336, "grad_norm": 14.655674934387207, "learning_rate": 9.980378577752058e-06, "loss": 0.46605387, "memory(GiB)": 15.03, "step": 1395, "train_speed(iter/s)": 1.434481 }, { "acc": 0.90454617, "epoch": 2.471315092674316, "grad_norm": 9.654297828674316, "learning_rate": 9.985310180998072e-06, "loss": 0.55439892, "memory(GiB)": 15.03, "step": 1400, "train_speed(iter/s)": 1.434381 }, { "acc": 0.91288948, "epoch": 2.4801412180052957, "grad_norm": 12.461824417114258, "learning_rate": 9.99022420273353e-06, "loss": 0.45627351, "memory(GiB)": 15.03, "step": 1405, "train_speed(iter/s)": 1.43459 }, { "acc": 0.91507378, "epoch": 2.4889673433362756, "grad_norm": 8.104724884033203, "learning_rate": 9.995120767871867e-06, "loss": 0.47773552, "memory(GiB)": 15.03, "step": 1410, "train_speed(iter/s)": 1.434778 }, { "acc": 0.93443203, "epoch": 2.497793468667255, "grad_norm": 7.503716468811035, "learning_rate": 1e-05, "loss": 0.41085472, "memory(GiB)": 15.03, "step": 1415, "train_speed(iter/s)": 1.435124 }, { "acc": 0.91479282, "epoch": 2.5066195939982348, "grad_norm": 10.845869064331055, "learning_rate": 9.999999146671527e-06, "loss": 0.47397275, "memory(GiB)": 15.03, "step": 1420, "train_speed(iter/s)": 1.435336 }, { "acc": 0.91439838, "epoch": 2.5154457193292146, "grad_norm": 9.410380363464355, "learning_rate": 9.999996586686398e-06, "loss": 0.4462194, "memory(GiB)": 15.03, "step": 1425, "train_speed(iter/s)": 1.435394 }, { "acc": 0.91747313, "epoch": 2.524271844660194, "grad_norm": 9.56289291381836, "learning_rate": 9.999992320045489e-06, "loss": 0.46370864, "memory(GiB)": 15.03, "step": 1430, "train_speed(iter/s)": 1.435857 }, { "acc": 0.90350361, "epoch": 2.5330979699911738, "grad_norm": 13.972111701965332, "learning_rate": 9.999986346750253e-06, "loss": 0.50518618, "memory(GiB)": 15.03, "step": 1435, "train_speed(iter/s)": 1.436032 }, { "acc": 0.90905018, "epoch": 2.5419240953221536, "grad_norm": 10.242715835571289, "learning_rate": 9.999978666802733e-06, "loss": 0.53044767, "memory(GiB)": 15.03, "step": 1440, "train_speed(iter/s)": 1.435895 }, { "acc": 0.90639534, "epoch": 2.5507502206531334, "grad_norm": 9.2958345413208, "learning_rate": 9.999969280205549e-06, "loss": 0.52485456, "memory(GiB)": 15.03, "step": 1445, "train_speed(iter/s)": 1.436137 }, { "acc": 0.93085041, "epoch": 2.559576345984113, "grad_norm": 12.533552169799805, "learning_rate": 9.999958186961902e-06, "loss": 0.40665669, "memory(GiB)": 15.03, "step": 1450, "train_speed(iter/s)": 1.436474 }, { "acc": 0.92664146, "epoch": 2.5684024713150926, "grad_norm": 8.871023178100586, "learning_rate": 9.999945387075585e-06, "loss": 0.39843211, "memory(GiB)": 15.03, "step": 1455, "train_speed(iter/s)": 1.436662 }, { "acc": 0.91081667, "epoch": 2.5772285966460724, "grad_norm": 12.978819847106934, "learning_rate": 9.999930880550962e-06, "loss": 0.49279604, "memory(GiB)": 15.03, "step": 1460, "train_speed(iter/s)": 1.436367 }, { "acc": 0.90601749, "epoch": 2.586054721977052, "grad_norm": 9.939981460571289, "learning_rate": 9.99991466739299e-06, "loss": 0.5186614, "memory(GiB)": 15.03, "step": 1465, "train_speed(iter/s)": 1.436492 }, { "acc": 0.92673521, "epoch": 2.5948808473080316, "grad_norm": 11.815101623535156, "learning_rate": 9.9998967476072e-06, "loss": 0.44794822, "memory(GiB)": 15.03, "step": 1470, "train_speed(iter/s)": 1.436841 }, { "acc": 0.91323814, "epoch": 2.6037069726390114, "grad_norm": 12.08131217956543, "learning_rate": 9.999877121199708e-06, "loss": 0.47158709, "memory(GiB)": 15.03, "step": 1475, "train_speed(iter/s)": 1.437109 }, { "acc": 0.9161231, "epoch": 2.6125330979699912, "grad_norm": 13.018186569213867, "learning_rate": 9.999855788177218e-06, "loss": 0.49788733, "memory(GiB)": 15.03, "step": 1480, "train_speed(iter/s)": 1.437186 }, { "acc": 0.92422495, "epoch": 2.6213592233009706, "grad_norm": 8.610344886779785, "learning_rate": 9.99983274854701e-06, "loss": 0.44242597, "memory(GiB)": 15.03, "step": 1485, "train_speed(iter/s)": 1.437384 }, { "acc": 0.91069012, "epoch": 2.6301853486319504, "grad_norm": 10.765470504760742, "learning_rate": 9.999808002316949e-06, "loss": 0.48285084, "memory(GiB)": 15.03, "step": 1490, "train_speed(iter/s)": 1.437419 }, { "acc": 0.9235095, "epoch": 2.6390114739629302, "grad_norm": 7.9351983070373535, "learning_rate": 9.999781549495484e-06, "loss": 0.45113478, "memory(GiB)": 15.03, "step": 1495, "train_speed(iter/s)": 1.437389 }, { "acc": 0.92511139, "epoch": 2.64783759929391, "grad_norm": 9.12990951538086, "learning_rate": 9.999753390091642e-06, "loss": 0.44021358, "memory(GiB)": 15.03, "step": 1500, "train_speed(iter/s)": 1.437571 }, { "acc": 0.92693291, "epoch": 2.65666372462489, "grad_norm": 9.218193054199219, "learning_rate": 9.999723524115038e-06, "loss": 0.41648812, "memory(GiB)": 15.03, "step": 1505, "train_speed(iter/s)": 1.437415 }, { "acc": 0.91290226, "epoch": 2.6654898499558692, "grad_norm": 9.858386993408203, "learning_rate": 9.999691951575867e-06, "loss": 0.51713762, "memory(GiB)": 15.03, "step": 1510, "train_speed(iter/s)": 1.437305 }, { "acc": 0.91186714, "epoch": 2.674315975286849, "grad_norm": 13.941583633422852, "learning_rate": 9.999658672484907e-06, "loss": 0.52526731, "memory(GiB)": 15.03, "step": 1515, "train_speed(iter/s)": 1.437338 }, { "acc": 0.91082239, "epoch": 2.683142100617829, "grad_norm": 11.50890064239502, "learning_rate": 9.999623686853518e-06, "loss": 0.52128792, "memory(GiB)": 15.03, "step": 1520, "train_speed(iter/s)": 1.437434 }, { "acc": 0.92564583, "epoch": 2.6919682259488082, "grad_norm": 10.629166603088379, "learning_rate": 9.999586994693641e-06, "loss": 0.4109426, "memory(GiB)": 15.03, "step": 1525, "train_speed(iter/s)": 1.437485 }, { "acc": 0.90744705, "epoch": 2.700794351279788, "grad_norm": 9.501011848449707, "learning_rate": 9.999548596017806e-06, "loss": 0.54890652, "memory(GiB)": 15.03, "step": 1530, "train_speed(iter/s)": 1.437534 }, { "acc": 0.89879904, "epoch": 2.709620476610768, "grad_norm": 11.034645080566406, "learning_rate": 9.999508490839115e-06, "loss": 0.55875921, "memory(GiB)": 15.03, "step": 1535, "train_speed(iter/s)": 1.437615 }, { "acc": 0.92507906, "epoch": 2.7184466019417477, "grad_norm": 9.483128547668457, "learning_rate": 9.999466679171263e-06, "loss": 0.41303616, "memory(GiB)": 15.03, "step": 1540, "train_speed(iter/s)": 1.437854 }, { "acc": 0.91601171, "epoch": 2.7272727272727275, "grad_norm": 9.552450180053711, "learning_rate": 9.999423161028523e-06, "loss": 0.47476473, "memory(GiB)": 15.03, "step": 1545, "train_speed(iter/s)": 1.438047 }, { "acc": 0.91517248, "epoch": 2.736098852603707, "grad_norm": 15.098411560058594, "learning_rate": 9.999377936425747e-06, "loss": 0.48443136, "memory(GiB)": 15.03, "step": 1550, "train_speed(iter/s)": 1.438199 }, { "acc": 0.91606855, "epoch": 2.7449249779346867, "grad_norm": 9.451160430908203, "learning_rate": 9.999331005378377e-06, "loss": 0.46983757, "memory(GiB)": 15.03, "step": 1555, "train_speed(iter/s)": 1.438342 }, { "acc": 0.92634563, "epoch": 2.7537511032656665, "grad_norm": 38.79680633544922, "learning_rate": 9.999282367902432e-06, "loss": 0.42826118, "memory(GiB)": 15.03, "step": 1560, "train_speed(iter/s)": 1.438559 }, { "acc": 0.92093725, "epoch": 2.762577228596646, "grad_norm": 11.102478981018066, "learning_rate": 9.999232024014514e-06, "loss": 0.41878071, "memory(GiB)": 15.03, "step": 1565, "train_speed(iter/s)": 1.438721 }, { "acc": 0.91261034, "epoch": 2.7714033539276257, "grad_norm": 10.575754165649414, "learning_rate": 9.99917997373181e-06, "loss": 0.50506535, "memory(GiB)": 15.03, "step": 1570, "train_speed(iter/s)": 1.438879 }, { "acc": 0.92461147, "epoch": 2.7802294792586055, "grad_norm": 12.712259292602539, "learning_rate": 9.99912621707209e-06, "loss": 0.41079521, "memory(GiB)": 15.03, "step": 1575, "train_speed(iter/s)": 1.439075 }, { "acc": 0.90895329, "epoch": 2.789055604589585, "grad_norm": 25.274887084960938, "learning_rate": 9.999070754053702e-06, "loss": 0.52976913, "memory(GiB)": 15.03, "step": 1580, "train_speed(iter/s)": 1.439022 }, { "acc": 0.92069702, "epoch": 2.7978817299205647, "grad_norm": 13.451714515686035, "learning_rate": 9.999013584695579e-06, "loss": 0.44877977, "memory(GiB)": 15.03, "step": 1585, "train_speed(iter/s)": 1.439157 }, { "acc": 0.92738361, "epoch": 2.8067078552515445, "grad_norm": 11.272367477416992, "learning_rate": 9.998954709017237e-06, "loss": 0.41436558, "memory(GiB)": 15.03, "step": 1590, "train_speed(iter/s)": 1.439122 }, { "acc": 0.92893848, "epoch": 2.8155339805825244, "grad_norm": 10.496988296508789, "learning_rate": 9.998894127038776e-06, "loss": 0.40633864, "memory(GiB)": 15.03, "step": 1595, "train_speed(iter/s)": 1.439426 }, { "acc": 0.91919746, "epoch": 2.824360105913504, "grad_norm": 8.555822372436523, "learning_rate": 9.998831838780876e-06, "loss": 0.45057583, "memory(GiB)": 15.03, "step": 1600, "train_speed(iter/s)": 1.439582 }, { "acc": 0.92423344, "epoch": 2.8331862312444835, "grad_norm": 9.885980606079102, "learning_rate": 9.998767844264799e-06, "loss": 0.40626993, "memory(GiB)": 15.03, "step": 1605, "train_speed(iter/s)": 1.439724 }, { "acc": 0.92240124, "epoch": 2.8420123565754634, "grad_norm": 9.088223457336426, "learning_rate": 9.99870214351239e-06, "loss": 0.451612, "memory(GiB)": 15.03, "step": 1610, "train_speed(iter/s)": 1.439899 }, { "acc": 0.9265111, "epoch": 2.850838481906443, "grad_norm": 10.530967712402344, "learning_rate": 9.998634736546077e-06, "loss": 0.40844021, "memory(GiB)": 15.03, "step": 1615, "train_speed(iter/s)": 1.440018 }, { "acc": 0.91232758, "epoch": 2.8596646072374226, "grad_norm": 24.119035720825195, "learning_rate": 9.998565623388872e-06, "loss": 0.49725871, "memory(GiB)": 15.03, "step": 1620, "train_speed(iter/s)": 1.440163 }, { "acc": 0.91808786, "epoch": 2.8684907325684024, "grad_norm": 10.45950984954834, "learning_rate": 9.998494804064366e-06, "loss": 0.48162956, "memory(GiB)": 15.03, "step": 1625, "train_speed(iter/s)": 1.44008 }, { "acc": 0.92230215, "epoch": 2.877316857899382, "grad_norm": 14.900110244750977, "learning_rate": 9.998422278596739e-06, "loss": 0.45389352, "memory(GiB)": 15.03, "step": 1630, "train_speed(iter/s)": 1.440217 }, { "acc": 0.91437464, "epoch": 2.886142983230362, "grad_norm": 9.795614242553711, "learning_rate": 9.998348047010742e-06, "loss": 0.46637745, "memory(GiB)": 15.03, "step": 1635, "train_speed(iter/s)": 1.440582 }, { "acc": 0.91935959, "epoch": 2.894969108561342, "grad_norm": 5.393486976623535, "learning_rate": 9.998272109331718e-06, "loss": 0.42381935, "memory(GiB)": 15.03, "step": 1640, "train_speed(iter/s)": 1.440729 }, { "acc": 0.93192205, "epoch": 2.903795233892321, "grad_norm": 7.824235916137695, "learning_rate": 9.998194465585592e-06, "loss": 0.3914639, "memory(GiB)": 15.03, "step": 1645, "train_speed(iter/s)": 1.440885 }, { "acc": 0.93138628, "epoch": 2.912621359223301, "grad_norm": 9.248165130615234, "learning_rate": 9.998115115798865e-06, "loss": 0.38761487, "memory(GiB)": 15.03, "step": 1650, "train_speed(iter/s)": 1.441162 }, { "acc": 0.92396736, "epoch": 2.921447484554281, "grad_norm": 10.363149642944336, "learning_rate": 9.998034059998626e-06, "loss": 0.42818699, "memory(GiB)": 15.03, "step": 1655, "train_speed(iter/s)": 1.44129 }, { "acc": 0.92142315, "epoch": 2.93027360988526, "grad_norm": 12.772117614746094, "learning_rate": 9.997951298212544e-06, "loss": 0.44225616, "memory(GiB)": 15.03, "step": 1660, "train_speed(iter/s)": 1.44114 }, { "acc": 0.92935648, "epoch": 2.93909973521624, "grad_norm": 7.617005825042725, "learning_rate": 9.997866830468872e-06, "loss": 0.40684738, "memory(GiB)": 15.03, "step": 1665, "train_speed(iter/s)": 1.441324 }, { "acc": 0.92730713, "epoch": 2.94792586054722, "grad_norm": 10.592954635620117, "learning_rate": 9.997780656796445e-06, "loss": 0.41676388, "memory(GiB)": 15.03, "step": 1670, "train_speed(iter/s)": 1.441214 }, { "acc": 0.92717705, "epoch": 2.956751985878199, "grad_norm": 6.836135387420654, "learning_rate": 9.997692777224677e-06, "loss": 0.45691786, "memory(GiB)": 15.03, "step": 1675, "train_speed(iter/s)": 1.441341 }, { "acc": 0.9327837, "epoch": 2.965578111209179, "grad_norm": 8.894441604614258, "learning_rate": 9.997603191783569e-06, "loss": 0.37835226, "memory(GiB)": 15.03, "step": 1680, "train_speed(iter/s)": 1.441542 }, { "acc": 0.91993542, "epoch": 2.974404236540159, "grad_norm": 9.201231002807617, "learning_rate": 9.997511900503702e-06, "loss": 0.44014149, "memory(GiB)": 15.03, "step": 1685, "train_speed(iter/s)": 1.44168 }, { "acc": 0.92887487, "epoch": 2.9832303618711387, "grad_norm": 12.81490707397461, "learning_rate": 9.997418903416242e-06, "loss": 0.40680294, "memory(GiB)": 15.03, "step": 1690, "train_speed(iter/s)": 1.441851 }, { "acc": 0.91335526, "epoch": 2.9920564872021185, "grad_norm": 15.414380073547363, "learning_rate": 9.997324200552932e-06, "loss": 0.4620306, "memory(GiB)": 15.03, "step": 1695, "train_speed(iter/s)": 1.442085 }, { "acc": 0.91596775, "epoch": 3.000882612533098, "grad_norm": 10.496133804321289, "learning_rate": 9.997227791946099e-06, "loss": 0.41073246, "memory(GiB)": 15.03, "step": 1700, "train_speed(iter/s)": 1.441861 }, { "acc": 0.93006306, "epoch": 3.0097087378640777, "grad_norm": 8.612944602966309, "learning_rate": 9.997129677628656e-06, "loss": 0.36347353, "memory(GiB)": 15.03, "step": 1705, "train_speed(iter/s)": 1.442176 }, { "acc": 0.92991104, "epoch": 3.0185348631950575, "grad_norm": 12.620110511779785, "learning_rate": 9.997029857634095e-06, "loss": 0.36178682, "memory(GiB)": 15.03, "step": 1710, "train_speed(iter/s)": 1.442321 }, { "acc": 0.92197113, "epoch": 3.027360988526037, "grad_norm": 25.084766387939453, "learning_rate": 9.996928331996492e-06, "loss": 0.44765911, "memory(GiB)": 15.03, "step": 1715, "train_speed(iter/s)": 1.442476 }, { "acc": 0.91672621, "epoch": 3.0361871138570167, "grad_norm": 11.473336219787598, "learning_rate": 9.996825100750506e-06, "loss": 0.50118823, "memory(GiB)": 15.03, "step": 1720, "train_speed(iter/s)": 1.442556 }, { "acc": 0.92360458, "epoch": 3.0450132391879965, "grad_norm": 13.121673583984375, "learning_rate": 9.996720163931374e-06, "loss": 0.39821296, "memory(GiB)": 15.03, "step": 1725, "train_speed(iter/s)": 1.442537 }, { "acc": 0.93307228, "epoch": 3.0538393645189763, "grad_norm": 6.411123752593994, "learning_rate": 9.996613521574917e-06, "loss": 0.3612381, "memory(GiB)": 15.03, "step": 1730, "train_speed(iter/s)": 1.442789 }, { "acc": 0.93234053, "epoch": 3.0626654898499557, "grad_norm": 9.961936950683594, "learning_rate": 9.996505173717542e-06, "loss": 0.40973635, "memory(GiB)": 15.03, "step": 1735, "train_speed(iter/s)": 1.44283 }, { "acc": 0.93364954, "epoch": 3.0714916151809355, "grad_norm": 7.657005786895752, "learning_rate": 9.996395120396232e-06, "loss": 0.40301476, "memory(GiB)": 15.03, "step": 1740, "train_speed(iter/s)": 1.4431 }, { "acc": 0.93448582, "epoch": 3.0803177405119153, "grad_norm": 7.878607749938965, "learning_rate": 9.996283361648558e-06, "loss": 0.34267349, "memory(GiB)": 15.03, "step": 1745, "train_speed(iter/s)": 1.443276 }, { "acc": 0.92159805, "epoch": 3.089143865842895, "grad_norm": 8.601284980773926, "learning_rate": 9.996169897512668e-06, "loss": 0.4696209, "memory(GiB)": 15.03, "step": 1750, "train_speed(iter/s)": 1.443659 }, { "acc": 0.91161327, "epoch": 3.0979699911738745, "grad_norm": 10.683494567871094, "learning_rate": 9.996054728027298e-06, "loss": 0.49714108, "memory(GiB)": 15.03, "step": 1755, "train_speed(iter/s)": 1.443829 }, { "acc": 0.92917929, "epoch": 3.1067961165048543, "grad_norm": 8.245281219482422, "learning_rate": 9.995937853231761e-06, "loss": 0.40738068, "memory(GiB)": 15.03, "step": 1760, "train_speed(iter/s)": 1.444056 }, { "acc": 0.91678982, "epoch": 3.115622241835834, "grad_norm": 8.363460540771484, "learning_rate": 9.995819273165952e-06, "loss": 0.46374044, "memory(GiB)": 15.03, "step": 1765, "train_speed(iter/s)": 1.444086 }, { "acc": 0.93560295, "epoch": 3.124448367166814, "grad_norm": 10.532877922058105, "learning_rate": 9.995698987870355e-06, "loss": 0.37170763, "memory(GiB)": 15.03, "step": 1770, "train_speed(iter/s)": 1.44411 }, { "acc": 0.93965921, "epoch": 3.1332744924977933, "grad_norm": 11.120150566101074, "learning_rate": 9.995576997386028e-06, "loss": 0.36875865, "memory(GiB)": 15.03, "step": 1775, "train_speed(iter/s)": 1.444024 }, { "acc": 0.92877274, "epoch": 3.142100617828773, "grad_norm": 20.721811294555664, "learning_rate": 9.995453301754616e-06, "loss": 0.41674538, "memory(GiB)": 15.03, "step": 1780, "train_speed(iter/s)": 1.444337 }, { "acc": 0.9250926, "epoch": 3.150926743159753, "grad_norm": 10.784285545349121, "learning_rate": 9.995327901018341e-06, "loss": 0.44530206, "memory(GiB)": 15.03, "step": 1785, "train_speed(iter/s)": 1.4444 }, { "acc": 0.93565578, "epoch": 3.159752868490733, "grad_norm": 7.925683975219727, "learning_rate": 9.995200795220015e-06, "loss": 0.37676444, "memory(GiB)": 15.03, "step": 1790, "train_speed(iter/s)": 1.444724 }, { "acc": 0.92495365, "epoch": 3.168578993821712, "grad_norm": 11.855972290039062, "learning_rate": 9.995071984403024e-06, "loss": 0.40637865, "memory(GiB)": 15.03, "step": 1795, "train_speed(iter/s)": 1.445038 }, { "acc": 0.94980812, "epoch": 3.177405119152692, "grad_norm": 6.406301975250244, "learning_rate": 9.994941468611343e-06, "loss": 0.2946867, "memory(GiB)": 15.03, "step": 1800, "train_speed(iter/s)": 1.445109 }, { "acc": 0.93205032, "epoch": 3.186231244483672, "grad_norm": 10.18293285369873, "learning_rate": 9.99480924788952e-06, "loss": 0.36972222, "memory(GiB)": 15.03, "step": 1805, "train_speed(iter/s)": 1.445215 }, { "acc": 0.93153667, "epoch": 3.195057369814651, "grad_norm": 10.638223648071289, "learning_rate": 9.994675322282698e-06, "loss": 0.37495389, "memory(GiB)": 15.03, "step": 1810, "train_speed(iter/s)": 1.445342 }, { "acc": 0.92794304, "epoch": 3.203883495145631, "grad_norm": 22.45987892150879, "learning_rate": 9.99453969183659e-06, "loss": 0.44019284, "memory(GiB)": 15.03, "step": 1815, "train_speed(iter/s)": 1.445601 }, { "acc": 0.92340298, "epoch": 3.212709620476611, "grad_norm": 12.667778015136719, "learning_rate": 9.994402356597495e-06, "loss": 0.44926476, "memory(GiB)": 15.03, "step": 1820, "train_speed(iter/s)": 1.445804 }, { "acc": 0.93981352, "epoch": 3.2215357458075906, "grad_norm": 7.947700023651123, "learning_rate": 9.994263316612295e-06, "loss": 0.34779224, "memory(GiB)": 15.03, "step": 1825, "train_speed(iter/s)": 1.446005 }, { "acc": 0.93049936, "epoch": 3.23036187113857, "grad_norm": 9.938447952270508, "learning_rate": 9.994122571928457e-06, "loss": 0.38192043, "memory(GiB)": 15.03, "step": 1830, "train_speed(iter/s)": 1.446246 }, { "acc": 0.94139709, "epoch": 3.23918799646955, "grad_norm": 9.439772605895996, "learning_rate": 9.993980122594024e-06, "loss": 0.33498843, "memory(GiB)": 15.03, "step": 1835, "train_speed(iter/s)": 1.44634 }, { "acc": 0.92645683, "epoch": 3.2480141218005296, "grad_norm": 10.064387321472168, "learning_rate": 9.993835968657619e-06, "loss": 0.38324258, "memory(GiB)": 15.03, "step": 1840, "train_speed(iter/s)": 1.446597 }, { "acc": 0.92988558, "epoch": 3.2568402471315094, "grad_norm": 11.949687004089355, "learning_rate": 9.99369011016846e-06, "loss": 0.40098391, "memory(GiB)": 15.03, "step": 1845, "train_speed(iter/s)": 1.446712 }, { "acc": 0.92839489, "epoch": 3.265666372462489, "grad_norm": 12.320834159851074, "learning_rate": 9.99354254717633e-06, "loss": 0.43135424, "memory(GiB)": 15.03, "step": 1850, "train_speed(iter/s)": 1.44667 }, { "acc": 0.93079338, "epoch": 3.2744924977934686, "grad_norm": 9.834044456481934, "learning_rate": 9.993393279731605e-06, "loss": 0.35781455, "memory(GiB)": 15.03, "step": 1855, "train_speed(iter/s)": 1.446597 }, { "acc": 0.92115841, "epoch": 3.2833186231244484, "grad_norm": 13.446786880493164, "learning_rate": 9.99324230788524e-06, "loss": 0.45759153, "memory(GiB)": 15.03, "step": 1860, "train_speed(iter/s)": 1.44683 }, { "acc": 0.93713207, "epoch": 3.2921447484554283, "grad_norm": 7.847641944885254, "learning_rate": 9.993089631688772e-06, "loss": 0.36037784, "memory(GiB)": 15.03, "step": 1865, "train_speed(iter/s)": 1.446777 }, { "acc": 0.93007889, "epoch": 3.3009708737864076, "grad_norm": 8.881616592407227, "learning_rate": 9.992935251194322e-06, "loss": 0.35024102, "memory(GiB)": 15.03, "step": 1870, "train_speed(iter/s)": 1.446684 }, { "acc": 0.93119621, "epoch": 3.3097969991173875, "grad_norm": 7.268507480621338, "learning_rate": 9.992779166454584e-06, "loss": 0.40884485, "memory(GiB)": 15.03, "step": 1875, "train_speed(iter/s)": 1.446786 }, { "acc": 0.92583427, "epoch": 3.3186231244483673, "grad_norm": 10.706664085388184, "learning_rate": 9.992621377522843e-06, "loss": 0.37923436, "memory(GiB)": 15.03, "step": 1880, "train_speed(iter/s)": 1.446839 }, { "acc": 0.93032026, "epoch": 3.327449249779347, "grad_norm": 6.269292831420898, "learning_rate": 9.992461884452962e-06, "loss": 0.36880486, "memory(GiB)": 15.03, "step": 1885, "train_speed(iter/s)": 1.44692 }, { "acc": 0.93108664, "epoch": 3.3362753751103265, "grad_norm": 16.116668701171875, "learning_rate": 9.992300687299389e-06, "loss": 0.41503286, "memory(GiB)": 15.03, "step": 1890, "train_speed(iter/s)": 1.447113 }, { "acc": 0.93694553, "epoch": 3.3451015004413063, "grad_norm": 8.66443920135498, "learning_rate": 9.992137786117148e-06, "loss": 0.35943022, "memory(GiB)": 15.03, "step": 1895, "train_speed(iter/s)": 1.447307 }, { "acc": 0.93709908, "epoch": 3.353927625772286, "grad_norm": 10.041414260864258, "learning_rate": 9.99197318096185e-06, "loss": 0.34848833, "memory(GiB)": 15.03, "step": 1900, "train_speed(iter/s)": 1.447349 }, { "acc": 0.94369736, "epoch": 3.3627537511032655, "grad_norm": 8.689764022827148, "learning_rate": 9.991806871889686e-06, "loss": 0.33718417, "memory(GiB)": 15.03, "step": 1905, "train_speed(iter/s)": 1.447582 }, { "acc": 0.93276186, "epoch": 3.3715798764342453, "grad_norm": 8.173494338989258, "learning_rate": 9.991638858957425e-06, "loss": 0.41728215, "memory(GiB)": 15.03, "step": 1910, "train_speed(iter/s)": 1.447752 }, { "acc": 0.93069305, "epoch": 3.380406001765225, "grad_norm": 8.409926414489746, "learning_rate": 9.991469142222422e-06, "loss": 0.35446727, "memory(GiB)": 15.03, "step": 1915, "train_speed(iter/s)": 1.447677 }, { "acc": 0.9335125, "epoch": 3.389232127096205, "grad_norm": 5.451147079467773, "learning_rate": 9.991297721742612e-06, "loss": 0.3911902, "memory(GiB)": 15.03, "step": 1920, "train_speed(iter/s)": 1.447787 }, { "acc": 0.93243074, "epoch": 3.3980582524271843, "grad_norm": 6.618557453155518, "learning_rate": 9.991124597576517e-06, "loss": 0.37362843, "memory(GiB)": 15.03, "step": 1925, "train_speed(iter/s)": 1.447649 }, { "acc": 0.93783436, "epoch": 3.406884377758164, "grad_norm": 10.176216125488281, "learning_rate": 9.99094976978323e-06, "loss": 0.35632768, "memory(GiB)": 15.03, "step": 1930, "train_speed(iter/s)": 1.447676 }, { "acc": 0.93860359, "epoch": 3.415710503089144, "grad_norm": 8.94262981414795, "learning_rate": 9.990773238422433e-06, "loss": 0.35273445, "memory(GiB)": 15.03, "step": 1935, "train_speed(iter/s)": 1.447883 }, { "acc": 0.93940506, "epoch": 3.4245366284201237, "grad_norm": 8.16425895690918, "learning_rate": 9.990595003554388e-06, "loss": 0.32691717, "memory(GiB)": 15.03, "step": 1940, "train_speed(iter/s)": 1.447889 }, { "acc": 0.93767681, "epoch": 3.433362753751103, "grad_norm": 6.031472206115723, "learning_rate": 9.990415065239937e-06, "loss": 0.38795612, "memory(GiB)": 15.03, "step": 1945, "train_speed(iter/s)": 1.447897 }, { "acc": 0.92888899, "epoch": 3.442188879082083, "grad_norm": 10.497140884399414, "learning_rate": 9.990233423540506e-06, "loss": 0.38416359, "memory(GiB)": 15.03, "step": 1950, "train_speed(iter/s)": 1.448008 }, { "acc": 0.95056953, "epoch": 3.4510150044130627, "grad_norm": 9.19324016571045, "learning_rate": 9.9900500785181e-06, "loss": 0.29163241, "memory(GiB)": 15.03, "step": 1955, "train_speed(iter/s)": 1.448005 }, { "acc": 0.92808342, "epoch": 3.459841129744042, "grad_norm": 8.903212547302246, "learning_rate": 9.98986503023531e-06, "loss": 0.41854801, "memory(GiB)": 15.03, "step": 1960, "train_speed(iter/s)": 1.448039 }, { "acc": 0.93619547, "epoch": 3.468667255075022, "grad_norm": 14.323844909667969, "learning_rate": 9.989678278755302e-06, "loss": 0.35968173, "memory(GiB)": 15.03, "step": 1965, "train_speed(iter/s)": 1.448068 }, { "acc": 0.93726959, "epoch": 3.4774933804060018, "grad_norm": 6.66330623626709, "learning_rate": 9.989489824141826e-06, "loss": 0.34058928, "memory(GiB)": 15.03, "step": 1970, "train_speed(iter/s)": 1.448298 }, { "acc": 0.93845043, "epoch": 3.4863195057369816, "grad_norm": 6.9664459228515625, "learning_rate": 9.989299666459216e-06, "loss": 0.3447557, "memory(GiB)": 15.03, "step": 1975, "train_speed(iter/s)": 1.448263 }, { "acc": 0.94192743, "epoch": 3.4951456310679614, "grad_norm": 8.48476505279541, "learning_rate": 9.989107805772383e-06, "loss": 0.30749023, "memory(GiB)": 15.03, "step": 1980, "train_speed(iter/s)": 1.448228 }, { "acc": 0.9339571, "epoch": 3.5039717563989408, "grad_norm": 7.95899772644043, "learning_rate": 9.988914242146825e-06, "loss": 0.37668972, "memory(GiB)": 15.03, "step": 1985, "train_speed(iter/s)": 1.448181 }, { "acc": 0.93510284, "epoch": 3.5127978817299206, "grad_norm": 13.49086856842041, "learning_rate": 9.988718975648614e-06, "loss": 0.3647517, "memory(GiB)": 15.03, "step": 1990, "train_speed(iter/s)": 1.44827 }, { "acc": 0.92819691, "epoch": 3.5216240070609004, "grad_norm": 10.567143440246582, "learning_rate": 9.988522006344412e-06, "loss": 0.39109797, "memory(GiB)": 15.03, "step": 1995, "train_speed(iter/s)": 1.448363 }, { "acc": 0.94350758, "epoch": 3.5304501323918798, "grad_norm": 6.434574604034424, "learning_rate": 9.988323334301451e-06, "loss": 0.30188451, "memory(GiB)": 15.03, "step": 2000, "train_speed(iter/s)": 1.448491 }, { "acc": 0.93268843, "epoch": 3.5392762577228596, "grad_norm": 8.850269317626953, "learning_rate": 9.988122959587558e-06, "loss": 0.37216716, "memory(GiB)": 15.03, "step": 2005, "train_speed(iter/s)": 1.448591 }, { "acc": 0.9268961, "epoch": 3.5481023830538394, "grad_norm": 9.690704345703125, "learning_rate": 9.987920882271128e-06, "loss": 0.42502141, "memory(GiB)": 15.03, "step": 2010, "train_speed(iter/s)": 1.448771 }, { "acc": 0.9358077, "epoch": 3.556928508384819, "grad_norm": 7.753601551055908, "learning_rate": 9.987717102421148e-06, "loss": 0.37521212, "memory(GiB)": 15.03, "step": 2015, "train_speed(iter/s)": 1.448777 }, { "acc": 0.93560925, "epoch": 3.565754633715799, "grad_norm": 6.726711273193359, "learning_rate": 9.987511620107177e-06, "loss": 0.34460487, "memory(GiB)": 15.03, "step": 2020, "train_speed(iter/s)": 1.448933 }, { "acc": 0.93183765, "epoch": 3.5745807590467784, "grad_norm": 10.697713851928711, "learning_rate": 9.987304435399363e-06, "loss": 0.36716137, "memory(GiB)": 15.03, "step": 2025, "train_speed(iter/s)": 1.449016 }, { "acc": 0.94547424, "epoch": 3.5834068843777582, "grad_norm": 8.238561630249023, "learning_rate": 9.987095548368428e-06, "loss": 0.2723983, "memory(GiB)": 15.03, "step": 2030, "train_speed(iter/s)": 1.44898 }, { "acc": 0.9294857, "epoch": 3.592233009708738, "grad_norm": 8.923322677612305, "learning_rate": 9.986884959085684e-06, "loss": 0.39490113, "memory(GiB)": 15.03, "step": 2035, "train_speed(iter/s)": 1.449 }, { "acc": 0.93530645, "epoch": 3.6010591350397174, "grad_norm": 10.014941215515137, "learning_rate": 9.986672667623014e-06, "loss": 0.37584848, "memory(GiB)": 15.03, "step": 2040, "train_speed(iter/s)": 1.44896 }, { "acc": 0.94162388, "epoch": 3.6098852603706972, "grad_norm": 9.228662490844727, "learning_rate": 9.986458674052892e-06, "loss": 0.34758587, "memory(GiB)": 15.03, "step": 2045, "train_speed(iter/s)": 1.449322 }, { "acc": 0.94669075, "epoch": 3.618711385701677, "grad_norm": 9.324403762817383, "learning_rate": 9.986242978448363e-06, "loss": 0.31798372, "memory(GiB)": 15.03, "step": 2050, "train_speed(iter/s)": 1.449438 }, { "acc": 0.9382865, "epoch": 3.6275375110326564, "grad_norm": 8.432581901550293, "learning_rate": 9.986025580883063e-06, "loss": 0.34622166, "memory(GiB)": 15.03, "step": 2055, "train_speed(iter/s)": 1.449419 }, { "acc": 0.93154411, "epoch": 3.6363636363636362, "grad_norm": 9.631586074829102, "learning_rate": 9.985806481431198e-06, "loss": 0.41391735, "memory(GiB)": 15.03, "step": 2060, "train_speed(iter/s)": 1.449787 }, { "acc": 0.93705235, "epoch": 3.645189761694616, "grad_norm": 11.179079055786133, "learning_rate": 9.985585680167567e-06, "loss": 0.3315134, "memory(GiB)": 15.03, "step": 2065, "train_speed(iter/s)": 1.449885 }, { "acc": 0.93655891, "epoch": 3.654015887025596, "grad_norm": 16.42244529724121, "learning_rate": 9.985363177167539e-06, "loss": 0.3871038, "memory(GiB)": 15.03, "step": 2070, "train_speed(iter/s)": 1.449816 }, { "acc": 0.94565544, "epoch": 3.6628420123565757, "grad_norm": 6.83647346496582, "learning_rate": 9.985138972507073e-06, "loss": 0.27672806, "memory(GiB)": 15.03, "step": 2075, "train_speed(iter/s)": 1.449891 }, { "acc": 0.93317947, "epoch": 3.671668137687555, "grad_norm": 8.646677017211914, "learning_rate": 9.984913066262702e-06, "loss": 0.36612716, "memory(GiB)": 15.03, "step": 2080, "train_speed(iter/s)": 1.450002 }, { "acc": 0.92963133, "epoch": 3.680494263018535, "grad_norm": 16.891843795776367, "learning_rate": 9.984685458511543e-06, "loss": 0.39781644, "memory(GiB)": 15.03, "step": 2085, "train_speed(iter/s)": 1.450129 }, { "acc": 0.94397411, "epoch": 3.6893203883495147, "grad_norm": 7.045755863189697, "learning_rate": 9.984456149331294e-06, "loss": 0.31007366, "memory(GiB)": 15.03, "step": 2090, "train_speed(iter/s)": 1.450338 }, { "acc": 0.95009279, "epoch": 3.698146513680494, "grad_norm": 15.341543197631836, "learning_rate": 9.984225138800235e-06, "loss": 0.2899507, "memory(GiB)": 15.03, "step": 2095, "train_speed(iter/s)": 1.450416 }, { "acc": 0.93729029, "epoch": 3.706972639011474, "grad_norm": 11.889809608459473, "learning_rate": 9.98399242699722e-06, "loss": 0.3717833, "memory(GiB)": 15.03, "step": 2100, "train_speed(iter/s)": 1.450522 }, { "acc": 0.94671021, "epoch": 3.7157987643424537, "grad_norm": 8.337148666381836, "learning_rate": 9.983758014001693e-06, "loss": 0.31005697, "memory(GiB)": 15.03, "step": 2105, "train_speed(iter/s)": 1.450336 }, { "acc": 0.93042679, "epoch": 3.7246248896734335, "grad_norm": 11.223953247070312, "learning_rate": 9.983521899893674e-06, "loss": 0.38562946, "memory(GiB)": 15.03, "step": 2110, "train_speed(iter/s)": 1.450292 }, { "acc": 0.94258099, "epoch": 3.733451015004413, "grad_norm": 11.590808868408203, "learning_rate": 9.983284084753764e-06, "loss": 0.32557845, "memory(GiB)": 15.03, "step": 2115, "train_speed(iter/s)": 1.450347 }, { "acc": 0.93942394, "epoch": 3.7422771403353927, "grad_norm": 10.736227989196777, "learning_rate": 9.983044568663144e-06, "loss": 0.34112463, "memory(GiB)": 15.03, "step": 2120, "train_speed(iter/s)": 1.450373 }, { "acc": 0.93017559, "epoch": 3.7511032656663725, "grad_norm": 9.187634468078613, "learning_rate": 9.982803351703579e-06, "loss": 0.36631005, "memory(GiB)": 15.03, "step": 2125, "train_speed(iter/s)": 1.450593 }, { "acc": 0.93616123, "epoch": 3.7599293909973523, "grad_norm": 7.576179027557373, "learning_rate": 9.982560433957407e-06, "loss": 0.37766187, "memory(GiB)": 15.03, "step": 2130, "train_speed(iter/s)": 1.450662 }, { "acc": 0.92599773, "epoch": 3.7687555163283317, "grad_norm": 9.633357048034668, "learning_rate": 9.982315815507557e-06, "loss": 0.45776358, "memory(GiB)": 15.03, "step": 2135, "train_speed(iter/s)": 1.450733 }, { "acc": 0.94480715, "epoch": 3.7775816416593115, "grad_norm": 6.668014049530029, "learning_rate": 9.98206949643753e-06, "loss": 0.30696549, "memory(GiB)": 15.03, "step": 2140, "train_speed(iter/s)": 1.450791 }, { "acc": 0.94593382, "epoch": 3.7864077669902914, "grad_norm": 7.045835494995117, "learning_rate": 9.981821476831413e-06, "loss": 0.27948227, "memory(GiB)": 15.03, "step": 2145, "train_speed(iter/s)": 1.450625 }, { "acc": 0.93838778, "epoch": 3.7952338923212707, "grad_norm": 10.308120727539062, "learning_rate": 9.981571756773872e-06, "loss": 0.32509913, "memory(GiB)": 15.03, "step": 2150, "train_speed(iter/s)": 1.45084 }, { "acc": 0.9486846, "epoch": 3.8040600176522505, "grad_norm": 7.812724590301514, "learning_rate": 9.98132033635015e-06, "loss": 0.27981424, "memory(GiB)": 15.03, "step": 2155, "train_speed(iter/s)": 1.450693 }, { "acc": 0.9342329, "epoch": 3.8128861429832304, "grad_norm": 5.950748443603516, "learning_rate": 9.981067215646074e-06, "loss": 0.36350698, "memory(GiB)": 15.03, "step": 2160, "train_speed(iter/s)": 1.450935 }, { "acc": 0.93163137, "epoch": 3.82171226831421, "grad_norm": 7.924899578094482, "learning_rate": 9.980812394748052e-06, "loss": 0.34848399, "memory(GiB)": 15.03, "step": 2165, "train_speed(iter/s)": 1.451045 }, { "acc": 0.93358498, "epoch": 3.83053839364519, "grad_norm": 17.364398956298828, "learning_rate": 9.98055587374307e-06, "loss": 0.41398993, "memory(GiB)": 15.03, "step": 2170, "train_speed(iter/s)": 1.451046 }, { "acc": 0.94582224, "epoch": 3.8393645189761694, "grad_norm": 9.263498306274414, "learning_rate": 9.980297652718695e-06, "loss": 0.28851457, "memory(GiB)": 15.03, "step": 2175, "train_speed(iter/s)": 1.451019 }, { "acc": 0.92531204, "epoch": 3.848190644307149, "grad_norm": 11.733774185180664, "learning_rate": 9.980037731763077e-06, "loss": 0.47173033, "memory(GiB)": 15.03, "step": 2180, "train_speed(iter/s)": 1.450909 }, { "acc": 0.94763737, "epoch": 3.857016769638129, "grad_norm": 6.657760143280029, "learning_rate": 9.979776110964941e-06, "loss": 0.27988451, "memory(GiB)": 15.03, "step": 2185, "train_speed(iter/s)": 1.451124 }, { "acc": 0.94457998, "epoch": 3.8658428949691084, "grad_norm": 9.497953414916992, "learning_rate": 9.979512790413598e-06, "loss": 0.32475986, "memory(GiB)": 15.03, "step": 2190, "train_speed(iter/s)": 1.451022 }, { "acc": 0.93833637, "epoch": 3.874669020300088, "grad_norm": 11.01041030883789, "learning_rate": 9.979247770198937e-06, "loss": 0.36780593, "memory(GiB)": 15.03, "step": 2195, "train_speed(iter/s)": 1.451168 }, { "acc": 0.95207644, "epoch": 3.883495145631068, "grad_norm": 29.846393585205078, "learning_rate": 9.978981050411422e-06, "loss": 0.27997556, "memory(GiB)": 15.03, "step": 2200, "train_speed(iter/s)": 1.451264 }, { "acc": 0.93569727, "epoch": 3.8923212709620474, "grad_norm": 16.643718719482422, "learning_rate": 9.978712631142107e-06, "loss": 0.37598341, "memory(GiB)": 15.03, "step": 2205, "train_speed(iter/s)": 1.451432 }, { "acc": 0.94042149, "epoch": 3.901147396293027, "grad_norm": 14.059160232543945, "learning_rate": 9.978442512482618e-06, "loss": 0.35242703, "memory(GiB)": 15.03, "step": 2210, "train_speed(iter/s)": 1.451479 }, { "acc": 0.9380125, "epoch": 3.909973521624007, "grad_norm": 8.852974891662598, "learning_rate": 9.978170694525168e-06, "loss": 0.366099, "memory(GiB)": 15.03, "step": 2215, "train_speed(iter/s)": 1.451474 }, { "acc": 0.94673157, "epoch": 3.918799646954987, "grad_norm": 8.188931465148926, "learning_rate": 9.977897177362541e-06, "loss": 0.31311717, "memory(GiB)": 15.03, "step": 2220, "train_speed(iter/s)": 1.451592 }, { "acc": 0.93377323, "epoch": 3.9276257722859667, "grad_norm": 11.441035270690918, "learning_rate": 9.97762196108811e-06, "loss": 0.39126635, "memory(GiB)": 15.03, "step": 2225, "train_speed(iter/s)": 1.451667 }, { "acc": 0.94491367, "epoch": 3.936451897616946, "grad_norm": 9.766624450683594, "learning_rate": 9.977345045795824e-06, "loss": 0.31463122, "memory(GiB)": 15.03, "step": 2230, "train_speed(iter/s)": 1.451719 }, { "acc": 0.94517441, "epoch": 3.945278022947926, "grad_norm": 8.786040306091309, "learning_rate": 9.977066431580212e-06, "loss": 0.34136262, "memory(GiB)": 15.03, "step": 2235, "train_speed(iter/s)": 1.451827 }, { "acc": 0.94598513, "epoch": 3.9541041482789057, "grad_norm": 8.186838150024414, "learning_rate": 9.976786118536382e-06, "loss": 0.31374192, "memory(GiB)": 15.03, "step": 2240, "train_speed(iter/s)": 1.451959 }, { "acc": 0.94927216, "epoch": 3.962930273609885, "grad_norm": 7.016834735870361, "learning_rate": 9.976504106760024e-06, "loss": 0.28411503, "memory(GiB)": 15.03, "step": 2245, "train_speed(iter/s)": 1.451799 }, { "acc": 0.94466734, "epoch": 3.971756398940865, "grad_norm": 7.712978839874268, "learning_rate": 9.976220396347406e-06, "loss": 0.31819024, "memory(GiB)": 15.03, "step": 2250, "train_speed(iter/s)": 1.451999 }, { "acc": 0.93759527, "epoch": 3.9805825242718447, "grad_norm": 11.348342895507812, "learning_rate": 9.975934987395382e-06, "loss": 0.38504801, "memory(GiB)": 15.03, "step": 2255, "train_speed(iter/s)": 1.452133 }, { "acc": 0.93606739, "epoch": 3.9894086496028245, "grad_norm": 8.484078407287598, "learning_rate": 9.975647880001374e-06, "loss": 0.39043746, "memory(GiB)": 15.03, "step": 2260, "train_speed(iter/s)": 1.452307 }, { "acc": 0.94718084, "epoch": 3.9982347749338043, "grad_norm": 10.411121368408203, "learning_rate": 9.975359074263396e-06, "loss": 0.30305748, "memory(GiB)": 15.03, "step": 2265, "train_speed(iter/s)": 1.452288 }, { "acc": 0.9435051, "epoch": 4.007060900264784, "grad_norm": 7.71547269821167, "learning_rate": 9.975068570280032e-06, "loss": 0.3041559, "memory(GiB)": 15.03, "step": 2270, "train_speed(iter/s)": 1.452058 }, { "acc": 0.94648037, "epoch": 4.015887025595763, "grad_norm": 10.329819679260254, "learning_rate": 9.974776368150452e-06, "loss": 0.30379939, "memory(GiB)": 15.03, "step": 2275, "train_speed(iter/s)": 1.4522 }, { "acc": 0.9479311, "epoch": 4.024713150926743, "grad_norm": 6.286107063293457, "learning_rate": 9.974482467974406e-06, "loss": 0.29272232, "memory(GiB)": 15.03, "step": 2280, "train_speed(iter/s)": 1.452098 }, { "acc": 0.95113354, "epoch": 4.033539276257723, "grad_norm": 7.296119689941406, "learning_rate": 9.974186869852218e-06, "loss": 0.270626, "memory(GiB)": 15.03, "step": 2285, "train_speed(iter/s)": 1.452235 }, { "acc": 0.95375395, "epoch": 4.042365401588703, "grad_norm": 5.868255138397217, "learning_rate": 9.973889573884794e-06, "loss": 0.28094878, "memory(GiB)": 15.03, "step": 2290, "train_speed(iter/s)": 1.45221 }, { "acc": 0.95006533, "epoch": 4.051191526919682, "grad_norm": 11.629537582397461, "learning_rate": 9.973590580173625e-06, "loss": 0.28958757, "memory(GiB)": 15.03, "step": 2295, "train_speed(iter/s)": 1.452172 }, { "acc": 0.95209246, "epoch": 4.060017652250662, "grad_norm": 13.971942901611328, "learning_rate": 9.973289888820774e-06, "loss": 0.26458087, "memory(GiB)": 15.03, "step": 2300, "train_speed(iter/s)": 1.452297 }, { "acc": 0.94166994, "epoch": 4.068843777581642, "grad_norm": 9.324275016784668, "learning_rate": 9.972987499928888e-06, "loss": 0.31910219, "memory(GiB)": 15.03, "step": 2305, "train_speed(iter/s)": 1.45227 }, { "acc": 0.93814611, "epoch": 4.077669902912621, "grad_norm": 8.166544914245605, "learning_rate": 9.972683413601191e-06, "loss": 0.38942082, "memory(GiB)": 15.03, "step": 2310, "train_speed(iter/s)": 1.452453 }, { "acc": 0.94114628, "epoch": 4.086496028243601, "grad_norm": 7.318504333496094, "learning_rate": 9.97237762994149e-06, "loss": 0.33835878, "memory(GiB)": 15.03, "step": 2315, "train_speed(iter/s)": 1.452598 }, { "acc": 0.94946842, "epoch": 4.095322153574581, "grad_norm": 7.230982780456543, "learning_rate": 9.972070149054165e-06, "loss": 0.28253841, "memory(GiB)": 15.03, "step": 2320, "train_speed(iter/s)": 1.452601 }, { "acc": 0.94167747, "epoch": 4.10414827890556, "grad_norm": 14.483214378356934, "learning_rate": 9.971760971044182e-06, "loss": 0.34756012, "memory(GiB)": 15.03, "step": 2325, "train_speed(iter/s)": 1.452668 }, { "acc": 0.95287247, "epoch": 4.112974404236541, "grad_norm": 6.371538162231445, "learning_rate": 9.971450096017084e-06, "loss": 0.28174186, "memory(GiB)": 15.03, "step": 2330, "train_speed(iter/s)": 1.452767 }, { "acc": 0.94936199, "epoch": 4.12180052956752, "grad_norm": 5.4877519607543945, "learning_rate": 9.971137524078993e-06, "loss": 0.27956462, "memory(GiB)": 15.03, "step": 2335, "train_speed(iter/s)": 1.452848 }, { "acc": 0.94989576, "epoch": 4.130626654898499, "grad_norm": 10.321344375610352, "learning_rate": 9.970823255336609e-06, "loss": 0.28006225, "memory(GiB)": 15.03, "step": 2340, "train_speed(iter/s)": 1.452946 }, { "acc": 0.95724287, "epoch": 4.13945278022948, "grad_norm": 8.259578704833984, "learning_rate": 9.970507289897214e-06, "loss": 0.26039498, "memory(GiB)": 15.03, "step": 2345, "train_speed(iter/s)": 1.452997 }, { "acc": 0.95693817, "epoch": 4.148278905560459, "grad_norm": 6.171472549438477, "learning_rate": 9.970189627868664e-06, "loss": 0.22587276, "memory(GiB)": 15.03, "step": 2350, "train_speed(iter/s)": 1.453055 }, { "acc": 0.9576088, "epoch": 4.157105030891438, "grad_norm": 6.691245079040527, "learning_rate": 9.969870269359403e-06, "loss": 0.2505249, "memory(GiB)": 15.03, "step": 2355, "train_speed(iter/s)": 1.453206 }, { "acc": 0.94827347, "epoch": 4.165931156222419, "grad_norm": 6.152040481567383, "learning_rate": 9.969549214478447e-06, "loss": 0.27251048, "memory(GiB)": 15.03, "step": 2360, "train_speed(iter/s)": 1.453273 }, { "acc": 0.9518754, "epoch": 4.174757281553398, "grad_norm": 17.10832977294922, "learning_rate": 9.969226463335391e-06, "loss": 0.27108324, "memory(GiB)": 15.03, "step": 2365, "train_speed(iter/s)": 1.45338 }, { "acc": 0.9506731, "epoch": 4.183583406884377, "grad_norm": 7.420350551605225, "learning_rate": 9.968902016040415e-06, "loss": 0.28457713, "memory(GiB)": 15.03, "step": 2370, "train_speed(iter/s)": 1.45333 }, { "acc": 0.95456047, "epoch": 4.192409532215358, "grad_norm": 6.412008762359619, "learning_rate": 9.968575872704271e-06, "loss": 0.25900297, "memory(GiB)": 15.03, "step": 2375, "train_speed(iter/s)": 1.453364 }, { "acc": 0.94933062, "epoch": 4.201235657546337, "grad_norm": 12.302229881286621, "learning_rate": 9.968248033438294e-06, "loss": 0.29013147, "memory(GiB)": 15.03, "step": 2380, "train_speed(iter/s)": 1.453398 }, { "acc": 0.95658407, "epoch": 4.210061782877317, "grad_norm": 6.212785720825195, "learning_rate": 9.967918498354397e-06, "loss": 0.21835859, "memory(GiB)": 15.03, "step": 2385, "train_speed(iter/s)": 1.453246 }, { "acc": 0.95129242, "epoch": 4.218887908208297, "grad_norm": 9.061625480651855, "learning_rate": 9.967587267565072e-06, "loss": 0.29959946, "memory(GiB)": 15.03, "step": 2390, "train_speed(iter/s)": 1.453243 }, { "acc": 0.94024639, "epoch": 4.227714033539276, "grad_norm": 8.313437461853027, "learning_rate": 9.967254341183392e-06, "loss": 0.36296177, "memory(GiB)": 15.03, "step": 2395, "train_speed(iter/s)": 1.453368 }, { "acc": 0.9404686, "epoch": 4.236540158870256, "grad_norm": 9.960396766662598, "learning_rate": 9.966919719323001e-06, "loss": 0.33676062, "memory(GiB)": 15.03, "step": 2400, "train_speed(iter/s)": 1.453392 }, { "acc": 0.95372505, "epoch": 4.245366284201236, "grad_norm": 6.622322082519531, "learning_rate": 9.966583402098131e-06, "loss": 0.24655163, "memory(GiB)": 15.03, "step": 2405, "train_speed(iter/s)": 1.453362 }, { "acc": 0.94309959, "epoch": 4.254192409532215, "grad_norm": 11.688154220581055, "learning_rate": 9.966245389623593e-06, "loss": 0.35374861, "memory(GiB)": 15.03, "step": 2410, "train_speed(iter/s)": 1.453577 }, { "acc": 0.94721956, "epoch": 4.263018534863195, "grad_norm": 8.215126037597656, "learning_rate": 9.965905682014765e-06, "loss": 0.334023, "memory(GiB)": 15.03, "step": 2415, "train_speed(iter/s)": 1.453693 }, { "acc": 0.94458761, "epoch": 4.271844660194175, "grad_norm": 9.543998718261719, "learning_rate": 9.965564279387617e-06, "loss": 0.31019869, "memory(GiB)": 15.03, "step": 2420, "train_speed(iter/s)": 1.453781 }, { "acc": 0.95345764, "epoch": 4.280670785525155, "grad_norm": 24.632150650024414, "learning_rate": 9.96522118185869e-06, "loss": 0.25234075, "memory(GiB)": 15.03, "step": 2425, "train_speed(iter/s)": 1.45372 }, { "acc": 0.95564165, "epoch": 4.289496910856134, "grad_norm": 7.4133076667785645, "learning_rate": 9.964876389545104e-06, "loss": 0.28770103, "memory(GiB)": 15.03, "step": 2430, "train_speed(iter/s)": 1.45359 }, { "acc": 0.95118828, "epoch": 4.298323036187114, "grad_norm": 7.117057800292969, "learning_rate": 9.964529902564563e-06, "loss": 0.27143567, "memory(GiB)": 15.03, "step": 2435, "train_speed(iter/s)": 1.453635 }, { "acc": 0.95610113, "epoch": 4.307149161518094, "grad_norm": 9.025492668151855, "learning_rate": 9.964181721035343e-06, "loss": 0.27309616, "memory(GiB)": 15.03, "step": 2440, "train_speed(iter/s)": 1.453719 }, { "acc": 0.9530983, "epoch": 4.315975286849073, "grad_norm": 9.101954460144043, "learning_rate": 9.963831845076302e-06, "loss": 0.28474705, "memory(GiB)": 15.03, "step": 2445, "train_speed(iter/s)": 1.453879 }, { "acc": 0.95939474, "epoch": 4.324801412180053, "grad_norm": 7.221519947052002, "learning_rate": 9.963480274806877e-06, "loss": 0.23118377, "memory(GiB)": 15.03, "step": 2450, "train_speed(iter/s)": 1.453957 }, { "acc": 0.95216322, "epoch": 4.333627537511033, "grad_norm": 7.679984092712402, "learning_rate": 9.963127010347078e-06, "loss": 0.26199081, "memory(GiB)": 15.03, "step": 2455, "train_speed(iter/s)": 1.453978 }, { "acc": 0.95247288, "epoch": 4.342453662842012, "grad_norm": 8.205707550048828, "learning_rate": 9.962772051817502e-06, "loss": 0.26661558, "memory(GiB)": 15.03, "step": 2460, "train_speed(iter/s)": 1.454082 }, { "acc": 0.94248238, "epoch": 4.351279788172992, "grad_norm": 24.695724487304688, "learning_rate": 9.962415399339316e-06, "loss": 0.36438227, "memory(GiB)": 15.03, "step": 2465, "train_speed(iter/s)": 1.454246 }, { "acc": 0.94882813, "epoch": 4.360105913503972, "grad_norm": 5.417104244232178, "learning_rate": 9.962057053034272e-06, "loss": 0.31933362, "memory(GiB)": 15.03, "step": 2470, "train_speed(iter/s)": 1.454354 }, { "acc": 0.95779953, "epoch": 4.368932038834951, "grad_norm": 6.5271100997924805, "learning_rate": 9.961697013024695e-06, "loss": 0.23744988, "memory(GiB)": 15.03, "step": 2475, "train_speed(iter/s)": 1.454446 }, { "acc": 0.95828571, "epoch": 4.3777581641659316, "grad_norm": 6.341363906860352, "learning_rate": 9.96133527943349e-06, "loss": 0.25762384, "memory(GiB)": 15.03, "step": 2480, "train_speed(iter/s)": 1.454491 }, { "acc": 0.94448872, "epoch": 4.386584289496911, "grad_norm": 7.749311923980713, "learning_rate": 9.960971852384141e-06, "loss": 0.31854582, "memory(GiB)": 15.03, "step": 2485, "train_speed(iter/s)": 1.454536 }, { "acc": 0.94361162, "epoch": 4.39541041482789, "grad_norm": 10.290121078491211, "learning_rate": 9.96060673200071e-06, "loss": 0.32936561, "memory(GiB)": 15.03, "step": 2490, "train_speed(iter/s)": 1.45455 }, { "acc": 0.947859, "epoch": 4.404236540158871, "grad_norm": 6.519283294677734, "learning_rate": 9.960239918407836e-06, "loss": 0.31235886, "memory(GiB)": 15.03, "step": 2495, "train_speed(iter/s)": 1.454494 }, { "acc": 0.96403809, "epoch": 4.41306266548985, "grad_norm": 7.348769187927246, "learning_rate": 9.959871411730737e-06, "loss": 0.21080439, "memory(GiB)": 15.03, "step": 2500, "train_speed(iter/s)": 1.454686 }, { "acc": 0.95505352, "epoch": 4.421888790820829, "grad_norm": 8.624689102172852, "learning_rate": 9.959501212095208e-06, "loss": 0.25547311, "memory(GiB)": 15.03, "step": 2505, "train_speed(iter/s)": 1.454634 }, { "acc": 0.95385895, "epoch": 4.43071491615181, "grad_norm": 7.038119792938232, "learning_rate": 9.95912931962762e-06, "loss": 0.27136614, "memory(GiB)": 15.03, "step": 2510, "train_speed(iter/s)": 1.4548 }, { "acc": 0.9655364, "epoch": 4.439541041482789, "grad_norm": 5.472497463226318, "learning_rate": 9.958755734454929e-06, "loss": 0.20156732, "memory(GiB)": 15.03, "step": 2515, "train_speed(iter/s)": 1.454677 }, { "acc": 0.94806414, "epoch": 4.448367166813769, "grad_norm": 11.569894790649414, "learning_rate": 9.958380456704662e-06, "loss": 0.31120868, "memory(GiB)": 15.03, "step": 2520, "train_speed(iter/s)": 1.454759 }, { "acc": 0.94849596, "epoch": 4.457193292144749, "grad_norm": 18.63300895690918, "learning_rate": 9.958003486504925e-06, "loss": 0.31181648, "memory(GiB)": 15.03, "step": 2525, "train_speed(iter/s)": 1.454837 }, { "acc": 0.95276756, "epoch": 4.466019417475728, "grad_norm": 9.761725425720215, "learning_rate": 9.957624823984401e-06, "loss": 0.28994775, "memory(GiB)": 15.03, "step": 2530, "train_speed(iter/s)": 1.454688 }, { "acc": 0.95929279, "epoch": 4.474845542806708, "grad_norm": 9.830881118774414, "learning_rate": 9.957244469272359e-06, "loss": 0.23710299, "memory(GiB)": 15.03, "step": 2535, "train_speed(iter/s)": 1.454871 }, { "acc": 0.94808083, "epoch": 4.483671668137688, "grad_norm": 5.861537456512451, "learning_rate": 9.956862422498632e-06, "loss": 0.29095936, "memory(GiB)": 15.03, "step": 2540, "train_speed(iter/s)": 1.454747 }, { "acc": 0.94644794, "epoch": 4.492497793468667, "grad_norm": 10.240289688110352, "learning_rate": 9.95647868379364e-06, "loss": 0.31686709, "memory(GiB)": 15.03, "step": 2545, "train_speed(iter/s)": 1.454826 }, { "acc": 0.96074715, "epoch": 4.501323918799647, "grad_norm": 6.983892917633057, "learning_rate": 9.956093253288381e-06, "loss": 0.2161886, "memory(GiB)": 15.03, "step": 2550, "train_speed(iter/s)": 1.454893 }, { "acc": 0.95755711, "epoch": 4.510150044130627, "grad_norm": 6.969470977783203, "learning_rate": 9.955706131114422e-06, "loss": 0.24622221, "memory(GiB)": 15.03, "step": 2555, "train_speed(iter/s)": 1.454904 }, { "acc": 0.96285763, "epoch": 4.518976169461606, "grad_norm": 7.886877536773682, "learning_rate": 9.955317317403917e-06, "loss": 0.21033974, "memory(GiB)": 15.03, "step": 2560, "train_speed(iter/s)": 1.454993 }, { "acc": 0.95011864, "epoch": 4.527802294792586, "grad_norm": 9.81740951538086, "learning_rate": 9.954926812289593e-06, "loss": 0.29648571, "memory(GiB)": 15.03, "step": 2565, "train_speed(iter/s)": 1.455198 }, { "acc": 0.95745754, "epoch": 4.536628420123566, "grad_norm": 10.420695304870605, "learning_rate": 9.954534615904753e-06, "loss": 0.27124996, "memory(GiB)": 15.03, "step": 2570, "train_speed(iter/s)": 1.455321 }, { "acc": 0.95360088, "epoch": 4.545454545454545, "grad_norm": 7.898953437805176, "learning_rate": 9.954140728383285e-06, "loss": 0.28436804, "memory(GiB)": 15.03, "step": 2575, "train_speed(iter/s)": 1.455448 }, { "acc": 0.94966316, "epoch": 4.554280670785525, "grad_norm": 17.082504272460938, "learning_rate": 9.953745149859643e-06, "loss": 0.30028443, "memory(GiB)": 15.03, "step": 2580, "train_speed(iter/s)": 1.455437 }, { "acc": 0.94467144, "epoch": 4.563106796116505, "grad_norm": 7.647613525390625, "learning_rate": 9.953347880468865e-06, "loss": 0.33703775, "memory(GiB)": 15.03, "step": 2585, "train_speed(iter/s)": 1.455562 }, { "acc": 0.95167084, "epoch": 4.571932921447485, "grad_norm": 8.778693199157715, "learning_rate": 9.952948920346567e-06, "loss": 0.26197152, "memory(GiB)": 15.03, "step": 2590, "train_speed(iter/s)": 1.455519 }, { "acc": 0.95392981, "epoch": 4.580759046778464, "grad_norm": 5.183188438415527, "learning_rate": 9.952548269628937e-06, "loss": 0.28058662, "memory(GiB)": 15.03, "step": 2595, "train_speed(iter/s)": 1.455628 }, { "acc": 0.95227757, "epoch": 4.589585172109444, "grad_norm": 11.897231101989746, "learning_rate": 9.952145928452746e-06, "loss": 0.28573675, "memory(GiB)": 15.03, "step": 2600, "train_speed(iter/s)": 1.455611 }, { "acc": 0.95740499, "epoch": 4.598411297440424, "grad_norm": 6.970617294311523, "learning_rate": 9.95174189695534e-06, "loss": 0.23273048, "memory(GiB)": 15.03, "step": 2605, "train_speed(iter/s)": 1.455591 }, { "acc": 0.95190248, "epoch": 4.607237422771403, "grad_norm": 7.811476707458496, "learning_rate": 9.951336175274638e-06, "loss": 0.29578736, "memory(GiB)": 15.03, "step": 2610, "train_speed(iter/s)": 1.455583 }, { "acc": 0.95036144, "epoch": 4.6160635481023835, "grad_norm": 8.821516990661621, "learning_rate": 9.950928763549144e-06, "loss": 0.27307365, "memory(GiB)": 15.03, "step": 2615, "train_speed(iter/s)": 1.455722 }, { "acc": 0.94754715, "epoch": 4.624889673433363, "grad_norm": 9.80310344696045, "learning_rate": 9.95051966191793e-06, "loss": 0.26825464, "memory(GiB)": 15.03, "step": 2620, "train_speed(iter/s)": 1.455736 }, { "acc": 0.95963678, "epoch": 4.633715798764342, "grad_norm": 4.8686628341674805, "learning_rate": 9.95010887052065e-06, "loss": 0.22348704, "memory(GiB)": 15.03, "step": 2625, "train_speed(iter/s)": 1.455878 }, { "acc": 0.95339155, "epoch": 4.6425419240953225, "grad_norm": 7.078245162963867, "learning_rate": 9.949696389497536e-06, "loss": 0.25512629, "memory(GiB)": 15.03, "step": 2630, "train_speed(iter/s)": 1.455971 }, { "acc": 0.94597092, "epoch": 4.651368049426302, "grad_norm": 7.566152095794678, "learning_rate": 9.949282218989395e-06, "loss": 0.32219687, "memory(GiB)": 15.03, "step": 2635, "train_speed(iter/s)": 1.456096 }, { "acc": 0.95862389, "epoch": 4.660194174757281, "grad_norm": 5.044859409332275, "learning_rate": 9.94886635913761e-06, "loss": 0.23519142, "memory(GiB)": 15.03, "step": 2640, "train_speed(iter/s)": 1.456143 }, { "acc": 0.95227375, "epoch": 4.6690203000882615, "grad_norm": 5.407356262207031, "learning_rate": 9.948448810084138e-06, "loss": 0.29148459, "memory(GiB)": 15.03, "step": 2645, "train_speed(iter/s)": 1.456173 }, { "acc": 0.95721998, "epoch": 4.677846425419241, "grad_norm": 6.724000930786133, "learning_rate": 9.94802957197152e-06, "loss": 0.24914296, "memory(GiB)": 15.03, "step": 2650, "train_speed(iter/s)": 1.45613 }, { "acc": 0.94818649, "epoch": 4.68667255075022, "grad_norm": 10.073929786682129, "learning_rate": 9.947608644942866e-06, "loss": 0.31240144, "memory(GiB)": 15.03, "step": 2655, "train_speed(iter/s)": 1.456157 }, { "acc": 0.953617, "epoch": 4.6954986760812005, "grad_norm": 7.673678874969482, "learning_rate": 9.947186029141869e-06, "loss": 0.22745986, "memory(GiB)": 15.03, "step": 2660, "train_speed(iter/s)": 1.456229 }, { "acc": 0.95890293, "epoch": 4.70432480141218, "grad_norm": 7.9076690673828125, "learning_rate": 9.946761724712795e-06, "loss": 0.23822155, "memory(GiB)": 15.03, "step": 2665, "train_speed(iter/s)": 1.456452 }, { "acc": 0.95099144, "epoch": 4.713150926743159, "grad_norm": 7.197851181030273, "learning_rate": 9.946335731800487e-06, "loss": 0.29876471, "memory(GiB)": 15.03, "step": 2670, "train_speed(iter/s)": 1.456493 }, { "acc": 0.95867462, "epoch": 4.7219770520741395, "grad_norm": 22.1085205078125, "learning_rate": 9.945908050550361e-06, "loss": 0.23151522, "memory(GiB)": 15.03, "step": 2675, "train_speed(iter/s)": 1.456623 }, { "acc": 0.95565376, "epoch": 4.730803177405119, "grad_norm": 7.94164514541626, "learning_rate": 9.945478681108417e-06, "loss": 0.23973765, "memory(GiB)": 15.03, "step": 2680, "train_speed(iter/s)": 1.456597 }, { "acc": 0.93865194, "epoch": 4.739629302736099, "grad_norm": 7.179780006408691, "learning_rate": 9.945047623621225e-06, "loss": 0.33746417, "memory(GiB)": 15.03, "step": 2685, "train_speed(iter/s)": 1.456669 }, { "acc": 0.95661869, "epoch": 4.7484554280670785, "grad_norm": 6.754254341125488, "learning_rate": 9.944614878235931e-06, "loss": 0.27477582, "memory(GiB)": 15.03, "step": 2690, "train_speed(iter/s)": 1.456726 }, { "acc": 0.96107645, "epoch": 4.757281553398058, "grad_norm": 6.484004974365234, "learning_rate": 9.944180445100265e-06, "loss": 0.22401438, "memory(GiB)": 15.03, "step": 2695, "train_speed(iter/s)": 1.456844 }, { "acc": 0.96235962, "epoch": 4.766107678729038, "grad_norm": 7.17456579208374, "learning_rate": 9.943744324362524e-06, "loss": 0.21103954, "memory(GiB)": 15.03, "step": 2700, "train_speed(iter/s)": 1.456881 }, { "acc": 0.95050144, "epoch": 4.7749338040600176, "grad_norm": 12.103436470031738, "learning_rate": 9.943306516171581e-06, "loss": 0.27964025, "memory(GiB)": 15.03, "step": 2705, "train_speed(iter/s)": 1.457051 }, { "acc": 0.96226053, "epoch": 4.783759929390998, "grad_norm": 5.941858291625977, "learning_rate": 9.942867020676895e-06, "loss": 0.21517711, "memory(GiB)": 15.03, "step": 2710, "train_speed(iter/s)": 1.457097 }, { "acc": 0.95962486, "epoch": 4.792586054721977, "grad_norm": 7.77577543258667, "learning_rate": 9.94242583802849e-06, "loss": 0.22689824, "memory(GiB)": 15.03, "step": 2715, "train_speed(iter/s)": 1.457094 }, { "acc": 0.95733871, "epoch": 4.801412180052957, "grad_norm": 15.456400871276855, "learning_rate": 9.941982968376974e-06, "loss": 0.24934988, "memory(GiB)": 15.03, "step": 2720, "train_speed(iter/s)": 1.457153 }, { "acc": 0.94900665, "epoch": 4.810238305383937, "grad_norm": 11.27192211151123, "learning_rate": 9.941538411873527e-06, "loss": 0.33550599, "memory(GiB)": 15.03, "step": 2725, "train_speed(iter/s)": 1.457187 }, { "acc": 0.95949621, "epoch": 4.819064430714916, "grad_norm": 7.358863353729248, "learning_rate": 9.941092168669903e-06, "loss": 0.22803726, "memory(GiB)": 15.03, "step": 2730, "train_speed(iter/s)": 1.457336 }, { "acc": 0.95571918, "epoch": 4.827890556045896, "grad_norm": 6.782904624938965, "learning_rate": 9.940644238918434e-06, "loss": 0.25851884, "memory(GiB)": 15.03, "step": 2735, "train_speed(iter/s)": 1.457377 }, { "acc": 0.9626852, "epoch": 4.836716681376876, "grad_norm": 5.639491558074951, "learning_rate": 9.940194622772032e-06, "loss": 0.23794026, "memory(GiB)": 15.03, "step": 2740, "train_speed(iter/s)": 1.457561 }, { "acc": 0.95561562, "epoch": 4.845542806707855, "grad_norm": 6.627401351928711, "learning_rate": 9.939743320384173e-06, "loss": 0.25254743, "memory(GiB)": 15.03, "step": 2745, "train_speed(iter/s)": 1.457552 }, { "acc": 0.94838905, "epoch": 4.854368932038835, "grad_norm": 9.950433731079102, "learning_rate": 9.939290331908924e-06, "loss": 0.29593277, "memory(GiB)": 15.03, "step": 2750, "train_speed(iter/s)": 1.457648 }, { "acc": 0.96342564, "epoch": 4.863195057369815, "grad_norm": 5.889432907104492, "learning_rate": 9.938835657500915e-06, "loss": 0.20188272, "memory(GiB)": 15.03, "step": 2755, "train_speed(iter/s)": 1.457877 }, { "acc": 0.95565891, "epoch": 4.872021182700794, "grad_norm": 6.489813327789307, "learning_rate": 9.938379297315356e-06, "loss": 0.27044868, "memory(GiB)": 15.03, "step": 2760, "train_speed(iter/s)": 1.457784 }, { "acc": 0.96317997, "epoch": 4.880847308031774, "grad_norm": 6.941957950592041, "learning_rate": 9.937921251508037e-06, "loss": 0.2346103, "memory(GiB)": 15.03, "step": 2765, "train_speed(iter/s)": 1.457688 }, { "acc": 0.96266403, "epoch": 4.889673433362754, "grad_norm": 10.423484802246094, "learning_rate": 9.937461520235315e-06, "loss": 0.22318115, "memory(GiB)": 15.03, "step": 2770, "train_speed(iter/s)": 1.457641 }, { "acc": 0.95703316, "epoch": 4.898499558693733, "grad_norm": 9.297316551208496, "learning_rate": 9.937000103654127e-06, "loss": 0.23628032, "memory(GiB)": 15.03, "step": 2775, "train_speed(iter/s)": 1.457599 }, { "acc": 0.95408211, "epoch": 4.9073256840247135, "grad_norm": 27.076980590820312, "learning_rate": 9.936537001921984e-06, "loss": 0.26286807, "memory(GiB)": 15.03, "step": 2780, "train_speed(iter/s)": 1.457702 }, { "acc": 0.96629391, "epoch": 4.916151809355693, "grad_norm": 5.2883782386779785, "learning_rate": 9.936072215196975e-06, "loss": 0.21739926, "memory(GiB)": 15.03, "step": 2785, "train_speed(iter/s)": 1.457788 }, { "acc": 0.95772543, "epoch": 4.924977934686672, "grad_norm": 7.134164333343506, "learning_rate": 9.93560574363776e-06, "loss": 0.27040792, "memory(GiB)": 15.03, "step": 2790, "train_speed(iter/s)": 1.457816 }, { "acc": 0.95764771, "epoch": 4.9338040600176525, "grad_norm": 6.568739891052246, "learning_rate": 9.93513758740358e-06, "loss": 0.23189082, "memory(GiB)": 15.03, "step": 2795, "train_speed(iter/s)": 1.457891 }, { "acc": 0.96655579, "epoch": 4.942630185348632, "grad_norm": 9.741599082946777, "learning_rate": 9.934667746654244e-06, "loss": 0.20842149, "memory(GiB)": 15.03, "step": 2800, "train_speed(iter/s)": 1.45811 }, { "acc": 0.95553398, "epoch": 4.951456310679612, "grad_norm": 4.308427810668945, "learning_rate": 9.93419622155014e-06, "loss": 0.24069483, "memory(GiB)": 15.03, "step": 2805, "train_speed(iter/s)": 1.458216 }, { "acc": 0.96095285, "epoch": 4.9602824360105915, "grad_norm": 9.193133354187012, "learning_rate": 9.93372301225223e-06, "loss": 0.23180938, "memory(GiB)": 15.03, "step": 2810, "train_speed(iter/s)": 1.458155 }, { "acc": 0.96161461, "epoch": 4.969108561341571, "grad_norm": 5.967164039611816, "learning_rate": 9.933248118922053e-06, "loss": 0.22086644, "memory(GiB)": 15.03, "step": 2815, "train_speed(iter/s)": 1.458312 }, { "acc": 0.96054249, "epoch": 4.977934686672551, "grad_norm": 10.045555114746094, "learning_rate": 9.93277154172172e-06, "loss": 0.21658187, "memory(GiB)": 15.03, "step": 2820, "train_speed(iter/s)": 1.458301 }, { "acc": 0.95030174, "epoch": 4.9867608120035305, "grad_norm": 9.851446151733398, "learning_rate": 9.93229328081392e-06, "loss": 0.29286373, "memory(GiB)": 15.03, "step": 2825, "train_speed(iter/s)": 1.458374 }, { "acc": 0.95938005, "epoch": 4.99558693733451, "grad_norm": 6.212740898132324, "learning_rate": 9.93181333636191e-06, "loss": 0.22334375, "memory(GiB)": 15.03, "step": 2830, "train_speed(iter/s)": 1.458385 }, { "acc": 0.96136608, "epoch": 5.00441306266549, "grad_norm": 7.102124214172363, "learning_rate": 9.931331708529532e-06, "loss": 0.22554502, "memory(GiB)": 15.03, "step": 2835, "train_speed(iter/s)": 1.458192 }, { "acc": 0.95267344, "epoch": 5.0132391879964695, "grad_norm": 9.255271911621094, "learning_rate": 9.930848397481196e-06, "loss": 0.23222003, "memory(GiB)": 15.03, "step": 2840, "train_speed(iter/s)": 1.458146 }, { "acc": 0.96112537, "epoch": 5.022065313327449, "grad_norm": 7.275452613830566, "learning_rate": 9.930363403381883e-06, "loss": 0.20642526, "memory(GiB)": 15.03, "step": 2845, "train_speed(iter/s)": 1.45822 }, { "acc": 0.95925102, "epoch": 5.030891438658429, "grad_norm": 4.475017547607422, "learning_rate": 9.92987672639716e-06, "loss": 0.23783765, "memory(GiB)": 15.03, "step": 2850, "train_speed(iter/s)": 1.458418 }, { "acc": 0.96239367, "epoch": 5.0397175639894085, "grad_norm": 10.786535263061523, "learning_rate": 9.929388366693157e-06, "loss": 0.23039305, "memory(GiB)": 15.03, "step": 2855, "train_speed(iter/s)": 1.458482 }, { "acc": 0.9542345, "epoch": 5.048543689320389, "grad_norm": 8.236555099487305, "learning_rate": 9.928898324436584e-06, "loss": 0.28167572, "memory(GiB)": 15.03, "step": 2860, "train_speed(iter/s)": 1.458508 }, { "acc": 0.95977764, "epoch": 5.057369814651368, "grad_norm": 7.027607440948486, "learning_rate": 9.928406599794727e-06, "loss": 0.25067267, "memory(GiB)": 15.03, "step": 2865, "train_speed(iter/s)": 1.458563 }, { "acc": 0.95859938, "epoch": 5.0661959399823475, "grad_norm": 5.945766925811768, "learning_rate": 9.927913192935443e-06, "loss": 0.25271187, "memory(GiB)": 15.03, "step": 2870, "train_speed(iter/s)": 1.458589 }, { "acc": 0.96676712, "epoch": 5.075022065313328, "grad_norm": 8.781973838806152, "learning_rate": 9.92741810402716e-06, "loss": 0.19940479, "memory(GiB)": 15.03, "step": 2875, "train_speed(iter/s)": 1.458579 }, { "acc": 0.95827274, "epoch": 5.083848190644307, "grad_norm": 6.718268394470215, "learning_rate": 9.92692133323889e-06, "loss": 0.20581045, "memory(GiB)": 15.03, "step": 2880, "train_speed(iter/s)": 1.458562 }, { "acc": 0.96052332, "epoch": 5.0926743159752865, "grad_norm": 8.680930137634277, "learning_rate": 9.926422880740209e-06, "loss": 0.23939321, "memory(GiB)": 15.03, "step": 2885, "train_speed(iter/s)": 1.458625 }, { "acc": 0.96763687, "epoch": 5.101500441306267, "grad_norm": 5.222537994384766, "learning_rate": 9.925922746701276e-06, "loss": 0.19797628, "memory(GiB)": 15.03, "step": 2890, "train_speed(iter/s)": 1.458615 }, { "acc": 0.95936594, "epoch": 5.110326566637246, "grad_norm": 6.874533176422119, "learning_rate": 9.925420931292813e-06, "loss": 0.20381694, "memory(GiB)": 15.03, "step": 2895, "train_speed(iter/s)": 1.458634 }, { "acc": 0.95923462, "epoch": 5.1191526919682255, "grad_norm": 6.786142349243164, "learning_rate": 9.92491743468613e-06, "loss": 0.24903226, "memory(GiB)": 15.03, "step": 2900, "train_speed(iter/s)": 1.45868 }, { "acc": 0.95748711, "epoch": 5.127978817299206, "grad_norm": 9.147269248962402, "learning_rate": 9.924412257053097e-06, "loss": 0.23977871, "memory(GiB)": 15.03, "step": 2905, "train_speed(iter/s)": 1.458666 }, { "acc": 0.9498764, "epoch": 5.136804942630185, "grad_norm": 6.857240200042725, "learning_rate": 9.92390539856617e-06, "loss": 0.31009216, "memory(GiB)": 15.03, "step": 2910, "train_speed(iter/s)": 1.458714 }, { "acc": 0.95531378, "epoch": 5.145631067961165, "grad_norm": 6.65079927444458, "learning_rate": 9.923396859398368e-06, "loss": 0.25982099, "memory(GiB)": 15.03, "step": 2915, "train_speed(iter/s)": 1.458668 }, { "acc": 0.95894823, "epoch": 5.154457193292145, "grad_norm": 8.637700080871582, "learning_rate": 9.922886639723291e-06, "loss": 0.25512354, "memory(GiB)": 15.03, "step": 2920, "train_speed(iter/s)": 1.458615 }, { "acc": 0.96156025, "epoch": 5.163283318623124, "grad_norm": 10.11424732208252, "learning_rate": 9.92237473971511e-06, "loss": 0.22729607, "memory(GiB)": 15.03, "step": 2925, "train_speed(iter/s)": 1.458721 }, { "acc": 0.96064291, "epoch": 5.172109443954104, "grad_norm": 7.139074325561523, "learning_rate": 9.92186115954857e-06, "loss": 0.24821122, "memory(GiB)": 15.03, "step": 2930, "train_speed(iter/s)": 1.458753 }, { "acc": 0.95921555, "epoch": 5.180935569285084, "grad_norm": 5.037779331207275, "learning_rate": 9.92134589939899e-06, "loss": 0.24583161, "memory(GiB)": 15.03, "step": 2935, "train_speed(iter/s)": 1.458633 }, { "acc": 0.96070538, "epoch": 5.189761694616063, "grad_norm": 3.6432435512542725, "learning_rate": 9.920828959442264e-06, "loss": 0.2505111, "memory(GiB)": 15.03, "step": 2940, "train_speed(iter/s)": 1.458752 }, { "acc": 0.95951214, "epoch": 5.198587819947043, "grad_norm": 6.090794086456299, "learning_rate": 9.920310339854851e-06, "loss": 0.27246752, "memory(GiB)": 15.03, "step": 2945, "train_speed(iter/s)": 1.458802 }, { "acc": 0.96541462, "epoch": 5.207413945278023, "grad_norm": 5.845771312713623, "learning_rate": 9.919790040813795e-06, "loss": 0.20088706, "memory(GiB)": 15.03, "step": 2950, "train_speed(iter/s)": 1.458796 }, { "acc": 0.95780659, "epoch": 5.216240070609003, "grad_norm": 8.94002914428711, "learning_rate": 9.91926806249671e-06, "loss": 0.24739788, "memory(GiB)": 15.03, "step": 2955, "train_speed(iter/s)": 1.458882 }, { "acc": 0.96791992, "epoch": 5.2250661959399824, "grad_norm": 7.209283351898193, "learning_rate": 9.918744405081776e-06, "loss": 0.18528726, "memory(GiB)": 15.03, "step": 2960, "train_speed(iter/s)": 1.458843 }, { "acc": 0.96549034, "epoch": 5.233892321270962, "grad_norm": 6.161664962768555, "learning_rate": 9.918219068747757e-06, "loss": 0.19029546, "memory(GiB)": 15.03, "step": 2965, "train_speed(iter/s)": 1.459003 }, { "acc": 0.96089029, "epoch": 5.242718446601942, "grad_norm": 6.483946800231934, "learning_rate": 9.91769205367398e-06, "loss": 0.24180562, "memory(GiB)": 15.03, "step": 2970, "train_speed(iter/s)": 1.459011 }, { "acc": 0.97011471, "epoch": 5.2515445719329215, "grad_norm": 4.41859245300293, "learning_rate": 9.917163360040354e-06, "loss": 0.19432884, "memory(GiB)": 15.03, "step": 2975, "train_speed(iter/s)": 1.459042 }, { "acc": 0.95835838, "epoch": 5.260370697263901, "grad_norm": 7.561718463897705, "learning_rate": 9.916632988027351e-06, "loss": 0.22048419, "memory(GiB)": 15.03, "step": 2980, "train_speed(iter/s)": 1.459005 }, { "acc": 0.9575469, "epoch": 5.269196822594881, "grad_norm": 11.062121391296387, "learning_rate": 9.916100937816027e-06, "loss": 0.25808859, "memory(GiB)": 15.03, "step": 2985, "train_speed(iter/s)": 1.459045 }, { "acc": 0.95597324, "epoch": 5.2780229479258605, "grad_norm": 7.390096187591553, "learning_rate": 9.915567209588005e-06, "loss": 0.26805921, "memory(GiB)": 15.03, "step": 2990, "train_speed(iter/s)": 1.459175 }, { "acc": 0.970123, "epoch": 5.28684907325684, "grad_norm": 12.17945671081543, "learning_rate": 9.915031803525478e-06, "loss": 0.18847357, "memory(GiB)": 15.03, "step": 2995, "train_speed(iter/s)": 1.45935 }, { "acc": 0.95640221, "epoch": 5.29567519858782, "grad_norm": 8.737335205078125, "learning_rate": 9.914494719811221e-06, "loss": 0.26770344, "memory(GiB)": 15.03, "step": 3000, "train_speed(iter/s)": 1.459408 }, { "acc": 0.96373587, "epoch": 5.3045013239187995, "grad_norm": 6.653964996337891, "learning_rate": 9.913955958628568e-06, "loss": 0.22062166, "memory(GiB)": 15.03, "step": 3005, "train_speed(iter/s)": 1.459387 }, { "acc": 0.96746693, "epoch": 5.31332744924978, "grad_norm": 4.1352858543396, "learning_rate": 9.91341552016144e-06, "loss": 0.1932224, "memory(GiB)": 15.03, "step": 3010, "train_speed(iter/s)": 1.459377 }, { "acc": 0.96645432, "epoch": 5.322153574580759, "grad_norm": 5.324366569519043, "learning_rate": 9.912873404594322e-06, "loss": 0.19796573, "memory(GiB)": 15.03, "step": 3015, "train_speed(iter/s)": 1.459323 }, { "acc": 0.96610432, "epoch": 5.3309796999117385, "grad_norm": 5.169738292694092, "learning_rate": 9.912329612112272e-06, "loss": 0.21726012, "memory(GiB)": 15.03, "step": 3020, "train_speed(iter/s)": 1.45946 }, { "acc": 0.9683671, "epoch": 5.339805825242719, "grad_norm": 4.973207473754883, "learning_rate": 9.911784142900925e-06, "loss": 0.16620979, "memory(GiB)": 15.03, "step": 3025, "train_speed(iter/s)": 1.45937 }, { "acc": 0.96756935, "epoch": 5.348631950573698, "grad_norm": 7.409580230712891, "learning_rate": 9.911236997146485e-06, "loss": 0.18403714, "memory(GiB)": 15.03, "step": 3030, "train_speed(iter/s)": 1.459365 }, { "acc": 0.96924877, "epoch": 5.3574580759046775, "grad_norm": 7.712068557739258, "learning_rate": 9.910688175035725e-06, "loss": 0.19775486, "memory(GiB)": 15.03, "step": 3035, "train_speed(iter/s)": 1.459329 }, { "acc": 0.96874504, "epoch": 5.366284201235658, "grad_norm": 5.824734687805176, "learning_rate": 9.910137676755997e-06, "loss": 0.21049676, "memory(GiB)": 15.03, "step": 3040, "train_speed(iter/s)": 1.459327 }, { "acc": 0.95673695, "epoch": 5.375110326566637, "grad_norm": 6.582108974456787, "learning_rate": 9.909585502495223e-06, "loss": 0.27462144, "memory(GiB)": 15.03, "step": 3045, "train_speed(iter/s)": 1.459358 }, { "acc": 0.9682539, "epoch": 5.3839364518976165, "grad_norm": 3.955803871154785, "learning_rate": 9.909031652441892e-06, "loss": 0.18456925, "memory(GiB)": 15.03, "step": 3050, "train_speed(iter/s)": 1.45935 }, { "acc": 0.96503267, "epoch": 5.392762577228597, "grad_norm": 8.494742393493652, "learning_rate": 9.908476126785075e-06, "loss": 0.20171645, "memory(GiB)": 15.03, "step": 3055, "train_speed(iter/s)": 1.459526 }, { "acc": 0.96539917, "epoch": 5.401588702559576, "grad_norm": 5.881004810333252, "learning_rate": 9.907918925714407e-06, "loss": 0.21811543, "memory(GiB)": 15.03, "step": 3060, "train_speed(iter/s)": 1.459649 }, { "acc": 0.95660286, "epoch": 5.410414827890556, "grad_norm": 7.0092854499816895, "learning_rate": 9.907360049420093e-06, "loss": 0.26228652, "memory(GiB)": 15.03, "step": 3065, "train_speed(iter/s)": 1.459751 }, { "acc": 0.9729866, "epoch": 5.419240953221536, "grad_norm": 5.535867214202881, "learning_rate": 9.906799498092922e-06, "loss": 0.17481145, "memory(GiB)": 15.03, "step": 3070, "train_speed(iter/s)": 1.459811 }, { "acc": 0.9577281, "epoch": 5.428067078552515, "grad_norm": 10.796890258789062, "learning_rate": 9.90623727192424e-06, "loss": 0.22796197, "memory(GiB)": 15.03, "step": 3075, "train_speed(iter/s)": 1.459857 }, { "acc": 0.96504889, "epoch": 5.436893203883495, "grad_norm": 9.713578224182129, "learning_rate": 9.905673371105973e-06, "loss": 0.18961357, "memory(GiB)": 15.03, "step": 3080, "train_speed(iter/s)": 1.459825 }, { "acc": 0.96526718, "epoch": 5.445719329214475, "grad_norm": 12.686463356018066, "learning_rate": 9.905107795830622e-06, "loss": 0.22011738, "memory(GiB)": 15.03, "step": 3085, "train_speed(iter/s)": 1.459947 }, { "acc": 0.95567551, "epoch": 5.454545454545454, "grad_norm": 6.805801868438721, "learning_rate": 9.90454054629125e-06, "loss": 0.26008418, "memory(GiB)": 15.03, "step": 3090, "train_speed(iter/s)": 1.460011 }, { "acc": 0.96485548, "epoch": 5.463371579876434, "grad_norm": 6.182193756103516, "learning_rate": 9.903971622681498e-06, "loss": 0.22443993, "memory(GiB)": 15.03, "step": 3095, "train_speed(iter/s)": 1.460135 }, { "acc": 0.9641448, "epoch": 5.472197705207414, "grad_norm": 5.995253562927246, "learning_rate": 9.903401025195578e-06, "loss": 0.21070478, "memory(GiB)": 15.03, "step": 3100, "train_speed(iter/s)": 1.460268 }, { "acc": 0.97012634, "epoch": 5.481023830538394, "grad_norm": 6.134598255157471, "learning_rate": 9.90282875402827e-06, "loss": 0.20684838, "memory(GiB)": 15.03, "step": 3105, "train_speed(iter/s)": 1.460357 }, { "acc": 0.96258945, "epoch": 5.489849955869373, "grad_norm": 6.7752366065979, "learning_rate": 9.902254809374929e-06, "loss": 0.25690956, "memory(GiB)": 15.03, "step": 3110, "train_speed(iter/s)": 1.460369 }, { "acc": 0.96798058, "epoch": 5.498676081200353, "grad_norm": 5.67679500579834, "learning_rate": 9.90167919143148e-06, "loss": 0.19349493, "memory(GiB)": 15.03, "step": 3115, "train_speed(iter/s)": 1.460412 }, { "acc": 0.97016211, "epoch": 5.507502206531333, "grad_norm": 7.724889755249023, "learning_rate": 9.90110190039442e-06, "loss": 0.15805256, "memory(GiB)": 15.03, "step": 3120, "train_speed(iter/s)": 1.460462 }, { "acc": 0.96848392, "epoch": 5.516328331862312, "grad_norm": 15.463862419128418, "learning_rate": 9.900522936460814e-06, "loss": 0.18812046, "memory(GiB)": 15.03, "step": 3125, "train_speed(iter/s)": 1.460473 }, { "acc": 0.96374283, "epoch": 5.525154457193292, "grad_norm": 3.1568658351898193, "learning_rate": 9.899942299828301e-06, "loss": 0.19745121, "memory(GiB)": 15.03, "step": 3130, "train_speed(iter/s)": 1.460558 }, { "acc": 0.96831207, "epoch": 5.533980582524272, "grad_norm": 9.205622673034668, "learning_rate": 9.899359990695092e-06, "loss": 0.17553681, "memory(GiB)": 15.03, "step": 3135, "train_speed(iter/s)": 1.460608 }, { "acc": 0.95882607, "epoch": 5.542806707855251, "grad_norm": 9.263728141784668, "learning_rate": 9.898776009259964e-06, "loss": 0.25710907, "memory(GiB)": 15.03, "step": 3140, "train_speed(iter/s)": 1.46067 }, { "acc": 0.9707716, "epoch": 5.551632833186231, "grad_norm": 4.538776874542236, "learning_rate": 9.898190355722273e-06, "loss": 0.17631458, "memory(GiB)": 15.03, "step": 3145, "train_speed(iter/s)": 1.460712 }, { "acc": 0.96327839, "epoch": 5.560458958517211, "grad_norm": 9.908318519592285, "learning_rate": 9.897603030281936e-06, "loss": 0.25069942, "memory(GiB)": 15.03, "step": 3150, "train_speed(iter/s)": 1.46078 }, { "acc": 0.97016563, "epoch": 5.56928508384819, "grad_norm": 6.323144435882568, "learning_rate": 9.897014033139448e-06, "loss": 0.18773868, "memory(GiB)": 15.03, "step": 3155, "train_speed(iter/s)": 1.460745 }, { "acc": 0.96283779, "epoch": 5.578111209179171, "grad_norm": 6.585527420043945, "learning_rate": 9.896423364495874e-06, "loss": 0.22865107, "memory(GiB)": 15.03, "step": 3160, "train_speed(iter/s)": 1.460891 }, { "acc": 0.96513176, "epoch": 5.58693733451015, "grad_norm": 6.46642541885376, "learning_rate": 9.895831024552845e-06, "loss": 0.18894553, "memory(GiB)": 15.03, "step": 3165, "train_speed(iter/s)": 1.46086 }, { "acc": 0.96771917, "epoch": 5.595763459841129, "grad_norm": 7.056471347808838, "learning_rate": 9.895237013512567e-06, "loss": 0.2054852, "memory(GiB)": 15.03, "step": 3170, "train_speed(iter/s)": 1.461036 }, { "acc": 0.97642879, "epoch": 5.60458958517211, "grad_norm": 6.038671016693115, "learning_rate": 9.89464133157781e-06, "loss": 0.15032374, "memory(GiB)": 15.03, "step": 3175, "train_speed(iter/s)": 1.461133 }, { "acc": 0.97377329, "epoch": 5.613415710503089, "grad_norm": 8.217158317565918, "learning_rate": 9.89404397895193e-06, "loss": 0.16643558, "memory(GiB)": 15.03, "step": 3180, "train_speed(iter/s)": 1.461257 }, { "acc": 0.97794037, "epoch": 5.622241835834069, "grad_norm": 4.41248083114624, "learning_rate": 9.893444955838832e-06, "loss": 0.14443944, "memory(GiB)": 15.03, "step": 3185, "train_speed(iter/s)": 1.461361 }, { "acc": 0.96606016, "epoch": 5.631067961165049, "grad_norm": 7.548863410949707, "learning_rate": 9.892844262443006e-06, "loss": 0.20204682, "memory(GiB)": 15.03, "step": 3190, "train_speed(iter/s)": 1.461463 }, { "acc": 0.95100927, "epoch": 5.639894086496028, "grad_norm": 11.598603248596191, "learning_rate": 9.892241898969511e-06, "loss": 0.28911791, "memory(GiB)": 15.03, "step": 3195, "train_speed(iter/s)": 1.461451 }, { "acc": 0.96938744, "epoch": 5.648720211827008, "grad_norm": 7.353464603424072, "learning_rate": 9.891637865623968e-06, "loss": 0.17864969, "memory(GiB)": 15.03, "step": 3200, "train_speed(iter/s)": 1.461467 }, { "acc": 0.96757774, "epoch": 5.657546337157988, "grad_norm": 9.20964241027832, "learning_rate": 9.891032162612577e-06, "loss": 0.18102056, "memory(GiB)": 15.03, "step": 3205, "train_speed(iter/s)": 1.461596 }, { "acc": 0.9682991, "epoch": 5.666372462488967, "grad_norm": 7.993354797363281, "learning_rate": 9.890424790142101e-06, "loss": 0.21145864, "memory(GiB)": 15.03, "step": 3210, "train_speed(iter/s)": 1.461523 }, { "acc": 0.96417923, "epoch": 5.675198587819947, "grad_norm": 6.230317115783691, "learning_rate": 9.889815748419878e-06, "loss": 0.21028926, "memory(GiB)": 15.03, "step": 3215, "train_speed(iter/s)": 1.46151 }, { "acc": 0.96447487, "epoch": 5.684024713150927, "grad_norm": 4.462335586547852, "learning_rate": 9.889205037653813e-06, "loss": 0.2316673, "memory(GiB)": 15.03, "step": 3220, "train_speed(iter/s)": 1.461505 }, { "acc": 0.97407303, "epoch": 5.692850838481906, "grad_norm": 4.029748916625977, "learning_rate": 9.888592658052382e-06, "loss": 0.15457826, "memory(GiB)": 15.03, "step": 3225, "train_speed(iter/s)": 1.461507 }, { "acc": 0.96541958, "epoch": 5.701676963812886, "grad_norm": 8.23068618774414, "learning_rate": 9.887978609824632e-06, "loss": 0.21286769, "memory(GiB)": 15.03, "step": 3230, "train_speed(iter/s)": 1.461506 }, { "acc": 0.96921005, "epoch": 5.710503089143866, "grad_norm": 6.182116985321045, "learning_rate": 9.887362893180175e-06, "loss": 0.20467772, "memory(GiB)": 15.03, "step": 3235, "train_speed(iter/s)": 1.4615 }, { "acc": 0.95689812, "epoch": 5.719329214474845, "grad_norm": 4.235690116882324, "learning_rate": 9.886745508329196e-06, "loss": 0.24990602, "memory(GiB)": 15.03, "step": 3240, "train_speed(iter/s)": 1.461497 }, { "acc": 0.96634741, "epoch": 5.728155339805825, "grad_norm": 6.346073627471924, "learning_rate": 9.886126455482448e-06, "loss": 0.18534248, "memory(GiB)": 15.03, "step": 3245, "train_speed(iter/s)": 1.461478 }, { "acc": 0.9630434, "epoch": 5.736981465136805, "grad_norm": 9.546823501586914, "learning_rate": 9.885505734851259e-06, "loss": 0.2121954, "memory(GiB)": 15.03, "step": 3250, "train_speed(iter/s)": 1.461505 }, { "acc": 0.96514091, "epoch": 5.745807590467785, "grad_norm": 5.027979373931885, "learning_rate": 9.884883346647516e-06, "loss": 0.21909847, "memory(GiB)": 15.03, "step": 3255, "train_speed(iter/s)": 1.461557 }, { "acc": 0.96804886, "epoch": 5.754633715798764, "grad_norm": 7.601296901702881, "learning_rate": 9.884259291083686e-06, "loss": 0.18840766, "memory(GiB)": 15.03, "step": 3260, "train_speed(iter/s)": 1.461671 }, { "acc": 0.96536083, "epoch": 5.763459841129744, "grad_norm": 4.782262802124023, "learning_rate": 9.883633568372796e-06, "loss": 0.21371379, "memory(GiB)": 15.03, "step": 3265, "train_speed(iter/s)": 1.461708 }, { "acc": 0.96409397, "epoch": 5.772285966460724, "grad_norm": 6.596679210662842, "learning_rate": 9.883006178728448e-06, "loss": 0.23057957, "memory(GiB)": 15.03, "step": 3270, "train_speed(iter/s)": 1.461735 }, { "acc": 0.96900558, "epoch": 5.781112091791703, "grad_norm": 5.581786632537842, "learning_rate": 9.882377122364808e-06, "loss": 0.16647373, "memory(GiB)": 15.03, "step": 3275, "train_speed(iter/s)": 1.461757 }, { "acc": 0.95975981, "epoch": 5.789938217122684, "grad_norm": 8.288549423217773, "learning_rate": 9.88174639949662e-06, "loss": 0.21453633, "memory(GiB)": 15.03, "step": 3280, "train_speed(iter/s)": 1.461725 }, { "acc": 0.96714077, "epoch": 5.798764342453663, "grad_norm": 5.791404724121094, "learning_rate": 9.881114010339185e-06, "loss": 0.17878098, "memory(GiB)": 15.03, "step": 3285, "train_speed(iter/s)": 1.46166 }, { "acc": 0.96368694, "epoch": 5.807590467784642, "grad_norm": 8.04486083984375, "learning_rate": 9.880479955108384e-06, "loss": 0.21904523, "memory(GiB)": 15.03, "step": 3290, "train_speed(iter/s)": 1.46167 }, { "acc": 0.96470852, "epoch": 5.816416593115623, "grad_norm": 5.092523574829102, "learning_rate": 9.879844234020658e-06, "loss": 0.22780242, "memory(GiB)": 15.03, "step": 3295, "train_speed(iter/s)": 1.46168 }, { "acc": 0.96939573, "epoch": 5.825242718446602, "grad_norm": 7.5891499519348145, "learning_rate": 9.879206847293022e-06, "loss": 0.1685048, "memory(GiB)": 15.03, "step": 3300, "train_speed(iter/s)": 1.461705 }, { "acc": 0.96951656, "epoch": 5.834068843777581, "grad_norm": 4.851561069488525, "learning_rate": 9.878567795143057e-06, "loss": 0.1993763, "memory(GiB)": 15.03, "step": 3305, "train_speed(iter/s)": 1.461812 }, { "acc": 0.96654568, "epoch": 5.842894969108562, "grad_norm": 7.472423553466797, "learning_rate": 9.877927077788915e-06, "loss": 0.18275471, "memory(GiB)": 15.03, "step": 3310, "train_speed(iter/s)": 1.461837 }, { "acc": 0.96206646, "epoch": 5.851721094439541, "grad_norm": 6.39948034286499, "learning_rate": 9.877284695449316e-06, "loss": 0.22306528, "memory(GiB)": 15.03, "step": 3315, "train_speed(iter/s)": 1.461941 }, { "acc": 0.97279053, "epoch": 5.86054721977052, "grad_norm": 6.386104106903076, "learning_rate": 9.87664064834354e-06, "loss": 0.1715438, "memory(GiB)": 15.03, "step": 3320, "train_speed(iter/s)": 1.461948 }, { "acc": 0.96851797, "epoch": 5.869373345101501, "grad_norm": 5.896956920623779, "learning_rate": 9.87599493669145e-06, "loss": 0.19266167, "memory(GiB)": 15.03, "step": 3325, "train_speed(iter/s)": 1.461854 }, { "acc": 0.96841345, "epoch": 5.87819947043248, "grad_norm": 5.574155330657959, "learning_rate": 9.875347560713467e-06, "loss": 0.17597585, "memory(GiB)": 15.03, "step": 3330, "train_speed(iter/s)": 1.461853 }, { "acc": 0.96213818, "epoch": 5.887025595763459, "grad_norm": 5.291718482971191, "learning_rate": 9.874698520630582e-06, "loss": 0.20942922, "memory(GiB)": 15.03, "step": 3335, "train_speed(iter/s)": 1.461857 }, { "acc": 0.97075024, "epoch": 5.89585172109444, "grad_norm": 4.2998046875, "learning_rate": 9.874047816664356e-06, "loss": 0.18446898, "memory(GiB)": 15.03, "step": 3340, "train_speed(iter/s)": 1.461856 }, { "acc": 0.96590967, "epoch": 5.904677846425419, "grad_norm": 6.644496917724609, "learning_rate": 9.873395449036916e-06, "loss": 0.20124893, "memory(GiB)": 15.03, "step": 3345, "train_speed(iter/s)": 1.461909 }, { "acc": 0.97446442, "epoch": 5.913503971756399, "grad_norm": 5.310258388519287, "learning_rate": 9.872741417970959e-06, "loss": 0.16527309, "memory(GiB)": 15.03, "step": 3350, "train_speed(iter/s)": 1.462012 }, { "acc": 0.97426214, "epoch": 5.922330097087379, "grad_norm": 5.585916996002197, "learning_rate": 9.87208572368975e-06, "loss": 0.16657225, "memory(GiB)": 15.03, "step": 3355, "train_speed(iter/s)": 1.46192 }, { "acc": 0.97496176, "epoch": 5.931156222418358, "grad_norm": 6.255163192749023, "learning_rate": 9.871428366417115e-06, "loss": 0.17066467, "memory(GiB)": 15.03, "step": 3360, "train_speed(iter/s)": 1.461931 }, { "acc": 0.96620483, "epoch": 5.939982347749338, "grad_norm": 6.175842761993408, "learning_rate": 9.87076934637746e-06, "loss": 0.21256123, "memory(GiB)": 15.03, "step": 3365, "train_speed(iter/s)": 1.462 }, { "acc": 0.97381268, "epoch": 5.948808473080318, "grad_norm": 6.097362518310547, "learning_rate": 9.870108663795747e-06, "loss": 0.16777658, "memory(GiB)": 15.03, "step": 3370, "train_speed(iter/s)": 1.462068 }, { "acc": 0.96275139, "epoch": 5.957634598411297, "grad_norm": 7.616461277008057, "learning_rate": 9.869446318897511e-06, "loss": 0.22415738, "memory(GiB)": 15.03, "step": 3375, "train_speed(iter/s)": 1.462067 }, { "acc": 0.97454853, "epoch": 5.966460723742277, "grad_norm": 6.449368000030518, "learning_rate": 9.868782311908855e-06, "loss": 0.14398859, "memory(GiB)": 15.03, "step": 3380, "train_speed(iter/s)": 1.462067 }, { "acc": 0.96356277, "epoch": 5.975286849073257, "grad_norm": 7.698802471160889, "learning_rate": 9.868116643056448e-06, "loss": 0.19656202, "memory(GiB)": 15.03, "step": 3385, "train_speed(iter/s)": 1.462117 }, { "acc": 0.96262331, "epoch": 5.984112974404237, "grad_norm": 7.512460708618164, "learning_rate": 9.867449312567525e-06, "loss": 0.21820633, "memory(GiB)": 15.03, "step": 3390, "train_speed(iter/s)": 1.462087 }, { "acc": 0.96442194, "epoch": 5.992939099735216, "grad_norm": 6.338830947875977, "learning_rate": 9.866780320669891e-06, "loss": 0.22509024, "memory(GiB)": 15.03, "step": 3395, "train_speed(iter/s)": 1.462198 }, { "acc": 0.96878071, "epoch": 6.001765225066196, "grad_norm": 6.602928638458252, "learning_rate": 9.866109667591916e-06, "loss": 0.19613136, "memory(GiB)": 15.03, "step": 3400, "train_speed(iter/s)": 1.461959 }, { "acc": 0.96916418, "epoch": 6.010591350397176, "grad_norm": 7.301319122314453, "learning_rate": 9.86543735356254e-06, "loss": 0.15830817, "memory(GiB)": 15.03, "step": 3405, "train_speed(iter/s)": 1.462049 }, { "acc": 0.96753807, "epoch": 6.019417475728155, "grad_norm": 6.4732136726379395, "learning_rate": 9.864763378811263e-06, "loss": 0.1918059, "memory(GiB)": 15.03, "step": 3410, "train_speed(iter/s)": 1.462048 }, { "acc": 0.96545868, "epoch": 6.028243601059135, "grad_norm": 7.973384857177734, "learning_rate": 9.864087743568161e-06, "loss": 0.20355525, "memory(GiB)": 15.03, "step": 3415, "train_speed(iter/s)": 1.462122 }, { "acc": 0.96871262, "epoch": 6.037069726390115, "grad_norm": 6.617256164550781, "learning_rate": 9.86341044806387e-06, "loss": 0.17812997, "memory(GiB)": 15.03, "step": 3420, "train_speed(iter/s)": 1.462182 }, { "acc": 0.97164974, "epoch": 6.045895851721094, "grad_norm": 5.174384117126465, "learning_rate": 9.862731492529599e-06, "loss": 0.17810515, "memory(GiB)": 15.03, "step": 3425, "train_speed(iter/s)": 1.462267 }, { "acc": 0.96717463, "epoch": 6.054721977052074, "grad_norm": 4.771603107452393, "learning_rate": 9.862050877197117e-06, "loss": 0.2120091, "memory(GiB)": 15.03, "step": 3430, "train_speed(iter/s)": 1.4622 }, { "acc": 0.96647377, "epoch": 6.063548102383054, "grad_norm": 10.964253425598145, "learning_rate": 9.861368602298762e-06, "loss": 0.22115576, "memory(GiB)": 15.03, "step": 3435, "train_speed(iter/s)": 1.462254 }, { "acc": 0.96901865, "epoch": 6.072374227714033, "grad_norm": 7.0796356201171875, "learning_rate": 9.860684668067442e-06, "loss": 0.20764518, "memory(GiB)": 15.03, "step": 3440, "train_speed(iter/s)": 1.462264 }, { "acc": 0.97206287, "epoch": 6.081200353045014, "grad_norm": 4.703484058380127, "learning_rate": 9.859999074736625e-06, "loss": 0.16258296, "memory(GiB)": 15.03, "step": 3445, "train_speed(iter/s)": 1.462343 }, { "acc": 0.97394943, "epoch": 6.090026478375993, "grad_norm": 7.742722034454346, "learning_rate": 9.859311822540353e-06, "loss": 0.16034372, "memory(GiB)": 15.03, "step": 3450, "train_speed(iter/s)": 1.462411 }, { "acc": 0.96903152, "epoch": 6.098852603706972, "grad_norm": 5.895169734954834, "learning_rate": 9.858622911713226e-06, "loss": 0.18091623, "memory(GiB)": 15.03, "step": 3455, "train_speed(iter/s)": 1.462413 }, { "acc": 0.9647789, "epoch": 6.107678729037953, "grad_norm": 5.2998833656311035, "learning_rate": 9.857932342490417e-06, "loss": 0.21544695, "memory(GiB)": 15.03, "step": 3460, "train_speed(iter/s)": 1.462544 }, { "acc": 0.97065678, "epoch": 6.116504854368932, "grad_norm": 8.228965759277344, "learning_rate": 9.857240115107663e-06, "loss": 0.16774726, "memory(GiB)": 15.03, "step": 3465, "train_speed(iter/s)": 1.462675 }, { "acc": 0.97190504, "epoch": 6.125330979699911, "grad_norm": 5.508465766906738, "learning_rate": 9.856546229801266e-06, "loss": 0.16559716, "memory(GiB)": 15.03, "step": 3470, "train_speed(iter/s)": 1.462741 }, { "acc": 0.96185732, "epoch": 6.134157105030892, "grad_norm": 6.427427291870117, "learning_rate": 9.855850686808091e-06, "loss": 0.21036639, "memory(GiB)": 15.03, "step": 3475, "train_speed(iter/s)": 1.462666 }, { "acc": 0.96608009, "epoch": 6.142983230361871, "grad_norm": 6.687475681304932, "learning_rate": 9.855153486365578e-06, "loss": 0.21822019, "memory(GiB)": 15.03, "step": 3480, "train_speed(iter/s)": 1.462807 }, { "acc": 0.97349396, "epoch": 6.151809355692851, "grad_norm": 5.654672622680664, "learning_rate": 9.854454628711724e-06, "loss": 0.15946544, "memory(GiB)": 15.03, "step": 3485, "train_speed(iter/s)": 1.462837 }, { "acc": 0.97507114, "epoch": 6.160635481023831, "grad_norm": 4.543931484222412, "learning_rate": 9.853754114085095e-06, "loss": 0.1640697, "memory(GiB)": 15.03, "step": 3490, "train_speed(iter/s)": 1.462852 }, { "acc": 0.96853962, "epoch": 6.16946160635481, "grad_norm": 5.165155410766602, "learning_rate": 9.853051942724826e-06, "loss": 0.17066979, "memory(GiB)": 15.03, "step": 3495, "train_speed(iter/s)": 1.462897 }, { "acc": 0.96620464, "epoch": 6.17828773168579, "grad_norm": 9.140493392944336, "learning_rate": 9.852348114870607e-06, "loss": 0.19598982, "memory(GiB)": 15.03, "step": 3500, "train_speed(iter/s)": 1.463015 }, { "acc": 0.97051754, "epoch": 6.18711385701677, "grad_norm": 9.01025390625, "learning_rate": 9.851642630762707e-06, "loss": 0.17753077, "memory(GiB)": 15.03, "step": 3505, "train_speed(iter/s)": 1.46311 }, { "acc": 0.96771364, "epoch": 6.195939982347749, "grad_norm": 7.083347797393799, "learning_rate": 9.850935490641952e-06, "loss": 0.19380356, "memory(GiB)": 15.03, "step": 3510, "train_speed(iter/s)": 1.46323 }, { "acc": 0.96266413, "epoch": 6.204766107678729, "grad_norm": 9.907690048217773, "learning_rate": 9.850226694749736e-06, "loss": 0.22711782, "memory(GiB)": 15.03, "step": 3515, "train_speed(iter/s)": 1.463228 }, { "acc": 0.97467575, "epoch": 6.213592233009709, "grad_norm": 6.452168941497803, "learning_rate": 9.849516243328014e-06, "loss": 0.14521629, "memory(GiB)": 15.03, "step": 3520, "train_speed(iter/s)": 1.463166 }, { "acc": 0.96680775, "epoch": 6.222418358340688, "grad_norm": 6.2983880043029785, "learning_rate": 9.848804136619313e-06, "loss": 0.17086717, "memory(GiB)": 15.03, "step": 3525, "train_speed(iter/s)": 1.463133 }, { "acc": 0.9674015, "epoch": 6.231244483671668, "grad_norm": 6.5996270179748535, "learning_rate": 9.84809037486672e-06, "loss": 0.16331189, "memory(GiB)": 15.03, "step": 3530, "train_speed(iter/s)": 1.463218 }, { "acc": 0.97135277, "epoch": 6.240070609002648, "grad_norm": 7.039328098297119, "learning_rate": 9.847374958313892e-06, "loss": 0.18764704, "memory(GiB)": 15.03, "step": 3535, "train_speed(iter/s)": 1.463284 }, { "acc": 0.97538414, "epoch": 6.248896734333628, "grad_norm": 5.13048791885376, "learning_rate": 9.846657887205042e-06, "loss": 0.15556755, "memory(GiB)": 15.03, "step": 3540, "train_speed(iter/s)": 1.463398 }, { "acc": 0.97641945, "epoch": 6.257722859664607, "grad_norm": 5.879028797149658, "learning_rate": 9.845939161784958e-06, "loss": 0.14477594, "memory(GiB)": 15.03, "step": 3545, "train_speed(iter/s)": 1.463419 }, { "acc": 0.97539082, "epoch": 6.266548984995587, "grad_norm": 5.634106636047363, "learning_rate": 9.845218782298986e-06, "loss": 0.15525434, "memory(GiB)": 15.03, "step": 3550, "train_speed(iter/s)": 1.463326 }, { "acc": 0.96797867, "epoch": 6.275375110326567, "grad_norm": 7.408880710601807, "learning_rate": 9.84449674899304e-06, "loss": 0.21146359, "memory(GiB)": 15.03, "step": 3555, "train_speed(iter/s)": 1.463332 }, { "acc": 0.97032194, "epoch": 6.284201235657546, "grad_norm": 4.520908832550049, "learning_rate": 9.843773062113595e-06, "loss": 0.18019599, "memory(GiB)": 15.03, "step": 3560, "train_speed(iter/s)": 1.46332 }, { "acc": 0.96792011, "epoch": 6.293027360988526, "grad_norm": 6.967878341674805, "learning_rate": 9.843047721907694e-06, "loss": 0.18265424, "memory(GiB)": 15.03, "step": 3565, "train_speed(iter/s)": 1.463382 }, { "acc": 0.97067032, "epoch": 6.301853486319506, "grad_norm": 4.886819362640381, "learning_rate": 9.842320728622946e-06, "loss": 0.16552269, "memory(GiB)": 15.03, "step": 3570, "train_speed(iter/s)": 1.463456 }, { "acc": 0.96947098, "epoch": 6.310679611650485, "grad_norm": 6.460099697113037, "learning_rate": 9.841592082507517e-06, "loss": 0.15458834, "memory(GiB)": 15.03, "step": 3575, "train_speed(iter/s)": 1.463361 }, { "acc": 0.96851006, "epoch": 6.319505736981466, "grad_norm": 6.06338357925415, "learning_rate": 9.840861783810143e-06, "loss": 0.20309105, "memory(GiB)": 15.03, "step": 3580, "train_speed(iter/s)": 1.463431 }, { "acc": 0.97249546, "epoch": 6.328331862312445, "grad_norm": 5.6442742347717285, "learning_rate": 9.840129832780124e-06, "loss": 0.18181851, "memory(GiB)": 15.03, "step": 3585, "train_speed(iter/s)": 1.463493 }, { "acc": 0.97257347, "epoch": 6.337157987643424, "grad_norm": 8.186223983764648, "learning_rate": 9.83939622966732e-06, "loss": 0.17424023, "memory(GiB)": 15.03, "step": 3590, "train_speed(iter/s)": 1.463588 }, { "acc": 0.97022228, "epoch": 6.345984112974405, "grad_norm": 4.396054744720459, "learning_rate": 9.838660974722165e-06, "loss": 0.18452905, "memory(GiB)": 15.03, "step": 3595, "train_speed(iter/s)": 1.463582 }, { "acc": 0.96725521, "epoch": 6.354810238305384, "grad_norm": 5.589823246002197, "learning_rate": 9.837924068195639e-06, "loss": 0.21969619, "memory(GiB)": 15.03, "step": 3600, "train_speed(iter/s)": 1.463544 }, { "acc": 0.97863941, "epoch": 6.363636363636363, "grad_norm": 4.480112075805664, "learning_rate": 9.837185510339306e-06, "loss": 0.14280463, "memory(GiB)": 15.03, "step": 3605, "train_speed(iter/s)": 1.463469 }, { "acc": 0.97822838, "epoch": 6.372462488967344, "grad_norm": 6.872964859008789, "learning_rate": 9.836445301405278e-06, "loss": 0.14818182, "memory(GiB)": 15.03, "step": 3610, "train_speed(iter/s)": 1.463482 }, { "acc": 0.97204094, "epoch": 6.381288614298323, "grad_norm": 3.6577205657958984, "learning_rate": 9.835703441646238e-06, "loss": 0.14643584, "memory(GiB)": 15.03, "step": 3615, "train_speed(iter/s)": 1.463512 }, { "acc": 0.96984901, "epoch": 6.390114739629302, "grad_norm": 5.63392448425293, "learning_rate": 9.834959931315434e-06, "loss": 0.18588499, "memory(GiB)": 15.03, "step": 3620, "train_speed(iter/s)": 1.463601 }, { "acc": 0.97360497, "epoch": 6.398940864960283, "grad_norm": 5.851679801940918, "learning_rate": 9.834214770666674e-06, "loss": 0.14116452, "memory(GiB)": 15.03, "step": 3625, "train_speed(iter/s)": 1.463588 }, { "acc": 0.97427254, "epoch": 6.407766990291262, "grad_norm": 5.656660079956055, "learning_rate": 9.833467959954328e-06, "loss": 0.13960446, "memory(GiB)": 15.03, "step": 3630, "train_speed(iter/s)": 1.463561 }, { "acc": 0.96747131, "epoch": 6.416593115622242, "grad_norm": 7.916586399078369, "learning_rate": 9.832719499433335e-06, "loss": 0.17853031, "memory(GiB)": 15.03, "step": 3635, "train_speed(iter/s)": 1.46359 }, { "acc": 0.9721014, "epoch": 6.425419240953222, "grad_norm": 5.588857173919678, "learning_rate": 9.831969389359188e-06, "loss": 0.16161902, "memory(GiB)": 15.03, "step": 3640, "train_speed(iter/s)": 1.463555 }, { "acc": 0.96815205, "epoch": 6.434245366284201, "grad_norm": 5.713095188140869, "learning_rate": 9.831217629987954e-06, "loss": 0.20783231, "memory(GiB)": 15.03, "step": 3645, "train_speed(iter/s)": 1.463582 }, { "acc": 0.97191381, "epoch": 6.443071491615181, "grad_norm": 5.949958324432373, "learning_rate": 9.830464221576257e-06, "loss": 0.16117878, "memory(GiB)": 15.03, "step": 3650, "train_speed(iter/s)": 1.463602 }, { "acc": 0.97399988, "epoch": 6.451897616946161, "grad_norm": 11.921993255615234, "learning_rate": 9.829709164381283e-06, "loss": 0.15314236, "memory(GiB)": 15.03, "step": 3655, "train_speed(iter/s)": 1.463542 }, { "acc": 0.98048878, "epoch": 6.46072374227714, "grad_norm": 5.308621406555176, "learning_rate": 9.828952458660781e-06, "loss": 0.12191594, "memory(GiB)": 15.03, "step": 3660, "train_speed(iter/s)": 1.463556 }, { "acc": 0.96713848, "epoch": 6.46954986760812, "grad_norm": 5.593404293060303, "learning_rate": 9.828194104673068e-06, "loss": 0.18777621, "memory(GiB)": 15.03, "step": 3665, "train_speed(iter/s)": 1.463606 }, { "acc": 0.97385502, "epoch": 6.4783759929391, "grad_norm": 6.941951751708984, "learning_rate": 9.827434102677017e-06, "loss": 0.15558854, "memory(GiB)": 15.03, "step": 3670, "train_speed(iter/s)": 1.463601 }, { "acc": 0.97471924, "epoch": 6.48720211827008, "grad_norm": 5.999978065490723, "learning_rate": 9.826672452932068e-06, "loss": 0.14228966, "memory(GiB)": 15.03, "step": 3675, "train_speed(iter/s)": 1.463625 }, { "acc": 0.96858444, "epoch": 6.496028243601059, "grad_norm": 4.82997465133667, "learning_rate": 9.825909155698223e-06, "loss": 0.18665462, "memory(GiB)": 15.03, "step": 3680, "train_speed(iter/s)": 1.463708 }, { "acc": 0.97002087, "epoch": 6.504854368932039, "grad_norm": 3.8762457370758057, "learning_rate": 9.825144211236043e-06, "loss": 0.19259851, "memory(GiB)": 15.03, "step": 3685, "train_speed(iter/s)": 1.463708 }, { "acc": 0.97141485, "epoch": 6.513680494263019, "grad_norm": 5.661272048950195, "learning_rate": 9.824377619806655e-06, "loss": 0.17167549, "memory(GiB)": 15.03, "step": 3690, "train_speed(iter/s)": 1.463675 }, { "acc": 0.97301731, "epoch": 6.522506619593998, "grad_norm": 5.667910099029541, "learning_rate": 9.823609381671747e-06, "loss": 0.16097934, "memory(GiB)": 15.03, "step": 3695, "train_speed(iter/s)": 1.463663 }, { "acc": 0.97705221, "epoch": 6.531332744924978, "grad_norm": 7.060276031494141, "learning_rate": 9.822839497093567e-06, "loss": 0.16519277, "memory(GiB)": 15.03, "step": 3700, "train_speed(iter/s)": 1.463774 }, { "acc": 0.97261477, "epoch": 6.540158870255958, "grad_norm": 4.64961576461792, "learning_rate": 9.82206796633493e-06, "loss": 0.18370113, "memory(GiB)": 15.03, "step": 3705, "train_speed(iter/s)": 1.463924 }, { "acc": 0.97307892, "epoch": 6.548984995586937, "grad_norm": 4.709315776824951, "learning_rate": 9.821294789659208e-06, "loss": 0.15898798, "memory(GiB)": 15.03, "step": 3710, "train_speed(iter/s)": 1.464047 }, { "acc": 0.97735672, "epoch": 6.557811120917917, "grad_norm": 4.91945219039917, "learning_rate": 9.820519967330339e-06, "loss": 0.14268646, "memory(GiB)": 15.03, "step": 3715, "train_speed(iter/s)": 1.464082 }, { "acc": 0.97748432, "epoch": 6.566637246248897, "grad_norm": 5.114956378936768, "learning_rate": 9.819743499612817e-06, "loss": 0.13646142, "memory(GiB)": 15.03, "step": 3720, "train_speed(iter/s)": 1.464098 }, { "acc": 0.97563877, "epoch": 6.575463371579876, "grad_norm": 12.786428451538086, "learning_rate": 9.818965386771703e-06, "loss": 0.163557, "memory(GiB)": 15.03, "step": 3725, "train_speed(iter/s)": 1.464219 }, { "acc": 0.97304745, "epoch": 6.5842894969108565, "grad_norm": 4.023158550262451, "learning_rate": 9.81818562907262e-06, "loss": 0.16599528, "memory(GiB)": 15.03, "step": 3730, "train_speed(iter/s)": 1.464245 }, { "acc": 0.97159405, "epoch": 6.593115622241836, "grad_norm": 5.126593112945557, "learning_rate": 9.817404226781748e-06, "loss": 0.16939447, "memory(GiB)": 15.03, "step": 3735, "train_speed(iter/s)": 1.464283 }, { "acc": 0.97497759, "epoch": 6.601941747572815, "grad_norm": 5.553883075714111, "learning_rate": 9.816621180165833e-06, "loss": 0.15512643, "memory(GiB)": 15.03, "step": 3740, "train_speed(iter/s)": 1.464337 }, { "acc": 0.97151985, "epoch": 6.6107678729037955, "grad_norm": 5.65995454788208, "learning_rate": 9.815836489492177e-06, "loss": 0.17805895, "memory(GiB)": 15.03, "step": 3745, "train_speed(iter/s)": 1.464389 }, { "acc": 0.97410316, "epoch": 6.619593998234775, "grad_norm": 5.554707050323486, "learning_rate": 9.815050155028647e-06, "loss": 0.15064867, "memory(GiB)": 15.03, "step": 3750, "train_speed(iter/s)": 1.464411 }, { "acc": 0.960145, "epoch": 6.628420123565754, "grad_norm": 5.91768741607666, "learning_rate": 9.814262177043674e-06, "loss": 0.25109415, "memory(GiB)": 15.03, "step": 3755, "train_speed(iter/s)": 1.464437 }, { "acc": 0.9659029, "epoch": 6.6372462488967345, "grad_norm": 4.187118053436279, "learning_rate": 9.813472555806242e-06, "loss": 0.26206017, "memory(GiB)": 15.03, "step": 3760, "train_speed(iter/s)": 1.464572 }, { "acc": 0.96720467, "epoch": 6.646072374227714, "grad_norm": 5.282883167266846, "learning_rate": 9.812681291585903e-06, "loss": 0.18998146, "memory(GiB)": 15.03, "step": 3765, "train_speed(iter/s)": 1.464596 }, { "acc": 0.97343197, "epoch": 6.654898499558694, "grad_norm": 4.297295093536377, "learning_rate": 9.811888384652768e-06, "loss": 0.16168454, "memory(GiB)": 15.03, "step": 3770, "train_speed(iter/s)": 1.46472 }, { "acc": 0.97105036, "epoch": 6.6637246248896735, "grad_norm": 5.142823219299316, "learning_rate": 9.811093835277506e-06, "loss": 0.17646422, "memory(GiB)": 15.03, "step": 3775, "train_speed(iter/s)": 1.464723 }, { "acc": 0.97459297, "epoch": 6.672550750220653, "grad_norm": 7.745919227600098, "learning_rate": 9.81029764373135e-06, "loss": 0.1568006, "memory(GiB)": 15.03, "step": 3780, "train_speed(iter/s)": 1.464845 }, { "acc": 0.96853857, "epoch": 6.681376875551633, "grad_norm": 7.404074668884277, "learning_rate": 9.80949981028609e-06, "loss": 0.19422791, "memory(GiB)": 15.03, "step": 3785, "train_speed(iter/s)": 1.464855 }, { "acc": 0.96537056, "epoch": 6.6902030008826125, "grad_norm": 8.224377632141113, "learning_rate": 9.808700335214081e-06, "loss": 0.21766295, "memory(GiB)": 15.03, "step": 3790, "train_speed(iter/s)": 1.464828 }, { "acc": 0.97632561, "epoch": 6.699029126213592, "grad_norm": 18.835956573486328, "learning_rate": 9.807899218788241e-06, "loss": 0.15907071, "memory(GiB)": 15.03, "step": 3795, "train_speed(iter/s)": 1.464845 }, { "acc": 0.97197886, "epoch": 6.707855251544572, "grad_norm": 6.191768646240234, "learning_rate": 9.807096461282033e-06, "loss": 0.16936405, "memory(GiB)": 15.03, "step": 3800, "train_speed(iter/s)": 1.464839 }, { "acc": 0.96831884, "epoch": 6.716681376875552, "grad_norm": 7.818177700042725, "learning_rate": 9.8062920629695e-06, "loss": 0.19131317, "memory(GiB)": 15.03, "step": 3805, "train_speed(iter/s)": 1.464853 }, { "acc": 0.97275352, "epoch": 6.725507502206531, "grad_norm": 5.5519914627075195, "learning_rate": 9.80548602412523e-06, "loss": 0.16453705, "memory(GiB)": 15.03, "step": 3810, "train_speed(iter/s)": 1.464879 }, { "acc": 0.96563959, "epoch": 6.734333627537511, "grad_norm": 6.030156135559082, "learning_rate": 9.80467834502438e-06, "loss": 0.19187307, "memory(GiB)": 15.03, "step": 3815, "train_speed(iter/s)": 1.46495 }, { "acc": 0.97558908, "epoch": 6.743159752868491, "grad_norm": 10.64682674407959, "learning_rate": 9.803869025942662e-06, "loss": 0.12621793, "memory(GiB)": 15.03, "step": 3820, "train_speed(iter/s)": 1.46486 }, { "acc": 0.97700844, "epoch": 6.75198587819947, "grad_norm": 4.712008476257324, "learning_rate": 9.80305806715635e-06, "loss": 0.12413369, "memory(GiB)": 15.03, "step": 3825, "train_speed(iter/s)": 1.464917 }, { "acc": 0.96768503, "epoch": 6.76081200353045, "grad_norm": 6.152114391326904, "learning_rate": 9.802245468942277e-06, "loss": 0.22149091, "memory(GiB)": 15.03, "step": 3830, "train_speed(iter/s)": 1.465013 }, { "acc": 0.97000504, "epoch": 6.76963812886143, "grad_norm": 5.81306791305542, "learning_rate": 9.801431231577841e-06, "loss": 0.18951459, "memory(GiB)": 15.03, "step": 3835, "train_speed(iter/s)": 1.465165 }, { "acc": 0.97291641, "epoch": 6.77846425419241, "grad_norm": 7.798123836517334, "learning_rate": 9.800615355340987e-06, "loss": 0.17746874, "memory(GiB)": 15.03, "step": 3840, "train_speed(iter/s)": 1.465237 }, { "acc": 0.97133493, "epoch": 6.787290379523389, "grad_norm": 4.034450531005859, "learning_rate": 9.799797840510232e-06, "loss": 0.183079, "memory(GiB)": 15.03, "step": 3845, "train_speed(iter/s)": 1.465333 }, { "acc": 0.96492329, "epoch": 6.796116504854369, "grad_norm": 8.292844772338867, "learning_rate": 9.798978687364643e-06, "loss": 0.21040006, "memory(GiB)": 15.03, "step": 3850, "train_speed(iter/s)": 1.465381 }, { "acc": 0.97070417, "epoch": 6.804942630185349, "grad_norm": 7.806837558746338, "learning_rate": 9.798157896183858e-06, "loss": 0.165569, "memory(GiB)": 15.03, "step": 3855, "train_speed(iter/s)": 1.465428 }, { "acc": 0.97468443, "epoch": 6.813768755516328, "grad_norm": 7.152675628662109, "learning_rate": 9.797335467248059e-06, "loss": 0.17338738, "memory(GiB)": 15.03, "step": 3860, "train_speed(iter/s)": 1.465476 }, { "acc": 0.9755537, "epoch": 6.8225948808473085, "grad_norm": 4.06453800201416, "learning_rate": 9.796511400837998e-06, "loss": 0.17772341, "memory(GiB)": 15.03, "step": 3865, "train_speed(iter/s)": 1.465524 }, { "acc": 0.96264801, "epoch": 6.831421006178288, "grad_norm": 6.702805519104004, "learning_rate": 9.795685697234985e-06, "loss": 0.21787624, "memory(GiB)": 15.03, "step": 3870, "train_speed(iter/s)": 1.465549 }, { "acc": 0.97400169, "epoch": 6.840247131509267, "grad_norm": 7.842660427093506, "learning_rate": 9.794858356720884e-06, "loss": 0.13476212, "memory(GiB)": 15.03, "step": 3875, "train_speed(iter/s)": 1.4656 }, { "acc": 0.97265091, "epoch": 6.8490732568402475, "grad_norm": 5.55590295791626, "learning_rate": 9.794029379578123e-06, "loss": 0.18130087, "memory(GiB)": 15.03, "step": 3880, "train_speed(iter/s)": 1.46559 }, { "acc": 0.97544422, "epoch": 6.857899382171227, "grad_norm": 5.7380523681640625, "learning_rate": 9.793198766089682e-06, "loss": 0.14541512, "memory(GiB)": 15.03, "step": 3885, "train_speed(iter/s)": 1.4656 }, { "acc": 0.97079601, "epoch": 6.866725507502206, "grad_norm": 14.027680397033691, "learning_rate": 9.792366516539107e-06, "loss": 0.17474136, "memory(GiB)": 15.03, "step": 3890, "train_speed(iter/s)": 1.465648 }, { "acc": 0.97176571, "epoch": 6.8755516328331865, "grad_norm": 4.541036605834961, "learning_rate": 9.7915326312105e-06, "loss": 0.20224273, "memory(GiB)": 15.03, "step": 3895, "train_speed(iter/s)": 1.465705 }, { "acc": 0.97593708, "epoch": 6.884377758164166, "grad_norm": 4.729853630065918, "learning_rate": 9.790697110388518e-06, "loss": 0.14596429, "memory(GiB)": 15.03, "step": 3900, "train_speed(iter/s)": 1.465726 }, { "acc": 0.97063847, "epoch": 6.893203883495145, "grad_norm": 7.530506610870361, "learning_rate": 9.78985995435838e-06, "loss": 0.17983183, "memory(GiB)": 15.03, "step": 3905, "train_speed(iter/s)": 1.465865 }, { "acc": 0.97157507, "epoch": 6.9020300088261255, "grad_norm": 4.861341953277588, "learning_rate": 9.789021163405862e-06, "loss": 0.17271166, "memory(GiB)": 15.03, "step": 3910, "train_speed(iter/s)": 1.465773 }, { "acc": 0.97042141, "epoch": 6.910856134157105, "grad_norm": 7.221363067626953, "learning_rate": 9.7881807378173e-06, "loss": 0.16081216, "memory(GiB)": 15.03, "step": 3915, "train_speed(iter/s)": 1.465904 }, { "acc": 0.97386932, "epoch": 6.919682259488084, "grad_norm": 7.894848346710205, "learning_rate": 9.787338677879586e-06, "loss": 0.16449869, "memory(GiB)": 15.03, "step": 3920, "train_speed(iter/s)": 1.465905 }, { "acc": 0.97458038, "epoch": 6.9285083848190645, "grad_norm": 8.836786270141602, "learning_rate": 9.786494983880167e-06, "loss": 0.16751462, "memory(GiB)": 15.03, "step": 3925, "train_speed(iter/s)": 1.466019 }, { "acc": 0.97022934, "epoch": 6.937334510150044, "grad_norm": 7.052101135253906, "learning_rate": 9.785649656107056e-06, "loss": 0.17001345, "memory(GiB)": 15.03, "step": 3930, "train_speed(iter/s)": 1.466069 }, { "acc": 0.97344122, "epoch": 6.946160635481024, "grad_norm": 7.273437023162842, "learning_rate": 9.784802694848815e-06, "loss": 0.16032242, "memory(GiB)": 15.03, "step": 3935, "train_speed(iter/s)": 1.466174 }, { "acc": 0.97758474, "epoch": 6.9549867608120035, "grad_norm": 4.540959358215332, "learning_rate": 9.783954100394569e-06, "loss": 0.12887988, "memory(GiB)": 15.03, "step": 3940, "train_speed(iter/s)": 1.466239 }, { "acc": 0.97943172, "epoch": 6.963812886142983, "grad_norm": 14.332303047180176, "learning_rate": 9.783103873033999e-06, "loss": 0.12719306, "memory(GiB)": 15.03, "step": 3945, "train_speed(iter/s)": 1.46629 }, { "acc": 0.97847576, "epoch": 6.972639011473963, "grad_norm": 4.3686418533325195, "learning_rate": 9.782252013057341e-06, "loss": 0.14430101, "memory(GiB)": 15.03, "step": 3950, "train_speed(iter/s)": 1.466374 }, { "acc": 0.97962379, "epoch": 6.9814651368049425, "grad_norm": 3.5471079349517822, "learning_rate": 9.781398520755394e-06, "loss": 0.12964729, "memory(GiB)": 15.03, "step": 3955, "train_speed(iter/s)": 1.466447 }, { "acc": 0.96860685, "epoch": 6.990291262135923, "grad_norm": 5.742743968963623, "learning_rate": 9.78054339641951e-06, "loss": 0.18672926, "memory(GiB)": 15.03, "step": 3960, "train_speed(iter/s)": 1.466402 }, { "acc": 0.97997179, "epoch": 6.999117387466902, "grad_norm": 4.915040493011475, "learning_rate": 9.779686640341596e-06, "loss": 0.13108799, "memory(GiB)": 15.03, "step": 3965, "train_speed(iter/s)": 1.466502 }, { "acc": 0.97954979, "epoch": 7.0079435127978815, "grad_norm": 6.651116371154785, "learning_rate": 9.778828252814122e-06, "loss": 0.13783989, "memory(GiB)": 15.03, "step": 3970, "train_speed(iter/s)": 1.466325 }, { "acc": 0.98336229, "epoch": 7.016769638128862, "grad_norm": 4.123208522796631, "learning_rate": 9.777968234130111e-06, "loss": 0.11572157, "memory(GiB)": 15.03, "step": 3975, "train_speed(iter/s)": 1.466299 }, { "acc": 0.97423801, "epoch": 7.025595763459841, "grad_norm": 7.389788627624512, "learning_rate": 9.777106584583146e-06, "loss": 0.14631047, "memory(GiB)": 15.03, "step": 3980, "train_speed(iter/s)": 1.466328 }, { "acc": 0.97762213, "epoch": 7.0344218887908205, "grad_norm": 7.473908424377441, "learning_rate": 9.77624330446736e-06, "loss": 0.127522, "memory(GiB)": 15.03, "step": 3985, "train_speed(iter/s)": 1.46639 }, { "acc": 0.97362719, "epoch": 7.043248014121801, "grad_norm": 5.29202938079834, "learning_rate": 9.775378394077452e-06, "loss": 0.15075657, "memory(GiB)": 15.03, "step": 3990, "train_speed(iter/s)": 1.46632 }, { "acc": 0.97760983, "epoch": 7.05207413945278, "grad_norm": 6.835725784301758, "learning_rate": 9.774511853708667e-06, "loss": 0.12623527, "memory(GiB)": 15.03, "step": 3995, "train_speed(iter/s)": 1.466232 }, { "acc": 0.97309866, "epoch": 7.0609002647837595, "grad_norm": 6.230284690856934, "learning_rate": 9.773643683656817e-06, "loss": 0.18601971, "memory(GiB)": 15.03, "step": 4000, "train_speed(iter/s)": 1.466366 }, { "acc": 0.97752552, "epoch": 7.06972639011474, "grad_norm": 2.919517755508423, "learning_rate": 9.772773884218263e-06, "loss": 0.13775705, "memory(GiB)": 15.03, "step": 4005, "train_speed(iter/s)": 1.466412 }, { "acc": 0.97516651, "epoch": 7.078552515445719, "grad_norm": 3.1090049743652344, "learning_rate": 9.771902455689925e-06, "loss": 0.15589247, "memory(GiB)": 15.03, "step": 4010, "train_speed(iter/s)": 1.466392 }, { "acc": 0.97824955, "epoch": 7.087378640776699, "grad_norm": 4.173274993896484, "learning_rate": 9.771029398369278e-06, "loss": 0.13717202, "memory(GiB)": 15.03, "step": 4015, "train_speed(iter/s)": 1.466423 }, { "acc": 0.97621689, "epoch": 7.096204766107679, "grad_norm": 5.402634620666504, "learning_rate": 9.770154712554354e-06, "loss": 0.15604575, "memory(GiB)": 15.03, "step": 4020, "train_speed(iter/s)": 1.466481 }, { "acc": 0.98435278, "epoch": 7.105030891438658, "grad_norm": 4.402596950531006, "learning_rate": 9.76927839854374e-06, "loss": 0.09814033, "memory(GiB)": 15.03, "step": 4025, "train_speed(iter/s)": 1.466535 }, { "acc": 0.97798605, "epoch": 7.113857016769638, "grad_norm": 16.100637435913086, "learning_rate": 9.768400456636581e-06, "loss": 0.18023326, "memory(GiB)": 15.03, "step": 4030, "train_speed(iter/s)": 1.466575 }, { "acc": 0.97918806, "epoch": 7.122683142100618, "grad_norm": 4.121559143066406, "learning_rate": 9.767520887132577e-06, "loss": 0.12359972, "memory(GiB)": 15.03, "step": 4035, "train_speed(iter/s)": 1.466639 }, { "acc": 0.97749548, "epoch": 7.131509267431597, "grad_norm": 4.178430080413818, "learning_rate": 9.766639690331978e-06, "loss": 0.15756319, "memory(GiB)": 15.03, "step": 4040, "train_speed(iter/s)": 1.466693 }, { "acc": 0.97337408, "epoch": 7.1403353927625774, "grad_norm": 8.438490867614746, "learning_rate": 9.7657568665356e-06, "loss": 0.1480582, "memory(GiB)": 15.03, "step": 4045, "train_speed(iter/s)": 1.466753 }, { "acc": 0.98485479, "epoch": 7.149161518093557, "grad_norm": 9.386825561523438, "learning_rate": 9.764872416044805e-06, "loss": 0.11246498, "memory(GiB)": 15.03, "step": 4050, "train_speed(iter/s)": 1.46681 }, { "acc": 0.97203302, "epoch": 7.157987643424537, "grad_norm": 5.868162155151367, "learning_rate": 9.763986339161514e-06, "loss": 0.16797638, "memory(GiB)": 15.03, "step": 4055, "train_speed(iter/s)": 1.466862 }, { "acc": 0.97647057, "epoch": 7.1668137687555165, "grad_norm": 5.983351707458496, "learning_rate": 9.763098636188204e-06, "loss": 0.14341037, "memory(GiB)": 15.03, "step": 4060, "train_speed(iter/s)": 1.46683 }, { "acc": 0.98145618, "epoch": 7.175639894086496, "grad_norm": 3.1942780017852783, "learning_rate": 9.762209307427905e-06, "loss": 0.13737965, "memory(GiB)": 15.03, "step": 4065, "train_speed(iter/s)": 1.46696 }, { "acc": 0.97190628, "epoch": 7.184466019417476, "grad_norm": 6.958121299743652, "learning_rate": 9.761318353184206e-06, "loss": 0.17976205, "memory(GiB)": 15.03, "step": 4070, "train_speed(iter/s)": 1.46696 }, { "acc": 0.97147779, "epoch": 7.1932921447484555, "grad_norm": 7.185634136199951, "learning_rate": 9.760425773761245e-06, "loss": 0.18060596, "memory(GiB)": 15.03, "step": 4075, "train_speed(iter/s)": 1.467029 }, { "acc": 0.97480412, "epoch": 7.202118270079435, "grad_norm": 4.83156156539917, "learning_rate": 9.75953156946372e-06, "loss": 0.16287677, "memory(GiB)": 15.03, "step": 4080, "train_speed(iter/s)": 1.46699 }, { "acc": 0.970047, "epoch": 7.210944395410415, "grad_norm": 4.020845890045166, "learning_rate": 9.758635740596882e-06, "loss": 0.15425205, "memory(GiB)": 15.03, "step": 4085, "train_speed(iter/s)": 1.466962 }, { "acc": 0.97913475, "epoch": 7.2197705207413945, "grad_norm": 4.904809951782227, "learning_rate": 9.757738287466532e-06, "loss": 0.1392329, "memory(GiB)": 15.03, "step": 4090, "train_speed(iter/s)": 1.467012 }, { "acc": 0.97633371, "epoch": 7.228596646072374, "grad_norm": 4.34108304977417, "learning_rate": 9.756839210379033e-06, "loss": 0.16685638, "memory(GiB)": 15.03, "step": 4095, "train_speed(iter/s)": 1.467006 }, { "acc": 0.97880802, "epoch": 7.237422771403354, "grad_norm": 6.1107964515686035, "learning_rate": 9.755938509641298e-06, "loss": 0.12619407, "memory(GiB)": 15.03, "step": 4100, "train_speed(iter/s)": 1.467044 }, { "acc": 0.97770576, "epoch": 7.2462488967343335, "grad_norm": 6.061568260192871, "learning_rate": 9.755036185560796e-06, "loss": 0.14478559, "memory(GiB)": 15.03, "step": 4105, "train_speed(iter/s)": 1.467084 }, { "acc": 0.97692852, "epoch": 7.255075022065314, "grad_norm": 5.449748516082764, "learning_rate": 9.75413223844555e-06, "loss": 0.15157639, "memory(GiB)": 15.03, "step": 4110, "train_speed(iter/s)": 1.467073 }, { "acc": 0.97848186, "epoch": 7.263901147396293, "grad_norm": 3.864969491958618, "learning_rate": 9.753226668604132e-06, "loss": 0.13246437, "memory(GiB)": 15.03, "step": 4115, "train_speed(iter/s)": 1.467137 }, { "acc": 0.97535667, "epoch": 7.2727272727272725, "grad_norm": 6.914940357208252, "learning_rate": 9.752319476345678e-06, "loss": 0.16454086, "memory(GiB)": 15.03, "step": 4120, "train_speed(iter/s)": 1.467069 }, { "acc": 0.98035288, "epoch": 7.281553398058253, "grad_norm": 6.057742595672607, "learning_rate": 9.751410661979869e-06, "loss": 0.12974104, "memory(GiB)": 15.03, "step": 4125, "train_speed(iter/s)": 1.467132 }, { "acc": 0.96243048, "epoch": 7.290379523389232, "grad_norm": 4.671562671661377, "learning_rate": 9.750500225816943e-06, "loss": 0.23827763, "memory(GiB)": 15.03, "step": 4130, "train_speed(iter/s)": 1.467192 }, { "acc": 0.97867146, "epoch": 7.2992056487202115, "grad_norm": 5.838350772857666, "learning_rate": 9.749588168167691e-06, "loss": 0.1377627, "memory(GiB)": 15.03, "step": 4135, "train_speed(iter/s)": 1.467204 }, { "acc": 0.97923498, "epoch": 7.308031774051192, "grad_norm": 3.3657336235046387, "learning_rate": 9.748674489343462e-06, "loss": 0.12390119, "memory(GiB)": 15.03, "step": 4140, "train_speed(iter/s)": 1.467128 }, { "acc": 0.9826416, "epoch": 7.316857899382171, "grad_norm": 4.1938042640686035, "learning_rate": 9.747759189656148e-06, "loss": 0.10332088, "memory(GiB)": 15.03, "step": 4145, "train_speed(iter/s)": 1.467224 }, { "acc": 0.97816401, "epoch": 7.325684024713151, "grad_norm": 7.172029972076416, "learning_rate": 9.746842269418207e-06, "loss": 0.13747089, "memory(GiB)": 15.03, "step": 4150, "train_speed(iter/s)": 1.467299 }, { "acc": 0.97278099, "epoch": 7.334510150044131, "grad_norm": 6.1291913986206055, "learning_rate": 9.74592372894264e-06, "loss": 0.14959476, "memory(GiB)": 15.03, "step": 4155, "train_speed(iter/s)": 1.467395 }, { "acc": 0.97768841, "epoch": 7.34333627537511, "grad_norm": 4.69340181350708, "learning_rate": 9.745003568543006e-06, "loss": 0.12764344, "memory(GiB)": 15.03, "step": 4160, "train_speed(iter/s)": 1.467378 }, { "acc": 0.98022766, "epoch": 7.35216240070609, "grad_norm": 4.881111145019531, "learning_rate": 9.744081788533416e-06, "loss": 0.13950715, "memory(GiB)": 15.03, "step": 4165, "train_speed(iter/s)": 1.467436 }, { "acc": 0.9771946, "epoch": 7.36098852603707, "grad_norm": 7.7263970375061035, "learning_rate": 9.743158389228534e-06, "loss": 0.12823375, "memory(GiB)": 15.03, "step": 4170, "train_speed(iter/s)": 1.467476 }, { "acc": 0.98259916, "epoch": 7.369814651368049, "grad_norm": 6.574479103088379, "learning_rate": 9.74223337094358e-06, "loss": 0.11419473, "memory(GiB)": 15.03, "step": 4175, "train_speed(iter/s)": 1.467577 }, { "acc": 0.97647457, "epoch": 7.378640776699029, "grad_norm": 5.902623176574707, "learning_rate": 9.741306733994318e-06, "loss": 0.14674128, "memory(GiB)": 15.03, "step": 4180, "train_speed(iter/s)": 1.467558 }, { "acc": 0.97487831, "epoch": 7.387466902030009, "grad_norm": 4.3700690269470215, "learning_rate": 9.740378478697071e-06, "loss": 0.15583037, "memory(GiB)": 15.03, "step": 4185, "train_speed(iter/s)": 1.467589 }, { "acc": 0.97369804, "epoch": 7.396293027360988, "grad_norm": 5.648822784423828, "learning_rate": 9.739448605368715e-06, "loss": 0.17972007, "memory(GiB)": 15.03, "step": 4190, "train_speed(iter/s)": 1.467568 }, { "acc": 0.97345181, "epoch": 7.405119152691968, "grad_norm": 8.76723575592041, "learning_rate": 9.73851711432668e-06, "loss": 0.16937921, "memory(GiB)": 15.03, "step": 4195, "train_speed(iter/s)": 1.467667 }, { "acc": 0.97867737, "epoch": 7.413945278022948, "grad_norm": 7.373640537261963, "learning_rate": 9.737584005888937e-06, "loss": 0.14012086, "memory(GiB)": 15.03, "step": 4200, "train_speed(iter/s)": 1.46771 }, { "acc": 0.97992496, "epoch": 7.422771403353927, "grad_norm": 4.022878170013428, "learning_rate": 9.736649280374021e-06, "loss": 0.13751106, "memory(GiB)": 15.03, "step": 4205, "train_speed(iter/s)": 1.467765 }, { "acc": 0.97665596, "epoch": 7.431597528684907, "grad_norm": 4.980714797973633, "learning_rate": 9.735712938101017e-06, "loss": 0.15570583, "memory(GiB)": 15.03, "step": 4210, "train_speed(iter/s)": 1.46781 }, { "acc": 0.97511721, "epoch": 7.440423654015887, "grad_norm": 7.825179576873779, "learning_rate": 9.734774979389558e-06, "loss": 0.16144613, "memory(GiB)": 15.03, "step": 4215, "train_speed(iter/s)": 1.467833 }, { "acc": 0.97755585, "epoch": 7.449249779346867, "grad_norm": 5.622488975524902, "learning_rate": 9.733835404559831e-06, "loss": 0.15244308, "memory(GiB)": 15.03, "step": 4220, "train_speed(iter/s)": 1.467838 }, { "acc": 0.97394695, "epoch": 7.458075904677846, "grad_norm": 5.212650299072266, "learning_rate": 9.732894213932574e-06, "loss": 0.15793748, "memory(GiB)": 15.03, "step": 4225, "train_speed(iter/s)": 1.467818 }, { "acc": 0.9782815, "epoch": 7.466902030008826, "grad_norm": 3.2398345470428467, "learning_rate": 9.731951407829078e-06, "loss": 0.15943323, "memory(GiB)": 15.03, "step": 4230, "train_speed(iter/s)": 1.467862 }, { "acc": 0.98288555, "epoch": 7.475728155339806, "grad_norm": 2.9303910732269287, "learning_rate": 9.731006986571185e-06, "loss": 0.11151319, "memory(GiB)": 15.03, "step": 4235, "train_speed(iter/s)": 1.46797 }, { "acc": 0.97769089, "epoch": 7.484554280670785, "grad_norm": 4.921606063842773, "learning_rate": 9.730060950481284e-06, "loss": 0.13288517, "memory(GiB)": 15.03, "step": 4240, "train_speed(iter/s)": 1.467956 }, { "acc": 0.97582531, "epoch": 7.493380406001766, "grad_norm": 3.736227512359619, "learning_rate": 9.729113299882324e-06, "loss": 0.16069937, "memory(GiB)": 15.03, "step": 4245, "train_speed(iter/s)": 1.468007 }, { "acc": 0.97607117, "epoch": 7.502206531332745, "grad_norm": 4.380244731903076, "learning_rate": 9.728164035097797e-06, "loss": 0.15974851, "memory(GiB)": 15.03, "step": 4250, "train_speed(iter/s)": 1.468082 }, { "acc": 0.9752573, "epoch": 7.511032656663724, "grad_norm": 6.441835880279541, "learning_rate": 9.727213156451752e-06, "loss": 0.16795797, "memory(GiB)": 15.03, "step": 4255, "train_speed(iter/s)": 1.468198 }, { "acc": 0.96835594, "epoch": 7.519858781994705, "grad_norm": 4.285942077636719, "learning_rate": 9.726260664268785e-06, "loss": 0.17488027, "memory(GiB)": 15.03, "step": 4260, "train_speed(iter/s)": 1.468185 }, { "acc": 0.98171234, "epoch": 7.528684907325684, "grad_norm": 4.743375301361084, "learning_rate": 9.72530655887404e-06, "loss": 0.11825235, "memory(GiB)": 15.03, "step": 4265, "train_speed(iter/s)": 1.468278 }, { "acc": 0.98602123, "epoch": 7.5375110326566634, "grad_norm": 5.062646389007568, "learning_rate": 9.724350840593223e-06, "loss": 0.08361697, "memory(GiB)": 15.03, "step": 4270, "train_speed(iter/s)": 1.468283 }, { "acc": 0.98030052, "epoch": 7.546337157987644, "grad_norm": 2.5854015350341797, "learning_rate": 9.72339350975258e-06, "loss": 0.12481074, "memory(GiB)": 15.03, "step": 4275, "train_speed(iter/s)": 1.468294 }, { "acc": 0.97558823, "epoch": 7.555163283318623, "grad_norm": 4.200395584106445, "learning_rate": 9.72243456667891e-06, "loss": 0.13199121, "memory(GiB)": 15.03, "step": 4280, "train_speed(iter/s)": 1.468264 }, { "acc": 0.98024712, "epoch": 7.5639894086496025, "grad_norm": 5.4655914306640625, "learning_rate": 9.72147401169956e-06, "loss": 0.14097159, "memory(GiB)": 15.03, "step": 4285, "train_speed(iter/s)": 1.468259 }, { "acc": 0.97774925, "epoch": 7.572815533980583, "grad_norm": 6.129053115844727, "learning_rate": 9.720511845142437e-06, "loss": 0.147523, "memory(GiB)": 15.03, "step": 4290, "train_speed(iter/s)": 1.468268 }, { "acc": 0.98307629, "epoch": 7.581641659311562, "grad_norm": 5.71121072769165, "learning_rate": 9.719548067335989e-06, "loss": 0.10876029, "memory(GiB)": 15.03, "step": 4295, "train_speed(iter/s)": 1.468343 }, { "acc": 0.9798502, "epoch": 7.5904677846425415, "grad_norm": 5.473901271820068, "learning_rate": 9.718582678609214e-06, "loss": 0.1166597, "memory(GiB)": 15.03, "step": 4300, "train_speed(iter/s)": 1.468281 }, { "acc": 0.97852249, "epoch": 7.599293909973522, "grad_norm": 3.998239517211914, "learning_rate": 9.717615679291665e-06, "loss": 0.13703868, "memory(GiB)": 15.03, "step": 4305, "train_speed(iter/s)": 1.468321 }, { "acc": 0.9796299, "epoch": 7.608120035304501, "grad_norm": 5.236603260040283, "learning_rate": 9.716647069713442e-06, "loss": 0.13337867, "memory(GiB)": 15.03, "step": 4310, "train_speed(iter/s)": 1.468369 }, { "acc": 0.98078146, "epoch": 7.616946160635481, "grad_norm": 3.63881254196167, "learning_rate": 9.715676850205193e-06, "loss": 0.12363307, "memory(GiB)": 15.03, "step": 4315, "train_speed(iter/s)": 1.468392 }, { "acc": 0.97557583, "epoch": 7.625772285966461, "grad_norm": 5.446394443511963, "learning_rate": 9.71470502109812e-06, "loss": 0.14488807, "memory(GiB)": 15.03, "step": 4320, "train_speed(iter/s)": 1.468461 }, { "acc": 0.98060551, "epoch": 7.63459841129744, "grad_norm": 7.205985069274902, "learning_rate": 9.71373158272397e-06, "loss": 0.13567809, "memory(GiB)": 15.03, "step": 4325, "train_speed(iter/s)": 1.468462 }, { "acc": 0.97822237, "epoch": 7.64342453662842, "grad_norm": 6.537880897521973, "learning_rate": 9.712756535415043e-06, "loss": 0.13166834, "memory(GiB)": 15.03, "step": 4330, "train_speed(iter/s)": 1.468554 }, { "acc": 0.98105927, "epoch": 7.6522506619594, "grad_norm": 6.519103050231934, "learning_rate": 9.711779879504186e-06, "loss": 0.11593565, "memory(GiB)": 15.03, "step": 4335, "train_speed(iter/s)": 1.468663 }, { "acc": 0.97721043, "epoch": 7.66107678729038, "grad_norm": 4.776328086853027, "learning_rate": 9.710801615324794e-06, "loss": 0.12788968, "memory(GiB)": 15.03, "step": 4340, "train_speed(iter/s)": 1.468686 }, { "acc": 0.98163624, "epoch": 7.669902912621359, "grad_norm": 5.058078765869141, "learning_rate": 9.709821743210815e-06, "loss": 0.11915751, "memory(GiB)": 15.03, "step": 4345, "train_speed(iter/s)": 1.468764 }, { "acc": 0.97685862, "epoch": 7.678729037952339, "grad_norm": 11.300054550170898, "learning_rate": 9.708840263496741e-06, "loss": 0.14170654, "memory(GiB)": 15.03, "step": 4350, "train_speed(iter/s)": 1.46886 }, { "acc": 0.97941742, "epoch": 7.687555163283319, "grad_norm": 4.905223369598389, "learning_rate": 9.70785717651762e-06, "loss": 0.14144856, "memory(GiB)": 15.03, "step": 4355, "train_speed(iter/s)": 1.468819 }, { "acc": 0.97871647, "epoch": 7.696381288614298, "grad_norm": 3.4345664978027344, "learning_rate": 9.70687248260904e-06, "loss": 0.12738779, "memory(GiB)": 15.03, "step": 4360, "train_speed(iter/s)": 1.468819 }, { "acc": 0.98140316, "epoch": 7.705207413945278, "grad_norm": 4.682449817657471, "learning_rate": 9.70588618210714e-06, "loss": 0.11228495, "memory(GiB)": 15.03, "step": 4365, "train_speed(iter/s)": 1.468845 }, { "acc": 0.97248268, "epoch": 7.714033539276258, "grad_norm": 6.134199619293213, "learning_rate": 9.704898275348614e-06, "loss": 0.1656669, "memory(GiB)": 15.03, "step": 4370, "train_speed(iter/s)": 1.468893 }, { "acc": 0.97908268, "epoch": 7.722859664607237, "grad_norm": 4.8266921043396, "learning_rate": 9.703908762670697e-06, "loss": 0.12972388, "memory(GiB)": 15.03, "step": 4375, "train_speed(iter/s)": 1.468999 }, { "acc": 0.97499256, "epoch": 7.731685789938217, "grad_norm": 6.818910598754883, "learning_rate": 9.702917644411173e-06, "loss": 0.16377864, "memory(GiB)": 15.03, "step": 4380, "train_speed(iter/s)": 1.469097 }, { "acc": 0.97547913, "epoch": 7.740511915269197, "grad_norm": 5.746918201446533, "learning_rate": 9.701924920908378e-06, "loss": 0.14347999, "memory(GiB)": 15.03, "step": 4385, "train_speed(iter/s)": 1.469053 }, { "acc": 0.97973576, "epoch": 7.749338040600176, "grad_norm": 6.424937725067139, "learning_rate": 9.70093059250119e-06, "loss": 0.13533881, "memory(GiB)": 15.03, "step": 4390, "train_speed(iter/s)": 1.469054 }, { "acc": 0.97511892, "epoch": 7.758164165931156, "grad_norm": 4.663536548614502, "learning_rate": 9.699934659529043e-06, "loss": 0.15673716, "memory(GiB)": 15.03, "step": 4395, "train_speed(iter/s)": 1.469145 }, { "acc": 0.97743568, "epoch": 7.766990291262136, "grad_norm": 9.02021598815918, "learning_rate": 9.698937122331913e-06, "loss": 0.13420472, "memory(GiB)": 15.03, "step": 4400, "train_speed(iter/s)": 1.469137 }, { "acc": 0.97576523, "epoch": 7.775816416593115, "grad_norm": 10.151236534118652, "learning_rate": 9.697937981250324e-06, "loss": 0.15754929, "memory(GiB)": 15.03, "step": 4405, "train_speed(iter/s)": 1.469125 }, { "acc": 0.97952423, "epoch": 7.784642541924096, "grad_norm": 3.7734618186950684, "learning_rate": 9.696937236625345e-06, "loss": 0.1266537, "memory(GiB)": 15.03, "step": 4410, "train_speed(iter/s)": 1.469143 }, { "acc": 0.97795305, "epoch": 7.793468667255075, "grad_norm": 3.227010488510132, "learning_rate": 9.695934888798603e-06, "loss": 0.15525159, "memory(GiB)": 15.03, "step": 4415, "train_speed(iter/s)": 1.469155 }, { "acc": 0.97742157, "epoch": 7.802294792586054, "grad_norm": 5.889753341674805, "learning_rate": 9.694930938112258e-06, "loss": 0.1199729, "memory(GiB)": 15.03, "step": 4420, "train_speed(iter/s)": 1.469205 }, { "acc": 0.97599792, "epoch": 7.811120917917035, "grad_norm": 5.094205856323242, "learning_rate": 9.69392538490903e-06, "loss": 0.15505946, "memory(GiB)": 15.03, "step": 4425, "train_speed(iter/s)": 1.469252 }, { "acc": 0.98504677, "epoch": 7.819947043248014, "grad_norm": 3.420966863632202, "learning_rate": 9.692918229532174e-06, "loss": 0.09520102, "memory(GiB)": 15.03, "step": 4430, "train_speed(iter/s)": 1.469338 }, { "acc": 0.97962694, "epoch": 7.828773168578994, "grad_norm": 5.763525485992432, "learning_rate": 9.691909472325503e-06, "loss": 0.1419737, "memory(GiB)": 15.03, "step": 4435, "train_speed(iter/s)": 1.469338 }, { "acc": 0.97729797, "epoch": 7.837599293909974, "grad_norm": 4.2080535888671875, "learning_rate": 9.690899113633371e-06, "loss": 0.15383794, "memory(GiB)": 15.03, "step": 4440, "train_speed(iter/s)": 1.469444 }, { "acc": 0.97521467, "epoch": 7.846425419240953, "grad_norm": 5.233713626861572, "learning_rate": 9.689887153800678e-06, "loss": 0.17374724, "memory(GiB)": 15.03, "step": 4445, "train_speed(iter/s)": 1.469468 }, { "acc": 0.98611507, "epoch": 7.855251544571933, "grad_norm": 3.4369845390319824, "learning_rate": 9.688873593172873e-06, "loss": 0.10446981, "memory(GiB)": 15.03, "step": 4450, "train_speed(iter/s)": 1.469551 }, { "acc": 0.98296032, "epoch": 7.864077669902913, "grad_norm": 3.4128317832946777, "learning_rate": 9.687858432095951e-06, "loss": 0.11739469, "memory(GiB)": 15.03, "step": 4455, "train_speed(iter/s)": 1.469498 }, { "acc": 0.97402658, "epoch": 7.872903795233892, "grad_norm": 8.056379318237305, "learning_rate": 9.686841670916453e-06, "loss": 0.15779463, "memory(GiB)": 15.03, "step": 4460, "train_speed(iter/s)": 1.469663 }, { "acc": 0.97515993, "epoch": 7.881729920564872, "grad_norm": 6.402256011962891, "learning_rate": 9.685823309981467e-06, "loss": 0.15907444, "memory(GiB)": 15.03, "step": 4465, "train_speed(iter/s)": 1.469754 }, { "acc": 0.97506523, "epoch": 7.890556045895852, "grad_norm": 4.466578483581543, "learning_rate": 9.684803349638623e-06, "loss": 0.13247392, "memory(GiB)": 15.03, "step": 4470, "train_speed(iter/s)": 1.469796 }, { "acc": 0.9824708, "epoch": 7.899382171226831, "grad_norm": 5.922434329986572, "learning_rate": 9.683781790236105e-06, "loss": 0.10726254, "memory(GiB)": 15.03, "step": 4475, "train_speed(iter/s)": 1.469724 }, { "acc": 0.97793751, "epoch": 7.908208296557811, "grad_norm": 4.638969421386719, "learning_rate": 9.682758632122637e-06, "loss": 0.16260016, "memory(GiB)": 15.03, "step": 4480, "train_speed(iter/s)": 1.46975 }, { "acc": 0.98341885, "epoch": 7.917034421888791, "grad_norm": 5.153562068939209, "learning_rate": 9.681733875647485e-06, "loss": 0.09943285, "memory(GiB)": 15.03, "step": 4485, "train_speed(iter/s)": 1.469754 }, { "acc": 0.97892418, "epoch": 7.92586054721977, "grad_norm": 6.403397560119629, "learning_rate": 9.680707521160471e-06, "loss": 0.12430495, "memory(GiB)": 15.03, "step": 4490, "train_speed(iter/s)": 1.469781 }, { "acc": 0.97374125, "epoch": 7.93468667255075, "grad_norm": 5.260499954223633, "learning_rate": 9.679679569011956e-06, "loss": 0.16350969, "memory(GiB)": 15.03, "step": 4495, "train_speed(iter/s)": 1.469774 }, { "acc": 0.98099041, "epoch": 7.94351279788173, "grad_norm": 5.459330081939697, "learning_rate": 9.678650019552848e-06, "loss": 0.11769654, "memory(GiB)": 15.03, "step": 4500, "train_speed(iter/s)": 1.469788 }, { "acc": 0.97600412, "epoch": 7.95233892321271, "grad_norm": 6.697353839874268, "learning_rate": 9.677618873134596e-06, "loss": 0.14163359, "memory(GiB)": 15.03, "step": 4505, "train_speed(iter/s)": 1.469838 }, { "acc": 0.97690792, "epoch": 7.961165048543689, "grad_norm": 3.856858968734741, "learning_rate": 9.6765861301092e-06, "loss": 0.14775393, "memory(GiB)": 15.03, "step": 4510, "train_speed(iter/s)": 1.46982 }, { "acc": 0.97866459, "epoch": 7.969991173874669, "grad_norm": 4.36188268661499, "learning_rate": 9.675551790829205e-06, "loss": 0.12716533, "memory(GiB)": 15.03, "step": 4515, "train_speed(iter/s)": 1.469825 }, { "acc": 0.98420429, "epoch": 7.978817299205649, "grad_norm": 4.8647260665893555, "learning_rate": 9.674515855647695e-06, "loss": 0.11445339, "memory(GiB)": 15.03, "step": 4520, "train_speed(iter/s)": 1.469775 }, { "acc": 0.97826138, "epoch": 7.987643424536628, "grad_norm": 4.23262357711792, "learning_rate": 9.673478324918306e-06, "loss": 0.13098426, "memory(GiB)": 15.03, "step": 4525, "train_speed(iter/s)": 1.469827 }, { "acc": 0.97638464, "epoch": 7.996469549867609, "grad_norm": 6.38602876663208, "learning_rate": 9.672439198995215e-06, "loss": 0.14240258, "memory(GiB)": 15.03, "step": 4530, "train_speed(iter/s)": 1.469873 }, { "acc": 0.9822854, "epoch": 8.005295675198587, "grad_norm": 3.9979026317596436, "learning_rate": 9.67139847823314e-06, "loss": 0.09252361, "memory(GiB)": 15.03, "step": 4535, "train_speed(iter/s)": 1.469811 }, { "acc": 0.97941799, "epoch": 8.014121800529567, "grad_norm": 5.849185943603516, "learning_rate": 9.67035616298735e-06, "loss": 0.14244325, "memory(GiB)": 15.03, "step": 4540, "train_speed(iter/s)": 1.469831 }, { "acc": 0.97863264, "epoch": 8.022947925860548, "grad_norm": 5.956216812133789, "learning_rate": 9.669312253613655e-06, "loss": 0.14135294, "memory(GiB)": 15.03, "step": 4545, "train_speed(iter/s)": 1.469904 }, { "acc": 0.98912649, "epoch": 8.031774051191526, "grad_norm": 2.9200491905212402, "learning_rate": 9.668266750468412e-06, "loss": 0.07283123, "memory(GiB)": 15.03, "step": 4550, "train_speed(iter/s)": 1.469885 }, { "acc": 0.98215199, "epoch": 8.040600176522506, "grad_norm": 4.531777381896973, "learning_rate": 9.667219653908515e-06, "loss": 0.12810378, "memory(GiB)": 15.03, "step": 4555, "train_speed(iter/s)": 1.469917 }, { "acc": 0.98668613, "epoch": 8.049426301853487, "grad_norm": 5.731302261352539, "learning_rate": 9.66617096429141e-06, "loss": 0.0822894, "memory(GiB)": 15.03, "step": 4560, "train_speed(iter/s)": 1.469857 }, { "acc": 0.98017998, "epoch": 8.058252427184467, "grad_norm": 2.4476728439331055, "learning_rate": 9.665120681975082e-06, "loss": 0.11450273, "memory(GiB)": 15.03, "step": 4565, "train_speed(iter/s)": 1.469816 }, { "acc": 0.98074245, "epoch": 8.067078552515445, "grad_norm": 14.528837203979492, "learning_rate": 9.664068807318063e-06, "loss": 0.12670206, "memory(GiB)": 15.03, "step": 4570, "train_speed(iter/s)": 1.469878 }, { "acc": 0.98249893, "epoch": 8.075904677846426, "grad_norm": 6.192163467407227, "learning_rate": 9.663015340679424e-06, "loss": 0.12029817, "memory(GiB)": 15.03, "step": 4575, "train_speed(iter/s)": 1.46988 }, { "acc": 0.97816191, "epoch": 8.084730803177406, "grad_norm": 7.162300109863281, "learning_rate": 9.661960282418786e-06, "loss": 0.13202834, "memory(GiB)": 15.03, "step": 4580, "train_speed(iter/s)": 1.469814 }, { "acc": 0.97583351, "epoch": 8.093556928508384, "grad_norm": 6.644062042236328, "learning_rate": 9.660903632896307e-06, "loss": 0.12781544, "memory(GiB)": 15.03, "step": 4585, "train_speed(iter/s)": 1.46985 }, { "acc": 0.98053017, "epoch": 8.102383053839365, "grad_norm": 3.457294464111328, "learning_rate": 9.65984539247269e-06, "loss": 0.13675777, "memory(GiB)": 15.03, "step": 4590, "train_speed(iter/s)": 1.469838 }, { "acc": 0.97910652, "epoch": 8.111209179170345, "grad_norm": 5.0355706214904785, "learning_rate": 9.658785561509185e-06, "loss": 0.11624949, "memory(GiB)": 15.03, "step": 4595, "train_speed(iter/s)": 1.469841 }, { "acc": 0.98443279, "epoch": 8.120035304501323, "grad_norm": 2.019434928894043, "learning_rate": 9.657724140367577e-06, "loss": 0.10786185, "memory(GiB)": 15.03, "step": 4600, "train_speed(iter/s)": 1.469869 }, { "acc": 0.98394718, "epoch": 8.128861429832304, "grad_norm": 7.563292503356934, "learning_rate": 9.656661129410204e-06, "loss": 0.12148569, "memory(GiB)": 15.03, "step": 4605, "train_speed(iter/s)": 1.469894 }, { "acc": 0.98443356, "epoch": 8.137687555163284, "grad_norm": 4.496523857116699, "learning_rate": 9.655596528999935e-06, "loss": 0.09366883, "memory(GiB)": 15.03, "step": 4610, "train_speed(iter/s)": 1.469973 }, { "acc": 0.97647276, "epoch": 8.146513680494262, "grad_norm": 4.814881801605225, "learning_rate": 9.654530339500193e-06, "loss": 0.13444518, "memory(GiB)": 15.03, "step": 4615, "train_speed(iter/s)": 1.470044 }, { "acc": 0.98510818, "epoch": 8.155339805825243, "grad_norm": 5.086309909820557, "learning_rate": 9.653462561274937e-06, "loss": 0.10397916, "memory(GiB)": 15.03, "step": 4620, "train_speed(iter/s)": 1.47003 }, { "acc": 0.97273369, "epoch": 8.164165931156223, "grad_norm": 13.729241371154785, "learning_rate": 9.652393194688668e-06, "loss": 0.16752317, "memory(GiB)": 15.03, "step": 4625, "train_speed(iter/s)": 1.470012 }, { "acc": 0.9823288, "epoch": 8.172992056487201, "grad_norm": 8.1738920211792, "learning_rate": 9.651322240106434e-06, "loss": 0.1123915, "memory(GiB)": 15.03, "step": 4630, "train_speed(iter/s)": 1.469984 }, { "acc": 0.9779417, "epoch": 8.181818181818182, "grad_norm": 4.662229061126709, "learning_rate": 9.650249697893819e-06, "loss": 0.14414711, "memory(GiB)": 15.03, "step": 4635, "train_speed(iter/s)": 1.470067 }, { "acc": 0.98108339, "epoch": 8.190644307149162, "grad_norm": 6.320214748382568, "learning_rate": 9.649175568416956e-06, "loss": 0.10886984, "memory(GiB)": 15.03, "step": 4640, "train_speed(iter/s)": 1.470077 }, { "acc": 0.97803097, "epoch": 8.19947043248014, "grad_norm": 7.061778545379639, "learning_rate": 9.648099852042509e-06, "loss": 0.12497249, "memory(GiB)": 15.03, "step": 4645, "train_speed(iter/s)": 1.47001 }, { "acc": 0.98066778, "epoch": 8.20829655781112, "grad_norm": 4.790492057800293, "learning_rate": 9.647022549137697e-06, "loss": 0.11560059, "memory(GiB)": 15.03, "step": 4650, "train_speed(iter/s)": 1.469953 }, { "acc": 0.97631245, "epoch": 8.217122683142101, "grad_norm": 6.330280303955078, "learning_rate": 9.64594366007027e-06, "loss": 0.15504074, "memory(GiB)": 15.03, "step": 4655, "train_speed(iter/s)": 1.469886 }, { "acc": 0.98120556, "epoch": 8.225948808473081, "grad_norm": 6.518667697906494, "learning_rate": 9.644863185208526e-06, "loss": 0.11242719, "memory(GiB)": 15.03, "step": 4660, "train_speed(iter/s)": 1.469898 }, { "acc": 0.97898989, "epoch": 8.23477493380406, "grad_norm": 6.6163201332092285, "learning_rate": 9.6437811249213e-06, "loss": 0.1258047, "memory(GiB)": 15.03, "step": 4665, "train_speed(iter/s)": 1.469866 }, { "acc": 0.97596035, "epoch": 8.24360105913504, "grad_norm": 7.661744594573975, "learning_rate": 9.642697479577973e-06, "loss": 0.14624487, "memory(GiB)": 15.03, "step": 4670, "train_speed(iter/s)": 1.469844 }, { "acc": 0.98077469, "epoch": 8.25242718446602, "grad_norm": 5.208218574523926, "learning_rate": 9.64161224954846e-06, "loss": 0.11472478, "memory(GiB)": 15.03, "step": 4675, "train_speed(iter/s)": 1.469813 }, { "acc": 0.98430481, "epoch": 8.261253309796999, "grad_norm": 3.345654249191284, "learning_rate": 9.640525435203225e-06, "loss": 0.10708601, "memory(GiB)": 15.03, "step": 4680, "train_speed(iter/s)": 1.469845 }, { "acc": 0.97784901, "epoch": 8.270079435127979, "grad_norm": 5.492842674255371, "learning_rate": 9.639437036913265e-06, "loss": 0.13861289, "memory(GiB)": 15.03, "step": 4685, "train_speed(iter/s)": 1.469872 }, { "acc": 0.97979927, "epoch": 8.27890556045896, "grad_norm": 5.5763654708862305, "learning_rate": 9.638347055050126e-06, "loss": 0.13166211, "memory(GiB)": 15.03, "step": 4690, "train_speed(iter/s)": 1.469887 }, { "acc": 0.97980614, "epoch": 8.287731685789938, "grad_norm": 4.828721046447754, "learning_rate": 9.637255489985887e-06, "loss": 0.13127148, "memory(GiB)": 15.03, "step": 4695, "train_speed(iter/s)": 1.46993 }, { "acc": 0.98037281, "epoch": 8.296557811120918, "grad_norm": 20.723602294921875, "learning_rate": 9.636162342093172e-06, "loss": 0.1296628, "memory(GiB)": 15.03, "step": 4700, "train_speed(iter/s)": 1.469941 }, { "acc": 0.9811883, "epoch": 8.305383936451898, "grad_norm": 6.146444797515869, "learning_rate": 9.635067611745145e-06, "loss": 0.10969741, "memory(GiB)": 15.03, "step": 4705, "train_speed(iter/s)": 1.469959 }, { "acc": 0.98039417, "epoch": 8.314210061782877, "grad_norm": 5.153848648071289, "learning_rate": 9.633971299315506e-06, "loss": 0.11160297, "memory(GiB)": 15.03, "step": 4710, "train_speed(iter/s)": 1.469951 }, { "acc": 0.98310251, "epoch": 8.323036187113857, "grad_norm": 3.8853042125701904, "learning_rate": 9.632873405178502e-06, "loss": 0.11340349, "memory(GiB)": 15.03, "step": 4715, "train_speed(iter/s)": 1.469934 }, { "acc": 0.98419046, "epoch": 8.331862312444837, "grad_norm": 5.349069595336914, "learning_rate": 9.631773929708912e-06, "loss": 0.09571871, "memory(GiB)": 15.03, "step": 4720, "train_speed(iter/s)": 1.469969 }, { "acc": 0.98198872, "epoch": 8.340688437775816, "grad_norm": 4.140796661376953, "learning_rate": 9.630672873282064e-06, "loss": 0.11403126, "memory(GiB)": 15.03, "step": 4725, "train_speed(iter/s)": 1.469995 }, { "acc": 0.98162174, "epoch": 8.349514563106796, "grad_norm": 6.819282531738281, "learning_rate": 9.629570236273818e-06, "loss": 0.09362194, "memory(GiB)": 15.03, "step": 4730, "train_speed(iter/s)": 1.469926 }, { "acc": 0.98139153, "epoch": 8.358340688437776, "grad_norm": 6.418247699737549, "learning_rate": 9.628466019060575e-06, "loss": 0.14073595, "memory(GiB)": 15.03, "step": 4735, "train_speed(iter/s)": 1.46991 }, { "acc": 0.985186, "epoch": 8.367166813768755, "grad_norm": 4.880292892456055, "learning_rate": 9.62736022201928e-06, "loss": 0.08964586, "memory(GiB)": 15.03, "step": 4740, "train_speed(iter/s)": 1.469905 }, { "acc": 0.97895527, "epoch": 8.375992939099735, "grad_norm": 4.758167266845703, "learning_rate": 9.62625284552741e-06, "loss": 0.12127315, "memory(GiB)": 15.03, "step": 4745, "train_speed(iter/s)": 1.46988 }, { "acc": 0.97904282, "epoch": 8.384819064430715, "grad_norm": 3.6744027137756348, "learning_rate": 9.625143889962989e-06, "loss": 0.14054215, "memory(GiB)": 15.03, "step": 4750, "train_speed(iter/s)": 1.469913 }, { "acc": 0.98740215, "epoch": 8.393645189761695, "grad_norm": 3.3447484970092773, "learning_rate": 9.624033355704574e-06, "loss": 0.08817288, "memory(GiB)": 15.03, "step": 4755, "train_speed(iter/s)": 1.469969 }, { "acc": 0.98102932, "epoch": 8.402471315092674, "grad_norm": 5.5197062492370605, "learning_rate": 9.622921243131265e-06, "loss": 0.12927728, "memory(GiB)": 15.03, "step": 4760, "train_speed(iter/s)": 1.470048 }, { "acc": 0.98443451, "epoch": 8.411297440423654, "grad_norm": 3.5600521564483643, "learning_rate": 9.621807552622693e-06, "loss": 0.0936382, "memory(GiB)": 15.03, "step": 4765, "train_speed(iter/s)": 1.470022 }, { "acc": 0.98064899, "epoch": 8.420123565754634, "grad_norm": 7.777843952178955, "learning_rate": 9.620692284559042e-06, "loss": 0.11105193, "memory(GiB)": 15.03, "step": 4770, "train_speed(iter/s)": 1.470057 }, { "acc": 0.97860041, "epoch": 8.428949691085613, "grad_norm": 3.826289415359497, "learning_rate": 9.619575439321021e-06, "loss": 0.14347131, "memory(GiB)": 15.03, "step": 4775, "train_speed(iter/s)": 1.470105 }, { "acc": 0.9808445, "epoch": 8.437775816416593, "grad_norm": 9.303421020507812, "learning_rate": 9.618457017289886e-06, "loss": 0.15269496, "memory(GiB)": 15.03, "step": 4780, "train_speed(iter/s)": 1.470146 }, { "acc": 0.97815704, "epoch": 8.446601941747574, "grad_norm": 4.569357395172119, "learning_rate": 9.617337018847423e-06, "loss": 0.13791478, "memory(GiB)": 15.03, "step": 4785, "train_speed(iter/s)": 1.470195 }, { "acc": 0.9778986, "epoch": 8.455428067078552, "grad_norm": 4.337357997894287, "learning_rate": 9.616215444375963e-06, "loss": 0.12687864, "memory(GiB)": 15.03, "step": 4790, "train_speed(iter/s)": 1.470189 }, { "acc": 0.98115101, "epoch": 8.464254192409532, "grad_norm": 3.033646821975708, "learning_rate": 9.615092294258372e-06, "loss": 0.11343614, "memory(GiB)": 15.03, "step": 4795, "train_speed(iter/s)": 1.470196 }, { "acc": 0.98200922, "epoch": 8.473080317740513, "grad_norm": 5.073472023010254, "learning_rate": 9.613967568878057e-06, "loss": 0.10619702, "memory(GiB)": 15.03, "step": 4800, "train_speed(iter/s)": 1.470194 }, { "acc": 0.98258905, "epoch": 8.481906443071491, "grad_norm": 3.6229074001312256, "learning_rate": 9.61284126861896e-06, "loss": 0.11348685, "memory(GiB)": 15.03, "step": 4805, "train_speed(iter/s)": 1.470167 }, { "acc": 0.98009319, "epoch": 8.490732568402471, "grad_norm": 5.1981425285339355, "learning_rate": 9.61171339386556e-06, "loss": 0.10826492, "memory(GiB)": 15.03, "step": 4810, "train_speed(iter/s)": 1.470164 }, { "acc": 0.98593788, "epoch": 8.499558693733452, "grad_norm": 6.091955661773682, "learning_rate": 9.610583945002874e-06, "loss": 0.08797303, "memory(GiB)": 15.03, "step": 4815, "train_speed(iter/s)": 1.470094 }, { "acc": 0.98150539, "epoch": 8.50838481906443, "grad_norm": 5.092984199523926, "learning_rate": 9.60945292241646e-06, "loss": 0.11923745, "memory(GiB)": 15.03, "step": 4820, "train_speed(iter/s)": 1.470129 }, { "acc": 0.97758369, "epoch": 8.51721094439541, "grad_norm": 5.73684024810791, "learning_rate": 9.608320326492406e-06, "loss": 0.14616734, "memory(GiB)": 15.03, "step": 4825, "train_speed(iter/s)": 1.470096 }, { "acc": 0.97659273, "epoch": 8.52603706972639, "grad_norm": 7.2787628173828125, "learning_rate": 9.607186157617343e-06, "loss": 0.15068023, "memory(GiB)": 15.03, "step": 4830, "train_speed(iter/s)": 1.470135 }, { "acc": 0.98062124, "epoch": 8.534863195057369, "grad_norm": 4.443432331085205, "learning_rate": 9.606050416178437e-06, "loss": 0.14043047, "memory(GiB)": 15.03, "step": 4835, "train_speed(iter/s)": 1.470145 }, { "acc": 0.98272743, "epoch": 8.54368932038835, "grad_norm": 5.61752462387085, "learning_rate": 9.604913102563392e-06, "loss": 0.1047785, "memory(GiB)": 15.03, "step": 4840, "train_speed(iter/s)": 1.470152 }, { "acc": 0.98740616, "epoch": 8.55251544571933, "grad_norm": 2.581777572631836, "learning_rate": 9.603774217160447e-06, "loss": 0.08437463, "memory(GiB)": 15.03, "step": 4845, "train_speed(iter/s)": 1.470152 }, { "acc": 0.97729816, "epoch": 8.56134157105031, "grad_norm": 5.278456211090088, "learning_rate": 9.602633760358377e-06, "loss": 0.13269513, "memory(GiB)": 15.03, "step": 4850, "train_speed(iter/s)": 1.470163 }, { "acc": 0.9836071, "epoch": 8.570167696381288, "grad_norm": 14.167874336242676, "learning_rate": 9.601491732546497e-06, "loss": 0.118841, "memory(GiB)": 15.03, "step": 4855, "train_speed(iter/s)": 1.47018 }, { "acc": 0.97944155, "epoch": 8.578993821712269, "grad_norm": 2.9063844680786133, "learning_rate": 9.600348134114655e-06, "loss": 0.13412528, "memory(GiB)": 15.03, "step": 4860, "train_speed(iter/s)": 1.470156 }, { "acc": 0.98191614, "epoch": 8.587819947043249, "grad_norm": 7.787407875061035, "learning_rate": 9.599202965453235e-06, "loss": 0.12093534, "memory(GiB)": 15.03, "step": 4865, "train_speed(iter/s)": 1.470197 }, { "acc": 0.98244009, "epoch": 8.596646072374227, "grad_norm": 2.865856409072876, "learning_rate": 9.598056226953158e-06, "loss": 0.10496646, "memory(GiB)": 15.03, "step": 4870, "train_speed(iter/s)": 1.470202 }, { "acc": 0.98312855, "epoch": 8.605472197705208, "grad_norm": 3.1355013847351074, "learning_rate": 9.596907919005881e-06, "loss": 0.11628475, "memory(GiB)": 15.03, "step": 4875, "train_speed(iter/s)": 1.470217 }, { "acc": 0.98450642, "epoch": 8.614298323036188, "grad_norm": 6.272003650665283, "learning_rate": 9.5957580420034e-06, "loss": 0.10462056, "memory(GiB)": 15.03, "step": 4880, "train_speed(iter/s)": 1.470202 }, { "acc": 0.98138704, "epoch": 8.623124448367166, "grad_norm": 13.468574523925781, "learning_rate": 9.59460659633824e-06, "loss": 0.1165194, "memory(GiB)": 15.03, "step": 4885, "train_speed(iter/s)": 1.470172 }, { "acc": 0.98496246, "epoch": 8.631950573698147, "grad_norm": 4.141894340515137, "learning_rate": 9.593453582403466e-06, "loss": 0.0866084, "memory(GiB)": 15.03, "step": 4890, "train_speed(iter/s)": 1.470224 }, { "acc": 0.97554989, "epoch": 8.640776699029127, "grad_norm": 6.447508335113525, "learning_rate": 9.592299000592678e-06, "loss": 0.16653564, "memory(GiB)": 15.03, "step": 4895, "train_speed(iter/s)": 1.470222 }, { "acc": 0.97898331, "epoch": 8.649602824360105, "grad_norm": 6.855510234832764, "learning_rate": 9.591142851300006e-06, "loss": 0.1489536, "memory(GiB)": 15.03, "step": 4900, "train_speed(iter/s)": 1.470283 }, { "acc": 0.98150539, "epoch": 8.658428949691086, "grad_norm": 5.506774425506592, "learning_rate": 9.589985134920126e-06, "loss": 0.10586249, "memory(GiB)": 15.03, "step": 4905, "train_speed(iter/s)": 1.470299 }, { "acc": 0.98281441, "epoch": 8.667255075022066, "grad_norm": 5.232043266296387, "learning_rate": 9.58882585184824e-06, "loss": 0.10866686, "memory(GiB)": 15.03, "step": 4910, "train_speed(iter/s)": 1.470342 }, { "acc": 0.97699108, "epoch": 8.676081200353044, "grad_norm": 5.491676330566406, "learning_rate": 9.587665002480086e-06, "loss": 0.12840903, "memory(GiB)": 15.03, "step": 4915, "train_speed(iter/s)": 1.470405 }, { "acc": 0.98661642, "epoch": 8.684907325684025, "grad_norm": 4.983754634857178, "learning_rate": 9.586502587211936e-06, "loss": 0.09881309, "memory(GiB)": 15.03, "step": 4920, "train_speed(iter/s)": 1.470382 }, { "acc": 0.98417835, "epoch": 8.693733451015005, "grad_norm": 4.689720153808594, "learning_rate": 9.585338606440605e-06, "loss": 0.10896696, "memory(GiB)": 15.03, "step": 4925, "train_speed(iter/s)": 1.470453 }, { "acc": 0.98079185, "epoch": 8.702559576345983, "grad_norm": 4.684690952301025, "learning_rate": 9.58417306056343e-06, "loss": 0.12396977, "memory(GiB)": 15.03, "step": 4930, "train_speed(iter/s)": 1.470488 }, { "acc": 0.98060093, "epoch": 8.711385701676964, "grad_norm": 4.837536334991455, "learning_rate": 9.58300594997829e-06, "loss": 0.12146959, "memory(GiB)": 15.03, "step": 4935, "train_speed(iter/s)": 1.470517 }, { "acc": 0.98288517, "epoch": 8.720211827007944, "grad_norm": 5.0455121994018555, "learning_rate": 9.581837275083597e-06, "loss": 0.10246885, "memory(GiB)": 15.03, "step": 4940, "train_speed(iter/s)": 1.470489 }, { "acc": 0.98225336, "epoch": 8.729037952338924, "grad_norm": 4.610119342803955, "learning_rate": 9.580667036278297e-06, "loss": 0.13099742, "memory(GiB)": 15.03, "step": 4945, "train_speed(iter/s)": 1.470536 }, { "acc": 0.98240051, "epoch": 8.737864077669903, "grad_norm": 4.303934097290039, "learning_rate": 9.579495233961866e-06, "loss": 0.11262778, "memory(GiB)": 15.03, "step": 4950, "train_speed(iter/s)": 1.470552 }, { "acc": 0.97671995, "epoch": 8.746690203000883, "grad_norm": 4.828369617462158, "learning_rate": 9.57832186853432e-06, "loss": 0.13805645, "memory(GiB)": 15.03, "step": 4955, "train_speed(iter/s)": 1.47055 }, { "acc": 0.98681793, "epoch": 8.755516328331863, "grad_norm": 5.049877643585205, "learning_rate": 9.577146940396205e-06, "loss": 0.09215527, "memory(GiB)": 15.03, "step": 4960, "train_speed(iter/s)": 1.470528 }, { "acc": 0.98238592, "epoch": 8.764342453662842, "grad_norm": 6.001547813415527, "learning_rate": 9.575970449948598e-06, "loss": 0.10701563, "memory(GiB)": 15.03, "step": 4965, "train_speed(iter/s)": 1.470539 }, { "acc": 0.98490753, "epoch": 8.773168578993822, "grad_norm": 3.036698341369629, "learning_rate": 9.574792397593118e-06, "loss": 0.11032348, "memory(GiB)": 15.03, "step": 4970, "train_speed(iter/s)": 1.470544 }, { "acc": 0.9799222, "epoch": 8.781994704324802, "grad_norm": 4.718025207519531, "learning_rate": 9.573612783731906e-06, "loss": 0.12796841, "memory(GiB)": 15.03, "step": 4975, "train_speed(iter/s)": 1.470597 }, { "acc": 0.98121586, "epoch": 8.79082082965578, "grad_norm": 4.866864204406738, "learning_rate": 9.572431608767644e-06, "loss": 0.13001552, "memory(GiB)": 15.03, "step": 4980, "train_speed(iter/s)": 1.470621 }, { "acc": 0.97488575, "epoch": 8.79964695498676, "grad_norm": 6.543542861938477, "learning_rate": 9.571248873103544e-06, "loss": 0.16470096, "memory(GiB)": 15.03, "step": 4985, "train_speed(iter/s)": 1.470661 }, { "acc": 0.98134956, "epoch": 8.808473080317741, "grad_norm": 4.254634380340576, "learning_rate": 9.570064577143352e-06, "loss": 0.13441327, "memory(GiB)": 15.03, "step": 4990, "train_speed(iter/s)": 1.470734 }, { "acc": 0.98322983, "epoch": 8.81729920564872, "grad_norm": 3.7964789867401123, "learning_rate": 9.568878721291344e-06, "loss": 0.10590303, "memory(GiB)": 15.03, "step": 4995, "train_speed(iter/s)": 1.47076 }, { "acc": 0.97424011, "epoch": 8.8261253309797, "grad_norm": 11.032280921936035, "learning_rate": 9.567691305952333e-06, "loss": 0.16066126, "memory(GiB)": 15.03, "step": 5000, "train_speed(iter/s)": 1.470748 }, { "acc": 0.98652182, "epoch": 8.83495145631068, "grad_norm": 5.170904636383057, "learning_rate": 9.56650233153166e-06, "loss": 0.09312518, "memory(GiB)": 15.03, "step": 5005, "train_speed(iter/s)": 1.470726 }, { "acc": 0.97945919, "epoch": 8.843777581641659, "grad_norm": 2.8165698051452637, "learning_rate": 9.565311798435199e-06, "loss": 0.1277871, "memory(GiB)": 15.03, "step": 5010, "train_speed(iter/s)": 1.470723 }, { "acc": 0.98298569, "epoch": 8.852603706972639, "grad_norm": 4.940521240234375, "learning_rate": 9.56411970706936e-06, "loss": 0.12894793, "memory(GiB)": 15.03, "step": 5015, "train_speed(iter/s)": 1.47074 }, { "acc": 0.98290176, "epoch": 8.86142983230362, "grad_norm": 2.4253060817718506, "learning_rate": 9.562926057841077e-06, "loss": 0.12107694, "memory(GiB)": 15.03, "step": 5020, "train_speed(iter/s)": 1.470747 }, { "acc": 0.98195286, "epoch": 8.870255957634598, "grad_norm": 5.724897861480713, "learning_rate": 9.561730851157824e-06, "loss": 0.12352828, "memory(GiB)": 15.03, "step": 5025, "train_speed(iter/s)": 1.470767 }, { "acc": 0.98240204, "epoch": 8.879082082965578, "grad_norm": 5.032639026641846, "learning_rate": 9.560534087427602e-06, "loss": 0.11427009, "memory(GiB)": 15.03, "step": 5030, "train_speed(iter/s)": 1.470721 }, { "acc": 0.97901621, "epoch": 8.887908208296558, "grad_norm": 6.040504455566406, "learning_rate": 9.559335767058948e-06, "loss": 0.12666464, "memory(GiB)": 15.03, "step": 5035, "train_speed(iter/s)": 1.470764 }, { "acc": 0.97697334, "epoch": 8.896734333627538, "grad_norm": 5.788447380065918, "learning_rate": 9.558135890460923e-06, "loss": 0.14981922, "memory(GiB)": 15.03, "step": 5040, "train_speed(iter/s)": 1.47078 }, { "acc": 0.98377466, "epoch": 8.905560458958517, "grad_norm": 1.853922963142395, "learning_rate": 9.556934458043129e-06, "loss": 0.10166771, "memory(GiB)": 15.03, "step": 5045, "train_speed(iter/s)": 1.470779 }, { "acc": 0.98257389, "epoch": 8.914386584289497, "grad_norm": 3.301675796508789, "learning_rate": 9.555731470215687e-06, "loss": 0.10558331, "memory(GiB)": 15.03, "step": 5050, "train_speed(iter/s)": 1.470784 }, { "acc": 0.97875605, "epoch": 8.923212709620477, "grad_norm": 3.8978638648986816, "learning_rate": 9.554526927389258e-06, "loss": 0.13692222, "memory(GiB)": 15.03, "step": 5055, "train_speed(iter/s)": 1.470766 }, { "acc": 0.9776762, "epoch": 8.932038834951456, "grad_norm": 5.515985488891602, "learning_rate": 9.553320829975034e-06, "loss": 0.12814233, "memory(GiB)": 15.03, "step": 5060, "train_speed(iter/s)": 1.470769 }, { "acc": 0.98248787, "epoch": 8.940864960282436, "grad_norm": 6.038349151611328, "learning_rate": 9.552113178384732e-06, "loss": 0.11903, "memory(GiB)": 15.03, "step": 5065, "train_speed(iter/s)": 1.470822 }, { "acc": 0.98103199, "epoch": 8.949691085613416, "grad_norm": 5.706503391265869, "learning_rate": 9.550903973030604e-06, "loss": 0.11993604, "memory(GiB)": 15.03, "step": 5070, "train_speed(iter/s)": 1.47081 }, { "acc": 0.9833292, "epoch": 8.958517210944395, "grad_norm": 3.4289755821228027, "learning_rate": 9.549693214325431e-06, "loss": 0.09601079, "memory(GiB)": 15.03, "step": 5075, "train_speed(iter/s)": 1.470801 }, { "acc": 0.98291607, "epoch": 8.967343336275375, "grad_norm": 4.83172607421875, "learning_rate": 9.548480902682524e-06, "loss": 0.0993529, "memory(GiB)": 15.03, "step": 5080, "train_speed(iter/s)": 1.470845 }, { "acc": 0.98349037, "epoch": 8.976169461606355, "grad_norm": 4.350212574005127, "learning_rate": 9.547267038515725e-06, "loss": 0.11607533, "memory(GiB)": 15.03, "step": 5085, "train_speed(iter/s)": 1.470904 }, { "acc": 0.98400497, "epoch": 8.984995586937334, "grad_norm": 4.701853275299072, "learning_rate": 9.546051622239405e-06, "loss": 0.10309038, "memory(GiB)": 15.03, "step": 5090, "train_speed(iter/s)": 1.470922 }, { "acc": 0.98166342, "epoch": 8.993821712268314, "grad_norm": 6.765443801879883, "learning_rate": 9.544834654268464e-06, "loss": 0.10757623, "memory(GiB)": 15.03, "step": 5095, "train_speed(iter/s)": 1.470904 }, { "acc": 0.98443546, "epoch": 9.002647837599294, "grad_norm": 2.9289441108703613, "learning_rate": 9.543616135018336e-06, "loss": 0.09741812, "memory(GiB)": 15.03, "step": 5100, "train_speed(iter/s)": 1.470719 }, { "acc": 0.98109112, "epoch": 9.011473962930273, "grad_norm": 32.19902038574219, "learning_rate": 9.542396064904978e-06, "loss": 0.12455893, "memory(GiB)": 15.03, "step": 5105, "train_speed(iter/s)": 1.470737 }, { "acc": 0.98487549, "epoch": 9.020300088261253, "grad_norm": 3.2501492500305176, "learning_rate": 9.54117444434488e-06, "loss": 0.09105992, "memory(GiB)": 15.03, "step": 5110, "train_speed(iter/s)": 1.470721 }, { "acc": 0.97869473, "epoch": 9.029126213592233, "grad_norm": 7.096625328063965, "learning_rate": 9.539951273755065e-06, "loss": 0.13129343, "memory(GiB)": 15.03, "step": 5115, "train_speed(iter/s)": 1.470715 }, { "acc": 0.98251133, "epoch": 9.037952338923212, "grad_norm": 4.443025588989258, "learning_rate": 9.538726553553075e-06, "loss": 0.1085711, "memory(GiB)": 15.03, "step": 5120, "train_speed(iter/s)": 1.47071 }, { "acc": 0.97783585, "epoch": 9.046778464254192, "grad_norm": 5.005191326141357, "learning_rate": 9.537500284156994e-06, "loss": 0.12131181, "memory(GiB)": 15.03, "step": 5125, "train_speed(iter/s)": 1.470743 }, { "acc": 0.98417044, "epoch": 9.055604589585172, "grad_norm": 5.216335296630859, "learning_rate": 9.536272465985425e-06, "loss": 0.13289405, "memory(GiB)": 15.03, "step": 5130, "train_speed(iter/s)": 1.470833 }, { "acc": 0.98522243, "epoch": 9.064430714916151, "grad_norm": 4.812147617340088, "learning_rate": 9.535043099457503e-06, "loss": 0.10056376, "memory(GiB)": 15.03, "step": 5135, "train_speed(iter/s)": 1.470777 }, { "acc": 0.98540077, "epoch": 9.073256840247131, "grad_norm": 4.4623188972473145, "learning_rate": 9.533812184992888e-06, "loss": 0.08852785, "memory(GiB)": 15.03, "step": 5140, "train_speed(iter/s)": 1.470773 }, { "acc": 0.98063908, "epoch": 9.082082965578111, "grad_norm": 6.259687423706055, "learning_rate": 9.53257972301178e-06, "loss": 0.12704723, "memory(GiB)": 15.03, "step": 5145, "train_speed(iter/s)": 1.470747 }, { "acc": 0.98499508, "epoch": 9.090909090909092, "grad_norm": 5.553722381591797, "learning_rate": 9.53134571393489e-06, "loss": 0.10187466, "memory(GiB)": 15.03, "step": 5150, "train_speed(iter/s)": 1.470788 }, { "acc": 0.97547302, "epoch": 9.09973521624007, "grad_norm": 6.202369689941406, "learning_rate": 9.530110158183474e-06, "loss": 0.13651257, "memory(GiB)": 15.03, "step": 5155, "train_speed(iter/s)": 1.470686 }, { "acc": 0.98366222, "epoch": 9.10856134157105, "grad_norm": 5.3386664390563965, "learning_rate": 9.5288730561793e-06, "loss": 0.10597122, "memory(GiB)": 15.03, "step": 5160, "train_speed(iter/s)": 1.470707 }, { "acc": 0.99033852, "epoch": 9.11738746690203, "grad_norm": 3.1341941356658936, "learning_rate": 9.52763440834468e-06, "loss": 0.06386057, "memory(GiB)": 15.03, "step": 5165, "train_speed(iter/s)": 1.470704 }, { "acc": 0.98241568, "epoch": 9.12621359223301, "grad_norm": 4.639533519744873, "learning_rate": 9.526394215102441e-06, "loss": 0.10875146, "memory(GiB)": 15.03, "step": 5170, "train_speed(iter/s)": 1.470666 }, { "acc": 0.98114967, "epoch": 9.13503971756399, "grad_norm": 4.96806001663208, "learning_rate": 9.525152476875943e-06, "loss": 0.13041843, "memory(GiB)": 15.03, "step": 5175, "train_speed(iter/s)": 1.47069 }, { "acc": 0.98585453, "epoch": 9.14386584289497, "grad_norm": 6.578402042388916, "learning_rate": 9.523909194089072e-06, "loss": 0.08976344, "memory(GiB)": 15.03, "step": 5180, "train_speed(iter/s)": 1.470583 }, { "acc": 0.97887621, "epoch": 9.152691968225948, "grad_norm": 4.419182777404785, "learning_rate": 9.522664367166244e-06, "loss": 0.14673949, "memory(GiB)": 15.03, "step": 5185, "train_speed(iter/s)": 1.47059 }, { "acc": 0.98478775, "epoch": 9.161518093556928, "grad_norm": 6.066734313964844, "learning_rate": 9.521417996532395e-06, "loss": 0.1038842, "memory(GiB)": 15.03, "step": 5190, "train_speed(iter/s)": 1.470611 }, { "acc": 0.98824034, "epoch": 9.170344218887909, "grad_norm": 4.973381996154785, "learning_rate": 9.520170082613002e-06, "loss": 0.0877559, "memory(GiB)": 15.03, "step": 5195, "train_speed(iter/s)": 1.470751 }, { "acc": 0.98700428, "epoch": 9.179170344218887, "grad_norm": 2.860074996948242, "learning_rate": 9.518920625834052e-06, "loss": 0.09465901, "memory(GiB)": 15.03, "step": 5200, "train_speed(iter/s)": 1.470723 }, { "acc": 0.98336639, "epoch": 9.187996469549867, "grad_norm": 3.8885602951049805, "learning_rate": 9.51766962662207e-06, "loss": 0.11920326, "memory(GiB)": 15.03, "step": 5205, "train_speed(iter/s)": 1.470741 }, { "acc": 0.98365421, "epoch": 9.196822594880848, "grad_norm": 8.507586479187012, "learning_rate": 9.5164170854041e-06, "loss": 0.11369636, "memory(GiB)": 15.03, "step": 5210, "train_speed(iter/s)": 1.470723 }, { "acc": 0.98559446, "epoch": 9.205648720211826, "grad_norm": 3.9550154209136963, "learning_rate": 9.515163002607722e-06, "loss": 0.10611575, "memory(GiB)": 15.03, "step": 5215, "train_speed(iter/s)": 1.470783 }, { "acc": 0.98017282, "epoch": 9.214474845542806, "grad_norm": 3.883721113204956, "learning_rate": 9.513907378661034e-06, "loss": 0.11998637, "memory(GiB)": 15.03, "step": 5220, "train_speed(iter/s)": 1.47079 }, { "acc": 0.98303747, "epoch": 9.223300970873787, "grad_norm": 3.905834674835205, "learning_rate": 9.512650213992662e-06, "loss": 0.11943166, "memory(GiB)": 15.03, "step": 5225, "train_speed(iter/s)": 1.470776 }, { "acc": 0.98020687, "epoch": 9.232127096204765, "grad_norm": 3.424084424972534, "learning_rate": 9.511391509031758e-06, "loss": 0.13189332, "memory(GiB)": 15.03, "step": 5230, "train_speed(iter/s)": 1.47079 }, { "acc": 0.98529701, "epoch": 9.240953221535745, "grad_norm": 4.301281929016113, "learning_rate": 9.510131264208003e-06, "loss": 0.09775892, "memory(GiB)": 15.03, "step": 5235, "train_speed(iter/s)": 1.470813 }, { "acc": 0.98363552, "epoch": 9.249779346866726, "grad_norm": 4.030009746551514, "learning_rate": 9.508869479951599e-06, "loss": 0.11203816, "memory(GiB)": 15.03, "step": 5240, "train_speed(iter/s)": 1.470848 }, { "acc": 0.98316813, "epoch": 9.258605472197706, "grad_norm": 5.3123459815979, "learning_rate": 9.507606156693279e-06, "loss": 0.12000511, "memory(GiB)": 15.03, "step": 5245, "train_speed(iter/s)": 1.470879 }, { "acc": 0.9824585, "epoch": 9.267431597528685, "grad_norm": 4.860470294952393, "learning_rate": 9.506341294864295e-06, "loss": 0.12124267, "memory(GiB)": 15.03, "step": 5250, "train_speed(iter/s)": 1.47089 }, { "acc": 0.98422775, "epoch": 9.276257722859665, "grad_norm": 3.7650866508483887, "learning_rate": 9.505074894896425e-06, "loss": 0.10356534, "memory(GiB)": 15.03, "step": 5255, "train_speed(iter/s)": 1.470931 }, { "acc": 0.98170662, "epoch": 9.285083848190645, "grad_norm": 3.9184577465057373, "learning_rate": 9.503806957221981e-06, "loss": 0.13467866, "memory(GiB)": 15.03, "step": 5260, "train_speed(iter/s)": 1.47098 }, { "acc": 0.98760996, "epoch": 9.293909973521624, "grad_norm": 2.904289722442627, "learning_rate": 9.502537482273788e-06, "loss": 0.08941088, "memory(GiB)": 15.03, "step": 5265, "train_speed(iter/s)": 1.471011 }, { "acc": 0.98435326, "epoch": 9.302736098852604, "grad_norm": 4.72812557220459, "learning_rate": 9.501266470485201e-06, "loss": 0.11510471, "memory(GiB)": 15.03, "step": 5270, "train_speed(iter/s)": 1.471045 }, { "acc": 0.9863306, "epoch": 9.311562224183584, "grad_norm": 3.6051747798919678, "learning_rate": 9.499993922290103e-06, "loss": 0.10630503, "memory(GiB)": 15.03, "step": 5275, "train_speed(iter/s)": 1.471078 }, { "acc": 0.9845273, "epoch": 9.320388349514563, "grad_norm": 3.6851930618286133, "learning_rate": 9.498719838122896e-06, "loss": 0.10311644, "memory(GiB)": 15.03, "step": 5280, "train_speed(iter/s)": 1.471054 }, { "acc": 0.98707237, "epoch": 9.329214474845543, "grad_norm": 6.580712795257568, "learning_rate": 9.497444218418506e-06, "loss": 0.10115873, "memory(GiB)": 15.03, "step": 5285, "train_speed(iter/s)": 1.471135 }, { "acc": 0.98293295, "epoch": 9.338040600176523, "grad_norm": 4.331326007843018, "learning_rate": 9.49616706361239e-06, "loss": 0.12079887, "memory(GiB)": 15.03, "step": 5290, "train_speed(iter/s)": 1.471167 }, { "acc": 0.98383551, "epoch": 9.346866725507502, "grad_norm": 4.338402271270752, "learning_rate": 9.494888374140521e-06, "loss": 0.10471306, "memory(GiB)": 15.03, "step": 5295, "train_speed(iter/s)": 1.471197 }, { "acc": 0.9885294, "epoch": 9.355692850838482, "grad_norm": 5.45211935043335, "learning_rate": 9.493608150439401e-06, "loss": 0.07713531, "memory(GiB)": 15.03, "step": 5300, "train_speed(iter/s)": 1.47113 }, { "acc": 0.9852541, "epoch": 9.364518976169462, "grad_norm": 5.0218329429626465, "learning_rate": 9.492326392946057e-06, "loss": 0.09907146, "memory(GiB)": 15.03, "step": 5305, "train_speed(iter/s)": 1.471161 }, { "acc": 0.98070164, "epoch": 9.37334510150044, "grad_norm": 13.707798957824707, "learning_rate": 9.491043102098033e-06, "loss": 0.12154562, "memory(GiB)": 15.03, "step": 5310, "train_speed(iter/s)": 1.471168 }, { "acc": 0.98645687, "epoch": 9.38217122683142, "grad_norm": 4.983213424682617, "learning_rate": 9.489758278333402e-06, "loss": 0.1065473, "memory(GiB)": 15.03, "step": 5315, "train_speed(iter/s)": 1.471169 }, { "acc": 0.98368645, "epoch": 9.390997352162401, "grad_norm": 2.6752936840057373, "learning_rate": 9.488471922090757e-06, "loss": 0.13876144, "memory(GiB)": 15.03, "step": 5320, "train_speed(iter/s)": 1.47118 }, { "acc": 0.9904397, "epoch": 9.399823477493381, "grad_norm": 2.8576860427856445, "learning_rate": 9.487184033809216e-06, "loss": 0.06761025, "memory(GiB)": 15.03, "step": 5325, "train_speed(iter/s)": 1.471106 }, { "acc": 0.98310699, "epoch": 9.40864960282436, "grad_norm": 7.583826065063477, "learning_rate": 9.485894613928423e-06, "loss": 0.10673773, "memory(GiB)": 15.03, "step": 5330, "train_speed(iter/s)": 1.471121 }, { "acc": 0.98650589, "epoch": 9.41747572815534, "grad_norm": 4.1614670753479, "learning_rate": 9.484603662888537e-06, "loss": 0.08565624, "memory(GiB)": 15.03, "step": 5335, "train_speed(iter/s)": 1.47115 }, { "acc": 0.98019552, "epoch": 9.42630185348632, "grad_norm": 5.64926815032959, "learning_rate": 9.483311181130247e-06, "loss": 0.12192373, "memory(GiB)": 15.03, "step": 5340, "train_speed(iter/s)": 1.4712 }, { "acc": 0.98044977, "epoch": 9.435127978817299, "grad_norm": 2.8795247077941895, "learning_rate": 9.48201716909476e-06, "loss": 0.12615345, "memory(GiB)": 15.03, "step": 5345, "train_speed(iter/s)": 1.471174 }, { "acc": 0.97980537, "epoch": 9.443954104148279, "grad_norm": 6.874777317047119, "learning_rate": 9.480721627223806e-06, "loss": 0.15063243, "memory(GiB)": 15.03, "step": 5350, "train_speed(iter/s)": 1.471219 }, { "acc": 0.97689819, "epoch": 9.45278022947926, "grad_norm": 6.375941276550293, "learning_rate": 9.479424555959645e-06, "loss": 0.13502028, "memory(GiB)": 15.03, "step": 5355, "train_speed(iter/s)": 1.471248 }, { "acc": 0.98693314, "epoch": 9.461606354810238, "grad_norm": 2.889423370361328, "learning_rate": 9.478125955745044e-06, "loss": 0.09437811, "memory(GiB)": 15.03, "step": 5360, "train_speed(iter/s)": 1.471258 }, { "acc": 0.98551254, "epoch": 9.470432480141218, "grad_norm": 6.323914051055908, "learning_rate": 9.476825827023304e-06, "loss": 0.07921948, "memory(GiB)": 15.03, "step": 5365, "train_speed(iter/s)": 1.471299 }, { "acc": 0.98456879, "epoch": 9.479258605472198, "grad_norm": 3.11187481880188, "learning_rate": 9.475524170238244e-06, "loss": 0.11656997, "memory(GiB)": 15.03, "step": 5370, "train_speed(iter/s)": 1.471313 }, { "acc": 0.98566246, "epoch": 9.488084730803177, "grad_norm": 4.420257091522217, "learning_rate": 9.474220985834205e-06, "loss": 0.08528044, "memory(GiB)": 15.03, "step": 5375, "train_speed(iter/s)": 1.471313 }, { "acc": 0.98276176, "epoch": 9.496910856134157, "grad_norm": 3.5465073585510254, "learning_rate": 9.47291627425605e-06, "loss": 0.10833173, "memory(GiB)": 15.03, "step": 5380, "train_speed(iter/s)": 1.471306 }, { "acc": 0.98475142, "epoch": 9.505736981465137, "grad_norm": 6.639709949493408, "learning_rate": 9.47161003594916e-06, "loss": 0.10459168, "memory(GiB)": 15.03, "step": 5385, "train_speed(iter/s)": 1.47129 }, { "acc": 0.98644142, "epoch": 9.514563106796116, "grad_norm": 5.924108982086182, "learning_rate": 9.470302271359444e-06, "loss": 0.07787685, "memory(GiB)": 15.03, "step": 5390, "train_speed(iter/s)": 1.471264 }, { "acc": 0.98807755, "epoch": 9.523389232127096, "grad_norm": 13.060989379882812, "learning_rate": 9.468992980933322e-06, "loss": 0.08990206, "memory(GiB)": 15.03, "step": 5395, "train_speed(iter/s)": 1.471266 }, { "acc": 0.98429346, "epoch": 9.532215357458076, "grad_norm": 5.631528854370117, "learning_rate": 9.467682165117748e-06, "loss": 0.10060582, "memory(GiB)": 15.03, "step": 5400, "train_speed(iter/s)": 1.471314 }, { "acc": 0.98779917, "epoch": 9.541041482789055, "grad_norm": 5.685439586639404, "learning_rate": 9.46636982436018e-06, "loss": 0.0846401, "memory(GiB)": 15.03, "step": 5405, "train_speed(iter/s)": 1.471362 }, { "acc": 0.98761425, "epoch": 9.549867608120035, "grad_norm": 3.2483768463134766, "learning_rate": 9.465055959108614e-06, "loss": 0.08226025, "memory(GiB)": 15.03, "step": 5410, "train_speed(iter/s)": 1.471386 }, { "acc": 0.98445873, "epoch": 9.558693733451015, "grad_norm": 4.11806058883667, "learning_rate": 9.463740569811554e-06, "loss": 0.11194841, "memory(GiB)": 15.03, "step": 5415, "train_speed(iter/s)": 1.471394 }, { "acc": 0.98183651, "epoch": 9.567519858781996, "grad_norm": 4.0970611572265625, "learning_rate": 9.462423656918031e-06, "loss": 0.11987615, "memory(GiB)": 15.03, "step": 5420, "train_speed(iter/s)": 1.471308 }, { "acc": 0.98904743, "epoch": 9.576345984112974, "grad_norm": 4.558534145355225, "learning_rate": 9.46110522087759e-06, "loss": 0.07582824, "memory(GiB)": 15.03, "step": 5425, "train_speed(iter/s)": 1.471316 }, { "acc": 0.98226891, "epoch": 9.585172109443954, "grad_norm": 8.21090316772461, "learning_rate": 9.459785262140305e-06, "loss": 0.11591315, "memory(GiB)": 15.03, "step": 5430, "train_speed(iter/s)": 1.471304 }, { "acc": 0.9837492, "epoch": 9.593998234774935, "grad_norm": 3.4258241653442383, "learning_rate": 9.458463781156759e-06, "loss": 0.11313517, "memory(GiB)": 15.03, "step": 5435, "train_speed(iter/s)": 1.471337 }, { "acc": 0.9891305, "epoch": 9.602824360105913, "grad_norm": 2.6109657287597656, "learning_rate": 9.457140778378063e-06, "loss": 0.0922451, "memory(GiB)": 15.03, "step": 5440, "train_speed(iter/s)": 1.471435 }, { "acc": 0.98723946, "epoch": 9.611650485436893, "grad_norm": 2.313499689102173, "learning_rate": 9.455816254255846e-06, "loss": 0.09157287, "memory(GiB)": 15.03, "step": 5445, "train_speed(iter/s)": 1.471342 }, { "acc": 0.98523626, "epoch": 9.620476610767874, "grad_norm": 3.9264087677001953, "learning_rate": 9.45449020924225e-06, "loss": 0.10521219, "memory(GiB)": 15.03, "step": 5450, "train_speed(iter/s)": 1.471322 }, { "acc": 0.98003683, "epoch": 9.629302736098852, "grad_norm": 4.978489398956299, "learning_rate": 9.453162643789947e-06, "loss": 0.13596305, "memory(GiB)": 15.03, "step": 5455, "train_speed(iter/s)": 1.471268 }, { "acc": 0.98921566, "epoch": 9.638128861429832, "grad_norm": 4.880705833435059, "learning_rate": 9.451833558352117e-06, "loss": 0.08215414, "memory(GiB)": 15.03, "step": 5460, "train_speed(iter/s)": 1.47126 }, { "acc": 0.98163443, "epoch": 9.646954986760813, "grad_norm": 6.624598026275635, "learning_rate": 9.450502953382466e-06, "loss": 0.10555689, "memory(GiB)": 15.03, "step": 5465, "train_speed(iter/s)": 1.471256 }, { "acc": 0.98765278, "epoch": 9.655781112091791, "grad_norm": 8.511521339416504, "learning_rate": 9.449170829335218e-06, "loss": 0.08948265, "memory(GiB)": 15.03, "step": 5470, "train_speed(iter/s)": 1.471277 }, { "acc": 0.98701315, "epoch": 9.664607237422771, "grad_norm": 5.214874267578125, "learning_rate": 9.447837186665112e-06, "loss": 0.09507197, "memory(GiB)": 15.03, "step": 5475, "train_speed(iter/s)": 1.471312 }, { "acc": 0.98404293, "epoch": 9.673433362753752, "grad_norm": 7.818774700164795, "learning_rate": 9.446502025827409e-06, "loss": 0.10428448, "memory(GiB)": 15.03, "step": 5480, "train_speed(iter/s)": 1.471356 }, { "acc": 0.98083611, "epoch": 9.68225948808473, "grad_norm": 5.4727582931518555, "learning_rate": 9.445165347277886e-06, "loss": 0.11966401, "memory(GiB)": 15.03, "step": 5485, "train_speed(iter/s)": 1.471416 }, { "acc": 0.98831301, "epoch": 9.69108561341571, "grad_norm": 4.743756294250488, "learning_rate": 9.443827151472837e-06, "loss": 0.09401097, "memory(GiB)": 15.03, "step": 5490, "train_speed(iter/s)": 1.471486 }, { "acc": 0.98317165, "epoch": 9.69991173874669, "grad_norm": 3.9438424110412598, "learning_rate": 9.44248743886908e-06, "loss": 0.10806714, "memory(GiB)": 15.03, "step": 5495, "train_speed(iter/s)": 1.471505 }, { "acc": 0.98329535, "epoch": 9.70873786407767, "grad_norm": 3.9044711589813232, "learning_rate": 9.441146209923949e-06, "loss": 0.11365751, "memory(GiB)": 15.03, "step": 5500, "train_speed(iter/s)": 1.471533 }, { "acc": 0.9859848, "epoch": 9.71756398940865, "grad_norm": 3.8580684661865234, "learning_rate": 9.439803465095284e-06, "loss": 0.1003885, "memory(GiB)": 15.03, "step": 5505, "train_speed(iter/s)": 1.471499 }, { "acc": 0.98030701, "epoch": 9.72639011473963, "grad_norm": 3.712869882583618, "learning_rate": 9.43845920484146e-06, "loss": 0.1415503, "memory(GiB)": 15.03, "step": 5510, "train_speed(iter/s)": 1.471527 }, { "acc": 0.98266926, "epoch": 9.73521624007061, "grad_norm": 4.319155693054199, "learning_rate": 9.437113429621357e-06, "loss": 0.12116125, "memory(GiB)": 15.03, "step": 5515, "train_speed(iter/s)": 1.471561 }, { "acc": 0.98775749, "epoch": 9.744042365401588, "grad_norm": 5.767709255218506, "learning_rate": 9.435766139894377e-06, "loss": 0.08968593, "memory(GiB)": 15.03, "step": 5520, "train_speed(iter/s)": 1.471572 }, { "acc": 0.98490486, "epoch": 9.752868490732569, "grad_norm": 6.518494606018066, "learning_rate": 9.43441733612044e-06, "loss": 0.0990405, "memory(GiB)": 15.03, "step": 5525, "train_speed(iter/s)": 1.47161 }, { "acc": 0.98440351, "epoch": 9.761694616063549, "grad_norm": 6.150426387786865, "learning_rate": 9.43306701875998e-06, "loss": 0.10020602, "memory(GiB)": 15.03, "step": 5530, "train_speed(iter/s)": 1.471567 }, { "acc": 0.98509274, "epoch": 9.770520741394527, "grad_norm": 4.55529260635376, "learning_rate": 9.431715188273948e-06, "loss": 0.10636775, "memory(GiB)": 15.03, "step": 5535, "train_speed(iter/s)": 1.471625 }, { "acc": 0.98347797, "epoch": 9.779346866725508, "grad_norm": 4.526733875274658, "learning_rate": 9.430361845123812e-06, "loss": 0.11173277, "memory(GiB)": 15.03, "step": 5540, "train_speed(iter/s)": 1.471605 }, { "acc": 0.98944359, "epoch": 9.788172992056488, "grad_norm": 2.9868428707122803, "learning_rate": 9.429006989771559e-06, "loss": 0.07854258, "memory(GiB)": 15.03, "step": 5545, "train_speed(iter/s)": 1.471622 }, { "acc": 0.98066273, "epoch": 9.796999117387466, "grad_norm": 3.3669261932373047, "learning_rate": 9.427650622679687e-06, "loss": 0.13198116, "memory(GiB)": 15.03, "step": 5550, "train_speed(iter/s)": 1.471659 }, { "acc": 0.99017963, "epoch": 9.805825242718447, "grad_norm": 4.778109073638916, "learning_rate": 9.426292744311217e-06, "loss": 0.06583869, "memory(GiB)": 15.03, "step": 5555, "train_speed(iter/s)": 1.47165 }, { "acc": 0.98222847, "epoch": 9.814651368049427, "grad_norm": 5.164674282073975, "learning_rate": 9.424933355129678e-06, "loss": 0.12371104, "memory(GiB)": 15.03, "step": 5560, "train_speed(iter/s)": 1.471605 }, { "acc": 0.98020401, "epoch": 9.823477493380405, "grad_norm": 4.696728229522705, "learning_rate": 9.42357245559912e-06, "loss": 0.14200857, "memory(GiB)": 15.03, "step": 5565, "train_speed(iter/s)": 1.471652 }, { "acc": 0.98103371, "epoch": 9.832303618711386, "grad_norm": 6.6608052253723145, "learning_rate": 9.422210046184109e-06, "loss": 0.12250583, "memory(GiB)": 15.03, "step": 5570, "train_speed(iter/s)": 1.471705 }, { "acc": 0.98361855, "epoch": 9.841129744042366, "grad_norm": 5.9443135261535645, "learning_rate": 9.42084612734972e-06, "loss": 0.10557089, "memory(GiB)": 15.03, "step": 5575, "train_speed(iter/s)": 1.471749 }, { "acc": 0.98484001, "epoch": 9.849955869373344, "grad_norm": 12.675237655639648, "learning_rate": 9.419480699561552e-06, "loss": 0.11265949, "memory(GiB)": 15.03, "step": 5580, "train_speed(iter/s)": 1.471733 }, { "acc": 0.98623762, "epoch": 9.858781994704325, "grad_norm": 6.0256266593933105, "learning_rate": 9.418113763285712e-06, "loss": 0.08485516, "memory(GiB)": 15.03, "step": 5585, "train_speed(iter/s)": 1.471791 }, { "acc": 0.984624, "epoch": 9.867608120035305, "grad_norm": 4.205758094787598, "learning_rate": 9.416745318988829e-06, "loss": 0.08983501, "memory(GiB)": 15.03, "step": 5590, "train_speed(iter/s)": 1.47179 }, { "acc": 0.98586283, "epoch": 9.876434245366283, "grad_norm": 4.7376179695129395, "learning_rate": 9.415375367138038e-06, "loss": 0.09202372, "memory(GiB)": 15.03, "step": 5595, "train_speed(iter/s)": 1.47183 }, { "acc": 0.98725376, "epoch": 9.885260370697264, "grad_norm": 4.1387248039245605, "learning_rate": 9.414003908200998e-06, "loss": 0.08499388, "memory(GiB)": 15.03, "step": 5600, "train_speed(iter/s)": 1.47187 }, { "acc": 0.98605938, "epoch": 9.894086496028244, "grad_norm": 3.930490732192993, "learning_rate": 9.412630942645875e-06, "loss": 0.08704255, "memory(GiB)": 15.03, "step": 5605, "train_speed(iter/s)": 1.471934 }, { "acc": 0.98254871, "epoch": 9.902912621359224, "grad_norm": 4.8064985275268555, "learning_rate": 9.411256470941352e-06, "loss": 0.10987492, "memory(GiB)": 15.03, "step": 5610, "train_speed(iter/s)": 1.471952 }, { "acc": 0.98441048, "epoch": 9.911738746690203, "grad_norm": 3.1638293266296387, "learning_rate": 9.409880493556627e-06, "loss": 0.10821506, "memory(GiB)": 15.03, "step": 5615, "train_speed(iter/s)": 1.471978 }, { "acc": 0.9844698, "epoch": 9.920564872021183, "grad_norm": 2.708716869354248, "learning_rate": 9.40850301096141e-06, "loss": 0.10390047, "memory(GiB)": 15.03, "step": 5620, "train_speed(iter/s)": 1.471971 }, { "acc": 0.98448067, "epoch": 9.929390997352161, "grad_norm": 5.905113220214844, "learning_rate": 9.407124023625928e-06, "loss": 0.10885055, "memory(GiB)": 15.03, "step": 5625, "train_speed(iter/s)": 1.471991 }, { "acc": 0.98687439, "epoch": 9.938217122683142, "grad_norm": 7.314137935638428, "learning_rate": 9.40574353202092e-06, "loss": 0.0799708, "memory(GiB)": 15.03, "step": 5630, "train_speed(iter/s)": 1.472042 }, { "acc": 0.98863487, "epoch": 9.947043248014122, "grad_norm": 4.185338973999023, "learning_rate": 9.404361536617635e-06, "loss": 0.08254148, "memory(GiB)": 15.03, "step": 5635, "train_speed(iter/s)": 1.472089 }, { "acc": 0.98573008, "epoch": 9.955869373345102, "grad_norm": 6.38752555847168, "learning_rate": 9.402978037887842e-06, "loss": 0.08768109, "memory(GiB)": 15.03, "step": 5640, "train_speed(iter/s)": 1.472117 }, { "acc": 0.98574209, "epoch": 9.96469549867608, "grad_norm": 3.7632436752319336, "learning_rate": 9.401593036303816e-06, "loss": 0.09534254, "memory(GiB)": 15.03, "step": 5645, "train_speed(iter/s)": 1.472209 }, { "acc": 0.98348227, "epoch": 9.973521624007061, "grad_norm": 3.4877195358276367, "learning_rate": 9.400206532338354e-06, "loss": 0.10655179, "memory(GiB)": 15.03, "step": 5650, "train_speed(iter/s)": 1.472223 }, { "acc": 0.98313332, "epoch": 9.982347749338041, "grad_norm": 5.513731956481934, "learning_rate": 9.398818526464756e-06, "loss": 0.11226536, "memory(GiB)": 15.03, "step": 5655, "train_speed(iter/s)": 1.472185 }, { "acc": 0.98532867, "epoch": 9.99117387466902, "grad_norm": 3.094939708709717, "learning_rate": 9.397429019156841e-06, "loss": 0.08437028, "memory(GiB)": 15.03, "step": 5660, "train_speed(iter/s)": 1.472204 }, { "acc": 0.98708706, "epoch": 10.0, "grad_norm": 7.672934055328369, "learning_rate": 9.39603801088894e-06, "loss": 0.09575011, "memory(GiB)": 15.03, "step": 5665, "train_speed(iter/s)": 1.472207 }, { "acc": 0.98642826, "epoch": 10.00882612533098, "grad_norm": 7.108858585357666, "learning_rate": 9.394645502135898e-06, "loss": 0.08851074, "memory(GiB)": 15.03, "step": 5670, "train_speed(iter/s)": 1.472159 }, { "acc": 0.98955135, "epoch": 10.017652250661959, "grad_norm": 2.3297624588012695, "learning_rate": 9.393251493373061e-06, "loss": 0.07484924, "memory(GiB)": 15.03, "step": 5675, "train_speed(iter/s)": 1.472137 }, { "acc": 0.97865877, "epoch": 10.026478375992939, "grad_norm": 5.0639262199401855, "learning_rate": 9.391855985076303e-06, "loss": 0.13419809, "memory(GiB)": 15.03, "step": 5680, "train_speed(iter/s)": 1.47222 }, { "acc": 0.98811321, "epoch": 10.03530450132392, "grad_norm": 1.794277548789978, "learning_rate": 9.390458977721997e-06, "loss": 0.08703249, "memory(GiB)": 15.03, "step": 5685, "train_speed(iter/s)": 1.472266 }, { "acc": 0.98541384, "epoch": 10.044130626654898, "grad_norm": 2.829962968826294, "learning_rate": 9.38906047178704e-06, "loss": 0.09384823, "memory(GiB)": 15.03, "step": 5690, "train_speed(iter/s)": 1.472314 }, { "acc": 0.98766775, "epoch": 10.052956751985878, "grad_norm": 6.025633811950684, "learning_rate": 9.387660467748826e-06, "loss": 0.09031301, "memory(GiB)": 15.03, "step": 5695, "train_speed(iter/s)": 1.472321 }, { "acc": 0.98637524, "epoch": 10.061782877316858, "grad_norm": 4.46652364730835, "learning_rate": 9.386258966085273e-06, "loss": 0.08197641, "memory(GiB)": 15.03, "step": 5700, "train_speed(iter/s)": 1.472343 }, { "acc": 0.98127966, "epoch": 10.070609002647837, "grad_norm": 3.484283208847046, "learning_rate": 9.384855967274804e-06, "loss": 0.11013401, "memory(GiB)": 15.03, "step": 5705, "train_speed(iter/s)": 1.472418 }, { "acc": 0.98867559, "epoch": 10.079435127978817, "grad_norm": 5.187534332275391, "learning_rate": 9.383451471796356e-06, "loss": 0.07876477, "memory(GiB)": 15.03, "step": 5710, "train_speed(iter/s)": 1.472424 }, { "acc": 0.98746901, "epoch": 10.088261253309797, "grad_norm": 3.632976770401001, "learning_rate": 9.382045480129373e-06, "loss": 0.07443787, "memory(GiB)": 15.03, "step": 5715, "train_speed(iter/s)": 1.472421 }, { "acc": 0.98504381, "epoch": 10.097087378640778, "grad_norm": 10.980177879333496, "learning_rate": 9.380637992753815e-06, "loss": 0.08679685, "memory(GiB)": 15.03, "step": 5720, "train_speed(iter/s)": 1.472435 }, { "acc": 0.9873291, "epoch": 10.105913503971756, "grad_norm": 3.8500068187713623, "learning_rate": 9.379229010150144e-06, "loss": 0.08320627, "memory(GiB)": 15.03, "step": 5725, "train_speed(iter/s)": 1.472468 }, { "acc": 0.9883297, "epoch": 10.114739629302736, "grad_norm": 3.3022847175598145, "learning_rate": 9.377818532799344e-06, "loss": 0.07762804, "memory(GiB)": 15.03, "step": 5730, "train_speed(iter/s)": 1.472475 }, { "acc": 0.98470411, "epoch": 10.123565754633717, "grad_norm": 5.809149742126465, "learning_rate": 9.3764065611829e-06, "loss": 0.08845789, "memory(GiB)": 15.03, "step": 5735, "train_speed(iter/s)": 1.472496 }, { "acc": 0.98262482, "epoch": 10.132391879964695, "grad_norm": 4.618743896484375, "learning_rate": 9.374993095782813e-06, "loss": 0.11154673, "memory(GiB)": 15.03, "step": 5740, "train_speed(iter/s)": 1.472531 }, { "acc": 0.98389196, "epoch": 10.141218005295675, "grad_norm": 2.8942131996154785, "learning_rate": 9.373578137081589e-06, "loss": 0.10203723, "memory(GiB)": 15.03, "step": 5745, "train_speed(iter/s)": 1.472566 }, { "acc": 0.98572016, "epoch": 10.150044130626656, "grad_norm": 5.072423458099365, "learning_rate": 9.372161685562245e-06, "loss": 0.09165628, "memory(GiB)": 15.03, "step": 5750, "train_speed(iter/s)": 1.472608 }, { "acc": 0.98606186, "epoch": 10.158870255957634, "grad_norm": 4.305849552154541, "learning_rate": 9.370743741708316e-06, "loss": 0.08823658, "memory(GiB)": 15.03, "step": 5755, "train_speed(iter/s)": 1.472636 }, { "acc": 0.98379459, "epoch": 10.167696381288614, "grad_norm": 2.8969335556030273, "learning_rate": 9.36932430600383e-06, "loss": 0.10887702, "memory(GiB)": 15.03, "step": 5760, "train_speed(iter/s)": 1.472588 }, { "acc": 0.99051914, "epoch": 10.176522506619595, "grad_norm": 4.226154327392578, "learning_rate": 9.367903378933339e-06, "loss": 0.08110992, "memory(GiB)": 15.03, "step": 5765, "train_speed(iter/s)": 1.472632 }, { "acc": 0.98747005, "epoch": 10.185348631950573, "grad_norm": 3.131417751312256, "learning_rate": 9.366480960981894e-06, "loss": 0.06998911, "memory(GiB)": 15.03, "step": 5770, "train_speed(iter/s)": 1.472684 }, { "acc": 0.99154739, "epoch": 10.194174757281553, "grad_norm": 2.411997079849243, "learning_rate": 9.365057052635065e-06, "loss": 0.067269, "memory(GiB)": 15.03, "step": 5775, "train_speed(iter/s)": 1.472771 }, { "acc": 0.98221645, "epoch": 10.203000882612534, "grad_norm": 9.206354141235352, "learning_rate": 9.363631654378921e-06, "loss": 0.10346701, "memory(GiB)": 15.03, "step": 5780, "train_speed(iter/s)": 1.472771 }, { "acc": 0.98521738, "epoch": 10.211827007943512, "grad_norm": 6.091846466064453, "learning_rate": 9.362204766700047e-06, "loss": 0.11093712, "memory(GiB)": 15.03, "step": 5785, "train_speed(iter/s)": 1.472747 }, { "acc": 0.98227015, "epoch": 10.220653133274492, "grad_norm": 6.988794803619385, "learning_rate": 9.36077639008553e-06, "loss": 0.11812407, "memory(GiB)": 15.03, "step": 5790, "train_speed(iter/s)": 1.472753 }, { "acc": 0.98925247, "epoch": 10.229479258605473, "grad_norm": 2.854398250579834, "learning_rate": 9.35934652502297e-06, "loss": 0.07957177, "memory(GiB)": 15.03, "step": 5795, "train_speed(iter/s)": 1.472769 }, { "acc": 0.98854895, "epoch": 10.238305383936451, "grad_norm": 2.882568120956421, "learning_rate": 9.357915172000475e-06, "loss": 0.06556962, "memory(GiB)": 15.03, "step": 5800, "train_speed(iter/s)": 1.47272 }, { "acc": 0.98693428, "epoch": 10.247131509267431, "grad_norm": 3.9877054691314697, "learning_rate": 9.356482331506657e-06, "loss": 0.1009064, "memory(GiB)": 15.03, "step": 5805, "train_speed(iter/s)": 1.472684 }, { "acc": 0.98529625, "epoch": 10.255957634598412, "grad_norm": 3.6654775142669678, "learning_rate": 9.355048004030641e-06, "loss": 0.09368179, "memory(GiB)": 15.03, "step": 5810, "train_speed(iter/s)": 1.472684 }, { "acc": 0.98808498, "epoch": 10.264783759929392, "grad_norm": 3.973811626434326, "learning_rate": 9.353612190062056e-06, "loss": 0.08999655, "memory(GiB)": 15.03, "step": 5815, "train_speed(iter/s)": 1.472677 }, { "acc": 0.98441925, "epoch": 10.27360988526037, "grad_norm": 4.035444736480713, "learning_rate": 9.352174890091038e-06, "loss": 0.09800732, "memory(GiB)": 15.03, "step": 5820, "train_speed(iter/s)": 1.472673 }, { "acc": 0.98254118, "epoch": 10.28243601059135, "grad_norm": 7.923323631286621, "learning_rate": 9.350736104608232e-06, "loss": 0.10888486, "memory(GiB)": 15.03, "step": 5825, "train_speed(iter/s)": 1.472683 }, { "acc": 0.98260632, "epoch": 10.29126213592233, "grad_norm": 4.425653457641602, "learning_rate": 9.349295834104792e-06, "loss": 0.12140162, "memory(GiB)": 15.03, "step": 5830, "train_speed(iter/s)": 1.472742 }, { "acc": 0.98657017, "epoch": 10.30008826125331, "grad_norm": 2.8475828170776367, "learning_rate": 9.347854079072375e-06, "loss": 0.09403374, "memory(GiB)": 15.03, "step": 5835, "train_speed(iter/s)": 1.472728 }, { "acc": 0.98825397, "epoch": 10.30891438658429, "grad_norm": 4.338202953338623, "learning_rate": 9.346410840003146e-06, "loss": 0.08142152, "memory(GiB)": 15.03, "step": 5840, "train_speed(iter/s)": 1.472753 }, { "acc": 0.99103107, "epoch": 10.31774051191527, "grad_norm": 2.868048906326294, "learning_rate": 9.344966117389778e-06, "loss": 0.07122272, "memory(GiB)": 15.03, "step": 5845, "train_speed(iter/s)": 1.472749 }, { "acc": 0.98481884, "epoch": 10.326566637246248, "grad_norm": 4.127039909362793, "learning_rate": 9.343519911725448e-06, "loss": 0.1064754, "memory(GiB)": 15.03, "step": 5850, "train_speed(iter/s)": 1.472702 }, { "acc": 0.98736515, "epoch": 10.335392762577229, "grad_norm": 4.1518378257751465, "learning_rate": 9.342072223503844e-06, "loss": 0.07888302, "memory(GiB)": 15.03, "step": 5855, "train_speed(iter/s)": 1.472729 }, { "acc": 0.98527184, "epoch": 10.344218887908209, "grad_norm": 2.8583195209503174, "learning_rate": 9.340623053219152e-06, "loss": 0.10469272, "memory(GiB)": 15.03, "step": 5860, "train_speed(iter/s)": 1.472763 }, { "acc": 0.98767033, "epoch": 10.353045013239187, "grad_norm": 3.38993239402771, "learning_rate": 9.339172401366075e-06, "loss": 0.11542602, "memory(GiB)": 15.03, "step": 5865, "train_speed(iter/s)": 1.472778 }, { "acc": 0.98367519, "epoch": 10.361871138570168, "grad_norm": 4.289000511169434, "learning_rate": 9.33772026843981e-06, "loss": 0.09509541, "memory(GiB)": 15.03, "step": 5870, "train_speed(iter/s)": 1.472736 }, { "acc": 0.98737354, "epoch": 10.370697263901148, "grad_norm": 4.830870151519775, "learning_rate": 9.336266654936067e-06, "loss": 0.09016787, "memory(GiB)": 15.03, "step": 5875, "train_speed(iter/s)": 1.472818 }, { "acc": 0.97980919, "epoch": 10.379523389232126, "grad_norm": 4.967458248138428, "learning_rate": 9.334811561351061e-06, "loss": 0.12163301, "memory(GiB)": 15.03, "step": 5880, "train_speed(iter/s)": 1.47287 }, { "acc": 0.98592529, "epoch": 10.388349514563107, "grad_norm": 5.155917167663574, "learning_rate": 9.333354988181508e-06, "loss": 0.09486455, "memory(GiB)": 15.03, "step": 5885, "train_speed(iter/s)": 1.472896 }, { "acc": 0.98374825, "epoch": 10.397175639894087, "grad_norm": 6.113171577453613, "learning_rate": 9.331896935924636e-06, "loss": 0.09958209, "memory(GiB)": 15.03, "step": 5890, "train_speed(iter/s)": 1.472926 }, { "acc": 0.98577166, "epoch": 10.406001765225065, "grad_norm": 4.022655487060547, "learning_rate": 9.330437405078168e-06, "loss": 0.08526064, "memory(GiB)": 15.03, "step": 5895, "train_speed(iter/s)": 1.47295 }, { "acc": 0.98659306, "epoch": 10.414827890556046, "grad_norm": 4.953057289123535, "learning_rate": 9.328976396140343e-06, "loss": 0.08451171, "memory(GiB)": 15.03, "step": 5900, "train_speed(iter/s)": 1.472966 }, { "acc": 0.98607168, "epoch": 10.423654015887026, "grad_norm": 3.536909341812134, "learning_rate": 9.327513909609897e-06, "loss": 0.08009501, "memory(GiB)": 15.03, "step": 5905, "train_speed(iter/s)": 1.472978 }, { "acc": 0.98604498, "epoch": 10.432480141218006, "grad_norm": 4.137839317321777, "learning_rate": 9.326049945986071e-06, "loss": 0.09180957, "memory(GiB)": 15.03, "step": 5910, "train_speed(iter/s)": 1.47297 }, { "acc": 0.98302498, "epoch": 10.441306266548985, "grad_norm": 4.229802131652832, "learning_rate": 9.324584505768613e-06, "loss": 0.09785059, "memory(GiB)": 15.03, "step": 5915, "train_speed(iter/s)": 1.472964 }, { "acc": 0.9889883, "epoch": 10.450132391879965, "grad_norm": 5.266892910003662, "learning_rate": 9.323117589457774e-06, "loss": 0.07512638, "memory(GiB)": 15.03, "step": 5920, "train_speed(iter/s)": 1.472967 }, { "acc": 0.9877965, "epoch": 10.458958517210945, "grad_norm": 4.083326816558838, "learning_rate": 9.321649197554307e-06, "loss": 0.07163476, "memory(GiB)": 15.03, "step": 5925, "train_speed(iter/s)": 1.472957 }, { "acc": 0.98764286, "epoch": 10.467784642541924, "grad_norm": 3.6910977363586426, "learning_rate": 9.320179330559473e-06, "loss": 0.08099468, "memory(GiB)": 15.03, "step": 5930, "train_speed(iter/s)": 1.472956 }, { "acc": 0.98046675, "epoch": 10.476610767872904, "grad_norm": 4.076327800750732, "learning_rate": 9.318707988975032e-06, "loss": 0.12799311, "memory(GiB)": 15.03, "step": 5935, "train_speed(iter/s)": 1.47299 }, { "acc": 0.98960705, "epoch": 10.485436893203884, "grad_norm": 5.786803722381592, "learning_rate": 9.317235173303255e-06, "loss": 0.07086267, "memory(GiB)": 15.03, "step": 5940, "train_speed(iter/s)": 1.473061 }, { "acc": 0.98668385, "epoch": 10.494263018534863, "grad_norm": 4.7861762046813965, "learning_rate": 9.3157608840469e-06, "loss": 0.0834937, "memory(GiB)": 15.03, "step": 5945, "train_speed(iter/s)": 1.473076 }, { "acc": 0.98307409, "epoch": 10.503089143865843, "grad_norm": 5.941732883453369, "learning_rate": 9.314285121709245e-06, "loss": 0.1079487, "memory(GiB)": 15.03, "step": 5950, "train_speed(iter/s)": 1.473122 }, { "acc": 0.98351612, "epoch": 10.511915269196823, "grad_norm": 5.379951477050781, "learning_rate": 9.312807886794064e-06, "loss": 0.10641131, "memory(GiB)": 15.03, "step": 5955, "train_speed(iter/s)": 1.473127 }, { "acc": 0.9885602, "epoch": 10.520741394527802, "grad_norm": 2.8530983924865723, "learning_rate": 9.311329179805633e-06, "loss": 0.07697845, "memory(GiB)": 15.03, "step": 5960, "train_speed(iter/s)": 1.47309 }, { "acc": 0.98021145, "epoch": 10.529567519858782, "grad_norm": 7.1788129806518555, "learning_rate": 9.30984900124873e-06, "loss": 0.15077684, "memory(GiB)": 15.03, "step": 5965, "train_speed(iter/s)": 1.473151 }, { "acc": 0.98622408, "epoch": 10.538393645189762, "grad_norm": 3.357778787612915, "learning_rate": 9.308367351628644e-06, "loss": 0.0999231, "memory(GiB)": 15.03, "step": 5970, "train_speed(iter/s)": 1.473139 }, { "acc": 0.99145479, "epoch": 10.54721977052074, "grad_norm": 2.859290361404419, "learning_rate": 9.30688423145115e-06, "loss": 0.07199301, "memory(GiB)": 15.03, "step": 5975, "train_speed(iter/s)": 1.473068 }, { "acc": 0.98571415, "epoch": 10.556045895851721, "grad_norm": 3.3460655212402344, "learning_rate": 9.30539964122254e-06, "loss": 0.09217423, "memory(GiB)": 15.03, "step": 5980, "train_speed(iter/s)": 1.473082 }, { "acc": 0.99115705, "epoch": 10.564872021182701, "grad_norm": 5.81924295425415, "learning_rate": 9.3039135814496e-06, "loss": 0.06748508, "memory(GiB)": 15.03, "step": 5985, "train_speed(iter/s)": 1.473062 }, { "acc": 0.98738832, "epoch": 10.57369814651368, "grad_norm": 2.9748404026031494, "learning_rate": 9.30242605263962e-06, "loss": 0.09258108, "memory(GiB)": 15.03, "step": 5990, "train_speed(iter/s)": 1.473145 }, { "acc": 0.99074364, "epoch": 10.58252427184466, "grad_norm": 3.4002344608306885, "learning_rate": 9.30093705530039e-06, "loss": 0.0621986, "memory(GiB)": 15.03, "step": 5995, "train_speed(iter/s)": 1.473119 }, { "acc": 0.98531237, "epoch": 10.59135039717564, "grad_norm": 3.7787516117095947, "learning_rate": 9.299446589940203e-06, "loss": 0.10397656, "memory(GiB)": 15.03, "step": 6000, "train_speed(iter/s)": 1.473112 }, { "acc": 0.98631439, "epoch": 10.600176522506619, "grad_norm": 4.395920276641846, "learning_rate": 9.297954657067852e-06, "loss": 0.08577977, "memory(GiB)": 15.03, "step": 6005, "train_speed(iter/s)": 1.47304 }, { "acc": 0.98438015, "epoch": 10.609002647837599, "grad_norm": 6.847498893737793, "learning_rate": 9.296461257192633e-06, "loss": 0.11094575, "memory(GiB)": 15.03, "step": 6010, "train_speed(iter/s)": 1.473071 }, { "acc": 0.99026241, "epoch": 10.61782877316858, "grad_norm": 2.5228984355926514, "learning_rate": 9.294966390824341e-06, "loss": 0.07488299, "memory(GiB)": 15.03, "step": 6015, "train_speed(iter/s)": 1.473119 }, { "acc": 0.9865653, "epoch": 10.62665489849956, "grad_norm": 7.068607330322266, "learning_rate": 9.29347005847327e-06, "loss": 0.09128741, "memory(GiB)": 15.03, "step": 6020, "train_speed(iter/s)": 1.473134 }, { "acc": 0.98557177, "epoch": 10.635481023830538, "grad_norm": 3.9423632621765137, "learning_rate": 9.291972260650219e-06, "loss": 0.0942736, "memory(GiB)": 15.03, "step": 6025, "train_speed(iter/s)": 1.47313 }, { "acc": 0.98385372, "epoch": 10.644307149161518, "grad_norm": 5.701142311096191, "learning_rate": 9.290472997866481e-06, "loss": 0.1060406, "memory(GiB)": 15.03, "step": 6030, "train_speed(iter/s)": 1.47309 }, { "acc": 0.98872185, "epoch": 10.653133274492498, "grad_norm": 4.709616661071777, "learning_rate": 9.288972270633854e-06, "loss": 0.07669001, "memory(GiB)": 15.03, "step": 6035, "train_speed(iter/s)": 1.473065 }, { "acc": 0.98654613, "epoch": 10.661959399823477, "grad_norm": 6.212655067443848, "learning_rate": 9.287470079464638e-06, "loss": 0.09206779, "memory(GiB)": 15.03, "step": 6040, "train_speed(iter/s)": 1.473041 }, { "acc": 0.98562613, "epoch": 10.670785525154457, "grad_norm": 4.331943511962891, "learning_rate": 9.285966424871624e-06, "loss": 0.0840147, "memory(GiB)": 15.03, "step": 6045, "train_speed(iter/s)": 1.473003 }, { "acc": 0.99128284, "epoch": 10.679611650485437, "grad_norm": 6.825706481933594, "learning_rate": 9.284461307368111e-06, "loss": 0.06720691, "memory(GiB)": 15.03, "step": 6050, "train_speed(iter/s)": 1.472959 }, { "acc": 0.98732128, "epoch": 10.688437775816416, "grad_norm": 3.058628559112549, "learning_rate": 9.282954727467894e-06, "loss": 0.08195144, "memory(GiB)": 15.03, "step": 6055, "train_speed(iter/s)": 1.473015 }, { "acc": 0.98933792, "epoch": 10.697263901147396, "grad_norm": 3.922504186630249, "learning_rate": 9.281446685685266e-06, "loss": 0.08036928, "memory(GiB)": 15.03, "step": 6060, "train_speed(iter/s)": 1.473034 }, { "acc": 0.98937645, "epoch": 10.706090026478376, "grad_norm": 4.65108060836792, "learning_rate": 9.279937182535023e-06, "loss": 0.06348395, "memory(GiB)": 15.03, "step": 6065, "train_speed(iter/s)": 1.473054 }, { "acc": 0.98542299, "epoch": 10.714916151809355, "grad_norm": 4.882759094238281, "learning_rate": 9.278426218532454e-06, "loss": 0.08567482, "memory(GiB)": 15.03, "step": 6070, "train_speed(iter/s)": 1.473055 }, { "acc": 0.98558426, "epoch": 10.723742277140335, "grad_norm": 5.279613494873047, "learning_rate": 9.276913794193353e-06, "loss": 0.10334659, "memory(GiB)": 15.03, "step": 6075, "train_speed(iter/s)": 1.472977 }, { "acc": 0.99504433, "epoch": 10.732568402471315, "grad_norm": 3.428067684173584, "learning_rate": 9.275399910034008e-06, "loss": 0.03240651, "memory(GiB)": 15.03, "step": 6080, "train_speed(iter/s)": 1.472986 }, { "acc": 0.98867922, "epoch": 10.741394527802294, "grad_norm": 2.865978479385376, "learning_rate": 9.273884566571206e-06, "loss": 0.07587025, "memory(GiB)": 15.03, "step": 6085, "train_speed(iter/s)": 1.472993 }, { "acc": 0.98715973, "epoch": 10.750220653133274, "grad_norm": 6.288811683654785, "learning_rate": 9.272367764322236e-06, "loss": 0.07375463, "memory(GiB)": 15.03, "step": 6090, "train_speed(iter/s)": 1.473082 }, { "acc": 0.98506956, "epoch": 10.759046778464255, "grad_norm": 9.481131553649902, "learning_rate": 9.270849503804879e-06, "loss": 0.12402512, "memory(GiB)": 15.03, "step": 6095, "train_speed(iter/s)": 1.473107 }, { "acc": 0.98588772, "epoch": 10.767872903795233, "grad_norm": 4.328495979309082, "learning_rate": 9.269329785537417e-06, "loss": 0.08382407, "memory(GiB)": 15.03, "step": 6100, "train_speed(iter/s)": 1.47316 }, { "acc": 0.98939333, "epoch": 10.776699029126213, "grad_norm": 6.300540447235107, "learning_rate": 9.26780861003863e-06, "loss": 0.07918627, "memory(GiB)": 15.03, "step": 6105, "train_speed(iter/s)": 1.473228 }, { "acc": 0.98009987, "epoch": 10.785525154457194, "grad_norm": 3.5057761669158936, "learning_rate": 9.266285977827798e-06, "loss": 0.11825297, "memory(GiB)": 15.03, "step": 6110, "train_speed(iter/s)": 1.473223 }, { "acc": 0.9854187, "epoch": 10.794351279788174, "grad_norm": 6.7936601638793945, "learning_rate": 9.264761889424691e-06, "loss": 0.08844635, "memory(GiB)": 15.03, "step": 6115, "train_speed(iter/s)": 1.473208 }, { "acc": 0.98792934, "epoch": 10.803177405119152, "grad_norm": 4.417413711547852, "learning_rate": 9.263236345349583e-06, "loss": 0.08030308, "memory(GiB)": 15.03, "step": 6120, "train_speed(iter/s)": 1.473215 }, { "acc": 0.98595467, "epoch": 10.812003530450133, "grad_norm": 2.1729838848114014, "learning_rate": 9.261709346123237e-06, "loss": 0.10301428, "memory(GiB)": 15.03, "step": 6125, "train_speed(iter/s)": 1.473292 }, { "acc": 0.98678436, "epoch": 10.820829655781113, "grad_norm": 3.8060216903686523, "learning_rate": 9.260180892266926e-06, "loss": 0.08875495, "memory(GiB)": 15.03, "step": 6130, "train_speed(iter/s)": 1.473327 }, { "acc": 0.98838882, "epoch": 10.829655781112091, "grad_norm": 2.597945213317871, "learning_rate": 9.258650984302405e-06, "loss": 0.07888869, "memory(GiB)": 15.03, "step": 6135, "train_speed(iter/s)": 1.47325 }, { "acc": 0.98378201, "epoch": 10.838481906443072, "grad_norm": 8.108708381652832, "learning_rate": 9.257119622751935e-06, "loss": 0.11600158, "memory(GiB)": 15.03, "step": 6140, "train_speed(iter/s)": 1.473246 }, { "acc": 0.98845329, "epoch": 10.847308031774052, "grad_norm": 5.067374229431152, "learning_rate": 9.255586808138267e-06, "loss": 0.06411499, "memory(GiB)": 15.03, "step": 6145, "train_speed(iter/s)": 1.473245 }, { "acc": 0.98816128, "epoch": 10.85613415710503, "grad_norm": 5.226415634155273, "learning_rate": 9.254052540984657e-06, "loss": 0.09201419, "memory(GiB)": 15.03, "step": 6150, "train_speed(iter/s)": 1.473215 }, { "acc": 0.98930588, "epoch": 10.86496028243601, "grad_norm": 5.328153133392334, "learning_rate": 9.252516821814842e-06, "loss": 0.06674972, "memory(GiB)": 15.03, "step": 6155, "train_speed(iter/s)": 1.473179 }, { "acc": 0.98831558, "epoch": 10.87378640776699, "grad_norm": 4.941039562225342, "learning_rate": 9.25097965115307e-06, "loss": 0.08218503, "memory(GiB)": 15.03, "step": 6160, "train_speed(iter/s)": 1.473161 }, { "acc": 0.98553133, "epoch": 10.88261253309797, "grad_norm": 3.333026647567749, "learning_rate": 9.24944102952408e-06, "loss": 0.09679517, "memory(GiB)": 15.03, "step": 6165, "train_speed(iter/s)": 1.4732 }, { "acc": 0.98256168, "epoch": 10.89143865842895, "grad_norm": 3.9853527545928955, "learning_rate": 9.247900957453099e-06, "loss": 0.09832886, "memory(GiB)": 15.03, "step": 6170, "train_speed(iter/s)": 1.473216 }, { "acc": 0.98761845, "epoch": 10.90026478375993, "grad_norm": 4.1533660888671875, "learning_rate": 9.246359435465855e-06, "loss": 0.06705806, "memory(GiB)": 15.03, "step": 6175, "train_speed(iter/s)": 1.473266 }, { "acc": 0.98352146, "epoch": 10.909090909090908, "grad_norm": 16.816434860229492, "learning_rate": 9.244816464088571e-06, "loss": 0.1049669, "memory(GiB)": 15.03, "step": 6180, "train_speed(iter/s)": 1.473334 }, { "acc": 0.98814449, "epoch": 10.917917034421889, "grad_norm": 6.55294942855835, "learning_rate": 9.243272043847968e-06, "loss": 0.06587927, "memory(GiB)": 15.03, "step": 6185, "train_speed(iter/s)": 1.473376 }, { "acc": 0.98493061, "epoch": 10.926743159752869, "grad_norm": 4.853147029876709, "learning_rate": 9.241726175271255e-06, "loss": 0.10711125, "memory(GiB)": 15.03, "step": 6190, "train_speed(iter/s)": 1.473378 }, { "acc": 0.9853385, "epoch": 10.935569285083847, "grad_norm": 6.040640354156494, "learning_rate": 9.240178858886138e-06, "loss": 0.07894597, "memory(GiB)": 15.03, "step": 6195, "train_speed(iter/s)": 1.473307 }, { "acc": 0.98784771, "epoch": 10.944395410414828, "grad_norm": 2.802337884902954, "learning_rate": 9.238630095220818e-06, "loss": 0.07485791, "memory(GiB)": 15.03, "step": 6200, "train_speed(iter/s)": 1.473344 }, { "acc": 0.98793926, "epoch": 10.953221535745808, "grad_norm": 7.236006736755371, "learning_rate": 9.237079884803989e-06, "loss": 0.08334476, "memory(GiB)": 15.03, "step": 6205, "train_speed(iter/s)": 1.473434 }, { "acc": 0.98473511, "epoch": 10.962047661076788, "grad_norm": 3.329988956451416, "learning_rate": 9.235528228164841e-06, "loss": 0.10141348, "memory(GiB)": 15.03, "step": 6210, "train_speed(iter/s)": 1.473488 }, { "acc": 0.98406782, "epoch": 10.970873786407767, "grad_norm": 1.23566734790802, "learning_rate": 9.233975125833053e-06, "loss": 0.11369786, "memory(GiB)": 15.03, "step": 6215, "train_speed(iter/s)": 1.473472 }, { "acc": 0.98611641, "epoch": 10.979699911738747, "grad_norm": 3.0984926223754883, "learning_rate": 9.232420578338804e-06, "loss": 0.07624426, "memory(GiB)": 15.03, "step": 6220, "train_speed(iter/s)": 1.473432 }, { "acc": 0.98381414, "epoch": 10.988526037069727, "grad_norm": 7.573570251464844, "learning_rate": 9.230864586212758e-06, "loss": 0.10678487, "memory(GiB)": 15.03, "step": 6225, "train_speed(iter/s)": 1.473448 }, { "acc": 0.98593884, "epoch": 10.997352162400706, "grad_norm": 3.1448171138763428, "learning_rate": 9.229307149986083e-06, "loss": 0.09246154, "memory(GiB)": 15.03, "step": 6230, "train_speed(iter/s)": 1.473477 }, { "acc": 0.9859663, "epoch": 11.006178287731686, "grad_norm": 1.3756227493286133, "learning_rate": 9.22774827019043e-06, "loss": 0.10116946, "memory(GiB)": 15.03, "step": 6235, "train_speed(iter/s)": 1.47337 }, { "acc": 0.98559895, "epoch": 11.015004413062666, "grad_norm": 2.6211018562316895, "learning_rate": 9.226187947357948e-06, "loss": 0.08116509, "memory(GiB)": 15.03, "step": 6240, "train_speed(iter/s)": 1.473355 }, { "acc": 0.99110069, "epoch": 11.023830538393645, "grad_norm": 5.285623550415039, "learning_rate": 9.224626182021281e-06, "loss": 0.05903034, "memory(GiB)": 15.03, "step": 6245, "train_speed(iter/s)": 1.473401 }, { "acc": 0.98721266, "epoch": 11.032656663724625, "grad_norm": 4.680665016174316, "learning_rate": 9.223062974713556e-06, "loss": 0.08417792, "memory(GiB)": 15.03, "step": 6250, "train_speed(iter/s)": 1.473411 }, { "acc": 0.98581352, "epoch": 11.041482789055605, "grad_norm": 4.488255977630615, "learning_rate": 9.2214983259684e-06, "loss": 0.0952508, "memory(GiB)": 15.03, "step": 6255, "train_speed(iter/s)": 1.473433 }, { "acc": 0.98756943, "epoch": 11.050308914386584, "grad_norm": 3.101759433746338, "learning_rate": 9.219932236319931e-06, "loss": 0.08995317, "memory(GiB)": 15.03, "step": 6260, "train_speed(iter/s)": 1.47346 }, { "acc": 0.99053688, "epoch": 11.059135039717564, "grad_norm": 4.129998207092285, "learning_rate": 9.218364706302761e-06, "loss": 0.05837607, "memory(GiB)": 15.03, "step": 6265, "train_speed(iter/s)": 1.473498 }, { "acc": 0.98467178, "epoch": 11.067961165048544, "grad_norm": 3.6402573585510254, "learning_rate": 9.216795736451984e-06, "loss": 0.10673529, "memory(GiB)": 15.03, "step": 6270, "train_speed(iter/s)": 1.473501 }, { "acc": 0.98552017, "epoch": 11.076787290379523, "grad_norm": 5.164542198181152, "learning_rate": 9.215225327303195e-06, "loss": 0.09727818, "memory(GiB)": 15.03, "step": 6275, "train_speed(iter/s)": 1.473495 }, { "acc": 0.98524876, "epoch": 11.085613415710503, "grad_norm": 1.5025404691696167, "learning_rate": 9.213653479392479e-06, "loss": 0.09694282, "memory(GiB)": 15.03, "step": 6280, "train_speed(iter/s)": 1.473479 }, { "acc": 0.98225937, "epoch": 11.094439541041483, "grad_norm": 3.8298747539520264, "learning_rate": 9.212080193256411e-06, "loss": 0.13028218, "memory(GiB)": 15.03, "step": 6285, "train_speed(iter/s)": 1.473495 }, { "acc": 0.98854036, "epoch": 11.103265666372462, "grad_norm": 3.0559895038604736, "learning_rate": 9.210505469432056e-06, "loss": 0.0807041, "memory(GiB)": 15.03, "step": 6290, "train_speed(iter/s)": 1.473544 }, { "acc": 0.9829792, "epoch": 11.112091791703442, "grad_norm": 7.132787704467773, "learning_rate": 9.208929308456968e-06, "loss": 0.11302708, "memory(GiB)": 15.03, "step": 6295, "train_speed(iter/s)": 1.473494 }, { "acc": 0.99070568, "epoch": 11.120917917034422, "grad_norm": 1.2792922258377075, "learning_rate": 9.207351710869196e-06, "loss": 0.06612358, "memory(GiB)": 15.03, "step": 6300, "train_speed(iter/s)": 1.473473 }, { "acc": 0.9895751, "epoch": 11.129744042365402, "grad_norm": 2.7515621185302734, "learning_rate": 9.20577267720728e-06, "loss": 0.06767665, "memory(GiB)": 15.03, "step": 6305, "train_speed(iter/s)": 1.473457 }, { "acc": 0.98764877, "epoch": 11.13857016769638, "grad_norm": 9.552902221679688, "learning_rate": 9.204192208010244e-06, "loss": 0.08043027, "memory(GiB)": 15.03, "step": 6310, "train_speed(iter/s)": 1.473492 }, { "acc": 0.98999557, "epoch": 11.147396293027361, "grad_norm": 7.554165840148926, "learning_rate": 9.202610303817606e-06, "loss": 0.07987174, "memory(GiB)": 15.03, "step": 6315, "train_speed(iter/s)": 1.473529 }, { "acc": 0.99044256, "epoch": 11.156222418358341, "grad_norm": 1.4221450090408325, "learning_rate": 9.201026965169374e-06, "loss": 0.06688339, "memory(GiB)": 15.03, "step": 6320, "train_speed(iter/s)": 1.473493 }, { "acc": 0.98946953, "epoch": 11.16504854368932, "grad_norm": 5.46376371383667, "learning_rate": 9.199442192606048e-06, "loss": 0.06722629, "memory(GiB)": 15.03, "step": 6325, "train_speed(iter/s)": 1.4735 }, { "acc": 0.99240742, "epoch": 11.1738746690203, "grad_norm": 3.973273754119873, "learning_rate": 9.19785598666861e-06, "loss": 0.0549745, "memory(GiB)": 15.03, "step": 6330, "train_speed(iter/s)": 1.473493 }, { "acc": 0.98361778, "epoch": 11.18270079435128, "grad_norm": 3.9062135219573975, "learning_rate": 9.196268347898538e-06, "loss": 0.11657804, "memory(GiB)": 15.03, "step": 6335, "train_speed(iter/s)": 1.473488 }, { "acc": 0.98962479, "epoch": 11.191526919682259, "grad_norm": 2.329115390777588, "learning_rate": 9.1946792768378e-06, "loss": 0.08457098, "memory(GiB)": 15.03, "step": 6340, "train_speed(iter/s)": 1.473473 }, { "acc": 0.98441925, "epoch": 11.20035304501324, "grad_norm": 4.01609468460083, "learning_rate": 9.193088774028845e-06, "loss": 0.13605494, "memory(GiB)": 15.03, "step": 6345, "train_speed(iter/s)": 1.473465 }, { "acc": 0.98499374, "epoch": 11.20917917034422, "grad_norm": 3.256894588470459, "learning_rate": 9.19149684001462e-06, "loss": 0.1206918, "memory(GiB)": 15.03, "step": 6350, "train_speed(iter/s)": 1.473458 }, { "acc": 0.98502893, "epoch": 11.218005295675198, "grad_norm": 5.369540691375732, "learning_rate": 9.189903475338554e-06, "loss": 0.09859518, "memory(GiB)": 15.03, "step": 6355, "train_speed(iter/s)": 1.473494 }, { "acc": 0.98720818, "epoch": 11.226831421006178, "grad_norm": 3.8439059257507324, "learning_rate": 9.188308680544566e-06, "loss": 0.08440309, "memory(GiB)": 15.03, "step": 6360, "train_speed(iter/s)": 1.473597 }, { "acc": 0.98919659, "epoch": 11.235657546337158, "grad_norm": 4.081446170806885, "learning_rate": 9.186712456177069e-06, "loss": 0.08250089, "memory(GiB)": 15.03, "step": 6365, "train_speed(iter/s)": 1.473598 }, { "acc": 0.98783455, "epoch": 11.244483671668137, "grad_norm": 1.9111446142196655, "learning_rate": 9.185114802780952e-06, "loss": 0.09095634, "memory(GiB)": 15.03, "step": 6370, "train_speed(iter/s)": 1.473671 }, { "acc": 0.98492355, "epoch": 11.253309796999117, "grad_norm": 3.747558116912842, "learning_rate": 9.183515720901605e-06, "loss": 0.09858695, "memory(GiB)": 15.03, "step": 6375, "train_speed(iter/s)": 1.473649 }, { "acc": 0.98938046, "epoch": 11.262135922330097, "grad_norm": 3.2600440979003906, "learning_rate": 9.181915211084895e-06, "loss": 0.06065716, "memory(GiB)": 15.03, "step": 6380, "train_speed(iter/s)": 1.473637 }, { "acc": 0.98650227, "epoch": 11.270962047661076, "grad_norm": 3.6919853687286377, "learning_rate": 9.180313273877183e-06, "loss": 0.10382895, "memory(GiB)": 15.03, "step": 6385, "train_speed(iter/s)": 1.473666 }, { "acc": 0.99058418, "epoch": 11.279788172992056, "grad_norm": 6.23285436630249, "learning_rate": 9.178709909825316e-06, "loss": 0.05565741, "memory(GiB)": 15.03, "step": 6390, "train_speed(iter/s)": 1.473692 }, { "acc": 0.98483391, "epoch": 11.288614298323036, "grad_norm": 3.2960245609283447, "learning_rate": 9.177105119476622e-06, "loss": 0.09432721, "memory(GiB)": 15.03, "step": 6395, "train_speed(iter/s)": 1.473715 }, { "acc": 0.98859348, "epoch": 11.297440423654017, "grad_norm": 5.744123458862305, "learning_rate": 9.175498903378929e-06, "loss": 0.07121538, "memory(GiB)": 15.03, "step": 6400, "train_speed(iter/s)": 1.473771 }, { "acc": 0.98356981, "epoch": 11.306266548984995, "grad_norm": 4.83273458480835, "learning_rate": 9.173891262080538e-06, "loss": 0.09665654, "memory(GiB)": 15.03, "step": 6405, "train_speed(iter/s)": 1.473711 }, { "acc": 0.9908452, "epoch": 11.315092674315975, "grad_norm": 2.7157626152038574, "learning_rate": 9.172282196130244e-06, "loss": 0.0644004, "memory(GiB)": 15.03, "step": 6410, "train_speed(iter/s)": 1.473746 }, { "acc": 0.98637266, "epoch": 11.323918799646956, "grad_norm": 5.893570423126221, "learning_rate": 9.170671706077328e-06, "loss": 0.08699585, "memory(GiB)": 15.03, "step": 6415, "train_speed(iter/s)": 1.473789 }, { "acc": 0.98740816, "epoch": 11.332744924977934, "grad_norm": 3.1302108764648438, "learning_rate": 9.169059792471553e-06, "loss": 0.08276633, "memory(GiB)": 15.03, "step": 6420, "train_speed(iter/s)": 1.47386 }, { "acc": 0.9819994, "epoch": 11.341571050308914, "grad_norm": 3.5102622509002686, "learning_rate": 9.167446455863173e-06, "loss": 0.13629963, "memory(GiB)": 15.03, "step": 6425, "train_speed(iter/s)": 1.473852 }, { "acc": 0.98863049, "epoch": 11.350397175639895, "grad_norm": 5.211555480957031, "learning_rate": 9.165831696802926e-06, "loss": 0.08274974, "memory(GiB)": 15.03, "step": 6430, "train_speed(iter/s)": 1.473837 }, { "acc": 0.98809977, "epoch": 11.359223300970873, "grad_norm": 5.104304790496826, "learning_rate": 9.164215515842034e-06, "loss": 0.08273081, "memory(GiB)": 15.03, "step": 6435, "train_speed(iter/s)": 1.473861 }, { "acc": 0.98872576, "epoch": 11.368049426301853, "grad_norm": 4.298994541168213, "learning_rate": 9.162597913532202e-06, "loss": 0.08553579, "memory(GiB)": 15.03, "step": 6440, "train_speed(iter/s)": 1.473892 }, { "acc": 0.98118391, "epoch": 11.376875551632834, "grad_norm": 3.6735105514526367, "learning_rate": 9.160978890425631e-06, "loss": 0.11998863, "memory(GiB)": 15.03, "step": 6445, "train_speed(iter/s)": 1.473924 }, { "acc": 0.98427067, "epoch": 11.385701676963812, "grad_norm": 3.1968913078308105, "learning_rate": 9.159358447074992e-06, "loss": 0.08563492, "memory(GiB)": 15.03, "step": 6450, "train_speed(iter/s)": 1.4739 }, { "acc": 0.988692, "epoch": 11.394527802294792, "grad_norm": 2.6301162242889404, "learning_rate": 9.157736584033455e-06, "loss": 0.08185316, "memory(GiB)": 15.03, "step": 6455, "train_speed(iter/s)": 1.473894 }, { "acc": 0.99189177, "epoch": 11.403353927625773, "grad_norm": 2.621974468231201, "learning_rate": 9.156113301854664e-06, "loss": 0.06842655, "memory(GiB)": 15.03, "step": 6460, "train_speed(iter/s)": 1.473872 }, { "acc": 0.98750906, "epoch": 11.412180052956751, "grad_norm": 3.723113536834717, "learning_rate": 9.154488601092752e-06, "loss": 0.08607856, "memory(GiB)": 15.03, "step": 6465, "train_speed(iter/s)": 1.473901 }, { "acc": 0.9862278, "epoch": 11.421006178287731, "grad_norm": 4.408423900604248, "learning_rate": 9.152862482302338e-06, "loss": 0.0992218, "memory(GiB)": 15.03, "step": 6470, "train_speed(iter/s)": 1.47388 }, { "acc": 0.98687239, "epoch": 11.429832303618712, "grad_norm": 3.901848316192627, "learning_rate": 9.151234946038521e-06, "loss": 0.08924947, "memory(GiB)": 15.03, "step": 6475, "train_speed(iter/s)": 1.473903 }, { "acc": 0.99156179, "epoch": 11.43865842894969, "grad_norm": 4.677976608276367, "learning_rate": 9.149605992856887e-06, "loss": 0.05667843, "memory(GiB)": 15.03, "step": 6480, "train_speed(iter/s)": 1.473856 }, { "acc": 0.98883038, "epoch": 11.44748455428067, "grad_norm": 4.158396244049072, "learning_rate": 9.147975623313505e-06, "loss": 0.06938443, "memory(GiB)": 15.03, "step": 6485, "train_speed(iter/s)": 1.473855 }, { "acc": 0.98877764, "epoch": 11.45631067961165, "grad_norm": 4.594664096832275, "learning_rate": 9.146343837964924e-06, "loss": 0.08226143, "memory(GiB)": 15.03, "step": 6490, "train_speed(iter/s)": 1.473865 }, { "acc": 0.98995094, "epoch": 11.465136804942631, "grad_norm": 4.672100067138672, "learning_rate": 9.144710637368181e-06, "loss": 0.06491897, "memory(GiB)": 15.03, "step": 6495, "train_speed(iter/s)": 1.473842 }, { "acc": 0.98990583, "epoch": 11.47396293027361, "grad_norm": 7.323070526123047, "learning_rate": 9.143076022080795e-06, "loss": 0.06497197, "memory(GiB)": 15.03, "step": 6500, "train_speed(iter/s)": 1.473856 }, { "acc": 0.98786469, "epoch": 11.48278905560459, "grad_norm": 2.265296220779419, "learning_rate": 9.141439992660766e-06, "loss": 0.07226844, "memory(GiB)": 15.03, "step": 6505, "train_speed(iter/s)": 1.473861 }, { "acc": 0.98916245, "epoch": 11.49161518093557, "grad_norm": 4.314673900604248, "learning_rate": 9.13980254966658e-06, "loss": 0.07511581, "memory(GiB)": 15.03, "step": 6510, "train_speed(iter/s)": 1.473896 }, { "acc": 0.98684464, "epoch": 11.500441306266548, "grad_norm": 9.115246772766113, "learning_rate": 9.138163693657202e-06, "loss": 0.07975211, "memory(GiB)": 15.03, "step": 6515, "train_speed(iter/s)": 1.473868 }, { "acc": 0.99015903, "epoch": 11.509267431597529, "grad_norm": 4.46726655960083, "learning_rate": 9.13652342519208e-06, "loss": 0.06316785, "memory(GiB)": 15.03, "step": 6520, "train_speed(iter/s)": 1.473893 }, { "acc": 0.99129543, "epoch": 11.518093556928509, "grad_norm": 3.3767545223236084, "learning_rate": 9.134881744831149e-06, "loss": 0.07769831, "memory(GiB)": 15.03, "step": 6525, "train_speed(iter/s)": 1.473847 }, { "acc": 0.98710098, "epoch": 11.526919682259487, "grad_norm": 2.663440704345703, "learning_rate": 9.133238653134817e-06, "loss": 0.07544488, "memory(GiB)": 15.03, "step": 6530, "train_speed(iter/s)": 1.473838 }, { "acc": 0.99165506, "epoch": 11.535745807590468, "grad_norm": 4.695313930511475, "learning_rate": 9.131594150663982e-06, "loss": 0.05181928, "memory(GiB)": 15.03, "step": 6535, "train_speed(iter/s)": 1.473836 }, { "acc": 0.98791714, "epoch": 11.544571932921448, "grad_norm": 6.974271297454834, "learning_rate": 9.129948237980022e-06, "loss": 0.06768941, "memory(GiB)": 15.03, "step": 6540, "train_speed(iter/s)": 1.473872 }, { "acc": 0.9905159, "epoch": 11.553398058252426, "grad_norm": 2.8444066047668457, "learning_rate": 9.12830091564479e-06, "loss": 0.06963209, "memory(GiB)": 15.03, "step": 6545, "train_speed(iter/s)": 1.473852 }, { "acc": 0.98959255, "epoch": 11.562224183583407, "grad_norm": 6.358232021331787, "learning_rate": 9.126652184220629e-06, "loss": 0.06218749, "memory(GiB)": 15.03, "step": 6550, "train_speed(iter/s)": 1.473855 }, { "acc": 0.98813419, "epoch": 11.571050308914387, "grad_norm": 4.016536235809326, "learning_rate": 9.125002044270359e-06, "loss": 0.07179525, "memory(GiB)": 15.03, "step": 6555, "train_speed(iter/s)": 1.47386 }, { "acc": 0.98653984, "epoch": 11.579876434245366, "grad_norm": 4.2989091873168945, "learning_rate": 9.123350496357277e-06, "loss": 0.08478927, "memory(GiB)": 15.03, "step": 6560, "train_speed(iter/s)": 1.473899 }, { "acc": 0.98867264, "epoch": 11.588702559576346, "grad_norm": 3.036964178085327, "learning_rate": 9.121697541045168e-06, "loss": 0.07224551, "memory(GiB)": 15.03, "step": 6565, "train_speed(iter/s)": 1.473893 }, { "acc": 0.99068279, "epoch": 11.597528684907326, "grad_norm": 3.5922627449035645, "learning_rate": 9.120043178898292e-06, "loss": 0.05736054, "memory(GiB)": 15.03, "step": 6570, "train_speed(iter/s)": 1.473875 }, { "acc": 0.98986931, "epoch": 11.606354810238305, "grad_norm": 3.422973155975342, "learning_rate": 9.118387410481394e-06, "loss": 0.07605982, "memory(GiB)": 15.03, "step": 6575, "train_speed(iter/s)": 1.473868 }, { "acc": 0.98508339, "epoch": 11.615180935569285, "grad_norm": 5.12109899520874, "learning_rate": 9.116730236359695e-06, "loss": 0.08279511, "memory(GiB)": 15.03, "step": 6580, "train_speed(iter/s)": 1.473921 }, { "acc": 0.98802004, "epoch": 11.624007060900265, "grad_norm": 4.547296524047852, "learning_rate": 9.115071657098895e-06, "loss": 0.07174374, "memory(GiB)": 15.03, "step": 6585, "train_speed(iter/s)": 1.47389 }, { "acc": 0.98953238, "epoch": 11.632833186231245, "grad_norm": 4.767600059509277, "learning_rate": 9.113411673265178e-06, "loss": 0.07272013, "memory(GiB)": 15.03, "step": 6590, "train_speed(iter/s)": 1.473955 }, { "acc": 0.99233665, "epoch": 11.641659311562224, "grad_norm": 1.3255735635757446, "learning_rate": 9.111750285425207e-06, "loss": 0.0487478, "memory(GiB)": 15.03, "step": 6595, "train_speed(iter/s)": 1.473894 }, { "acc": 0.98660049, "epoch": 11.650485436893204, "grad_norm": 5.044189453125, "learning_rate": 9.110087494146118e-06, "loss": 0.10013897, "memory(GiB)": 15.03, "step": 6600, "train_speed(iter/s)": 1.473888 }, { "acc": 0.99040642, "epoch": 11.659311562224184, "grad_norm": 4.946611404418945, "learning_rate": 9.108423299995533e-06, "loss": 0.06790757, "memory(GiB)": 15.03, "step": 6605, "train_speed(iter/s)": 1.47385 }, { "acc": 0.98916712, "epoch": 11.668137687555163, "grad_norm": 5.413703441619873, "learning_rate": 9.10675770354155e-06, "loss": 0.08405078, "memory(GiB)": 15.03, "step": 6610, "train_speed(iter/s)": 1.473882 }, { "acc": 0.9852066, "epoch": 11.676963812886143, "grad_norm": 4.15528678894043, "learning_rate": 9.10509070535275e-06, "loss": 0.09332814, "memory(GiB)": 15.03, "step": 6615, "train_speed(iter/s)": 1.473877 }, { "acc": 0.98804998, "epoch": 11.685789938217123, "grad_norm": 8.424203872680664, "learning_rate": 9.103422305998182e-06, "loss": 0.07439014, "memory(GiB)": 15.03, "step": 6620, "train_speed(iter/s)": 1.473865 }, { "acc": 0.98585701, "epoch": 11.694616063548102, "grad_norm": 5.320619583129883, "learning_rate": 9.101752506047383e-06, "loss": 0.09921399, "memory(GiB)": 15.03, "step": 6625, "train_speed(iter/s)": 1.473829 }, { "acc": 0.98619251, "epoch": 11.703442188879082, "grad_norm": 6.671489238739014, "learning_rate": 9.100081306070369e-06, "loss": 0.08668411, "memory(GiB)": 15.03, "step": 6630, "train_speed(iter/s)": 1.473797 }, { "acc": 0.98838339, "epoch": 11.712268314210062, "grad_norm": 4.137324810028076, "learning_rate": 9.098408706637624e-06, "loss": 0.08544806, "memory(GiB)": 15.03, "step": 6635, "train_speed(iter/s)": 1.473807 }, { "acc": 0.98623924, "epoch": 11.72109443954104, "grad_norm": 5.192039489746094, "learning_rate": 9.096734708320118e-06, "loss": 0.07284256, "memory(GiB)": 15.03, "step": 6640, "train_speed(iter/s)": 1.47383 }, { "acc": 0.99116344, "epoch": 11.729920564872021, "grad_norm": 8.041074752807617, "learning_rate": 9.095059311689298e-06, "loss": 0.07163856, "memory(GiB)": 15.03, "step": 6645, "train_speed(iter/s)": 1.473793 }, { "acc": 0.98204107, "epoch": 11.738746690203001, "grad_norm": 3.711676836013794, "learning_rate": 9.093382517317086e-06, "loss": 0.10565729, "memory(GiB)": 15.03, "step": 6650, "train_speed(iter/s)": 1.473789 }, { "acc": 0.98672409, "epoch": 11.74757281553398, "grad_norm": 6.3812336921691895, "learning_rate": 9.091704325775879e-06, "loss": 0.09377109, "memory(GiB)": 15.03, "step": 6655, "train_speed(iter/s)": 1.473837 }, { "acc": 0.98463812, "epoch": 11.75639894086496, "grad_norm": 4.139703750610352, "learning_rate": 9.090024737638556e-06, "loss": 0.11027112, "memory(GiB)": 15.03, "step": 6660, "train_speed(iter/s)": 1.473879 }, { "acc": 0.9911664, "epoch": 11.76522506619594, "grad_norm": 6.469309329986572, "learning_rate": 9.088343753478472e-06, "loss": 0.05844013, "memory(GiB)": 15.03, "step": 6665, "train_speed(iter/s)": 1.473923 }, { "acc": 0.98969116, "epoch": 11.774051191526919, "grad_norm": 5.10896110534668, "learning_rate": 9.086661373869454e-06, "loss": 0.07324568, "memory(GiB)": 15.03, "step": 6670, "train_speed(iter/s)": 1.473912 }, { "acc": 0.98873072, "epoch": 11.782877316857899, "grad_norm": 3.8766541481018066, "learning_rate": 9.08497759938581e-06, "loss": 0.05982419, "memory(GiB)": 15.03, "step": 6675, "train_speed(iter/s)": 1.473898 }, { "acc": 0.98963375, "epoch": 11.79170344218888, "grad_norm": 4.707247257232666, "learning_rate": 9.083292430602322e-06, "loss": 0.06915681, "memory(GiB)": 15.03, "step": 6680, "train_speed(iter/s)": 1.473884 }, { "acc": 0.98467655, "epoch": 11.80052956751986, "grad_norm": 5.626936435699463, "learning_rate": 9.08160586809425e-06, "loss": 0.11368883, "memory(GiB)": 15.03, "step": 6685, "train_speed(iter/s)": 1.473915 }, { "acc": 0.98772259, "epoch": 11.809355692850838, "grad_norm": 4.55743932723999, "learning_rate": 9.079917912437325e-06, "loss": 0.07913449, "memory(GiB)": 15.03, "step": 6690, "train_speed(iter/s)": 1.473949 }, { "acc": 0.99096775, "epoch": 11.818181818181818, "grad_norm": 5.377889156341553, "learning_rate": 9.078228564207762e-06, "loss": 0.07000401, "memory(GiB)": 15.03, "step": 6695, "train_speed(iter/s)": 1.473993 }, { "acc": 0.98800716, "epoch": 11.827007943512799, "grad_norm": 2.2605741024017334, "learning_rate": 9.076537823982243e-06, "loss": 0.09267549, "memory(GiB)": 15.03, "step": 6700, "train_speed(iter/s)": 1.474009 }, { "acc": 0.98853998, "epoch": 11.835834068843777, "grad_norm": 2.380063056945801, "learning_rate": 9.074845692337927e-06, "loss": 0.08710135, "memory(GiB)": 15.03, "step": 6705, "train_speed(iter/s)": 1.474035 }, { "acc": 0.98467979, "epoch": 11.844660194174757, "grad_norm": 5.277482986450195, "learning_rate": 9.073152169852453e-06, "loss": 0.09005474, "memory(GiB)": 15.03, "step": 6710, "train_speed(iter/s)": 1.474053 }, { "acc": 0.98871078, "epoch": 11.853486319505738, "grad_norm": 4.481370449066162, "learning_rate": 9.071457257103927e-06, "loss": 0.06137453, "memory(GiB)": 15.03, "step": 6715, "train_speed(iter/s)": 1.474103 }, { "acc": 0.9913662, "epoch": 11.862312444836716, "grad_norm": 3.604565382003784, "learning_rate": 9.069760954670938e-06, "loss": 0.0704564, "memory(GiB)": 15.03, "step": 6720, "train_speed(iter/s)": 1.47417 }, { "acc": 0.99125891, "epoch": 11.871138570167696, "grad_norm": 2.830512523651123, "learning_rate": 9.068063263132541e-06, "loss": 0.0535026, "memory(GiB)": 15.03, "step": 6725, "train_speed(iter/s)": 1.474225 }, { "acc": 0.98811798, "epoch": 11.879964695498677, "grad_norm": 4.649196624755859, "learning_rate": 9.066364183068275e-06, "loss": 0.07574505, "memory(GiB)": 15.03, "step": 6730, "train_speed(iter/s)": 1.474244 }, { "acc": 0.98956366, "epoch": 11.888790820829655, "grad_norm": 5.587145805358887, "learning_rate": 9.06466371505814e-06, "loss": 0.08031173, "memory(GiB)": 15.03, "step": 6735, "train_speed(iter/s)": 1.474266 }, { "acc": 0.99023237, "epoch": 11.897616946160635, "grad_norm": 4.518322944641113, "learning_rate": 9.062961859682625e-06, "loss": 0.06050015, "memory(GiB)": 15.03, "step": 6740, "train_speed(iter/s)": 1.474229 }, { "acc": 0.99061852, "epoch": 11.906443071491616, "grad_norm": 5.599876403808594, "learning_rate": 9.06125861752268e-06, "loss": 0.05569254, "memory(GiB)": 15.03, "step": 6745, "train_speed(iter/s)": 1.474254 }, { "acc": 0.98412228, "epoch": 11.915269196822594, "grad_norm": 3.1458792686462402, "learning_rate": 9.059553989159731e-06, "loss": 0.11536756, "memory(GiB)": 15.03, "step": 6750, "train_speed(iter/s)": 1.474225 }, { "acc": 0.9859436, "epoch": 11.924095322153574, "grad_norm": 7.665375232696533, "learning_rate": 9.057847975175686e-06, "loss": 0.08180622, "memory(GiB)": 15.03, "step": 6755, "train_speed(iter/s)": 1.474255 }, { "acc": 0.98989153, "epoch": 11.932921447484555, "grad_norm": 1.885918140411377, "learning_rate": 9.056140576152916e-06, "loss": 0.06673539, "memory(GiB)": 15.03, "step": 6760, "train_speed(iter/s)": 1.474252 }, { "acc": 0.99124451, "epoch": 11.941747572815533, "grad_norm": 4.893038272857666, "learning_rate": 9.054431792674265e-06, "loss": 0.07103257, "memory(GiB)": 15.03, "step": 6765, "train_speed(iter/s)": 1.47429 }, { "acc": 0.99045706, "epoch": 11.950573698146513, "grad_norm": 4.223880290985107, "learning_rate": 9.052721625323059e-06, "loss": 0.0724562, "memory(GiB)": 15.03, "step": 6770, "train_speed(iter/s)": 1.474332 }, { "acc": 0.98376446, "epoch": 11.959399823477494, "grad_norm": 6.272837162017822, "learning_rate": 9.051010074683084e-06, "loss": 0.1004196, "memory(GiB)": 15.03, "step": 6775, "train_speed(iter/s)": 1.474274 }, { "acc": 0.99203529, "epoch": 11.968225948808474, "grad_norm": 6.809324741363525, "learning_rate": 9.04929714133861e-06, "loss": 0.06481164, "memory(GiB)": 15.03, "step": 6780, "train_speed(iter/s)": 1.474228 }, { "acc": 0.99166384, "epoch": 11.977052074139452, "grad_norm": 4.851701259613037, "learning_rate": 9.047582825874368e-06, "loss": 0.07579094, "memory(GiB)": 15.03, "step": 6785, "train_speed(iter/s)": 1.474246 }, { "acc": 0.9863884, "epoch": 11.985878199470433, "grad_norm": 5.414185047149658, "learning_rate": 9.045867128875569e-06, "loss": 0.09904598, "memory(GiB)": 15.03, "step": 6790, "train_speed(iter/s)": 1.474241 }, { "acc": 0.99236107, "epoch": 11.994704324801413, "grad_norm": 3.435497999191284, "learning_rate": 9.044150050927894e-06, "loss": 0.05278137, "memory(GiB)": 15.03, "step": 6795, "train_speed(iter/s)": 1.474189 }, { "acc": 0.99031754, "epoch": 12.003530450132391, "grad_norm": 2.9312686920166016, "learning_rate": 9.042431592617492e-06, "loss": 0.07900066, "memory(GiB)": 15.03, "step": 6800, "train_speed(iter/s)": 1.474147 }, { "acc": 0.98952408, "epoch": 12.012356575463372, "grad_norm": 5.544587135314941, "learning_rate": 9.040711754530988e-06, "loss": 0.07567236, "memory(GiB)": 15.03, "step": 6805, "train_speed(iter/s)": 1.474158 }, { "acc": 0.99085693, "epoch": 12.021182700794352, "grad_norm": 2.1558902263641357, "learning_rate": 9.038990537255472e-06, "loss": 0.0625818, "memory(GiB)": 15.03, "step": 6810, "train_speed(iter/s)": 1.474091 }, { "acc": 0.98659821, "epoch": 12.03000882612533, "grad_norm": 4.7702317237854, "learning_rate": 9.03726794137851e-06, "loss": 0.09917334, "memory(GiB)": 15.03, "step": 6815, "train_speed(iter/s)": 1.474071 }, { "acc": 0.98782482, "epoch": 12.03883495145631, "grad_norm": 4.97424840927124, "learning_rate": 9.035543967488137e-06, "loss": 0.06811545, "memory(GiB)": 15.03, "step": 6820, "train_speed(iter/s)": 1.474095 }, { "acc": 0.98659096, "epoch": 12.047661076787291, "grad_norm": 2.2982563972473145, "learning_rate": 9.03381861617286e-06, "loss": 0.09795151, "memory(GiB)": 15.03, "step": 6825, "train_speed(iter/s)": 1.474113 }, { "acc": 0.99017887, "epoch": 12.05648720211827, "grad_norm": 3.6344525814056396, "learning_rate": 9.03209188802165e-06, "loss": 0.07341094, "memory(GiB)": 15.03, "step": 6830, "train_speed(iter/s)": 1.474117 }, { "acc": 0.98896255, "epoch": 12.06531332744925, "grad_norm": 4.992835998535156, "learning_rate": 9.030363783623957e-06, "loss": 0.06496568, "memory(GiB)": 15.03, "step": 6835, "train_speed(iter/s)": 1.474144 }, { "acc": 0.99016857, "epoch": 12.07413945278023, "grad_norm": 5.696025371551514, "learning_rate": 9.028634303569693e-06, "loss": 0.06276612, "memory(GiB)": 15.03, "step": 6840, "train_speed(iter/s)": 1.474122 }, { "acc": 0.98978138, "epoch": 12.082965578111208, "grad_norm": 3.026210069656372, "learning_rate": 9.026903448449244e-06, "loss": 0.07180237, "memory(GiB)": 15.03, "step": 6845, "train_speed(iter/s)": 1.474102 }, { "acc": 0.99013586, "epoch": 12.091791703442189, "grad_norm": 2.213670253753662, "learning_rate": 9.025171218853466e-06, "loss": 0.06525594, "memory(GiB)": 15.03, "step": 6850, "train_speed(iter/s)": 1.474045 }, { "acc": 0.98459797, "epoch": 12.100617828773169, "grad_norm": 4.555992603302002, "learning_rate": 9.02343761537368e-06, "loss": 0.11839361, "memory(GiB)": 15.03, "step": 6855, "train_speed(iter/s)": 1.474023 }, { "acc": 0.99000816, "epoch": 12.109443954104147, "grad_norm": 4.287460803985596, "learning_rate": 9.021702638601679e-06, "loss": 0.06840624, "memory(GiB)": 15.03, "step": 6860, "train_speed(iter/s)": 1.474043 }, { "acc": 0.99035635, "epoch": 12.118270079435128, "grad_norm": 2.2687737941741943, "learning_rate": 9.019966289129725e-06, "loss": 0.06997643, "memory(GiB)": 15.03, "step": 6865, "train_speed(iter/s)": 1.474072 }, { "acc": 0.98897724, "epoch": 12.127096204766108, "grad_norm": 4.299405097961426, "learning_rate": 9.018228567550546e-06, "loss": 0.0764275, "memory(GiB)": 15.03, "step": 6870, "train_speed(iter/s)": 1.474102 }, { "acc": 0.99087162, "epoch": 12.135922330097088, "grad_norm": 3.4965484142303467, "learning_rate": 9.016489474457344e-06, "loss": 0.06214081, "memory(GiB)": 15.03, "step": 6875, "train_speed(iter/s)": 1.474088 }, { "acc": 0.98894844, "epoch": 12.144748455428067, "grad_norm": 3.639805316925049, "learning_rate": 9.014749010443782e-06, "loss": 0.08711236, "memory(GiB)": 15.03, "step": 6880, "train_speed(iter/s)": 1.474088 }, { "acc": 0.98905106, "epoch": 12.153574580759047, "grad_norm": 4.173666954040527, "learning_rate": 9.013007176103997e-06, "loss": 0.07557715, "memory(GiB)": 15.03, "step": 6885, "train_speed(iter/s)": 1.474105 }, { "acc": 0.98801651, "epoch": 12.162400706090027, "grad_norm": 7.618515968322754, "learning_rate": 9.01126397203259e-06, "loss": 0.07650589, "memory(GiB)": 15.03, "step": 6890, "train_speed(iter/s)": 1.474162 }, { "acc": 0.9873909, "epoch": 12.171226831421006, "grad_norm": 5.726360321044922, "learning_rate": 9.009519398824627e-06, "loss": 0.09380078, "memory(GiB)": 15.03, "step": 6895, "train_speed(iter/s)": 1.474162 }, { "acc": 0.99162579, "epoch": 12.180052956751986, "grad_norm": 2.3069467544555664, "learning_rate": 9.00777345707565e-06, "loss": 0.05934474, "memory(GiB)": 15.03, "step": 6900, "train_speed(iter/s)": 1.474163 }, { "acc": 0.99026165, "epoch": 12.188879082082966, "grad_norm": 4.461423873901367, "learning_rate": 9.006026147381662e-06, "loss": 0.06581931, "memory(GiB)": 15.03, "step": 6905, "train_speed(iter/s)": 1.474193 }, { "acc": 0.98989849, "epoch": 12.197705207413945, "grad_norm": 4.9914326667785645, "learning_rate": 9.004277470339137e-06, "loss": 0.06287822, "memory(GiB)": 15.03, "step": 6910, "train_speed(iter/s)": 1.474234 }, { "acc": 0.99036684, "epoch": 12.206531332744925, "grad_norm": 3.80911922454834, "learning_rate": 9.002527426545007e-06, "loss": 0.05906237, "memory(GiB)": 15.03, "step": 6915, "train_speed(iter/s)": 1.474237 }, { "acc": 0.98886003, "epoch": 12.215357458075905, "grad_norm": 2.6903398036956787, "learning_rate": 9.000776016596682e-06, "loss": 0.0759068, "memory(GiB)": 15.03, "step": 6920, "train_speed(iter/s)": 1.474266 }, { "acc": 0.99012966, "epoch": 12.224183583406884, "grad_norm": 3.607797384262085, "learning_rate": 8.999023241092029e-06, "loss": 0.07763752, "memory(GiB)": 15.03, "step": 6925, "train_speed(iter/s)": 1.474288 }, { "acc": 0.99176292, "epoch": 12.233009708737864, "grad_norm": 7.081466197967529, "learning_rate": 8.997269100629387e-06, "loss": 0.05307791, "memory(GiB)": 15.03, "step": 6930, "train_speed(iter/s)": 1.474312 }, { "acc": 0.99031487, "epoch": 12.241835834068844, "grad_norm": 3.0807430744171143, "learning_rate": 8.995513595807562e-06, "loss": 0.06230152, "memory(GiB)": 15.03, "step": 6935, "train_speed(iter/s)": 1.474355 }, { "acc": 0.98744373, "epoch": 12.250661959399823, "grad_norm": 4.223307132720947, "learning_rate": 8.993756727225817e-06, "loss": 0.08460751, "memory(GiB)": 15.03, "step": 6940, "train_speed(iter/s)": 1.474338 }, { "acc": 0.98833971, "epoch": 12.259488084730803, "grad_norm": 5.323685169219971, "learning_rate": 8.991998495483892e-06, "loss": 0.06851321, "memory(GiB)": 15.03, "step": 6945, "train_speed(iter/s)": 1.474318 }, { "acc": 0.98906746, "epoch": 12.268314210061783, "grad_norm": 3.737914800643921, "learning_rate": 8.990238901181983e-06, "loss": 0.06807251, "memory(GiB)": 15.03, "step": 6950, "train_speed(iter/s)": 1.474296 }, { "acc": 0.99028301, "epoch": 12.277140335392762, "grad_norm": 2.209517478942871, "learning_rate": 8.988477944920756e-06, "loss": 0.06182972, "memory(GiB)": 15.03, "step": 6955, "train_speed(iter/s)": 1.474335 }, { "acc": 0.98723011, "epoch": 12.285966460723742, "grad_norm": 4.631396293640137, "learning_rate": 8.98671562730134e-06, "loss": 0.08523266, "memory(GiB)": 15.03, "step": 6960, "train_speed(iter/s)": 1.474359 }, { "acc": 0.99167156, "epoch": 12.294792586054722, "grad_norm": 3.005908966064453, "learning_rate": 8.98495194892533e-06, "loss": 0.07161878, "memory(GiB)": 15.03, "step": 6965, "train_speed(iter/s)": 1.474356 }, { "acc": 0.98878288, "epoch": 12.303618711385703, "grad_norm": 5.561332702636719, "learning_rate": 8.983186910394787e-06, "loss": 0.07607215, "memory(GiB)": 15.03, "step": 6970, "train_speed(iter/s)": 1.474338 }, { "acc": 0.98937321, "epoch": 12.312444836716681, "grad_norm": 4.350594997406006, "learning_rate": 8.98142051231223e-06, "loss": 0.07371872, "memory(GiB)": 15.03, "step": 6975, "train_speed(iter/s)": 1.474297 }, { "acc": 0.98751202, "epoch": 12.321270962047661, "grad_norm": 4.836043834686279, "learning_rate": 8.97965275528065e-06, "loss": 0.09085312, "memory(GiB)": 15.03, "step": 6980, "train_speed(iter/s)": 1.474369 }, { "acc": 0.99075623, "epoch": 12.330097087378642, "grad_norm": 2.923732042312622, "learning_rate": 8.977883639903498e-06, "loss": 0.06772406, "memory(GiB)": 15.03, "step": 6985, "train_speed(iter/s)": 1.474355 }, { "acc": 0.99242268, "epoch": 12.33892321270962, "grad_norm": 2.303133010864258, "learning_rate": 8.976113166784686e-06, "loss": 0.04572789, "memory(GiB)": 15.03, "step": 6990, "train_speed(iter/s)": 1.474395 }, { "acc": 0.99424925, "epoch": 12.3477493380406, "grad_norm": 3.508016586303711, "learning_rate": 8.974341336528596e-06, "loss": 0.05216947, "memory(GiB)": 15.03, "step": 6995, "train_speed(iter/s)": 1.474439 }, { "acc": 0.9866188, "epoch": 12.35657546337158, "grad_norm": 3.331818103790283, "learning_rate": 8.972568149740067e-06, "loss": 0.10384302, "memory(GiB)": 15.03, "step": 7000, "train_speed(iter/s)": 1.474434 }, { "acc": 0.98709364, "epoch": 12.365401588702559, "grad_norm": 4.869511604309082, "learning_rate": 8.970793607024406e-06, "loss": 0.06692719, "memory(GiB)": 15.03, "step": 7005, "train_speed(iter/s)": 1.474419 }, { "acc": 0.98921242, "epoch": 12.37422771403354, "grad_norm": 6.7074785232543945, "learning_rate": 8.969017708987382e-06, "loss": 0.0675912, "memory(GiB)": 15.03, "step": 7010, "train_speed(iter/s)": 1.474394 }, { "acc": 0.9912632, "epoch": 12.38305383936452, "grad_norm": 5.720054626464844, "learning_rate": 8.96724045623522e-06, "loss": 0.0615644, "memory(GiB)": 15.03, "step": 7015, "train_speed(iter/s)": 1.474406 }, { "acc": 0.98912354, "epoch": 12.391879964695498, "grad_norm": 4.5392913818359375, "learning_rate": 8.96546184937462e-06, "loss": 0.08403287, "memory(GiB)": 15.03, "step": 7020, "train_speed(iter/s)": 1.474438 }, { "acc": 0.98882799, "epoch": 12.400706090026478, "grad_norm": 3.1315650939941406, "learning_rate": 8.96368188901273e-06, "loss": 0.07339678, "memory(GiB)": 15.03, "step": 7025, "train_speed(iter/s)": 1.474457 }, { "acc": 0.99176598, "epoch": 12.409532215357459, "grad_norm": 1.3322722911834717, "learning_rate": 8.961900575757171e-06, "loss": 0.06639999, "memory(GiB)": 15.03, "step": 7030, "train_speed(iter/s)": 1.474523 }, { "acc": 0.98819962, "epoch": 12.418358340688437, "grad_norm": 3.341250419616699, "learning_rate": 8.96011791021602e-06, "loss": 0.08096555, "memory(GiB)": 15.03, "step": 7035, "train_speed(iter/s)": 1.474565 }, { "acc": 0.9926816, "epoch": 12.427184466019417, "grad_norm": 2.0087037086486816, "learning_rate": 8.958333892997821e-06, "loss": 0.0500495, "memory(GiB)": 15.03, "step": 7040, "train_speed(iter/s)": 1.474614 }, { "acc": 0.99105721, "epoch": 12.436010591350398, "grad_norm": 3.962311029434204, "learning_rate": 8.956548524711574e-06, "loss": 0.05711586, "memory(GiB)": 15.03, "step": 7045, "train_speed(iter/s)": 1.474617 }, { "acc": 0.99001369, "epoch": 12.444836716681376, "grad_norm": 8.980878829956055, "learning_rate": 8.95476180596674e-06, "loss": 0.07392451, "memory(GiB)": 15.03, "step": 7050, "train_speed(iter/s)": 1.474689 }, { "acc": 0.98801174, "epoch": 12.453662842012356, "grad_norm": 5.17653226852417, "learning_rate": 8.952973737373248e-06, "loss": 0.08839643, "memory(GiB)": 15.03, "step": 7055, "train_speed(iter/s)": 1.47474 }, { "acc": 0.99125328, "epoch": 12.462488967343337, "grad_norm": 7.8228607177734375, "learning_rate": 8.951184319541476e-06, "loss": 0.0615773, "memory(GiB)": 15.03, "step": 7060, "train_speed(iter/s)": 1.474767 }, { "acc": 0.98468609, "epoch": 12.471315092674317, "grad_norm": 4.5631794929504395, "learning_rate": 8.949393553082277e-06, "loss": 0.11307528, "memory(GiB)": 15.03, "step": 7065, "train_speed(iter/s)": 1.474819 }, { "acc": 0.98705883, "epoch": 12.480141218005295, "grad_norm": 3.7927138805389404, "learning_rate": 8.947601438606953e-06, "loss": 0.0900806, "memory(GiB)": 15.03, "step": 7070, "train_speed(iter/s)": 1.474782 }, { "acc": 0.9876729, "epoch": 12.488967343336276, "grad_norm": 4.556002616882324, "learning_rate": 8.94580797672727e-06, "loss": 0.08057003, "memory(GiB)": 15.03, "step": 7075, "train_speed(iter/s)": 1.474792 }, { "acc": 0.99274874, "epoch": 12.497793468667256, "grad_norm": 3.1905758380889893, "learning_rate": 8.944013168055457e-06, "loss": 0.0480328, "memory(GiB)": 15.03, "step": 7080, "train_speed(iter/s)": 1.474798 }, { "acc": 0.99027252, "epoch": 12.506619593998234, "grad_norm": 1.4138131141662598, "learning_rate": 8.942217013204194e-06, "loss": 0.05852402, "memory(GiB)": 15.03, "step": 7085, "train_speed(iter/s)": 1.474824 }, { "acc": 0.99135113, "epoch": 12.515445719329215, "grad_norm": 3.2208025455474854, "learning_rate": 8.940419512786631e-06, "loss": 0.06504519, "memory(GiB)": 15.03, "step": 7090, "train_speed(iter/s)": 1.47481 }, { "acc": 0.99166813, "epoch": 12.524271844660195, "grad_norm": 4.942503452301025, "learning_rate": 8.938620667416373e-06, "loss": 0.05685909, "memory(GiB)": 15.03, "step": 7095, "train_speed(iter/s)": 1.474817 }, { "acc": 0.99071522, "epoch": 12.533097969991173, "grad_norm": 4.2536139488220215, "learning_rate": 8.936820477707482e-06, "loss": 0.0547006, "memory(GiB)": 15.03, "step": 7100, "train_speed(iter/s)": 1.474813 }, { "acc": 0.9890007, "epoch": 12.541924095322154, "grad_norm": 4.484463691711426, "learning_rate": 8.93501894427448e-06, "loss": 0.06657485, "memory(GiB)": 15.03, "step": 7105, "train_speed(iter/s)": 1.474813 }, { "acc": 0.98965168, "epoch": 12.550750220653134, "grad_norm": 3.0840909481048584, "learning_rate": 8.933216067732351e-06, "loss": 0.05602008, "memory(GiB)": 15.03, "step": 7110, "train_speed(iter/s)": 1.47483 }, { "acc": 0.99180241, "epoch": 12.559576345984112, "grad_norm": 3.939249277114868, "learning_rate": 8.931411848696532e-06, "loss": 0.04760731, "memory(GiB)": 15.03, "step": 7115, "train_speed(iter/s)": 1.474848 }, { "acc": 0.9922946, "epoch": 12.568402471315093, "grad_norm": 3.3029897212982178, "learning_rate": 8.929606287782923e-06, "loss": 0.04435669, "memory(GiB)": 15.03, "step": 7120, "train_speed(iter/s)": 1.47489 }, { "acc": 0.9938015, "epoch": 12.577228596646073, "grad_norm": 3.2479166984558105, "learning_rate": 8.927799385607882e-06, "loss": 0.04498982, "memory(GiB)": 15.03, "step": 7125, "train_speed(iter/s)": 1.474885 }, { "acc": 0.98949909, "epoch": 12.586054721977051, "grad_norm": 1.9849196672439575, "learning_rate": 8.92599114278822e-06, "loss": 0.06653167, "memory(GiB)": 15.03, "step": 7130, "train_speed(iter/s)": 1.474946 }, { "acc": 0.98905926, "epoch": 12.594880847308032, "grad_norm": 2.149968385696411, "learning_rate": 8.924181559941206e-06, "loss": 0.07584857, "memory(GiB)": 15.03, "step": 7135, "train_speed(iter/s)": 1.474907 }, { "acc": 0.98781033, "epoch": 12.603706972639012, "grad_norm": 4.49107027053833, "learning_rate": 8.922370637684576e-06, "loss": 0.08609766, "memory(GiB)": 15.03, "step": 7140, "train_speed(iter/s)": 1.474951 }, { "acc": 0.99159946, "epoch": 12.61253309796999, "grad_norm": 3.2915682792663574, "learning_rate": 8.920558376636513e-06, "loss": 0.05250264, "memory(GiB)": 15.03, "step": 7145, "train_speed(iter/s)": 1.474958 }, { "acc": 0.9884532, "epoch": 12.62135922330097, "grad_norm": 4.486784934997559, "learning_rate": 8.918744777415662e-06, "loss": 0.08258048, "memory(GiB)": 15.03, "step": 7150, "train_speed(iter/s)": 1.474984 }, { "acc": 0.99140024, "epoch": 12.63018534863195, "grad_norm": 5.0987019538879395, "learning_rate": 8.91692984064112e-06, "loss": 0.05372116, "memory(GiB)": 15.03, "step": 7155, "train_speed(iter/s)": 1.474948 }, { "acc": 0.98786736, "epoch": 12.639011473962931, "grad_norm": 6.339476108551025, "learning_rate": 8.915113566932448e-06, "loss": 0.08056297, "memory(GiB)": 15.03, "step": 7160, "train_speed(iter/s)": 1.474949 }, { "acc": 0.99048901, "epoch": 12.64783759929391, "grad_norm": 2.809921979904175, "learning_rate": 8.913295956909654e-06, "loss": 0.0739803, "memory(GiB)": 15.03, "step": 7165, "train_speed(iter/s)": 1.474958 }, { "acc": 0.98799992, "epoch": 12.65666372462489, "grad_norm": 1.9549803733825684, "learning_rate": 8.911477011193213e-06, "loss": 0.07270979, "memory(GiB)": 15.03, "step": 7170, "train_speed(iter/s)": 1.474926 }, { "acc": 0.98849335, "epoch": 12.66548984995587, "grad_norm": 6.290802955627441, "learning_rate": 8.909656730404047e-06, "loss": 0.0731584, "memory(GiB)": 15.03, "step": 7175, "train_speed(iter/s)": 1.474933 }, { "acc": 0.98933544, "epoch": 12.674315975286849, "grad_norm": 3.380514621734619, "learning_rate": 8.907835115163538e-06, "loss": 0.08409728, "memory(GiB)": 15.03, "step": 7180, "train_speed(iter/s)": 1.474889 }, { "acc": 0.98872108, "epoch": 12.683142100617829, "grad_norm": 4.10494327545166, "learning_rate": 8.906012166093522e-06, "loss": 0.07236729, "memory(GiB)": 15.03, "step": 7185, "train_speed(iter/s)": 1.474915 }, { "acc": 0.98737946, "epoch": 12.69196822594881, "grad_norm": 9.050786972045898, "learning_rate": 8.904187883816295e-06, "loss": 0.06603123, "memory(GiB)": 15.03, "step": 7190, "train_speed(iter/s)": 1.474987 }, { "acc": 0.99226837, "epoch": 12.700794351279788, "grad_norm": 4.0437211990356445, "learning_rate": 8.902362268954598e-06, "loss": 0.05318367, "memory(GiB)": 15.03, "step": 7195, "train_speed(iter/s)": 1.474954 }, { "acc": 0.99140511, "epoch": 12.709620476610768, "grad_norm": 2.048007011413574, "learning_rate": 8.900535322131635e-06, "loss": 0.06121974, "memory(GiB)": 15.03, "step": 7200, "train_speed(iter/s)": 1.474924 }, { "acc": 0.99079542, "epoch": 12.718446601941748, "grad_norm": 4.529441833496094, "learning_rate": 8.898707043971065e-06, "loss": 0.07464469, "memory(GiB)": 15.03, "step": 7205, "train_speed(iter/s)": 1.474913 }, { "acc": 0.99068422, "epoch": 12.727272727272727, "grad_norm": 5.573198318481445, "learning_rate": 8.896877435096996e-06, "loss": 0.04633795, "memory(GiB)": 15.03, "step": 7210, "train_speed(iter/s)": 1.474907 }, { "acc": 0.99068623, "epoch": 12.736098852603707, "grad_norm": 4.778669357299805, "learning_rate": 8.895046496133995e-06, "loss": 0.057331, "memory(GiB)": 15.03, "step": 7215, "train_speed(iter/s)": 1.474869 }, { "acc": 0.99090481, "epoch": 12.744924977934687, "grad_norm": 3.948298215866089, "learning_rate": 8.893214227707083e-06, "loss": 0.06012453, "memory(GiB)": 15.03, "step": 7220, "train_speed(iter/s)": 1.474909 }, { "acc": 0.99041681, "epoch": 12.753751103265666, "grad_norm": 3.306701421737671, "learning_rate": 8.89138063044173e-06, "loss": 0.06519302, "memory(GiB)": 15.03, "step": 7225, "train_speed(iter/s)": 1.474946 }, { "acc": 0.99005966, "epoch": 12.762577228596646, "grad_norm": 4.747673034667969, "learning_rate": 8.889545704963865e-06, "loss": 0.07582012, "memory(GiB)": 15.03, "step": 7230, "train_speed(iter/s)": 1.474939 }, { "acc": 0.99036932, "epoch": 12.771403353927626, "grad_norm": 2.7198073863983154, "learning_rate": 8.887709451899868e-06, "loss": 0.05978848, "memory(GiB)": 15.03, "step": 7235, "train_speed(iter/s)": 1.474936 }, { "acc": 0.98589954, "epoch": 12.780229479258605, "grad_norm": 1.6888234615325928, "learning_rate": 8.88587187187657e-06, "loss": 0.09506624, "memory(GiB)": 15.03, "step": 7240, "train_speed(iter/s)": 1.474936 }, { "acc": 0.99070473, "epoch": 12.789055604589585, "grad_norm": 5.399086952209473, "learning_rate": 8.884032965521262e-06, "loss": 0.08734972, "memory(GiB)": 15.03, "step": 7245, "train_speed(iter/s)": 1.474944 }, { "acc": 0.98975029, "epoch": 12.797881729920565, "grad_norm": 3.8586792945861816, "learning_rate": 8.88219273346168e-06, "loss": 0.05900968, "memory(GiB)": 15.03, "step": 7250, "train_speed(iter/s)": 1.474972 }, { "acc": 0.99046497, "epoch": 12.806707855251545, "grad_norm": 4.361014366149902, "learning_rate": 8.880351176326017e-06, "loss": 0.05444121, "memory(GiB)": 15.03, "step": 7255, "train_speed(iter/s)": 1.474988 }, { "acc": 0.99079866, "epoch": 12.815533980582524, "grad_norm": 3.7932686805725098, "learning_rate": 8.878508294742917e-06, "loss": 0.05569339, "memory(GiB)": 15.03, "step": 7260, "train_speed(iter/s)": 1.474973 }, { "acc": 0.99096699, "epoch": 12.824360105913504, "grad_norm": 2.1361207962036133, "learning_rate": 8.876664089341477e-06, "loss": 0.06234008, "memory(GiB)": 15.03, "step": 7265, "train_speed(iter/s)": 1.47502 }, { "acc": 0.99563932, "epoch": 12.833186231244484, "grad_norm": 2.3483800888061523, "learning_rate": 8.874818560751243e-06, "loss": 0.03290366, "memory(GiB)": 15.03, "step": 7270, "train_speed(iter/s)": 1.475082 }, { "acc": 0.98838453, "epoch": 12.842012356575463, "grad_norm": 5.212656021118164, "learning_rate": 8.872971709602218e-06, "loss": 0.06845979, "memory(GiB)": 15.03, "step": 7275, "train_speed(iter/s)": 1.475105 }, { "acc": 0.99010334, "epoch": 12.850838481906443, "grad_norm": 4.164439678192139, "learning_rate": 8.87112353652485e-06, "loss": 0.06310531, "memory(GiB)": 15.03, "step": 7280, "train_speed(iter/s)": 1.475123 }, { "acc": 0.9903286, "epoch": 12.859664607237423, "grad_norm": 2.201235294342041, "learning_rate": 8.86927404215004e-06, "loss": 0.05151259, "memory(GiB)": 15.03, "step": 7285, "train_speed(iter/s)": 1.475115 }, { "acc": 0.99179077, "epoch": 12.868490732568402, "grad_norm": 5.088080406188965, "learning_rate": 8.86742322710915e-06, "loss": 0.05317283, "memory(GiB)": 15.03, "step": 7290, "train_speed(iter/s)": 1.475176 }, { "acc": 0.98519325, "epoch": 12.877316857899382, "grad_norm": 3.5929551124572754, "learning_rate": 8.865571092033976e-06, "loss": 0.10318317, "memory(GiB)": 15.03, "step": 7295, "train_speed(iter/s)": 1.4752 }, { "acc": 0.99123468, "epoch": 12.886142983230362, "grad_norm": 3.443042516708374, "learning_rate": 8.863717637556777e-06, "loss": 0.06379066, "memory(GiB)": 15.03, "step": 7300, "train_speed(iter/s)": 1.475192 }, { "acc": 0.99413548, "epoch": 12.894969108561341, "grad_norm": 3.791921615600586, "learning_rate": 8.861862864310256e-06, "loss": 0.04617709, "memory(GiB)": 15.03, "step": 7305, "train_speed(iter/s)": 1.47518 }, { "acc": 0.98758802, "epoch": 12.903795233892321, "grad_norm": 21.52509307861328, "learning_rate": 8.860006772927573e-06, "loss": 0.07670084, "memory(GiB)": 15.03, "step": 7310, "train_speed(iter/s)": 1.475236 }, { "acc": 0.99035549, "epoch": 12.912621359223301, "grad_norm": 2.7875163555145264, "learning_rate": 8.85814936404233e-06, "loss": 0.07773386, "memory(GiB)": 15.03, "step": 7315, "train_speed(iter/s)": 1.475209 }, { "acc": 0.98819494, "epoch": 12.92144748455428, "grad_norm": 13.032513618469238, "learning_rate": 8.856290638288582e-06, "loss": 0.06636255, "memory(GiB)": 15.03, "step": 7320, "train_speed(iter/s)": 1.475164 }, { "acc": 0.99083366, "epoch": 12.93027360988526, "grad_norm": 2.1438815593719482, "learning_rate": 8.854430596300834e-06, "loss": 0.05767808, "memory(GiB)": 15.03, "step": 7325, "train_speed(iter/s)": 1.475159 }, { "acc": 0.9876667, "epoch": 12.93909973521624, "grad_norm": 3.765188455581665, "learning_rate": 8.852569238714043e-06, "loss": 0.07682603, "memory(GiB)": 15.03, "step": 7330, "train_speed(iter/s)": 1.475171 }, { "acc": 0.99117384, "epoch": 12.947925860547219, "grad_norm": 4.876363277435303, "learning_rate": 8.850706566163611e-06, "loss": 0.05812109, "memory(GiB)": 15.03, "step": 7335, "train_speed(iter/s)": 1.475193 }, { "acc": 0.98950348, "epoch": 12.9567519858782, "grad_norm": 2.9024529457092285, "learning_rate": 8.84884257928539e-06, "loss": 0.07100599, "memory(GiB)": 15.03, "step": 7340, "train_speed(iter/s)": 1.475224 }, { "acc": 0.98885956, "epoch": 12.96557811120918, "grad_norm": 3.415717363357544, "learning_rate": 8.846977278715679e-06, "loss": 0.06782874, "memory(GiB)": 15.03, "step": 7345, "train_speed(iter/s)": 1.475245 }, { "acc": 0.98915787, "epoch": 12.97440423654016, "grad_norm": 5.102143287658691, "learning_rate": 8.84511066509123e-06, "loss": 0.07977984, "memory(GiB)": 15.03, "step": 7350, "train_speed(iter/s)": 1.475266 }, { "acc": 0.98896141, "epoch": 12.983230361871138, "grad_norm": 2.4850754737854004, "learning_rate": 8.843242739049238e-06, "loss": 0.07583296, "memory(GiB)": 15.03, "step": 7355, "train_speed(iter/s)": 1.47531 }, { "acc": 0.99263535, "epoch": 12.992056487202118, "grad_norm": 3.152411460876465, "learning_rate": 8.841373501227353e-06, "loss": 0.05413851, "memory(GiB)": 15.03, "step": 7360, "train_speed(iter/s)": 1.475303 }, { "acc": 0.99129076, "epoch": 13.000882612533099, "grad_norm": 2.6294734477996826, "learning_rate": 8.839502952263665e-06, "loss": 0.05489618, "memory(GiB)": 15.03, "step": 7365, "train_speed(iter/s)": 1.475254 }, { "acc": 0.99300671, "epoch": 13.009708737864077, "grad_norm": 2.075719118118286, "learning_rate": 8.837631092796713e-06, "loss": 0.05583717, "memory(GiB)": 15.03, "step": 7370, "train_speed(iter/s)": 1.475239 }, { "acc": 0.98977509, "epoch": 13.018534863195057, "grad_norm": 3.767500162124634, "learning_rate": 8.835757923465489e-06, "loss": 0.06280243, "memory(GiB)": 15.03, "step": 7375, "train_speed(iter/s)": 1.475335 }, { "acc": 0.99000015, "epoch": 13.027360988526038, "grad_norm": 3.46524715423584, "learning_rate": 8.833883444909428e-06, "loss": 0.05972114, "memory(GiB)": 15.03, "step": 7380, "train_speed(iter/s)": 1.47534 }, { "acc": 0.99492645, "epoch": 13.036187113857016, "grad_norm": 3.9911296367645264, "learning_rate": 8.83200765776841e-06, "loss": 0.03662339, "memory(GiB)": 15.03, "step": 7385, "train_speed(iter/s)": 1.475301 }, { "acc": 0.98920708, "epoch": 13.045013239187996, "grad_norm": 2.877119779586792, "learning_rate": 8.830130562682766e-06, "loss": 0.07282912, "memory(GiB)": 15.03, "step": 7390, "train_speed(iter/s)": 1.475359 }, { "acc": 0.98826609, "epoch": 13.053839364518977, "grad_norm": 2.952810049057007, "learning_rate": 8.82825216029327e-06, "loss": 0.0735388, "memory(GiB)": 15.03, "step": 7395, "train_speed(iter/s)": 1.475388 }, { "acc": 0.99006529, "epoch": 13.062665489849955, "grad_norm": 1.7803139686584473, "learning_rate": 8.826372451241147e-06, "loss": 0.06175588, "memory(GiB)": 15.03, "step": 7400, "train_speed(iter/s)": 1.47542 }, { "acc": 0.99360857, "epoch": 13.071491615180935, "grad_norm": 3.420325517654419, "learning_rate": 8.824491436168061e-06, "loss": 0.05025064, "memory(GiB)": 15.03, "step": 7405, "train_speed(iter/s)": 1.475447 }, { "acc": 0.99009972, "epoch": 13.080317740511916, "grad_norm": 6.147634983062744, "learning_rate": 8.82260911571613e-06, "loss": 0.05935779, "memory(GiB)": 15.03, "step": 7410, "train_speed(iter/s)": 1.475456 }, { "acc": 0.99098272, "epoch": 13.089143865842894, "grad_norm": 2.4828526973724365, "learning_rate": 8.820725490527907e-06, "loss": 0.06084652, "memory(GiB)": 15.03, "step": 7415, "train_speed(iter/s)": 1.475445 }, { "acc": 0.99312325, "epoch": 13.097969991173875, "grad_norm": 1.7198208570480347, "learning_rate": 8.818840561246403e-06, "loss": 0.04406767, "memory(GiB)": 15.03, "step": 7420, "train_speed(iter/s)": 1.475437 }, { "acc": 0.98599968, "epoch": 13.106796116504855, "grad_norm": 5.341996669769287, "learning_rate": 8.816954328515065e-06, "loss": 0.08221872, "memory(GiB)": 15.03, "step": 7425, "train_speed(iter/s)": 1.475429 }, { "acc": 0.98861647, "epoch": 13.115622241835833, "grad_norm": 3.0648932456970215, "learning_rate": 8.815066792977788e-06, "loss": 0.07216836, "memory(GiB)": 15.03, "step": 7430, "train_speed(iter/s)": 1.475491 }, { "acc": 0.98868303, "epoch": 13.124448367166814, "grad_norm": 14.435758590698242, "learning_rate": 8.81317795527891e-06, "loss": 0.08172193, "memory(GiB)": 15.03, "step": 7435, "train_speed(iter/s)": 1.47544 }, { "acc": 0.9909277, "epoch": 13.133274492497794, "grad_norm": 3.4962708950042725, "learning_rate": 8.81128781606322e-06, "loss": 0.05600438, "memory(GiB)": 15.03, "step": 7440, "train_speed(iter/s)": 1.475407 }, { "acc": 0.99154243, "epoch": 13.142100617828774, "grad_norm": 3.97339129447937, "learning_rate": 8.809396375975941e-06, "loss": 0.0632004, "memory(GiB)": 15.03, "step": 7445, "train_speed(iter/s)": 1.47547 }, { "acc": 0.99044867, "epoch": 13.150926743159753, "grad_norm": 4.081250190734863, "learning_rate": 8.807503635662748e-06, "loss": 0.06347892, "memory(GiB)": 15.03, "step": 7450, "train_speed(iter/s)": 1.475436 }, { "acc": 0.99187918, "epoch": 13.159752868490733, "grad_norm": 9.515066146850586, "learning_rate": 8.805609595769757e-06, "loss": 0.0552126, "memory(GiB)": 15.03, "step": 7455, "train_speed(iter/s)": 1.475501 }, { "acc": 0.98671665, "epoch": 13.168578993821713, "grad_norm": 4.716747760772705, "learning_rate": 8.803714256943528e-06, "loss": 0.0745636, "memory(GiB)": 15.03, "step": 7460, "train_speed(iter/s)": 1.475464 }, { "acc": 0.98726664, "epoch": 13.177405119152692, "grad_norm": 2.773482084274292, "learning_rate": 8.801817619831063e-06, "loss": 0.08675184, "memory(GiB)": 15.03, "step": 7465, "train_speed(iter/s)": 1.475507 }, { "acc": 0.99100847, "epoch": 13.186231244483672, "grad_norm": 3.4675559997558594, "learning_rate": 8.799919685079811e-06, "loss": 0.06265658, "memory(GiB)": 15.03, "step": 7470, "train_speed(iter/s)": 1.475442 }, { "acc": 0.9894022, "epoch": 13.195057369814652, "grad_norm": 4.844145774841309, "learning_rate": 8.798020453337659e-06, "loss": 0.05942775, "memory(GiB)": 15.03, "step": 7475, "train_speed(iter/s)": 1.475429 }, { "acc": 0.98824348, "epoch": 13.20388349514563, "grad_norm": 3.8868911266326904, "learning_rate": 8.796119925252942e-06, "loss": 0.08668798, "memory(GiB)": 15.03, "step": 7480, "train_speed(iter/s)": 1.475433 }, { "acc": 0.99220362, "epoch": 13.21270962047661, "grad_norm": 2.706524133682251, "learning_rate": 8.794218101474432e-06, "loss": 0.05806905, "memory(GiB)": 15.03, "step": 7485, "train_speed(iter/s)": 1.475432 }, { "acc": 0.99218493, "epoch": 13.221535745807591, "grad_norm": 4.457629680633545, "learning_rate": 8.792314982651347e-06, "loss": 0.06141957, "memory(GiB)": 15.03, "step": 7490, "train_speed(iter/s)": 1.475427 }, { "acc": 0.9899395, "epoch": 13.23036187113857, "grad_norm": 3.7662570476531982, "learning_rate": 8.790410569433347e-06, "loss": 0.06978829, "memory(GiB)": 15.03, "step": 7495, "train_speed(iter/s)": 1.475429 }, { "acc": 0.992276, "epoch": 13.23918799646955, "grad_norm": 3.881181001663208, "learning_rate": 8.788504862470533e-06, "loss": 0.05457958, "memory(GiB)": 15.03, "step": 7500, "train_speed(iter/s)": 1.475411 }, { "acc": 0.98756409, "epoch": 13.24801412180053, "grad_norm": 2.76155948638916, "learning_rate": 8.786597862413446e-06, "loss": 0.08615362, "memory(GiB)": 15.03, "step": 7505, "train_speed(iter/s)": 1.475427 }, { "acc": 0.98868637, "epoch": 13.256840247131509, "grad_norm": 2.9542946815490723, "learning_rate": 8.784689569913073e-06, "loss": 0.06519393, "memory(GiB)": 15.03, "step": 7510, "train_speed(iter/s)": 1.475499 }, { "acc": 0.99301834, "epoch": 13.265666372462489, "grad_norm": 5.117018222808838, "learning_rate": 8.782779985620835e-06, "loss": 0.05246252, "memory(GiB)": 15.03, "step": 7515, "train_speed(iter/s)": 1.47552 }, { "acc": 0.98944817, "epoch": 13.274492497793469, "grad_norm": 3.431818723678589, "learning_rate": 8.780869110188601e-06, "loss": 0.07834511, "memory(GiB)": 15.03, "step": 7520, "train_speed(iter/s)": 1.475521 }, { "acc": 0.98928413, "epoch": 13.283318623124448, "grad_norm": 2.3132388591766357, "learning_rate": 8.778956944268679e-06, "loss": 0.06566118, "memory(GiB)": 15.03, "step": 7525, "train_speed(iter/s)": 1.475516 }, { "acc": 0.99023857, "epoch": 13.292144748455428, "grad_norm": 6.551204681396484, "learning_rate": 8.777043488513816e-06, "loss": 0.06122286, "memory(GiB)": 15.03, "step": 7530, "train_speed(iter/s)": 1.475525 }, { "acc": 0.9871933, "epoch": 13.300970873786408, "grad_norm": 3.935122013092041, "learning_rate": 8.775128743577199e-06, "loss": 0.10644456, "memory(GiB)": 15.03, "step": 7535, "train_speed(iter/s)": 1.475529 }, { "acc": 0.99338493, "epoch": 13.309796999117388, "grad_norm": 2.998908042907715, "learning_rate": 8.773212710112454e-06, "loss": 0.04199523, "memory(GiB)": 15.03, "step": 7540, "train_speed(iter/s)": 1.475574 }, { "acc": 0.9901207, "epoch": 13.318623124448367, "grad_norm": 2.6156585216522217, "learning_rate": 8.77129538877365e-06, "loss": 0.06438704, "memory(GiB)": 15.03, "step": 7545, "train_speed(iter/s)": 1.475571 }, { "acc": 0.9897522, "epoch": 13.327449249779347, "grad_norm": 7.111834526062012, "learning_rate": 8.769376780215297e-06, "loss": 0.0622341, "memory(GiB)": 15.03, "step": 7550, "train_speed(iter/s)": 1.475603 }, { "acc": 0.99262943, "epoch": 13.336275375110327, "grad_norm": 3.593073606491089, "learning_rate": 8.76745688509234e-06, "loss": 0.04650334, "memory(GiB)": 15.03, "step": 7555, "train_speed(iter/s)": 1.475657 }, { "acc": 0.9900816, "epoch": 13.345101500441306, "grad_norm": 2.9077863693237305, "learning_rate": 8.765535704060166e-06, "loss": 0.05469521, "memory(GiB)": 15.03, "step": 7560, "train_speed(iter/s)": 1.475612 }, { "acc": 0.99230385, "epoch": 13.353927625772286, "grad_norm": 4.029382228851318, "learning_rate": 8.763613237774597e-06, "loss": 0.05832432, "memory(GiB)": 15.03, "step": 7565, "train_speed(iter/s)": 1.475657 }, { "acc": 0.9894803, "epoch": 13.362753751103266, "grad_norm": 2.8830175399780273, "learning_rate": 8.761689486891898e-06, "loss": 0.07610686, "memory(GiB)": 15.03, "step": 7570, "train_speed(iter/s)": 1.475684 }, { "acc": 0.98449574, "epoch": 13.371579876434245, "grad_norm": 3.2725937366485596, "learning_rate": 8.759764452068774e-06, "loss": 0.09854979, "memory(GiB)": 15.03, "step": 7575, "train_speed(iter/s)": 1.475675 }, { "acc": 0.99097881, "epoch": 13.380406001765225, "grad_norm": 4.255208969116211, "learning_rate": 8.757838133962362e-06, "loss": 0.06712781, "memory(GiB)": 15.03, "step": 7580, "train_speed(iter/s)": 1.475668 }, { "acc": 0.98559875, "epoch": 13.389232127096205, "grad_norm": 5.479088306427002, "learning_rate": 8.755910533230243e-06, "loss": 0.07362231, "memory(GiB)": 15.03, "step": 7585, "train_speed(iter/s)": 1.475661 }, { "acc": 0.99188004, "epoch": 13.398058252427184, "grad_norm": 3.962226390838623, "learning_rate": 8.75398165053043e-06, "loss": 0.0473319, "memory(GiB)": 15.03, "step": 7590, "train_speed(iter/s)": 1.475661 }, { "acc": 0.98989677, "epoch": 13.406884377758164, "grad_norm": 2.4462356567382812, "learning_rate": 8.752051486521383e-06, "loss": 0.07498425, "memory(GiB)": 15.03, "step": 7595, "train_speed(iter/s)": 1.47569 }, { "acc": 0.99010506, "epoch": 13.415710503089144, "grad_norm": 4.270981788635254, "learning_rate": 8.750120041861987e-06, "loss": 0.06638736, "memory(GiB)": 15.03, "step": 7600, "train_speed(iter/s)": 1.475743 }, { "acc": 0.99469481, "epoch": 13.424536628420123, "grad_norm": 3.2740890979766846, "learning_rate": 8.748187317211575e-06, "loss": 0.03416079, "memory(GiB)": 15.03, "step": 7605, "train_speed(iter/s)": 1.475713 }, { "acc": 0.99232187, "epoch": 13.433362753751103, "grad_norm": 2.9987497329711914, "learning_rate": 8.746253313229912e-06, "loss": 0.06424344, "memory(GiB)": 15.03, "step": 7610, "train_speed(iter/s)": 1.475694 }, { "acc": 0.98963833, "epoch": 13.442188879082083, "grad_norm": 4.515815258026123, "learning_rate": 8.744318030577198e-06, "loss": 0.06073806, "memory(GiB)": 15.03, "step": 7615, "train_speed(iter/s)": 1.475706 }, { "acc": 0.9883091, "epoch": 13.451015004413062, "grad_norm": 4.796741962432861, "learning_rate": 8.742381469914075e-06, "loss": 0.07349377, "memory(GiB)": 15.03, "step": 7620, "train_speed(iter/s)": 1.475704 }, { "acc": 0.98929329, "epoch": 13.459841129744042, "grad_norm": 4.276737689971924, "learning_rate": 8.740443631901615e-06, "loss": 0.06153609, "memory(GiB)": 15.03, "step": 7625, "train_speed(iter/s)": 1.475724 }, { "acc": 0.9935112, "epoch": 13.468667255075022, "grad_norm": 2.4449923038482666, "learning_rate": 8.738504517201328e-06, "loss": 0.04970628, "memory(GiB)": 15.03, "step": 7630, "train_speed(iter/s)": 1.475727 }, { "acc": 0.99098988, "epoch": 13.477493380406003, "grad_norm": 4.508188724517822, "learning_rate": 8.736564126475167e-06, "loss": 0.05783843, "memory(GiB)": 15.03, "step": 7635, "train_speed(iter/s)": 1.475704 }, { "acc": 0.99279404, "epoch": 13.486319505736981, "grad_norm": 3.2519724369049072, "learning_rate": 8.734622460385508e-06, "loss": 0.05813428, "memory(GiB)": 15.03, "step": 7640, "train_speed(iter/s)": 1.475668 }, { "acc": 0.99178257, "epoch": 13.495145631067961, "grad_norm": 2.5606460571289062, "learning_rate": 8.732679519595175e-06, "loss": 0.05423301, "memory(GiB)": 15.03, "step": 7645, "train_speed(iter/s)": 1.475745 }, { "acc": 0.98967438, "epoch": 13.503971756398942, "grad_norm": 4.3377485275268555, "learning_rate": 8.730735304767413e-06, "loss": 0.06809303, "memory(GiB)": 15.03, "step": 7650, "train_speed(iter/s)": 1.47573 }, { "acc": 0.98936176, "epoch": 13.51279788172992, "grad_norm": 4.015938758850098, "learning_rate": 8.728789816565918e-06, "loss": 0.07341932, "memory(GiB)": 15.03, "step": 7655, "train_speed(iter/s)": 1.475769 }, { "acc": 0.99190483, "epoch": 13.5216240070609, "grad_norm": 1.8641738891601562, "learning_rate": 8.726843055654808e-06, "loss": 0.05071182, "memory(GiB)": 15.03, "step": 7660, "train_speed(iter/s)": 1.475743 }, { "acc": 0.9928689, "epoch": 13.53045013239188, "grad_norm": 1.683948040008545, "learning_rate": 8.724895022698641e-06, "loss": 0.05620385, "memory(GiB)": 15.03, "step": 7665, "train_speed(iter/s)": 1.475746 }, { "acc": 0.99125757, "epoch": 13.53927625772286, "grad_norm": 4.949843883514404, "learning_rate": 8.722945718362409e-06, "loss": 0.05006669, "memory(GiB)": 15.03, "step": 7670, "train_speed(iter/s)": 1.475781 }, { "acc": 0.99079523, "epoch": 13.54810238305384, "grad_norm": 3.6619713306427, "learning_rate": 8.720995143311537e-06, "loss": 0.06343828, "memory(GiB)": 15.03, "step": 7675, "train_speed(iter/s)": 1.475733 }, { "acc": 0.98938904, "epoch": 13.55692850838482, "grad_norm": 4.694051265716553, "learning_rate": 8.719043298211883e-06, "loss": 0.07204378, "memory(GiB)": 15.03, "step": 7680, "train_speed(iter/s)": 1.475722 }, { "acc": 0.98845844, "epoch": 13.565754633715798, "grad_norm": 1.2728397846221924, "learning_rate": 8.717090183729741e-06, "loss": 0.08041061, "memory(GiB)": 15.03, "step": 7685, "train_speed(iter/s)": 1.475743 }, { "acc": 0.99099255, "epoch": 13.574580759046778, "grad_norm": 3.1157732009887695, "learning_rate": 8.715135800531836e-06, "loss": 0.07141951, "memory(GiB)": 15.03, "step": 7690, "train_speed(iter/s)": 1.475778 }, { "acc": 0.99184084, "epoch": 13.583406884377759, "grad_norm": 3.818615436553955, "learning_rate": 8.713180149285329e-06, "loss": 0.05513404, "memory(GiB)": 15.03, "step": 7695, "train_speed(iter/s)": 1.47584 }, { "acc": 0.99226665, "epoch": 13.592233009708737, "grad_norm": 5.049389839172363, "learning_rate": 8.71122323065781e-06, "loss": 0.05272347, "memory(GiB)": 15.03, "step": 7700, "train_speed(iter/s)": 1.475835 }, { "acc": 0.99326239, "epoch": 13.601059135039717, "grad_norm": 4.292169094085693, "learning_rate": 8.709265045317304e-06, "loss": 0.03771079, "memory(GiB)": 15.03, "step": 7705, "train_speed(iter/s)": 1.47589 }, { "acc": 0.99369631, "epoch": 13.609885260370698, "grad_norm": 3.285712480545044, "learning_rate": 8.707305593932266e-06, "loss": 0.05678936, "memory(GiB)": 15.03, "step": 7710, "train_speed(iter/s)": 1.475893 }, { "acc": 0.99092979, "epoch": 13.618711385701676, "grad_norm": 4.784041881561279, "learning_rate": 8.705344877171589e-06, "loss": 0.06398755, "memory(GiB)": 15.03, "step": 7715, "train_speed(iter/s)": 1.475871 }, { "acc": 0.98987522, "epoch": 13.627537511032656, "grad_norm": 5.22430944442749, "learning_rate": 8.703382895704593e-06, "loss": 0.07706596, "memory(GiB)": 15.03, "step": 7720, "train_speed(iter/s)": 1.475852 }, { "acc": 0.99085999, "epoch": 13.636363636363637, "grad_norm": 2.15803599357605, "learning_rate": 8.701419650201026e-06, "loss": 0.0607146, "memory(GiB)": 15.03, "step": 7725, "train_speed(iter/s)": 1.475823 }, { "acc": 0.99004869, "epoch": 13.645189761694617, "grad_norm": 2.746767520904541, "learning_rate": 8.699455141331082e-06, "loss": 0.06055127, "memory(GiB)": 15.03, "step": 7730, "train_speed(iter/s)": 1.475867 }, { "acc": 0.99313259, "epoch": 13.654015887025595, "grad_norm": 3.273923873901367, "learning_rate": 8.697489369765366e-06, "loss": 0.04338832, "memory(GiB)": 15.03, "step": 7735, "train_speed(iter/s)": 1.47587 }, { "acc": 0.99077339, "epoch": 13.662842012356576, "grad_norm": 4.22088623046875, "learning_rate": 8.695522336174932e-06, "loss": 0.06084998, "memory(GiB)": 15.03, "step": 7740, "train_speed(iter/s)": 1.475878 }, { "acc": 0.98898573, "epoch": 13.671668137687556, "grad_norm": 2.449394464492798, "learning_rate": 8.693554041231257e-06, "loss": 0.07841105, "memory(GiB)": 15.03, "step": 7745, "train_speed(iter/s)": 1.475882 }, { "acc": 0.99274864, "epoch": 13.680494263018534, "grad_norm": 4.6682963371276855, "learning_rate": 8.691584485606243e-06, "loss": 0.0472695, "memory(GiB)": 15.03, "step": 7750, "train_speed(iter/s)": 1.475878 }, { "acc": 0.99142017, "epoch": 13.689320388349515, "grad_norm": 4.907999038696289, "learning_rate": 8.689613669972234e-06, "loss": 0.05315635, "memory(GiB)": 15.03, "step": 7755, "train_speed(iter/s)": 1.475886 }, { "acc": 0.99303007, "epoch": 13.698146513680495, "grad_norm": 2.65755558013916, "learning_rate": 8.687641595001997e-06, "loss": 0.05442785, "memory(GiB)": 15.03, "step": 7760, "train_speed(iter/s)": 1.475878 }, { "acc": 0.9955121, "epoch": 13.706972639011473, "grad_norm": 2.134589910507202, "learning_rate": 8.685668261368733e-06, "loss": 0.03607119, "memory(GiB)": 15.03, "step": 7765, "train_speed(iter/s)": 1.475865 }, { "acc": 0.9912014, "epoch": 13.715798764342454, "grad_norm": 2.6294875144958496, "learning_rate": 8.683693669746065e-06, "loss": 0.05707619, "memory(GiB)": 15.03, "step": 7770, "train_speed(iter/s)": 1.475943 }, { "acc": 0.9886961, "epoch": 13.724624889673434, "grad_norm": 4.627396583557129, "learning_rate": 8.681717820808052e-06, "loss": 0.07659013, "memory(GiB)": 15.03, "step": 7775, "train_speed(iter/s)": 1.475919 }, { "acc": 0.9912467, "epoch": 13.733451015004412, "grad_norm": 2.7853243350982666, "learning_rate": 8.679740715229183e-06, "loss": 0.06544136, "memory(GiB)": 15.03, "step": 7780, "train_speed(iter/s)": 1.475898 }, { "acc": 0.99020023, "epoch": 13.742277140335393, "grad_norm": 6.451267242431641, "learning_rate": 8.677762353684373e-06, "loss": 0.06628929, "memory(GiB)": 15.03, "step": 7785, "train_speed(iter/s)": 1.475882 }, { "acc": 0.99128876, "epoch": 13.751103265666373, "grad_norm": 2.4582841396331787, "learning_rate": 8.675782736848965e-06, "loss": 0.0601158, "memory(GiB)": 15.03, "step": 7790, "train_speed(iter/s)": 1.475839 }, { "acc": 0.99097767, "epoch": 13.759929390997351, "grad_norm": 3.4148318767547607, "learning_rate": 8.673801865398735e-06, "loss": 0.07238144, "memory(GiB)": 15.03, "step": 7795, "train_speed(iter/s)": 1.475798 }, { "acc": 0.99131098, "epoch": 13.768755516328332, "grad_norm": 3.9566550254821777, "learning_rate": 8.67181974000988e-06, "loss": 0.05190112, "memory(GiB)": 15.03, "step": 7800, "train_speed(iter/s)": 1.475802 }, { "acc": 0.99299898, "epoch": 13.777581641659312, "grad_norm": 3.2106995582580566, "learning_rate": 8.66983636135903e-06, "loss": 0.05489918, "memory(GiB)": 15.03, "step": 7805, "train_speed(iter/s)": 1.475764 }, { "acc": 0.99017029, "epoch": 13.78640776699029, "grad_norm": 2.542320489883423, "learning_rate": 8.667851730123245e-06, "loss": 0.07860731, "memory(GiB)": 15.03, "step": 7810, "train_speed(iter/s)": 1.475795 }, { "acc": 0.99087048, "epoch": 13.79523389232127, "grad_norm": 3.6205027103424072, "learning_rate": 8.665865846980008e-06, "loss": 0.06235188, "memory(GiB)": 15.03, "step": 7815, "train_speed(iter/s)": 1.475765 }, { "acc": 0.99225044, "epoch": 13.804060017652251, "grad_norm": 2.262861967086792, "learning_rate": 8.663878712607231e-06, "loss": 0.06109743, "memory(GiB)": 15.03, "step": 7820, "train_speed(iter/s)": 1.47575 }, { "acc": 0.99365559, "epoch": 13.812886142983231, "grad_norm": 2.3306972980499268, "learning_rate": 8.661890327683253e-06, "loss": 0.04568786, "memory(GiB)": 15.03, "step": 7825, "train_speed(iter/s)": 1.475751 }, { "acc": 0.98936644, "epoch": 13.82171226831421, "grad_norm": 5.172802448272705, "learning_rate": 8.659900692886842e-06, "loss": 0.0763512, "memory(GiB)": 15.03, "step": 7830, "train_speed(iter/s)": 1.47578 }, { "acc": 0.99156885, "epoch": 13.83053839364519, "grad_norm": 3.79459285736084, "learning_rate": 8.657909808897186e-06, "loss": 0.06649145, "memory(GiB)": 15.03, "step": 7835, "train_speed(iter/s)": 1.4758 }, { "acc": 0.9909646, "epoch": 13.83936451897617, "grad_norm": 4.358303070068359, "learning_rate": 8.65591767639391e-06, "loss": 0.06062957, "memory(GiB)": 15.03, "step": 7840, "train_speed(iter/s)": 1.475725 }, { "acc": 0.98937454, "epoch": 13.848190644307149, "grad_norm": 3.4765520095825195, "learning_rate": 8.653924296057056e-06, "loss": 0.07803037, "memory(GiB)": 15.03, "step": 7845, "train_speed(iter/s)": 1.475623 }, { "acc": 0.99285355, "epoch": 13.857016769638129, "grad_norm": 3.0600032806396484, "learning_rate": 8.651929668567097e-06, "loss": 0.05441375, "memory(GiB)": 15.03, "step": 7850, "train_speed(iter/s)": 1.47565 }, { "acc": 0.9927927, "epoch": 13.86584289496911, "grad_norm": 1.266940712928772, "learning_rate": 8.649933794604925e-06, "loss": 0.05393355, "memory(GiB)": 15.03, "step": 7855, "train_speed(iter/s)": 1.475693 }, { "acc": 0.99283314, "epoch": 13.874669020300088, "grad_norm": 4.747350692749023, "learning_rate": 8.64793667485187e-06, "loss": 0.06088049, "memory(GiB)": 15.03, "step": 7860, "train_speed(iter/s)": 1.475698 }, { "acc": 0.99228096, "epoch": 13.883495145631068, "grad_norm": 3.685199499130249, "learning_rate": 8.645938309989675e-06, "loss": 0.05866764, "memory(GiB)": 15.03, "step": 7865, "train_speed(iter/s)": 1.475735 }, { "acc": 0.99075069, "epoch": 13.892321270962048, "grad_norm": 2.5586471557617188, "learning_rate": 8.643938700700515e-06, "loss": 0.06962577, "memory(GiB)": 15.03, "step": 7870, "train_speed(iter/s)": 1.475717 }, { "acc": 0.99172802, "epoch": 13.901147396293027, "grad_norm": 3.9305264949798584, "learning_rate": 8.641937847666988e-06, "loss": 0.05684061, "memory(GiB)": 15.03, "step": 7875, "train_speed(iter/s)": 1.47575 }, { "acc": 0.99280357, "epoch": 13.909973521624007, "grad_norm": 3.591352939605713, "learning_rate": 8.639935751572113e-06, "loss": 0.04950166, "memory(GiB)": 15.03, "step": 7880, "train_speed(iter/s)": 1.475772 }, { "acc": 0.99113169, "epoch": 13.918799646954987, "grad_norm": 3.4345505237579346, "learning_rate": 8.637932413099339e-06, "loss": 0.05855219, "memory(GiB)": 15.03, "step": 7885, "train_speed(iter/s)": 1.475773 }, { "acc": 0.99174109, "epoch": 13.927625772285966, "grad_norm": 3.0945372581481934, "learning_rate": 8.635927832932537e-06, "loss": 0.06247183, "memory(GiB)": 15.03, "step": 7890, "train_speed(iter/s)": 1.475803 }, { "acc": 0.99320927, "epoch": 13.936451897616946, "grad_norm": 3.0949323177337646, "learning_rate": 8.633922011756001e-06, "loss": 0.04905765, "memory(GiB)": 15.03, "step": 7895, "train_speed(iter/s)": 1.475883 }, { "acc": 0.99280586, "epoch": 13.945278022947926, "grad_norm": 1.8252031803131104, "learning_rate": 8.63191495025445e-06, "loss": 0.0492154, "memory(GiB)": 15.03, "step": 7900, "train_speed(iter/s)": 1.47587 }, { "acc": 0.99173069, "epoch": 13.954104148278905, "grad_norm": 3.217693328857422, "learning_rate": 8.629906649113023e-06, "loss": 0.06370486, "memory(GiB)": 15.03, "step": 7905, "train_speed(iter/s)": 1.475901 }, { "acc": 0.99386015, "epoch": 13.962930273609885, "grad_norm": 4.429325580596924, "learning_rate": 8.627897109017288e-06, "loss": 0.04784957, "memory(GiB)": 15.03, "step": 7910, "train_speed(iter/s)": 1.475891 }, { "acc": 0.99156046, "epoch": 13.971756398940865, "grad_norm": 5.129411697387695, "learning_rate": 8.625886330653231e-06, "loss": 0.06518086, "memory(GiB)": 15.03, "step": 7915, "train_speed(iter/s)": 1.475918 }, { "acc": 0.99118938, "epoch": 13.980582524271846, "grad_norm": 5.320976257324219, "learning_rate": 8.623874314707263e-06, "loss": 0.05981255, "memory(GiB)": 15.03, "step": 7920, "train_speed(iter/s)": 1.475922 }, { "acc": 0.99424229, "epoch": 13.989408649602824, "grad_norm": 4.13339376449585, "learning_rate": 8.621861061866217e-06, "loss": 0.04309626, "memory(GiB)": 15.03, "step": 7925, "train_speed(iter/s)": 1.475926 }, { "acc": 0.99171791, "epoch": 13.998234774933804, "grad_norm": 3.733078718185425, "learning_rate": 8.619846572817348e-06, "loss": 0.05157564, "memory(GiB)": 15.03, "step": 7930, "train_speed(iter/s)": 1.47594 }, { "acc": 0.99234467, "epoch": 14.007060900264785, "grad_norm": 2.6910178661346436, "learning_rate": 8.61783084824833e-06, "loss": 0.04849579, "memory(GiB)": 15.03, "step": 7935, "train_speed(iter/s)": 1.475894 }, { "acc": 0.99045649, "epoch": 14.015887025595763, "grad_norm": 2.9162588119506836, "learning_rate": 8.615813888847268e-06, "loss": 0.05482354, "memory(GiB)": 15.03, "step": 7940, "train_speed(iter/s)": 1.475919 }, { "acc": 0.99652395, "epoch": 14.024713150926743, "grad_norm": 4.010524749755859, "learning_rate": 8.613795695302677e-06, "loss": 0.02220239, "memory(GiB)": 15.03, "step": 7945, "train_speed(iter/s)": 1.475947 }, { "acc": 0.99195194, "epoch": 14.033539276257724, "grad_norm": 3.186551809310913, "learning_rate": 8.611776268303502e-06, "loss": 0.05677466, "memory(GiB)": 15.03, "step": 7950, "train_speed(iter/s)": 1.475961 }, { "acc": 0.99021416, "epoch": 14.042365401588702, "grad_norm": 2.8467884063720703, "learning_rate": 8.609755608539103e-06, "loss": 0.06874521, "memory(GiB)": 15.03, "step": 7955, "train_speed(iter/s)": 1.475904 }, { "acc": 0.99231529, "epoch": 14.051191526919682, "grad_norm": 3.058255434036255, "learning_rate": 8.607733716699265e-06, "loss": 0.05006171, "memory(GiB)": 15.03, "step": 7960, "train_speed(iter/s)": 1.475905 }, { "acc": 0.98997803, "epoch": 14.060017652250663, "grad_norm": 3.875189781188965, "learning_rate": 8.605710593474196e-06, "loss": 0.06988183, "memory(GiB)": 15.03, "step": 7965, "train_speed(iter/s)": 1.475887 }, { "acc": 0.99273005, "epoch": 14.068843777581641, "grad_norm": 2.499953508377075, "learning_rate": 8.603686239554514e-06, "loss": 0.04240447, "memory(GiB)": 15.03, "step": 7970, "train_speed(iter/s)": 1.475929 }, { "acc": 0.99156647, "epoch": 14.077669902912621, "grad_norm": 3.9108729362487793, "learning_rate": 8.601660655631267e-06, "loss": 0.05884684, "memory(GiB)": 15.03, "step": 7975, "train_speed(iter/s)": 1.475949 }, { "acc": 0.9876914, "epoch": 14.086496028243602, "grad_norm": 9.755010604858398, "learning_rate": 8.59963384239592e-06, "loss": 0.09084948, "memory(GiB)": 15.03, "step": 7980, "train_speed(iter/s)": 1.475911 }, { "acc": 0.99115009, "epoch": 14.09532215357458, "grad_norm": 3.504723310470581, "learning_rate": 8.597605800540355e-06, "loss": 0.05887205, "memory(GiB)": 15.03, "step": 7985, "train_speed(iter/s)": 1.475925 }, { "acc": 0.9928278, "epoch": 14.10414827890556, "grad_norm": 2.0201995372772217, "learning_rate": 8.595576530756876e-06, "loss": 0.05507187, "memory(GiB)": 15.03, "step": 7990, "train_speed(iter/s)": 1.475934 }, { "acc": 0.99054108, "epoch": 14.11297440423654, "grad_norm": 4.107114791870117, "learning_rate": 8.593546033738207e-06, "loss": 0.05927454, "memory(GiB)": 15.03, "step": 7995, "train_speed(iter/s)": 1.475936 }, { "acc": 0.99123383, "epoch": 14.121800529567519, "grad_norm": 3.6136229038238525, "learning_rate": 8.591514310177491e-06, "loss": 0.06986181, "memory(GiB)": 15.03, "step": 8000, "train_speed(iter/s)": 1.475971 }, { "acc": 0.99159021, "epoch": 14.1306266548985, "grad_norm": 2.3975398540496826, "learning_rate": 8.589481360768284e-06, "loss": 0.05061299, "memory(GiB)": 15.03, "step": 8005, "train_speed(iter/s)": 1.47597 }, { "acc": 0.98914652, "epoch": 14.13945278022948, "grad_norm": 2.344257116317749, "learning_rate": 8.58744718620457e-06, "loss": 0.07193484, "memory(GiB)": 15.03, "step": 8010, "train_speed(iter/s)": 1.47597 }, { "acc": 0.99143429, "epoch": 14.148278905560458, "grad_norm": 4.001764297485352, "learning_rate": 8.585411787180741e-06, "loss": 0.06682891, "memory(GiB)": 15.03, "step": 8015, "train_speed(iter/s)": 1.476013 }, { "acc": 0.99468803, "epoch": 14.157105030891438, "grad_norm": 1.6333472728729248, "learning_rate": 8.583375164391617e-06, "loss": 0.04001825, "memory(GiB)": 15.03, "step": 8020, "train_speed(iter/s)": 1.475987 }, { "acc": 0.9924695, "epoch": 14.165931156222419, "grad_norm": 1.46846604347229, "learning_rate": 8.581337318532425e-06, "loss": 0.0412921, "memory(GiB)": 15.03, "step": 8025, "train_speed(iter/s)": 1.475983 }, { "acc": 0.99383869, "epoch": 14.174757281553399, "grad_norm": 4.461544513702393, "learning_rate": 8.579298250298823e-06, "loss": 0.04911553, "memory(GiB)": 15.03, "step": 8030, "train_speed(iter/s)": 1.475974 }, { "acc": 0.98880987, "epoch": 14.183583406884377, "grad_norm": 2.9569952487945557, "learning_rate": 8.577257960386871e-06, "loss": 0.07333411, "memory(GiB)": 15.03, "step": 8035, "train_speed(iter/s)": 1.47594 }, { "acc": 0.99138832, "epoch": 14.192409532215358, "grad_norm": 2.5602102279663086, "learning_rate": 8.575216449493057e-06, "loss": 0.06764455, "memory(GiB)": 15.03, "step": 8040, "train_speed(iter/s)": 1.475878 }, { "acc": 0.99208622, "epoch": 14.201235657546338, "grad_norm": 4.907078266143799, "learning_rate": 8.573173718314282e-06, "loss": 0.07162688, "memory(GiB)": 15.03, "step": 8045, "train_speed(iter/s)": 1.475875 }, { "acc": 0.99077682, "epoch": 14.210061782877316, "grad_norm": 3.6450607776641846, "learning_rate": 8.571129767547868e-06, "loss": 0.06852701, "memory(GiB)": 15.03, "step": 8050, "train_speed(iter/s)": 1.475878 }, { "acc": 0.99068089, "epoch": 14.218887908208297, "grad_norm": 6.060046672821045, "learning_rate": 8.569084597891543e-06, "loss": 0.05966155, "memory(GiB)": 15.03, "step": 8055, "train_speed(iter/s)": 1.475848 }, { "acc": 0.98894005, "epoch": 14.227714033539277, "grad_norm": 3.8790104389190674, "learning_rate": 8.56703821004346e-06, "loss": 0.07093288, "memory(GiB)": 15.03, "step": 8060, "train_speed(iter/s)": 1.475798 }, { "acc": 0.99330616, "epoch": 14.236540158870255, "grad_norm": 2.2658729553222656, "learning_rate": 8.564990604702187e-06, "loss": 0.04629785, "memory(GiB)": 15.03, "step": 8065, "train_speed(iter/s)": 1.475802 }, { "acc": 0.99396, "epoch": 14.245366284201236, "grad_norm": 3.418266773223877, "learning_rate": 8.562941782566702e-06, "loss": 0.04129475, "memory(GiB)": 15.03, "step": 8070, "train_speed(iter/s)": 1.475786 }, { "acc": 0.99571581, "epoch": 14.254192409532216, "grad_norm": 1.5930122137069702, "learning_rate": 8.560891744336406e-06, "loss": 0.03987826, "memory(GiB)": 15.03, "step": 8075, "train_speed(iter/s)": 1.475779 }, { "acc": 0.9911272, "epoch": 14.263018534863194, "grad_norm": 3.5372915267944336, "learning_rate": 8.558840490711109e-06, "loss": 0.06942055, "memory(GiB)": 15.03, "step": 8080, "train_speed(iter/s)": 1.475792 }, { "acc": 0.99345818, "epoch": 14.271844660194175, "grad_norm": 4.9714741706848145, "learning_rate": 8.55678802239104e-06, "loss": 0.04390925, "memory(GiB)": 15.03, "step": 8085, "train_speed(iter/s)": 1.475791 }, { "acc": 0.99450397, "epoch": 14.280670785525155, "grad_norm": 4.367522239685059, "learning_rate": 8.55473434007684e-06, "loss": 0.04100883, "memory(GiB)": 15.03, "step": 8090, "train_speed(iter/s)": 1.475865 }, { "acc": 0.99039316, "epoch": 14.289496910856133, "grad_norm": 2.0774593353271484, "learning_rate": 8.552679444469564e-06, "loss": 0.07547561, "memory(GiB)": 15.03, "step": 8095, "train_speed(iter/s)": 1.475894 }, { "acc": 0.99162235, "epoch": 14.298323036187114, "grad_norm": 4.873344898223877, "learning_rate": 8.550623336270684e-06, "loss": 0.05509191, "memory(GiB)": 15.03, "step": 8100, "train_speed(iter/s)": 1.475922 }, { "acc": 0.98999748, "epoch": 14.307149161518094, "grad_norm": 5.22462797164917, "learning_rate": 8.548566016182087e-06, "loss": 0.08034573, "memory(GiB)": 15.03, "step": 8105, "train_speed(iter/s)": 1.475859 }, { "acc": 0.9898222, "epoch": 14.315975286849074, "grad_norm": 2.1093218326568604, "learning_rate": 8.546507484906065e-06, "loss": 0.07317927, "memory(GiB)": 15.03, "step": 8110, "train_speed(iter/s)": 1.475861 }, { "acc": 0.98998508, "epoch": 14.324801412180053, "grad_norm": 2.541908025741577, "learning_rate": 8.544447743145336e-06, "loss": 0.06530702, "memory(GiB)": 15.03, "step": 8115, "train_speed(iter/s)": 1.475881 }, { "acc": 0.99031067, "epoch": 14.333627537511033, "grad_norm": 2.884276866912842, "learning_rate": 8.54238679160302e-06, "loss": 0.08013096, "memory(GiB)": 15.03, "step": 8120, "train_speed(iter/s)": 1.475864 }, { "acc": 0.98879728, "epoch": 14.342453662842013, "grad_norm": 3.484849214553833, "learning_rate": 8.54032463098266e-06, "loss": 0.06365883, "memory(GiB)": 15.03, "step": 8125, "train_speed(iter/s)": 1.475899 }, { "acc": 0.99010925, "epoch": 14.351279788172992, "grad_norm": 4.826974868774414, "learning_rate": 8.5382612619882e-06, "loss": 0.07099137, "memory(GiB)": 15.03, "step": 8130, "train_speed(iter/s)": 1.475918 }, { "acc": 0.9945158, "epoch": 14.360105913503972, "grad_norm": 2.0569252967834473, "learning_rate": 8.536196685324006e-06, "loss": 0.04077922, "memory(GiB)": 15.03, "step": 8135, "train_speed(iter/s)": 1.47599 }, { "acc": 0.99133186, "epoch": 14.368932038834952, "grad_norm": 5.546054363250732, "learning_rate": 8.534130901694857e-06, "loss": 0.04761035, "memory(GiB)": 15.03, "step": 8140, "train_speed(iter/s)": 1.475956 }, { "acc": 0.99026871, "epoch": 14.37775816416593, "grad_norm": 2.4609029293060303, "learning_rate": 8.532063911805934e-06, "loss": 0.06234386, "memory(GiB)": 15.03, "step": 8145, "train_speed(iter/s)": 1.475945 }, { "acc": 0.9884716, "epoch": 14.386584289496911, "grad_norm": 3.4813055992126465, "learning_rate": 8.52999571636284e-06, "loss": 0.08764692, "memory(GiB)": 15.03, "step": 8150, "train_speed(iter/s)": 1.475964 }, { "acc": 0.99496174, "epoch": 14.395410414827891, "grad_norm": 3.451038122177124, "learning_rate": 8.527926316071586e-06, "loss": 0.03814399, "memory(GiB)": 15.03, "step": 8155, "train_speed(iter/s)": 1.475963 }, { "acc": 0.99174347, "epoch": 14.40423654015887, "grad_norm": 4.364584445953369, "learning_rate": 8.52585571163859e-06, "loss": 0.06791204, "memory(GiB)": 15.03, "step": 8160, "train_speed(iter/s)": 1.47596 }, { "acc": 0.99309692, "epoch": 14.41306266548985, "grad_norm": 3.126430034637451, "learning_rate": 8.52378390377069e-06, "loss": 0.05730971, "memory(GiB)": 15.03, "step": 8165, "train_speed(iter/s)": 1.476012 }, { "acc": 0.99063873, "epoch": 14.42188879082083, "grad_norm": 4.130179405212402, "learning_rate": 8.521710893175127e-06, "loss": 0.05871237, "memory(GiB)": 15.03, "step": 8170, "train_speed(iter/s)": 1.475979 }, { "acc": 0.99159784, "epoch": 14.430714916151809, "grad_norm": 5.49724817276001, "learning_rate": 8.519636680559554e-06, "loss": 0.05293725, "memory(GiB)": 15.03, "step": 8175, "train_speed(iter/s)": 1.475952 }, { "acc": 0.98956394, "epoch": 14.439541041482789, "grad_norm": 2.7892050743103027, "learning_rate": 8.51756126663204e-06, "loss": 0.05736456, "memory(GiB)": 15.03, "step": 8180, "train_speed(iter/s)": 1.475955 }, { "acc": 0.99073048, "epoch": 14.44836716681377, "grad_norm": 3.8520078659057617, "learning_rate": 8.515484652101055e-06, "loss": 0.06640058, "memory(GiB)": 15.03, "step": 8185, "train_speed(iter/s)": 1.47598 }, { "acc": 0.98799782, "epoch": 14.457193292144748, "grad_norm": 4.680355072021484, "learning_rate": 8.513406837675487e-06, "loss": 0.08052497, "memory(GiB)": 15.03, "step": 8190, "train_speed(iter/s)": 1.476002 }, { "acc": 0.99326744, "epoch": 14.466019417475728, "grad_norm": 2.713458776473999, "learning_rate": 8.511327824064627e-06, "loss": 0.03828588, "memory(GiB)": 15.03, "step": 8195, "train_speed(iter/s)": 1.47601 }, { "acc": 0.99076786, "epoch": 14.474845542806708, "grad_norm": 5.676911354064941, "learning_rate": 8.509247611978184e-06, "loss": 0.07543242, "memory(GiB)": 15.03, "step": 8200, "train_speed(iter/s)": 1.476063 }, { "acc": 0.99285345, "epoch": 14.483671668137688, "grad_norm": 2.4282138347625732, "learning_rate": 8.507166202126265e-06, "loss": 0.05872192, "memory(GiB)": 15.03, "step": 8205, "train_speed(iter/s)": 1.476086 }, { "acc": 0.99208002, "epoch": 14.492497793468667, "grad_norm": 2.2314531803131104, "learning_rate": 8.505083595219393e-06, "loss": 0.05569652, "memory(GiB)": 15.03, "step": 8210, "train_speed(iter/s)": 1.476048 }, { "acc": 0.9925415, "epoch": 14.501323918799647, "grad_norm": 3.816894292831421, "learning_rate": 8.502999791968501e-06, "loss": 0.04406803, "memory(GiB)": 15.03, "step": 8215, "train_speed(iter/s)": 1.476072 }, { "acc": 0.9942585, "epoch": 14.510150044130627, "grad_norm": 3.121980905532837, "learning_rate": 8.500914793084926e-06, "loss": 0.04356001, "memory(GiB)": 15.03, "step": 8220, "train_speed(iter/s)": 1.476068 }, { "acc": 0.99046783, "epoch": 14.518976169461606, "grad_norm": 2.751131057739258, "learning_rate": 8.498828599280412e-06, "loss": 0.0818033, "memory(GiB)": 15.03, "step": 8225, "train_speed(iter/s)": 1.476048 }, { "acc": 0.99175205, "epoch": 14.527802294792586, "grad_norm": 1.456152319908142, "learning_rate": 8.496741211267117e-06, "loss": 0.05787106, "memory(GiB)": 15.03, "step": 8230, "train_speed(iter/s)": 1.476081 }, { "acc": 0.99061232, "epoch": 14.536628420123566, "grad_norm": 3.727294445037842, "learning_rate": 8.494652629757603e-06, "loss": 0.06440579, "memory(GiB)": 15.03, "step": 8235, "train_speed(iter/s)": 1.47608 }, { "acc": 0.98920288, "epoch": 14.545454545454545, "grad_norm": 3.720560073852539, "learning_rate": 8.492562855464838e-06, "loss": 0.06248739, "memory(GiB)": 15.03, "step": 8240, "train_speed(iter/s)": 1.476085 }, { "acc": 0.99556112, "epoch": 14.554280670785525, "grad_norm": 3.559694528579712, "learning_rate": 8.4904718891022e-06, "loss": 0.02942696, "memory(GiB)": 15.03, "step": 8245, "train_speed(iter/s)": 1.476095 }, { "acc": 0.99536991, "epoch": 14.563106796116505, "grad_norm": 2.6375389099121094, "learning_rate": 8.488379731383474e-06, "loss": 0.04061737, "memory(GiB)": 15.03, "step": 8250, "train_speed(iter/s)": 1.476131 }, { "acc": 0.99341125, "epoch": 14.571932921447484, "grad_norm": 3.618091106414795, "learning_rate": 8.48628638302285e-06, "loss": 0.05044608, "memory(GiB)": 15.03, "step": 8255, "train_speed(iter/s)": 1.476133 }, { "acc": 0.99213095, "epoch": 14.580759046778464, "grad_norm": 2.4394783973693848, "learning_rate": 8.484191844734923e-06, "loss": 0.06174729, "memory(GiB)": 15.03, "step": 8260, "train_speed(iter/s)": 1.476114 }, { "acc": 0.99386749, "epoch": 14.589585172109444, "grad_norm": 2.9239702224731445, "learning_rate": 8.482096117234697e-06, "loss": 0.0430848, "memory(GiB)": 15.03, "step": 8265, "train_speed(iter/s)": 1.476104 }, { "acc": 0.990907, "epoch": 14.598411297440423, "grad_norm": 3.457209587097168, "learning_rate": 8.47999920123758e-06, "loss": 0.06047316, "memory(GiB)": 15.03, "step": 8270, "train_speed(iter/s)": 1.476116 }, { "acc": 0.99436378, "epoch": 14.607237422771403, "grad_norm": 3.097519874572754, "learning_rate": 8.47790109745939e-06, "loss": 0.04111937, "memory(GiB)": 15.03, "step": 8275, "train_speed(iter/s)": 1.476154 }, { "acc": 0.9953495, "epoch": 14.616063548102384, "grad_norm": 1.6535524129867554, "learning_rate": 8.475801806616347e-06, "loss": 0.03243524, "memory(GiB)": 15.03, "step": 8280, "train_speed(iter/s)": 1.476172 }, { "acc": 0.9927042, "epoch": 14.624889673433362, "grad_norm": 1.906010627746582, "learning_rate": 8.473701329425075e-06, "loss": 0.04775805, "memory(GiB)": 15.03, "step": 8285, "train_speed(iter/s)": 1.476166 }, { "acc": 0.99062176, "epoch": 14.633715798764342, "grad_norm": 5.196842670440674, "learning_rate": 8.471599666602603e-06, "loss": 0.04388377, "memory(GiB)": 15.03, "step": 8290, "train_speed(iter/s)": 1.476173 }, { "acc": 0.99024763, "epoch": 14.642541924095323, "grad_norm": 2.795555830001831, "learning_rate": 8.469496818866369e-06, "loss": 0.06944999, "memory(GiB)": 15.03, "step": 8295, "train_speed(iter/s)": 1.476203 }, { "acc": 0.9926424, "epoch": 14.651368049426303, "grad_norm": 4.374702453613281, "learning_rate": 8.467392786934212e-06, "loss": 0.05871496, "memory(GiB)": 15.03, "step": 8300, "train_speed(iter/s)": 1.476212 }, { "acc": 0.99130173, "epoch": 14.660194174757281, "grad_norm": 3.9223952293395996, "learning_rate": 8.465287571524376e-06, "loss": 0.06704448, "memory(GiB)": 15.03, "step": 8305, "train_speed(iter/s)": 1.476188 }, { "acc": 0.99255676, "epoch": 14.669020300088262, "grad_norm": 2.6708569526672363, "learning_rate": 8.463181173355506e-06, "loss": 0.05408727, "memory(GiB)": 15.03, "step": 8310, "train_speed(iter/s)": 1.476225 }, { "acc": 0.99274845, "epoch": 14.677846425419242, "grad_norm": 3.3133628368377686, "learning_rate": 8.461073593146658e-06, "loss": 0.0482885, "memory(GiB)": 15.03, "step": 8315, "train_speed(iter/s)": 1.476234 }, { "acc": 0.99188747, "epoch": 14.68667255075022, "grad_norm": 3.105553150177002, "learning_rate": 8.458964831617286e-06, "loss": 0.05891593, "memory(GiB)": 15.03, "step": 8320, "train_speed(iter/s)": 1.476247 }, { "acc": 0.99374819, "epoch": 14.6954986760812, "grad_norm": 3.8275797367095947, "learning_rate": 8.456854889487248e-06, "loss": 0.04462597, "memory(GiB)": 15.03, "step": 8325, "train_speed(iter/s)": 1.47625 }, { "acc": 0.99450474, "epoch": 14.70432480141218, "grad_norm": 2.7497029304504395, "learning_rate": 8.454743767476806e-06, "loss": 0.04882172, "memory(GiB)": 15.03, "step": 8330, "train_speed(iter/s)": 1.47624 }, { "acc": 0.99434118, "epoch": 14.71315092674316, "grad_norm": 0.6045403480529785, "learning_rate": 8.45263146630662e-06, "loss": 0.04554625, "memory(GiB)": 15.03, "step": 8335, "train_speed(iter/s)": 1.476221 }, { "acc": 0.99009056, "epoch": 14.72197705207414, "grad_norm": 2.6770589351654053, "learning_rate": 8.450517986697767e-06, "loss": 0.07171953, "memory(GiB)": 15.03, "step": 8340, "train_speed(iter/s)": 1.476283 }, { "acc": 0.99353027, "epoch": 14.73080317740512, "grad_norm": 2.425414800643921, "learning_rate": 8.448403329371703e-06, "loss": 0.0501803, "memory(GiB)": 15.03, "step": 8345, "train_speed(iter/s)": 1.476313 }, { "acc": 0.99445667, "epoch": 14.739629302736098, "grad_norm": 1.4052120447158813, "learning_rate": 8.446287495050309e-06, "loss": 0.03729502, "memory(GiB)": 15.03, "step": 8350, "train_speed(iter/s)": 1.476292 }, { "acc": 0.99185715, "epoch": 14.748455428067079, "grad_norm": 6.9082183837890625, "learning_rate": 8.444170484455855e-06, "loss": 0.06029296, "memory(GiB)": 15.03, "step": 8355, "train_speed(iter/s)": 1.47632 }, { "acc": 0.99185581, "epoch": 14.757281553398059, "grad_norm": 4.818746089935303, "learning_rate": 8.442052298311015e-06, "loss": 0.05884875, "memory(GiB)": 15.03, "step": 8360, "train_speed(iter/s)": 1.476343 }, { "acc": 0.98923187, "epoch": 14.766107678729037, "grad_norm": 8.529623031616211, "learning_rate": 8.439932937338866e-06, "loss": 0.07780768, "memory(GiB)": 15.03, "step": 8365, "train_speed(iter/s)": 1.476368 }, { "acc": 0.9899332, "epoch": 14.774933804060018, "grad_norm": 0.9519566297531128, "learning_rate": 8.43781240226288e-06, "loss": 0.07199445, "memory(GiB)": 15.03, "step": 8370, "train_speed(iter/s)": 1.476315 }, { "acc": 0.99397717, "epoch": 14.783759929390998, "grad_norm": 2.513927698135376, "learning_rate": 8.435690693806942e-06, "loss": 0.04652147, "memory(GiB)": 15.03, "step": 8375, "train_speed(iter/s)": 1.476305 }, { "acc": 0.9899662, "epoch": 14.792586054721976, "grad_norm": 3.66697096824646, "learning_rate": 8.433567812695324e-06, "loss": 0.08678654, "memory(GiB)": 15.03, "step": 8380, "train_speed(iter/s)": 1.476317 }, { "acc": 0.99001179, "epoch": 14.801412180052957, "grad_norm": 5.210486888885498, "learning_rate": 8.431443759652708e-06, "loss": 0.07259018, "memory(GiB)": 15.03, "step": 8385, "train_speed(iter/s)": 1.476265 }, { "acc": 0.9909462, "epoch": 14.810238305383937, "grad_norm": 4.381834983825684, "learning_rate": 8.429318535404171e-06, "loss": 0.05336325, "memory(GiB)": 15.03, "step": 8390, "train_speed(iter/s)": 1.476234 }, { "acc": 0.99513741, "epoch": 14.819064430714917, "grad_norm": 1.879662036895752, "learning_rate": 8.42719214067519e-06, "loss": 0.02833835, "memory(GiB)": 15.03, "step": 8395, "train_speed(iter/s)": 1.476262 }, { "acc": 0.99400425, "epoch": 14.827890556045896, "grad_norm": 3.848064661026001, "learning_rate": 8.425064576191647e-06, "loss": 0.04844109, "memory(GiB)": 15.03, "step": 8400, "train_speed(iter/s)": 1.476264 }, { "acc": 0.99307384, "epoch": 14.836716681376876, "grad_norm": 3.1419150829315186, "learning_rate": 8.422935842679814e-06, "loss": 0.03684476, "memory(GiB)": 15.03, "step": 8405, "train_speed(iter/s)": 1.476242 }, { "acc": 0.99368134, "epoch": 14.845542806707854, "grad_norm": 8.78354263305664, "learning_rate": 8.42080594086637e-06, "loss": 0.0561486, "memory(GiB)": 15.03, "step": 8410, "train_speed(iter/s)": 1.47627 }, { "acc": 0.99129515, "epoch": 14.854368932038835, "grad_norm": 3.8913016319274902, "learning_rate": 8.418674871478393e-06, "loss": 0.05715364, "memory(GiB)": 15.03, "step": 8415, "train_speed(iter/s)": 1.476315 }, { "acc": 0.98945637, "epoch": 14.863195057369815, "grad_norm": 8.0780611038208, "learning_rate": 8.416542635243351e-06, "loss": 0.06757483, "memory(GiB)": 15.03, "step": 8420, "train_speed(iter/s)": 1.476377 }, { "acc": 0.99347858, "epoch": 14.872021182700795, "grad_norm": 4.011573791503906, "learning_rate": 8.414409232889119e-06, "loss": 0.04754406, "memory(GiB)": 15.03, "step": 8425, "train_speed(iter/s)": 1.476396 }, { "acc": 0.99327803, "epoch": 14.880847308031774, "grad_norm": 1.7536057233810425, "learning_rate": 8.412274665143965e-06, "loss": 0.05082901, "memory(GiB)": 15.03, "step": 8430, "train_speed(iter/s)": 1.476425 }, { "acc": 0.99307899, "epoch": 14.889673433362754, "grad_norm": 2.4747843742370605, "learning_rate": 8.41013893273656e-06, "loss": 0.05000602, "memory(GiB)": 15.03, "step": 8435, "train_speed(iter/s)": 1.476424 }, { "acc": 0.98985949, "epoch": 14.898499558693734, "grad_norm": 3.4317619800567627, "learning_rate": 8.408002036395968e-06, "loss": 0.07650706, "memory(GiB)": 15.03, "step": 8440, "train_speed(iter/s)": 1.476404 }, { "acc": 0.99059267, "epoch": 14.907325684024713, "grad_norm": 4.094009876251221, "learning_rate": 8.405863976851648e-06, "loss": 0.07397392, "memory(GiB)": 15.03, "step": 8445, "train_speed(iter/s)": 1.476384 }, { "acc": 0.99501228, "epoch": 14.916151809355693, "grad_norm": 5.632967472076416, "learning_rate": 8.403724754833466e-06, "loss": 0.03735835, "memory(GiB)": 15.03, "step": 8450, "train_speed(iter/s)": 1.476438 }, { "acc": 0.99251242, "epoch": 14.924977934686673, "grad_norm": 1.4355206489562988, "learning_rate": 8.401584371071675e-06, "loss": 0.0569592, "memory(GiB)": 15.03, "step": 8455, "train_speed(iter/s)": 1.476446 }, { "acc": 0.99325676, "epoch": 14.933804060017652, "grad_norm": 4.010365962982178, "learning_rate": 8.399442826296931e-06, "loss": 0.04713833, "memory(GiB)": 15.03, "step": 8460, "train_speed(iter/s)": 1.476456 }, { "acc": 0.99239016, "epoch": 14.942630185348632, "grad_norm": 1.9691082239151, "learning_rate": 8.39730012124028e-06, "loss": 0.04189223, "memory(GiB)": 15.03, "step": 8465, "train_speed(iter/s)": 1.476494 }, { "acc": 0.9903389, "epoch": 14.951456310679612, "grad_norm": 2.9740259647369385, "learning_rate": 8.395156256633168e-06, "loss": 0.05167012, "memory(GiB)": 15.03, "step": 8470, "train_speed(iter/s)": 1.476565 }, { "acc": 0.9958765, "epoch": 14.96028243601059, "grad_norm": 1.3427637815475464, "learning_rate": 8.393011233207441e-06, "loss": 0.03436767, "memory(GiB)": 15.03, "step": 8475, "train_speed(iter/s)": 1.476594 }, { "acc": 0.9922224, "epoch": 14.96910856134157, "grad_norm": 4.21502685546875, "learning_rate": 8.390865051695331e-06, "loss": 0.04357839, "memory(GiB)": 15.03, "step": 8480, "train_speed(iter/s)": 1.476614 }, { "acc": 0.99306173, "epoch": 14.977934686672551, "grad_norm": 2.0272305011749268, "learning_rate": 8.388717712829472e-06, "loss": 0.06371667, "memory(GiB)": 15.03, "step": 8485, "train_speed(iter/s)": 1.476627 }, { "acc": 0.99215698, "epoch": 14.986760812003531, "grad_norm": 4.36797571182251, "learning_rate": 8.386569217342893e-06, "loss": 0.05421722, "memory(GiB)": 15.03, "step": 8490, "train_speed(iter/s)": 1.476689 }, { "acc": 0.99380007, "epoch": 14.99558693733451, "grad_norm": 5.020602703094482, "learning_rate": 8.384419565969015e-06, "loss": 0.04644648, "memory(GiB)": 15.03, "step": 8495, "train_speed(iter/s)": 1.476716 }, { "acc": 0.99123917, "epoch": 15.00441306266549, "grad_norm": 1.776963710784912, "learning_rate": 8.382268759441655e-06, "loss": 0.05265189, "memory(GiB)": 15.03, "step": 8500, "train_speed(iter/s)": 1.476642 }, { "acc": 0.9950285, "epoch": 15.01323918799647, "grad_norm": 1.559414267539978, "learning_rate": 8.380116798495022e-06, "loss": 0.0356554, "memory(GiB)": 15.03, "step": 8505, "train_speed(iter/s)": 1.476622 }, { "acc": 0.99331217, "epoch": 15.022065313327449, "grad_norm": 3.492704391479492, "learning_rate": 8.377963683863724e-06, "loss": 0.04245189, "memory(GiB)": 15.03, "step": 8510, "train_speed(iter/s)": 1.47664 }, { "acc": 0.99043417, "epoch": 15.03089143865843, "grad_norm": 3.592212200164795, "learning_rate": 8.37580941628276e-06, "loss": 0.05872016, "memory(GiB)": 15.03, "step": 8515, "train_speed(iter/s)": 1.476656 }, { "acc": 0.98803101, "epoch": 15.03971756398941, "grad_norm": 2.0624887943267822, "learning_rate": 8.37365399648752e-06, "loss": 0.1004621, "memory(GiB)": 15.03, "step": 8520, "train_speed(iter/s)": 1.476642 }, { "acc": 0.99335098, "epoch": 15.048543689320388, "grad_norm": 1.9309227466583252, "learning_rate": 8.371497425213795e-06, "loss": 0.04927807, "memory(GiB)": 15.03, "step": 8525, "train_speed(iter/s)": 1.476663 }, { "acc": 0.98981361, "epoch": 15.057369814651368, "grad_norm": 4.460616111755371, "learning_rate": 8.36933970319776e-06, "loss": 0.06547876, "memory(GiB)": 15.03, "step": 8530, "train_speed(iter/s)": 1.476626 }, { "acc": 0.99375668, "epoch": 15.066195939982348, "grad_norm": 1.3319615125656128, "learning_rate": 8.367180831175988e-06, "loss": 0.05114837, "memory(GiB)": 15.03, "step": 8535, "train_speed(iter/s)": 1.476635 }, { "acc": 0.99001389, "epoch": 15.075022065313327, "grad_norm": 4.480347633361816, "learning_rate": 8.365020809885443e-06, "loss": 0.07318594, "memory(GiB)": 15.03, "step": 8540, "train_speed(iter/s)": 1.476657 }, { "acc": 0.99090385, "epoch": 15.083848190644307, "grad_norm": 5.1440534591674805, "learning_rate": 8.362859640063484e-06, "loss": 0.07010166, "memory(GiB)": 15.03, "step": 8545, "train_speed(iter/s)": 1.47665 }, { "acc": 0.9925458, "epoch": 15.092674315975287, "grad_norm": 3.199151039123535, "learning_rate": 8.360697322447858e-06, "loss": 0.06378129, "memory(GiB)": 15.03, "step": 8550, "train_speed(iter/s)": 1.476723 }, { "acc": 0.9937211, "epoch": 15.101500441306266, "grad_norm": 2.071348190307617, "learning_rate": 8.358533857776705e-06, "loss": 0.0480057, "memory(GiB)": 15.03, "step": 8555, "train_speed(iter/s)": 1.476729 }, { "acc": 0.99021301, "epoch": 15.110326566637246, "grad_norm": 3.332272529602051, "learning_rate": 8.356369246788559e-06, "loss": 0.0684599, "memory(GiB)": 15.03, "step": 8560, "train_speed(iter/s)": 1.476697 }, { "acc": 0.99350491, "epoch": 15.119152691968226, "grad_norm": 3.0628457069396973, "learning_rate": 8.354203490222343e-06, "loss": 0.05088018, "memory(GiB)": 15.03, "step": 8565, "train_speed(iter/s)": 1.476672 }, { "acc": 0.9938343, "epoch": 15.127978817299205, "grad_norm": 4.439654350280762, "learning_rate": 8.352036588817372e-06, "loss": 0.03219316, "memory(GiB)": 15.03, "step": 8570, "train_speed(iter/s)": 1.476721 }, { "acc": 0.99369221, "epoch": 15.136804942630185, "grad_norm": 2.0388007164001465, "learning_rate": 8.349868543313348e-06, "loss": 0.05041912, "memory(GiB)": 15.03, "step": 8575, "train_speed(iter/s)": 1.476719 }, { "acc": 0.99335918, "epoch": 15.145631067961165, "grad_norm": 3.94844651222229, "learning_rate": 8.347699354450373e-06, "loss": 0.04419053, "memory(GiB)": 15.03, "step": 8580, "train_speed(iter/s)": 1.476719 }, { "acc": 0.99344196, "epoch": 15.154457193292144, "grad_norm": 5.1555304527282715, "learning_rate": 8.345529022968927e-06, "loss": 0.04936058, "memory(GiB)": 15.03, "step": 8585, "train_speed(iter/s)": 1.476722 }, { "acc": 0.99248552, "epoch": 15.163283318623124, "grad_norm": 3.1813013553619385, "learning_rate": 8.343357549609892e-06, "loss": 0.0575506, "memory(GiB)": 15.03, "step": 8590, "train_speed(iter/s)": 1.476732 }, { "acc": 0.99171104, "epoch": 15.172109443954104, "grad_norm": 5.189530372619629, "learning_rate": 8.341184935114532e-06, "loss": 0.04881449, "memory(GiB)": 15.03, "step": 8595, "train_speed(iter/s)": 1.476727 }, { "acc": 0.99385157, "epoch": 15.180935569285085, "grad_norm": 2.414520502090454, "learning_rate": 8.3390111802245e-06, "loss": 0.03963299, "memory(GiB)": 15.03, "step": 8600, "train_speed(iter/s)": 1.476697 }, { "acc": 0.99420071, "epoch": 15.189761694616063, "grad_norm": 2.6041557788848877, "learning_rate": 8.336836285681844e-06, "loss": 0.04304667, "memory(GiB)": 15.03, "step": 8605, "train_speed(iter/s)": 1.476688 }, { "acc": 0.99609709, "epoch": 15.198587819947043, "grad_norm": 2.0064985752105713, "learning_rate": 8.334660252229e-06, "loss": 0.03201409, "memory(GiB)": 15.03, "step": 8610, "train_speed(iter/s)": 1.47668 }, { "acc": 0.99549122, "epoch": 15.207413945278024, "grad_norm": 3.3300065994262695, "learning_rate": 8.332483080608785e-06, "loss": 0.04551703, "memory(GiB)": 15.03, "step": 8615, "train_speed(iter/s)": 1.476675 }, { "acc": 0.99201851, "epoch": 15.216240070609002, "grad_norm": 3.3372578620910645, "learning_rate": 8.330304771564415e-06, "loss": 0.06862786, "memory(GiB)": 15.03, "step": 8620, "train_speed(iter/s)": 1.4767 }, { "acc": 0.99358244, "epoch": 15.225066195939982, "grad_norm": 4.293848037719727, "learning_rate": 8.32812532583949e-06, "loss": 0.04104547, "memory(GiB)": 15.03, "step": 8625, "train_speed(iter/s)": 1.476629 }, { "acc": 0.994349, "epoch": 15.233892321270963, "grad_norm": 5.635739326477051, "learning_rate": 8.325944744177992e-06, "loss": 0.0436719, "memory(GiB)": 15.03, "step": 8630, "train_speed(iter/s)": 1.476642 }, { "acc": 0.99479294, "epoch": 15.242718446601941, "grad_norm": 2.5159358978271484, "learning_rate": 8.3237630273243e-06, "loss": 0.04494417, "memory(GiB)": 15.03, "step": 8635, "train_speed(iter/s)": 1.476644 }, { "acc": 0.98952293, "epoch": 15.251544571932921, "grad_norm": 3.9601786136627197, "learning_rate": 8.32158017602318e-06, "loss": 0.07516306, "memory(GiB)": 15.03, "step": 8640, "train_speed(iter/s)": 1.47665 }, { "acc": 0.99151077, "epoch": 15.260370697263902, "grad_norm": 7.093146324157715, "learning_rate": 8.319396191019778e-06, "loss": 0.05286225, "memory(GiB)": 15.03, "step": 8645, "train_speed(iter/s)": 1.476685 }, { "acc": 0.99123688, "epoch": 15.26919682259488, "grad_norm": 4.035953044891357, "learning_rate": 8.317211073059632e-06, "loss": 0.05209764, "memory(GiB)": 15.03, "step": 8650, "train_speed(iter/s)": 1.476699 }, { "acc": 0.99264336, "epoch": 15.27802294792586, "grad_norm": 3.3423736095428467, "learning_rate": 8.315024822888666e-06, "loss": 0.05650687, "memory(GiB)": 15.03, "step": 8655, "train_speed(iter/s)": 1.476762 }, { "acc": 0.99525576, "epoch": 15.28684907325684, "grad_norm": 2.6742377281188965, "learning_rate": 8.312837441253192e-06, "loss": 0.03632032, "memory(GiB)": 15.03, "step": 8660, "train_speed(iter/s)": 1.476763 }, { "acc": 0.9907546, "epoch": 15.29567519858782, "grad_norm": 1.9829462766647339, "learning_rate": 8.310648928899904e-06, "loss": 0.0668256, "memory(GiB)": 15.03, "step": 8665, "train_speed(iter/s)": 1.476795 }, { "acc": 0.99185448, "epoch": 15.3045013239188, "grad_norm": 4.663470268249512, "learning_rate": 8.308459286575886e-06, "loss": 0.05830699, "memory(GiB)": 15.03, "step": 8670, "train_speed(iter/s)": 1.47683 }, { "acc": 0.99373608, "epoch": 15.31332744924978, "grad_norm": 3.2100026607513428, "learning_rate": 8.306268515028608e-06, "loss": 0.04417355, "memory(GiB)": 15.03, "step": 8675, "train_speed(iter/s)": 1.476857 }, { "acc": 0.9901392, "epoch": 15.322153574580758, "grad_norm": 3.9460408687591553, "learning_rate": 8.304076615005922e-06, "loss": 0.05312408, "memory(GiB)": 15.03, "step": 8680, "train_speed(iter/s)": 1.476844 }, { "acc": 0.98701735, "epoch": 15.330979699911738, "grad_norm": 6.617732048034668, "learning_rate": 8.301883587256068e-06, "loss": 0.08042678, "memory(GiB)": 15.03, "step": 8685, "train_speed(iter/s)": 1.476828 }, { "acc": 0.99305363, "epoch": 15.339805825242719, "grad_norm": 1.9857419729232788, "learning_rate": 8.299689432527669e-06, "loss": 0.0435034, "memory(GiB)": 15.03, "step": 8690, "train_speed(iter/s)": 1.476814 }, { "acc": 0.99003124, "epoch": 15.348631950573699, "grad_norm": 1.9675695896148682, "learning_rate": 8.297494151569734e-06, "loss": 0.0666278, "memory(GiB)": 15.03, "step": 8695, "train_speed(iter/s)": 1.476777 }, { "acc": 0.99195805, "epoch": 15.357458075904677, "grad_norm": 2.7079946994781494, "learning_rate": 8.295297745131658e-06, "loss": 0.05774093, "memory(GiB)": 15.03, "step": 8700, "train_speed(iter/s)": 1.476746 }, { "acc": 0.98977737, "epoch": 15.366284201235658, "grad_norm": 8.006708145141602, "learning_rate": 8.293100213963216e-06, "loss": 0.08448494, "memory(GiB)": 15.03, "step": 8705, "train_speed(iter/s)": 1.476762 }, { "acc": 0.99135418, "epoch": 15.375110326566638, "grad_norm": 3.484165668487549, "learning_rate": 8.290901558814571e-06, "loss": 0.05282773, "memory(GiB)": 15.03, "step": 8710, "train_speed(iter/s)": 1.476771 }, { "acc": 0.98972778, "epoch": 15.383936451897616, "grad_norm": 4.698517322540283, "learning_rate": 8.288701780436266e-06, "loss": 0.06050807, "memory(GiB)": 15.03, "step": 8715, "train_speed(iter/s)": 1.476722 }, { "acc": 0.99476891, "epoch": 15.392762577228597, "grad_norm": 4.618071556091309, "learning_rate": 8.286500879579234e-06, "loss": 0.03741308, "memory(GiB)": 15.03, "step": 8720, "train_speed(iter/s)": 1.476721 }, { "acc": 0.99427767, "epoch": 15.401588702559577, "grad_norm": 3.285520076751709, "learning_rate": 8.28429885699478e-06, "loss": 0.04364554, "memory(GiB)": 15.03, "step": 8725, "train_speed(iter/s)": 1.476756 }, { "acc": 0.99288197, "epoch": 15.410414827890556, "grad_norm": 1.5150595903396606, "learning_rate": 8.282095713434606e-06, "loss": 0.05499805, "memory(GiB)": 15.03, "step": 8730, "train_speed(iter/s)": 1.47674 }, { "acc": 0.98931217, "epoch": 15.419240953221536, "grad_norm": 10.867820739746094, "learning_rate": 8.279891449650783e-06, "loss": 0.07769268, "memory(GiB)": 15.03, "step": 8735, "train_speed(iter/s)": 1.476793 }, { "acc": 0.99310188, "epoch": 15.428067078552516, "grad_norm": 1.3958430290222168, "learning_rate": 8.277686066395774e-06, "loss": 0.05495457, "memory(GiB)": 15.03, "step": 8740, "train_speed(iter/s)": 1.476809 }, { "acc": 0.99105663, "epoch": 15.436893203883495, "grad_norm": 5.281322002410889, "learning_rate": 8.27547956442242e-06, "loss": 0.05727117, "memory(GiB)": 15.03, "step": 8745, "train_speed(iter/s)": 1.476797 }, { "acc": 0.99229727, "epoch": 15.445719329214475, "grad_norm": 2.528623580932617, "learning_rate": 8.273271944483945e-06, "loss": 0.0532937, "memory(GiB)": 15.03, "step": 8750, "train_speed(iter/s)": 1.476805 }, { "acc": 0.99168043, "epoch": 15.454545454545455, "grad_norm": 1.5722365379333496, "learning_rate": 8.271063207333955e-06, "loss": 0.06536523, "memory(GiB)": 15.03, "step": 8755, "train_speed(iter/s)": 1.47679 }, { "acc": 0.98986034, "epoch": 15.463371579876434, "grad_norm": 4.520513534545898, "learning_rate": 8.268853353726433e-06, "loss": 0.05693505, "memory(GiB)": 15.03, "step": 8760, "train_speed(iter/s)": 1.476829 }, { "acc": 0.99098797, "epoch": 15.472197705207414, "grad_norm": 5.923330307006836, "learning_rate": 8.266642384415754e-06, "loss": 0.05391436, "memory(GiB)": 15.03, "step": 8765, "train_speed(iter/s)": 1.476836 }, { "acc": 0.99019604, "epoch": 15.481023830538394, "grad_norm": 2.5394341945648193, "learning_rate": 8.264430300156657e-06, "loss": 0.06554301, "memory(GiB)": 15.03, "step": 8770, "train_speed(iter/s)": 1.476861 }, { "acc": 0.9899601, "epoch": 15.489849955869373, "grad_norm": 2.4953854084014893, "learning_rate": 8.262217101704281e-06, "loss": 0.05308408, "memory(GiB)": 15.03, "step": 8775, "train_speed(iter/s)": 1.476877 }, { "acc": 0.99323683, "epoch": 15.498676081200353, "grad_norm": 2.175891160964966, "learning_rate": 8.26000278981413e-06, "loss": 0.04685598, "memory(GiB)": 15.03, "step": 8780, "train_speed(iter/s)": 1.47685 }, { "acc": 0.99011917, "epoch": 15.507502206531333, "grad_norm": 4.4674224853515625, "learning_rate": 8.257787365242094e-06, "loss": 0.06648251, "memory(GiB)": 15.03, "step": 8785, "train_speed(iter/s)": 1.476828 }, { "acc": 0.99163952, "epoch": 15.516328331862312, "grad_norm": 2.341017961502075, "learning_rate": 8.255570828744444e-06, "loss": 0.04922553, "memory(GiB)": 15.03, "step": 8790, "train_speed(iter/s)": 1.476789 }, { "acc": 0.99301233, "epoch": 15.525154457193292, "grad_norm": 2.274045944213867, "learning_rate": 8.25335318107783e-06, "loss": 0.04824341, "memory(GiB)": 15.03, "step": 8795, "train_speed(iter/s)": 1.476779 }, { "acc": 0.99519749, "epoch": 15.533980582524272, "grad_norm": 1.5993788242340088, "learning_rate": 8.251134422999278e-06, "loss": 0.0388022, "memory(GiB)": 15.03, "step": 8800, "train_speed(iter/s)": 1.476739 }, { "acc": 0.99096889, "epoch": 15.542806707855252, "grad_norm": 6.241018295288086, "learning_rate": 8.248914555266197e-06, "loss": 0.06532828, "memory(GiB)": 15.03, "step": 8805, "train_speed(iter/s)": 1.476707 }, { "acc": 0.99472618, "epoch": 15.55163283318623, "grad_norm": 2.8304786682128906, "learning_rate": 8.246693578636373e-06, "loss": 0.04167971, "memory(GiB)": 15.03, "step": 8810, "train_speed(iter/s)": 1.476707 }, { "acc": 0.99321012, "epoch": 15.560458958517211, "grad_norm": 4.79172420501709, "learning_rate": 8.244471493867972e-06, "loss": 0.05623093, "memory(GiB)": 15.03, "step": 8815, "train_speed(iter/s)": 1.476711 }, { "acc": 0.98914413, "epoch": 15.569285083848191, "grad_norm": 4.221975326538086, "learning_rate": 8.242248301719536e-06, "loss": 0.07453114, "memory(GiB)": 15.03, "step": 8820, "train_speed(iter/s)": 1.476746 }, { "acc": 0.9914135, "epoch": 15.57811120917917, "grad_norm": 4.9072699546813965, "learning_rate": 8.240024002949985e-06, "loss": 0.06188316, "memory(GiB)": 15.03, "step": 8825, "train_speed(iter/s)": 1.476785 }, { "acc": 0.98962784, "epoch": 15.58693733451015, "grad_norm": 9.73583698272705, "learning_rate": 8.237798598318621e-06, "loss": 0.06424344, "memory(GiB)": 15.03, "step": 8830, "train_speed(iter/s)": 1.476793 }, { "acc": 0.99200878, "epoch": 15.59576345984113, "grad_norm": 5.044193744659424, "learning_rate": 8.235572088585116e-06, "loss": 0.04614856, "memory(GiB)": 15.03, "step": 8835, "train_speed(iter/s)": 1.476804 }, { "acc": 0.99468822, "epoch": 15.604589585172109, "grad_norm": 2.2712700366973877, "learning_rate": 8.233344474509527e-06, "loss": 0.03909455, "memory(GiB)": 15.03, "step": 8840, "train_speed(iter/s)": 1.476771 }, { "acc": 0.99228716, "epoch": 15.613415710503089, "grad_norm": 3.9782955646514893, "learning_rate": 8.231115756852284e-06, "loss": 0.05773257, "memory(GiB)": 15.03, "step": 8845, "train_speed(iter/s)": 1.4768 }, { "acc": 0.99505234, "epoch": 15.62224183583407, "grad_norm": 4.060893535614014, "learning_rate": 8.228885936374195e-06, "loss": 0.0416992, "memory(GiB)": 15.03, "step": 8850, "train_speed(iter/s)": 1.476821 }, { "acc": 0.99143314, "epoch": 15.631067961165048, "grad_norm": 6.338589668273926, "learning_rate": 8.226655013836443e-06, "loss": 0.05844487, "memory(GiB)": 15.03, "step": 8855, "train_speed(iter/s)": 1.476869 }, { "acc": 0.99166975, "epoch": 15.639894086496028, "grad_norm": 3.6483685970306396, "learning_rate": 8.224422990000586e-06, "loss": 0.04718324, "memory(GiB)": 15.03, "step": 8860, "train_speed(iter/s)": 1.476902 }, { "acc": 0.99176235, "epoch": 15.648720211827008, "grad_norm": 2.210101366043091, "learning_rate": 8.222189865628565e-06, "loss": 0.06266001, "memory(GiB)": 15.03, "step": 8865, "train_speed(iter/s)": 1.476848 }, { "acc": 0.99247265, "epoch": 15.657546337157987, "grad_norm": 3.0780558586120605, "learning_rate": 8.219955641482689e-06, "loss": 0.05701616, "memory(GiB)": 15.03, "step": 8870, "train_speed(iter/s)": 1.476859 }, { "acc": 0.99653292, "epoch": 15.666372462488967, "grad_norm": 4.785597801208496, "learning_rate": 8.217720318325643e-06, "loss": 0.03878545, "memory(GiB)": 15.03, "step": 8875, "train_speed(iter/s)": 1.476871 }, { "acc": 0.99114294, "epoch": 15.675198587819947, "grad_norm": 5.979025363922119, "learning_rate": 8.21548389692049e-06, "loss": 0.06520559, "memory(GiB)": 15.03, "step": 8880, "train_speed(iter/s)": 1.476876 }, { "acc": 0.99091129, "epoch": 15.684024713150926, "grad_norm": 2.2888598442077637, "learning_rate": 8.213246378030671e-06, "loss": 0.06810732, "memory(GiB)": 15.03, "step": 8885, "train_speed(iter/s)": 1.476907 }, { "acc": 0.99189548, "epoch": 15.692850838481906, "grad_norm": 2.460893154144287, "learning_rate": 8.211007762419994e-06, "loss": 0.06473633, "memory(GiB)": 15.03, "step": 8890, "train_speed(iter/s)": 1.476941 }, { "acc": 0.99450388, "epoch": 15.701676963812886, "grad_norm": 4.21790885925293, "learning_rate": 8.208768050852649e-06, "loss": 0.02637189, "memory(GiB)": 15.03, "step": 8895, "train_speed(iter/s)": 1.476995 }, { "acc": 0.99345531, "epoch": 15.710503089143867, "grad_norm": 2.4901633262634277, "learning_rate": 8.20652724409319e-06, "loss": 0.0421537, "memory(GiB)": 15.03, "step": 8900, "train_speed(iter/s)": 1.477042 }, { "acc": 0.98994408, "epoch": 15.719329214474845, "grad_norm": 2.405536413192749, "learning_rate": 8.204285342906555e-06, "loss": 0.06720718, "memory(GiB)": 15.03, "step": 8905, "train_speed(iter/s)": 1.47709 }, { "acc": 0.99601974, "epoch": 15.728155339805825, "grad_norm": 3.614943027496338, "learning_rate": 8.202042348058054e-06, "loss": 0.03208722, "memory(GiB)": 15.03, "step": 8910, "train_speed(iter/s)": 1.477105 }, { "acc": 0.99446716, "epoch": 15.736981465136806, "grad_norm": 1.87542724609375, "learning_rate": 8.199798260313362e-06, "loss": 0.03796242, "memory(GiB)": 15.03, "step": 8915, "train_speed(iter/s)": 1.477107 }, { "acc": 0.99431229, "epoch": 15.745807590467784, "grad_norm": 1.2054013013839722, "learning_rate": 8.19755308043854e-06, "loss": 0.0509838, "memory(GiB)": 15.03, "step": 8920, "train_speed(iter/s)": 1.477132 }, { "acc": 0.99469337, "epoch": 15.754633715798764, "grad_norm": 1.2649059295654297, "learning_rate": 8.195306809200013e-06, "loss": 0.04548314, "memory(GiB)": 15.03, "step": 8925, "train_speed(iter/s)": 1.477098 }, { "acc": 0.99456987, "epoch": 15.763459841129745, "grad_norm": 3.4297029972076416, "learning_rate": 8.193059447364577e-06, "loss": 0.0400948, "memory(GiB)": 15.03, "step": 8930, "train_speed(iter/s)": 1.477138 }, { "acc": 0.98823586, "epoch": 15.772285966460723, "grad_norm": 4.066166877746582, "learning_rate": 8.190810995699407e-06, "loss": 0.06097564, "memory(GiB)": 15.03, "step": 8935, "train_speed(iter/s)": 1.477162 }, { "acc": 0.99098501, "epoch": 15.781112091791703, "grad_norm": 3.5037903785705566, "learning_rate": 8.188561454972048e-06, "loss": 0.06634072, "memory(GiB)": 15.03, "step": 8940, "train_speed(iter/s)": 1.477212 }, { "acc": 0.9951025, "epoch": 15.789938217122684, "grad_norm": 2.52020525932312, "learning_rate": 8.186310825950411e-06, "loss": 0.03996206, "memory(GiB)": 15.03, "step": 8945, "train_speed(iter/s)": 1.477218 }, { "acc": 0.99230328, "epoch": 15.798764342453662, "grad_norm": 1.5526906251907349, "learning_rate": 8.184059109402789e-06, "loss": 0.0557336, "memory(GiB)": 15.03, "step": 8950, "train_speed(iter/s)": 1.477195 }, { "acc": 0.99223824, "epoch": 15.807590467784642, "grad_norm": 3.3895177841186523, "learning_rate": 8.181806306097834e-06, "loss": 0.04765656, "memory(GiB)": 15.03, "step": 8955, "train_speed(iter/s)": 1.477195 }, { "acc": 0.99340572, "epoch": 15.816416593115623, "grad_norm": 0.9177940487861633, "learning_rate": 8.179552416804582e-06, "loss": 0.05287665, "memory(GiB)": 15.03, "step": 8960, "train_speed(iter/s)": 1.477206 }, { "acc": 0.99298487, "epoch": 15.825242718446601, "grad_norm": 2.4205448627471924, "learning_rate": 8.177297442292428e-06, "loss": 0.05407157, "memory(GiB)": 15.03, "step": 8965, "train_speed(iter/s)": 1.477196 }, { "acc": 0.99436855, "epoch": 15.834068843777581, "grad_norm": 4.33038854598999, "learning_rate": 8.175041383331146e-06, "loss": 0.04478621, "memory(GiB)": 15.03, "step": 8970, "train_speed(iter/s)": 1.477204 }, { "acc": 0.99433222, "epoch": 15.842894969108562, "grad_norm": 1.1240425109863281, "learning_rate": 8.172784240690873e-06, "loss": 0.04977025, "memory(GiB)": 15.03, "step": 8975, "train_speed(iter/s)": 1.477192 }, { "acc": 0.99524145, "epoch": 15.85172109443954, "grad_norm": 2.1912741661071777, "learning_rate": 8.170526015142125e-06, "loss": 0.03274682, "memory(GiB)": 15.03, "step": 8980, "train_speed(iter/s)": 1.477202 }, { "acc": 0.990942, "epoch": 15.86054721977052, "grad_norm": 2.1409623622894287, "learning_rate": 8.168266707455776e-06, "loss": 0.05124604, "memory(GiB)": 15.03, "step": 8985, "train_speed(iter/s)": 1.477212 }, { "acc": 0.99405127, "epoch": 15.8693733451015, "grad_norm": 1.5850777626037598, "learning_rate": 8.16600631840308e-06, "loss": 0.04893687, "memory(GiB)": 15.03, "step": 8990, "train_speed(iter/s)": 1.477127 }, { "acc": 0.99465265, "epoch": 15.878199470432481, "grad_norm": 3.14874005317688, "learning_rate": 8.163744848755656e-06, "loss": 0.04752258, "memory(GiB)": 15.03, "step": 8995, "train_speed(iter/s)": 1.47714 }, { "acc": 0.99555836, "epoch": 15.88702559576346, "grad_norm": 3.3733720779418945, "learning_rate": 8.16148229928549e-06, "loss": 0.02963206, "memory(GiB)": 15.03, "step": 9000, "train_speed(iter/s)": 1.477136 }, { "acc": 0.99273834, "epoch": 15.89585172109444, "grad_norm": 3.647601842880249, "learning_rate": 8.159218670764941e-06, "loss": 0.05541487, "memory(GiB)": 15.03, "step": 9005, "train_speed(iter/s)": 1.477185 }, { "acc": 0.99211044, "epoch": 15.90467784642542, "grad_norm": 1.2046706676483154, "learning_rate": 8.156953963966731e-06, "loss": 0.0546949, "memory(GiB)": 15.03, "step": 9010, "train_speed(iter/s)": 1.477204 }, { "acc": 0.99342537, "epoch": 15.913503971756398, "grad_norm": 4.416232109069824, "learning_rate": 8.154688179663952e-06, "loss": 0.04057829, "memory(GiB)": 15.03, "step": 9015, "train_speed(iter/s)": 1.477214 }, { "acc": 0.99351606, "epoch": 15.922330097087379, "grad_norm": 5.085486888885498, "learning_rate": 8.152421318630069e-06, "loss": 0.04443395, "memory(GiB)": 15.03, "step": 9020, "train_speed(iter/s)": 1.477229 }, { "acc": 0.99385986, "epoch": 15.931156222418359, "grad_norm": 5.660799980163574, "learning_rate": 8.150153381638904e-06, "loss": 0.04020559, "memory(GiB)": 15.03, "step": 9025, "train_speed(iter/s)": 1.477232 }, { "acc": 0.99366436, "epoch": 15.939982347749337, "grad_norm": 1.7970123291015625, "learning_rate": 8.147884369464658e-06, "loss": 0.04439015, "memory(GiB)": 15.03, "step": 9030, "train_speed(iter/s)": 1.477222 }, { "acc": 0.99212742, "epoch": 15.948808473080318, "grad_norm": 2.6760926246643066, "learning_rate": 8.145614282881891e-06, "loss": 0.05626625, "memory(GiB)": 15.03, "step": 9035, "train_speed(iter/s)": 1.4772 }, { "acc": 0.99448347, "epoch": 15.957634598411298, "grad_norm": 2.5004005432128906, "learning_rate": 8.143343122665534e-06, "loss": 0.04715314, "memory(GiB)": 15.03, "step": 9040, "train_speed(iter/s)": 1.477201 }, { "acc": 0.99287157, "epoch": 15.966460723742276, "grad_norm": 3.1553523540496826, "learning_rate": 8.141070889590881e-06, "loss": 0.06189075, "memory(GiB)": 15.03, "step": 9045, "train_speed(iter/s)": 1.477224 }, { "acc": 0.99355974, "epoch": 15.975286849073257, "grad_norm": 4.806286334991455, "learning_rate": 8.138797584433595e-06, "loss": 0.04885306, "memory(GiB)": 15.03, "step": 9050, "train_speed(iter/s)": 1.477206 }, { "acc": 0.99175386, "epoch": 15.984112974404237, "grad_norm": 5.285266876220703, "learning_rate": 8.136523207969703e-06, "loss": 0.05606771, "memory(GiB)": 15.03, "step": 9055, "train_speed(iter/s)": 1.477219 }, { "acc": 0.99240265, "epoch": 15.992939099735215, "grad_norm": 2.6201553344726562, "learning_rate": 8.134247760975602e-06, "loss": 0.05577416, "memory(GiB)": 15.03, "step": 9060, "train_speed(iter/s)": 1.477224 }, { "acc": 0.99435234, "epoch": 16.001765225066197, "grad_norm": 2.852654457092285, "learning_rate": 8.131971244228046e-06, "loss": 0.03971931, "memory(GiB)": 15.03, "step": 9065, "train_speed(iter/s)": 1.477129 }, { "acc": 0.99438782, "epoch": 16.010591350397174, "grad_norm": 4.7192559242248535, "learning_rate": 8.129693658504162e-06, "loss": 0.0388641, "memory(GiB)": 15.03, "step": 9070, "train_speed(iter/s)": 1.477159 }, { "acc": 0.9926363, "epoch": 16.019417475728154, "grad_norm": 5.440241813659668, "learning_rate": 8.127415004581439e-06, "loss": 0.05693331, "memory(GiB)": 15.03, "step": 9075, "train_speed(iter/s)": 1.477134 }, { "acc": 0.99245672, "epoch": 16.028243601059135, "grad_norm": 1.71284019947052, "learning_rate": 8.125135283237733e-06, "loss": 0.04871661, "memory(GiB)": 15.03, "step": 9080, "train_speed(iter/s)": 1.477132 }, { "acc": 0.99200621, "epoch": 16.037069726390115, "grad_norm": 2.7525341510772705, "learning_rate": 8.12285449525126e-06, "loss": 0.05375794, "memory(GiB)": 15.03, "step": 9085, "train_speed(iter/s)": 1.477131 }, { "acc": 0.99316254, "epoch": 16.045895851721095, "grad_norm": 4.1036248207092285, "learning_rate": 8.1205726414006e-06, "loss": 0.05613893, "memory(GiB)": 15.03, "step": 9090, "train_speed(iter/s)": 1.477139 }, { "acc": 0.9949605, "epoch": 16.054721977052075, "grad_norm": 3.2200140953063965, "learning_rate": 8.118289722464704e-06, "loss": 0.0381494, "memory(GiB)": 15.03, "step": 9095, "train_speed(iter/s)": 1.477139 }, { "acc": 0.99518986, "epoch": 16.063548102383052, "grad_norm": 1.9174070358276367, "learning_rate": 8.11600573922288e-06, "loss": 0.03927913, "memory(GiB)": 15.03, "step": 9100, "train_speed(iter/s)": 1.477142 }, { "acc": 0.99395332, "epoch": 16.072374227714032, "grad_norm": 4.18668794631958, "learning_rate": 8.1137206924548e-06, "loss": 0.04242072, "memory(GiB)": 15.03, "step": 9105, "train_speed(iter/s)": 1.477144 }, { "acc": 0.98868027, "epoch": 16.081200353045013, "grad_norm": 9.317502975463867, "learning_rate": 8.111434582940502e-06, "loss": 0.06369053, "memory(GiB)": 15.03, "step": 9110, "train_speed(iter/s)": 1.477121 }, { "acc": 0.99145098, "epoch": 16.090026478375993, "grad_norm": 4.802980422973633, "learning_rate": 8.109147411460383e-06, "loss": 0.05777647, "memory(GiB)": 15.03, "step": 9115, "train_speed(iter/s)": 1.477114 }, { "acc": 0.99172506, "epoch": 16.098852603706973, "grad_norm": 5.202415943145752, "learning_rate": 8.106859178795209e-06, "loss": 0.05374013, "memory(GiB)": 15.03, "step": 9120, "train_speed(iter/s)": 1.477122 }, { "acc": 0.98786125, "epoch": 16.107678729037954, "grad_norm": 2.9849960803985596, "learning_rate": 8.104569885726097e-06, "loss": 0.07429491, "memory(GiB)": 15.03, "step": 9125, "train_speed(iter/s)": 1.477159 }, { "acc": 0.99294624, "epoch": 16.116504854368934, "grad_norm": 2.2972943782806396, "learning_rate": 8.102279533034537e-06, "loss": 0.04975866, "memory(GiB)": 15.03, "step": 9130, "train_speed(iter/s)": 1.477181 }, { "acc": 0.98928747, "epoch": 16.12533097969991, "grad_norm": 3.1997461318969727, "learning_rate": 8.099988121502374e-06, "loss": 0.0767284, "memory(GiB)": 15.03, "step": 9135, "train_speed(iter/s)": 1.477225 }, { "acc": 0.9913929, "epoch": 16.13415710503089, "grad_norm": 4.491968631744385, "learning_rate": 8.097695651911822e-06, "loss": 0.06182586, "memory(GiB)": 15.03, "step": 9140, "train_speed(iter/s)": 1.477255 }, { "acc": 0.99137907, "epoch": 16.14298323036187, "grad_norm": 1.7118481397628784, "learning_rate": 8.095402125045445e-06, "loss": 0.05733851, "memory(GiB)": 15.03, "step": 9145, "train_speed(iter/s)": 1.477255 }, { "acc": 0.99225817, "epoch": 16.15180935569285, "grad_norm": 3.914426803588867, "learning_rate": 8.093107541686177e-06, "loss": 0.05288776, "memory(GiB)": 15.03, "step": 9150, "train_speed(iter/s)": 1.477272 }, { "acc": 0.99219837, "epoch": 16.16063548102383, "grad_norm": 4.789412021636963, "learning_rate": 8.090811902617308e-06, "loss": 0.05878752, "memory(GiB)": 15.03, "step": 9155, "train_speed(iter/s)": 1.4773 }, { "acc": 0.99384422, "epoch": 16.169461606354812, "grad_norm": 1.384486198425293, "learning_rate": 8.088515208622494e-06, "loss": 0.04285099, "memory(GiB)": 15.03, "step": 9160, "train_speed(iter/s)": 1.477311 }, { "acc": 0.99250488, "epoch": 16.17828773168579, "grad_norm": 4.779116153717041, "learning_rate": 8.086217460485742e-06, "loss": 0.05241752, "memory(GiB)": 15.03, "step": 9165, "train_speed(iter/s)": 1.477327 }, { "acc": 0.99370317, "epoch": 16.18711385701677, "grad_norm": 1.2218118906021118, "learning_rate": 8.083918658991428e-06, "loss": 0.04498355, "memory(GiB)": 15.03, "step": 9170, "train_speed(iter/s)": 1.477341 }, { "acc": 0.99435005, "epoch": 16.19593998234775, "grad_norm": 1.9477468729019165, "learning_rate": 8.081618804924283e-06, "loss": 0.04160159, "memory(GiB)": 15.03, "step": 9175, "train_speed(iter/s)": 1.477345 }, { "acc": 0.99728851, "epoch": 16.20476610767873, "grad_norm": 1.6398932933807373, "learning_rate": 8.079317899069394e-06, "loss": 0.03048164, "memory(GiB)": 15.03, "step": 9180, "train_speed(iter/s)": 1.477378 }, { "acc": 0.99332104, "epoch": 16.21359223300971, "grad_norm": 2.7855048179626465, "learning_rate": 8.077015942212215e-06, "loss": 0.04252446, "memory(GiB)": 15.03, "step": 9185, "train_speed(iter/s)": 1.477372 }, { "acc": 0.99538307, "epoch": 16.22241835834069, "grad_norm": 2.2978909015655518, "learning_rate": 8.074712935138553e-06, "loss": 0.0252023, "memory(GiB)": 15.03, "step": 9190, "train_speed(iter/s)": 1.477365 }, { "acc": 0.99039869, "epoch": 16.231244483671667, "grad_norm": 3.096233367919922, "learning_rate": 8.072408878634577e-06, "loss": 0.06091452, "memory(GiB)": 15.03, "step": 9195, "train_speed(iter/s)": 1.477365 }, { "acc": 0.99337435, "epoch": 16.240070609002647, "grad_norm": 3.286856174468994, "learning_rate": 8.07010377348681e-06, "loss": 0.04460409, "memory(GiB)": 15.03, "step": 9200, "train_speed(iter/s)": 1.477383 }, { "acc": 0.99398041, "epoch": 16.248896734333627, "grad_norm": 1.381220817565918, "learning_rate": 8.067797620482137e-06, "loss": 0.04661448, "memory(GiB)": 15.03, "step": 9205, "train_speed(iter/s)": 1.47736 }, { "acc": 0.9919178, "epoch": 16.257722859664607, "grad_norm": 1.6789833307266235, "learning_rate": 8.065490420407797e-06, "loss": 0.05222557, "memory(GiB)": 15.03, "step": 9210, "train_speed(iter/s)": 1.477325 }, { "acc": 0.99382687, "epoch": 16.266548984995588, "grad_norm": 4.027559280395508, "learning_rate": 8.063182174051393e-06, "loss": 0.04696824, "memory(GiB)": 15.03, "step": 9215, "train_speed(iter/s)": 1.477343 }, { "acc": 0.99412193, "epoch": 16.275375110326568, "grad_norm": 3.186871290206909, "learning_rate": 8.060872882200879e-06, "loss": 0.05193359, "memory(GiB)": 15.03, "step": 9220, "train_speed(iter/s)": 1.477372 }, { "acc": 0.99387236, "epoch": 16.284201235657548, "grad_norm": 1.7249460220336914, "learning_rate": 8.058562545644564e-06, "loss": 0.03624463, "memory(GiB)": 15.03, "step": 9225, "train_speed(iter/s)": 1.477418 }, { "acc": 0.99414177, "epoch": 16.293027360988525, "grad_norm": 2.144763708114624, "learning_rate": 8.05625116517112e-06, "loss": 0.04156213, "memory(GiB)": 15.03, "step": 9230, "train_speed(iter/s)": 1.477431 }, { "acc": 0.99452648, "epoch": 16.301853486319505, "grad_norm": 2.7731833457946777, "learning_rate": 8.053938741569575e-06, "loss": 0.04194317, "memory(GiB)": 15.03, "step": 9235, "train_speed(iter/s)": 1.477465 }, { "acc": 0.99359646, "epoch": 16.310679611650485, "grad_norm": 2.3401339054107666, "learning_rate": 8.051625275629306e-06, "loss": 0.03809549, "memory(GiB)": 15.03, "step": 9240, "train_speed(iter/s)": 1.477492 }, { "acc": 0.99184055, "epoch": 16.319505736981466, "grad_norm": 1.7692337036132812, "learning_rate": 8.049310768140053e-06, "loss": 0.04138916, "memory(GiB)": 15.03, "step": 9245, "train_speed(iter/s)": 1.477529 }, { "acc": 0.99382458, "epoch": 16.328331862312446, "grad_norm": 3.8921358585357666, "learning_rate": 8.046995219891909e-06, "loss": 0.04295041, "memory(GiB)": 15.03, "step": 9250, "train_speed(iter/s)": 1.477542 }, { "acc": 0.99600925, "epoch": 16.337157987643426, "grad_norm": 3.503941774368286, "learning_rate": 8.044678631675323e-06, "loss": 0.03125636, "memory(GiB)": 15.03, "step": 9255, "train_speed(iter/s)": 1.47755 }, { "acc": 0.99044304, "epoch": 16.345984112974403, "grad_norm": 5.209796905517578, "learning_rate": 8.042361004281097e-06, "loss": 0.0456482, "memory(GiB)": 15.03, "step": 9260, "train_speed(iter/s)": 1.477534 }, { "acc": 0.99428635, "epoch": 16.354810238305383, "grad_norm": 1.6119517087936401, "learning_rate": 8.04004233850039e-06, "loss": 0.03371397, "memory(GiB)": 15.03, "step": 9265, "train_speed(iter/s)": 1.477552 }, { "acc": 0.99414959, "epoch": 16.363636363636363, "grad_norm": 3.9373044967651367, "learning_rate": 8.037722635124714e-06, "loss": 0.04344511, "memory(GiB)": 15.03, "step": 9270, "train_speed(iter/s)": 1.477578 }, { "acc": 0.99523706, "epoch": 16.372462488967344, "grad_norm": 3.5455455780029297, "learning_rate": 8.035401894945934e-06, "loss": 0.03464227, "memory(GiB)": 15.03, "step": 9275, "train_speed(iter/s)": 1.477573 }, { "acc": 0.99627943, "epoch": 16.381288614298324, "grad_norm": 2.687523365020752, "learning_rate": 8.033080118756275e-06, "loss": 0.030492, "memory(GiB)": 15.03, "step": 9280, "train_speed(iter/s)": 1.477589 }, { "acc": 0.99146385, "epoch": 16.390114739629304, "grad_norm": 3.9764153957366943, "learning_rate": 8.030757307348309e-06, "loss": 0.06534961, "memory(GiB)": 15.03, "step": 9285, "train_speed(iter/s)": 1.477617 }, { "acc": 0.99276791, "epoch": 16.39894086496028, "grad_norm": 4.4117841720581055, "learning_rate": 8.028433461514962e-06, "loss": 0.04074755, "memory(GiB)": 15.03, "step": 9290, "train_speed(iter/s)": 1.477663 }, { "acc": 0.99095078, "epoch": 16.40776699029126, "grad_norm": 1.1686705350875854, "learning_rate": 8.026108582049518e-06, "loss": 0.06388958, "memory(GiB)": 15.03, "step": 9295, "train_speed(iter/s)": 1.477696 }, { "acc": 0.9907094, "epoch": 16.41659311562224, "grad_norm": 4.237314701080322, "learning_rate": 8.023782669745609e-06, "loss": 0.06904215, "memory(GiB)": 15.03, "step": 9300, "train_speed(iter/s)": 1.477699 }, { "acc": 0.99465237, "epoch": 16.42541924095322, "grad_norm": 4.631938934326172, "learning_rate": 8.021455725397223e-06, "loss": 0.03592393, "memory(GiB)": 15.03, "step": 9305, "train_speed(iter/s)": 1.477721 }, { "acc": 0.99596395, "epoch": 16.434245366284202, "grad_norm": 1.8657941818237305, "learning_rate": 8.019127749798696e-06, "loss": 0.02841585, "memory(GiB)": 15.03, "step": 9310, "train_speed(iter/s)": 1.477716 }, { "acc": 0.99572115, "epoch": 16.443071491615182, "grad_norm": 0.8921713829040527, "learning_rate": 8.016798743744717e-06, "loss": 0.02716255, "memory(GiB)": 15.03, "step": 9315, "train_speed(iter/s)": 1.477708 }, { "acc": 0.99586449, "epoch": 16.451897616946162, "grad_norm": 0.6019196510314941, "learning_rate": 8.014468708030333e-06, "loss": 0.03652521, "memory(GiB)": 15.03, "step": 9320, "train_speed(iter/s)": 1.477742 }, { "acc": 0.99663677, "epoch": 16.46072374227714, "grad_norm": 3.2737717628479004, "learning_rate": 8.012137643450937e-06, "loss": 0.02351617, "memory(GiB)": 15.03, "step": 9325, "train_speed(iter/s)": 1.477742 }, { "acc": 0.99140644, "epoch": 16.46954986760812, "grad_norm": 2.1572763919830322, "learning_rate": 8.009805550802274e-06, "loss": 0.05392322, "memory(GiB)": 15.03, "step": 9330, "train_speed(iter/s)": 1.477772 }, { "acc": 0.99325056, "epoch": 16.4783759929391, "grad_norm": 5.205357551574707, "learning_rate": 8.007472430880437e-06, "loss": 0.0418355, "memory(GiB)": 15.03, "step": 9335, "train_speed(iter/s)": 1.477799 }, { "acc": 0.99354458, "epoch": 16.48720211827008, "grad_norm": 2.7922346591949463, "learning_rate": 8.005138284481876e-06, "loss": 0.0536873, "memory(GiB)": 15.03, "step": 9340, "train_speed(iter/s)": 1.477797 }, { "acc": 0.99284887, "epoch": 16.49602824360106, "grad_norm": 1.1324809789657593, "learning_rate": 8.002803112403387e-06, "loss": 0.03856691, "memory(GiB)": 15.03, "step": 9345, "train_speed(iter/s)": 1.477759 }, { "acc": 0.99199524, "epoch": 16.50485436893204, "grad_norm": 3.446793794631958, "learning_rate": 8.000466915442116e-06, "loss": 0.05782454, "memory(GiB)": 15.03, "step": 9350, "train_speed(iter/s)": 1.477738 }, { "acc": 0.99416609, "epoch": 16.513680494263017, "grad_norm": 6.9130096435546875, "learning_rate": 7.998129694395562e-06, "loss": 0.04148179, "memory(GiB)": 15.03, "step": 9355, "train_speed(iter/s)": 1.477792 }, { "acc": 0.99410982, "epoch": 16.522506619593997, "grad_norm": 1.5173054933547974, "learning_rate": 7.99579145006157e-06, "loss": 0.04238374, "memory(GiB)": 15.03, "step": 9360, "train_speed(iter/s)": 1.4778 }, { "acc": 0.99478483, "epoch": 16.531332744924978, "grad_norm": 5.893192291259766, "learning_rate": 7.993452183238339e-06, "loss": 0.0330637, "memory(GiB)": 15.03, "step": 9365, "train_speed(iter/s)": 1.477789 }, { "acc": 0.9924222, "epoch": 16.540158870255958, "grad_norm": 4.501545429229736, "learning_rate": 7.991111894724411e-06, "loss": 0.04833452, "memory(GiB)": 15.03, "step": 9370, "train_speed(iter/s)": 1.477812 }, { "acc": 0.99400597, "epoch": 16.548984995586938, "grad_norm": 1.149450659751892, "learning_rate": 7.988770585318681e-06, "loss": 0.04923117, "memory(GiB)": 15.03, "step": 9375, "train_speed(iter/s)": 1.47784 }, { "acc": 0.99057789, "epoch": 16.55781112091792, "grad_norm": 2.824357509613037, "learning_rate": 7.986428255820391e-06, "loss": 0.05691738, "memory(GiB)": 15.03, "step": 9380, "train_speed(iter/s)": 1.477857 }, { "acc": 0.99160671, "epoch": 16.566637246248895, "grad_norm": 3.148881673812866, "learning_rate": 7.984084907029131e-06, "loss": 0.05535977, "memory(GiB)": 15.03, "step": 9385, "train_speed(iter/s)": 1.477853 }, { "acc": 0.99308662, "epoch": 16.575463371579875, "grad_norm": 1.649014949798584, "learning_rate": 7.981740539744842e-06, "loss": 0.03504281, "memory(GiB)": 15.03, "step": 9390, "train_speed(iter/s)": 1.477887 }, { "acc": 0.99084663, "epoch": 16.584289496910856, "grad_norm": 1.8348681926727295, "learning_rate": 7.979395154767808e-06, "loss": 0.0754923, "memory(GiB)": 15.03, "step": 9395, "train_speed(iter/s)": 1.477844 }, { "acc": 0.9930666, "epoch": 16.593115622241836, "grad_norm": 1.3391706943511963, "learning_rate": 7.977048752898662e-06, "loss": 0.05958557, "memory(GiB)": 15.03, "step": 9400, "train_speed(iter/s)": 1.477819 }, { "acc": 0.99225149, "epoch": 16.601941747572816, "grad_norm": 4.426831245422363, "learning_rate": 7.974701334938385e-06, "loss": 0.05444576, "memory(GiB)": 15.03, "step": 9405, "train_speed(iter/s)": 1.477859 }, { "acc": 0.99613819, "epoch": 16.610767872903796, "grad_norm": 2.686101198196411, "learning_rate": 7.972352901688308e-06, "loss": 0.03383715, "memory(GiB)": 15.03, "step": 9410, "train_speed(iter/s)": 1.4779 }, { "acc": 0.99433546, "epoch": 16.619593998234777, "grad_norm": 2.505579710006714, "learning_rate": 7.9700034539501e-06, "loss": 0.030904, "memory(GiB)": 15.03, "step": 9415, "train_speed(iter/s)": 1.477905 }, { "acc": 0.99121275, "epoch": 16.628420123565753, "grad_norm": 4.600115776062012, "learning_rate": 7.967652992525784e-06, "loss": 0.04695969, "memory(GiB)": 15.03, "step": 9420, "train_speed(iter/s)": 1.477945 }, { "acc": 0.99304924, "epoch": 16.637246248896734, "grad_norm": 2.8703079223632812, "learning_rate": 7.965301518217726e-06, "loss": 0.04411903, "memory(GiB)": 15.03, "step": 9425, "train_speed(iter/s)": 1.477949 }, { "acc": 0.99246693, "epoch": 16.646072374227714, "grad_norm": 1.9868232011795044, "learning_rate": 7.96294903182864e-06, "loss": 0.05510146, "memory(GiB)": 15.03, "step": 9430, "train_speed(iter/s)": 1.477926 }, { "acc": 0.99667816, "epoch": 16.654898499558694, "grad_norm": 1.3509125709533691, "learning_rate": 7.960595534161583e-06, "loss": 0.03045179, "memory(GiB)": 15.03, "step": 9435, "train_speed(iter/s)": 1.477956 }, { "acc": 0.9939785, "epoch": 16.663724624889674, "grad_norm": 4.584801197052002, "learning_rate": 7.958241026019954e-06, "loss": 0.0399389, "memory(GiB)": 15.03, "step": 9440, "train_speed(iter/s)": 1.477957 }, { "acc": 0.99278307, "epoch": 16.672550750220655, "grad_norm": 3.429248332977295, "learning_rate": 7.955885508207509e-06, "loss": 0.04873725, "memory(GiB)": 15.03, "step": 9445, "train_speed(iter/s)": 1.477987 }, { "acc": 0.99475937, "epoch": 16.68137687555163, "grad_norm": 3.6419126987457275, "learning_rate": 7.953528981528331e-06, "loss": 0.05107215, "memory(GiB)": 15.03, "step": 9450, "train_speed(iter/s)": 1.478018 }, { "acc": 0.99516945, "epoch": 16.69020300088261, "grad_norm": 3.8818278312683105, "learning_rate": 7.951171446786866e-06, "loss": 0.03636537, "memory(GiB)": 15.03, "step": 9455, "train_speed(iter/s)": 1.478 }, { "acc": 0.9932683, "epoch": 16.699029126213592, "grad_norm": 1.1849138736724854, "learning_rate": 7.948812904787887e-06, "loss": 0.0543902, "memory(GiB)": 15.03, "step": 9460, "train_speed(iter/s)": 1.478013 }, { "acc": 0.99589653, "epoch": 16.707855251544572, "grad_norm": 2.656524658203125, "learning_rate": 7.946453356336524e-06, "loss": 0.04390099, "memory(GiB)": 15.03, "step": 9465, "train_speed(iter/s)": 1.477965 }, { "acc": 0.99584312, "epoch": 16.716681376875552, "grad_norm": 2.5557546615600586, "learning_rate": 7.944092802238247e-06, "loss": 0.02702863, "memory(GiB)": 15.03, "step": 9470, "train_speed(iter/s)": 1.477966 }, { "acc": 0.99392576, "epoch": 16.725507502206533, "grad_norm": 2.2553975582122803, "learning_rate": 7.941731243298863e-06, "loss": 0.04579529, "memory(GiB)": 15.03, "step": 9475, "train_speed(iter/s)": 1.477967 }, { "acc": 0.99040813, "epoch": 16.73433362753751, "grad_norm": 5.720101356506348, "learning_rate": 7.939368680324528e-06, "loss": 0.06784065, "memory(GiB)": 15.03, "step": 9480, "train_speed(iter/s)": 1.47796 }, { "acc": 0.99500504, "epoch": 16.74315975286849, "grad_norm": 2.0538289546966553, "learning_rate": 7.937005114121742e-06, "loss": 0.03502366, "memory(GiB)": 15.03, "step": 9485, "train_speed(iter/s)": 1.477941 }, { "acc": 0.99541054, "epoch": 16.75198587819947, "grad_norm": 5.481133937835693, "learning_rate": 7.934640545497342e-06, "loss": 0.03882057, "memory(GiB)": 15.03, "step": 9490, "train_speed(iter/s)": 1.47793 }, { "acc": 0.99281235, "epoch": 16.76081200353045, "grad_norm": 3.514281988143921, "learning_rate": 7.932274975258515e-06, "loss": 0.05272213, "memory(GiB)": 15.03, "step": 9495, "train_speed(iter/s)": 1.477967 }, { "acc": 0.99440012, "epoch": 16.76963812886143, "grad_norm": 1.7955906391143799, "learning_rate": 7.929908404212778e-06, "loss": 0.03171651, "memory(GiB)": 15.03, "step": 9500, "train_speed(iter/s)": 1.477987 }, { "acc": 0.99066296, "epoch": 16.77846425419241, "grad_norm": 6.219470977783203, "learning_rate": 7.927540833168001e-06, "loss": 0.05579764, "memory(GiB)": 15.03, "step": 9505, "train_speed(iter/s)": 1.477951 }, { "acc": 0.99513474, "epoch": 16.78729037952339, "grad_norm": 1.4623222351074219, "learning_rate": 7.925172262932393e-06, "loss": 0.03083439, "memory(GiB)": 15.03, "step": 9510, "train_speed(iter/s)": 1.477934 }, { "acc": 0.9921978, "epoch": 16.796116504854368, "grad_norm": 4.174780368804932, "learning_rate": 7.922802694314498e-06, "loss": 0.05660611, "memory(GiB)": 15.03, "step": 9515, "train_speed(iter/s)": 1.477986 }, { "acc": 0.99268036, "epoch": 16.804942630185348, "grad_norm": 4.0133466720581055, "learning_rate": 7.920432128123209e-06, "loss": 0.0587055, "memory(GiB)": 15.03, "step": 9520, "train_speed(iter/s)": 1.478011 }, { "acc": 0.9950964, "epoch": 16.813768755516328, "grad_norm": 3.693925380706787, "learning_rate": 7.918060565167752e-06, "loss": 0.03409262, "memory(GiB)": 15.03, "step": 9525, "train_speed(iter/s)": 1.478034 }, { "acc": 0.99475422, "epoch": 16.82259488084731, "grad_norm": 3.861785650253296, "learning_rate": 7.915688006257698e-06, "loss": 0.04484619, "memory(GiB)": 15.03, "step": 9530, "train_speed(iter/s)": 1.478066 }, { "acc": 0.99300022, "epoch": 16.83142100617829, "grad_norm": 9.072541236877441, "learning_rate": 7.913314452202958e-06, "loss": 0.05662906, "memory(GiB)": 15.03, "step": 9535, "train_speed(iter/s)": 1.478048 }, { "acc": 0.99367428, "epoch": 16.84024713150927, "grad_norm": 3.186432123184204, "learning_rate": 7.91093990381378e-06, "loss": 0.05332999, "memory(GiB)": 15.03, "step": 9540, "train_speed(iter/s)": 1.478066 }, { "acc": 0.99370365, "epoch": 16.849073256840246, "grad_norm": 4.504738807678223, "learning_rate": 7.908564361900755e-06, "loss": 0.04506553, "memory(GiB)": 15.03, "step": 9545, "train_speed(iter/s)": 1.478106 }, { "acc": 0.99495649, "epoch": 16.857899382171226, "grad_norm": 2.4222919940948486, "learning_rate": 7.906187827274808e-06, "loss": 0.02859634, "memory(GiB)": 15.03, "step": 9550, "train_speed(iter/s)": 1.478125 }, { "acc": 0.99521465, "epoch": 16.866725507502206, "grad_norm": 3.4142086505889893, "learning_rate": 7.90381030074721e-06, "loss": 0.03455225, "memory(GiB)": 15.03, "step": 9555, "train_speed(iter/s)": 1.478138 }, { "acc": 0.99531622, "epoch": 16.875551632833186, "grad_norm": 2.8387956619262695, "learning_rate": 7.901431783129563e-06, "loss": 0.04165184, "memory(GiB)": 15.03, "step": 9560, "train_speed(iter/s)": 1.478181 }, { "acc": 0.99309912, "epoch": 16.884377758164167, "grad_norm": 4.14256477355957, "learning_rate": 7.899052275233813e-06, "loss": 0.04714821, "memory(GiB)": 15.03, "step": 9565, "train_speed(iter/s)": 1.478208 }, { "acc": 0.99624557, "epoch": 16.893203883495147, "grad_norm": 3.1877386569976807, "learning_rate": 7.896671777872239e-06, "loss": 0.03232239, "memory(GiB)": 15.03, "step": 9570, "train_speed(iter/s)": 1.478201 }, { "acc": 0.99496975, "epoch": 16.902030008826124, "grad_norm": 1.7577683925628662, "learning_rate": 7.894290291857466e-06, "loss": 0.03668192, "memory(GiB)": 15.03, "step": 9575, "train_speed(iter/s)": 1.478148 }, { "acc": 0.99404621, "epoch": 16.910856134157104, "grad_norm": 4.357499122619629, "learning_rate": 7.891907818002447e-06, "loss": 0.04149423, "memory(GiB)": 15.03, "step": 9580, "train_speed(iter/s)": 1.478157 }, { "acc": 0.99046898, "epoch": 16.919682259488084, "grad_norm": 3.0574777126312256, "learning_rate": 7.889524357120478e-06, "loss": 0.06725289, "memory(GiB)": 15.03, "step": 9585, "train_speed(iter/s)": 1.478158 }, { "acc": 0.99391899, "epoch": 16.928508384819065, "grad_norm": 2.74082088470459, "learning_rate": 7.88713991002519e-06, "loss": 0.04341449, "memory(GiB)": 15.03, "step": 9590, "train_speed(iter/s)": 1.478189 }, { "acc": 0.99220676, "epoch": 16.937334510150045, "grad_norm": 3.3223912715911865, "learning_rate": 7.88475447753055e-06, "loss": 0.05120761, "memory(GiB)": 15.03, "step": 9595, "train_speed(iter/s)": 1.478158 }, { "acc": 0.99341793, "epoch": 16.946160635481025, "grad_norm": 2.2374050617218018, "learning_rate": 7.882368060450866e-06, "loss": 0.0423427, "memory(GiB)": 15.03, "step": 9600, "train_speed(iter/s)": 1.478146 }, { "acc": 0.99257107, "epoch": 16.954986760812005, "grad_norm": 3.824381113052368, "learning_rate": 7.879980659600773e-06, "loss": 0.0437359, "memory(GiB)": 15.03, "step": 9605, "train_speed(iter/s)": 1.478163 }, { "acc": 0.99120874, "epoch": 16.963812886142982, "grad_norm": 2.570233106613159, "learning_rate": 7.877592275795251e-06, "loss": 0.05827805, "memory(GiB)": 15.03, "step": 9610, "train_speed(iter/s)": 1.478171 }, { "acc": 0.99177933, "epoch": 16.972639011473962, "grad_norm": 3.223113775253296, "learning_rate": 7.875202909849615e-06, "loss": 0.06252102, "memory(GiB)": 15.03, "step": 9615, "train_speed(iter/s)": 1.478167 }, { "acc": 0.99152832, "epoch": 16.981465136804943, "grad_norm": 3.46097993850708, "learning_rate": 7.872812562579505e-06, "loss": 0.05788125, "memory(GiB)": 15.03, "step": 9620, "train_speed(iter/s)": 1.47819 }, { "acc": 0.99724512, "epoch": 16.990291262135923, "grad_norm": 2.0727715492248535, "learning_rate": 7.870421234800908e-06, "loss": 0.02429129, "memory(GiB)": 15.03, "step": 9625, "train_speed(iter/s)": 1.478204 }, { "acc": 0.9944314, "epoch": 16.999117387466903, "grad_norm": 5.060495376586914, "learning_rate": 7.868028927330137e-06, "loss": 0.0483492, "memory(GiB)": 15.03, "step": 9630, "train_speed(iter/s)": 1.478188 }, { "acc": 0.99311028, "epoch": 17.007943512797883, "grad_norm": 3.8220903873443604, "learning_rate": 7.865635640983847e-06, "loss": 0.04224327, "memory(GiB)": 15.03, "step": 9635, "train_speed(iter/s)": 1.478137 }, { "acc": 0.99578953, "epoch": 17.01676963812886, "grad_norm": 1.0097200870513916, "learning_rate": 7.863241376579022e-06, "loss": 0.02999201, "memory(GiB)": 15.03, "step": 9640, "train_speed(iter/s)": 1.478129 }, { "acc": 0.99527607, "epoch": 17.02559576345984, "grad_norm": 5.066915988922119, "learning_rate": 7.860846134932982e-06, "loss": 0.03696901, "memory(GiB)": 15.03, "step": 9645, "train_speed(iter/s)": 1.478142 }, { "acc": 0.99382401, "epoch": 17.03442188879082, "grad_norm": 0.7646409273147583, "learning_rate": 7.858449916863382e-06, "loss": 0.03629427, "memory(GiB)": 15.03, "step": 9650, "train_speed(iter/s)": 1.47809 }, { "acc": 0.99326496, "epoch": 17.0432480141218, "grad_norm": 3.798686981201172, "learning_rate": 7.856052723188202e-06, "loss": 0.0527109, "memory(GiB)": 15.03, "step": 9655, "train_speed(iter/s)": 1.478112 }, { "acc": 0.99090309, "epoch": 17.05207413945278, "grad_norm": 3.068354606628418, "learning_rate": 7.853654554725767e-06, "loss": 0.05597183, "memory(GiB)": 15.03, "step": 9660, "train_speed(iter/s)": 1.478088 }, { "acc": 0.99356794, "epoch": 17.06090026478376, "grad_norm": 3.467702865600586, "learning_rate": 7.851255412294724e-06, "loss": 0.04120067, "memory(GiB)": 15.03, "step": 9665, "train_speed(iter/s)": 1.478053 }, { "acc": 0.99354124, "epoch": 17.069726390114738, "grad_norm": 1.0958819389343262, "learning_rate": 7.848855296714063e-06, "loss": 0.04634499, "memory(GiB)": 15.03, "step": 9670, "train_speed(iter/s)": 1.478023 }, { "acc": 0.99444313, "epoch": 17.07855251544572, "grad_norm": 1.974390983581543, "learning_rate": 7.846454208803099e-06, "loss": 0.04292429, "memory(GiB)": 15.03, "step": 9675, "train_speed(iter/s)": 1.478055 }, { "acc": 0.99370613, "epoch": 17.0873786407767, "grad_norm": 4.194187164306641, "learning_rate": 7.844052149381479e-06, "loss": 0.03741043, "memory(GiB)": 15.03, "step": 9680, "train_speed(iter/s)": 1.478051 }, { "acc": 0.99459343, "epoch": 17.09620476610768, "grad_norm": 3.0122950077056885, "learning_rate": 7.841649119269185e-06, "loss": 0.04444561, "memory(GiB)": 15.03, "step": 9685, "train_speed(iter/s)": 1.478077 }, { "acc": 0.99050665, "epoch": 17.10503089143866, "grad_norm": 3.076902151107788, "learning_rate": 7.839245119286528e-06, "loss": 0.0703499, "memory(GiB)": 15.03, "step": 9690, "train_speed(iter/s)": 1.478057 }, { "acc": 0.99590855, "epoch": 17.11385701676964, "grad_norm": 2.062615156173706, "learning_rate": 7.836840150254148e-06, "loss": 0.03238455, "memory(GiB)": 15.03, "step": 9695, "train_speed(iter/s)": 1.478052 }, { "acc": 0.99476175, "epoch": 17.12268314210062, "grad_norm": 3.240806818008423, "learning_rate": 7.834434212993023e-06, "loss": 0.03517009, "memory(GiB)": 15.03, "step": 9700, "train_speed(iter/s)": 1.478079 }, { "acc": 0.99408836, "epoch": 17.131509267431596, "grad_norm": 3.3569376468658447, "learning_rate": 7.832027308324454e-06, "loss": 0.04578951, "memory(GiB)": 15.03, "step": 9705, "train_speed(iter/s)": 1.478099 }, { "acc": 0.9927639, "epoch": 17.140335392762577, "grad_norm": 1.5600645542144775, "learning_rate": 7.829619437070079e-06, "loss": 0.04881061, "memory(GiB)": 15.03, "step": 9710, "train_speed(iter/s)": 1.478144 }, { "acc": 0.99245939, "epoch": 17.149161518093557, "grad_norm": 4.016834259033203, "learning_rate": 7.827210600051856e-06, "loss": 0.04947143, "memory(GiB)": 15.03, "step": 9715, "train_speed(iter/s)": 1.478132 }, { "acc": 0.99225769, "epoch": 17.157987643424537, "grad_norm": 3.7051291465759277, "learning_rate": 7.824800798092083e-06, "loss": 0.05187438, "memory(GiB)": 15.03, "step": 9720, "train_speed(iter/s)": 1.478153 }, { "acc": 0.9939621, "epoch": 17.166813768755517, "grad_norm": 2.4065308570861816, "learning_rate": 7.822390032013386e-06, "loss": 0.05698977, "memory(GiB)": 15.03, "step": 9725, "train_speed(iter/s)": 1.478219 }, { "acc": 0.99467793, "epoch": 17.175639894086498, "grad_norm": 3.1866238117218018, "learning_rate": 7.81997830263871e-06, "loss": 0.05152199, "memory(GiB)": 15.03, "step": 9730, "train_speed(iter/s)": 1.478209 }, { "acc": 0.99508171, "epoch": 17.184466019417474, "grad_norm": 3.9873836040496826, "learning_rate": 7.817565610791344e-06, "loss": 0.0478074, "memory(GiB)": 15.03, "step": 9735, "train_speed(iter/s)": 1.478248 }, { "acc": 0.99380836, "epoch": 17.193292144748455, "grad_norm": 2.9539403915405273, "learning_rate": 7.815151957294892e-06, "loss": 0.04947134, "memory(GiB)": 15.03, "step": 9740, "train_speed(iter/s)": 1.478237 }, { "acc": 0.99393082, "epoch": 17.202118270079435, "grad_norm": 1.638323187828064, "learning_rate": 7.812737342973295e-06, "loss": 0.05023437, "memory(GiB)": 15.03, "step": 9745, "train_speed(iter/s)": 1.478227 }, { "acc": 0.99530849, "epoch": 17.210944395410415, "grad_norm": 4.866275787353516, "learning_rate": 7.81032176865082e-06, "loss": 0.02911733, "memory(GiB)": 15.03, "step": 9750, "train_speed(iter/s)": 1.478246 }, { "acc": 0.99544926, "epoch": 17.219770520741395, "grad_norm": 2.0425500869750977, "learning_rate": 7.807905235152058e-06, "loss": 0.03327469, "memory(GiB)": 15.03, "step": 9755, "train_speed(iter/s)": 1.478255 }, { "acc": 0.9940218, "epoch": 17.228596646072376, "grad_norm": 2.4428799152374268, "learning_rate": 7.805487743301931e-06, "loss": 0.03533966, "memory(GiB)": 15.03, "step": 9760, "train_speed(iter/s)": 1.478272 }, { "acc": 0.99513922, "epoch": 17.237422771403352, "grad_norm": 2.0030171871185303, "learning_rate": 7.80306929392569e-06, "loss": 0.04070901, "memory(GiB)": 15.03, "step": 9765, "train_speed(iter/s)": 1.478289 }, { "acc": 0.99404707, "epoch": 17.246248896734333, "grad_norm": 6.270140171051025, "learning_rate": 7.800649887848907e-06, "loss": 0.04678546, "memory(GiB)": 15.03, "step": 9770, "train_speed(iter/s)": 1.478295 }, { "acc": 0.99190197, "epoch": 17.255075022065313, "grad_norm": 2.2881762981414795, "learning_rate": 7.798229525897483e-06, "loss": 0.06868464, "memory(GiB)": 15.03, "step": 9775, "train_speed(iter/s)": 1.478357 }, { "acc": 0.99213572, "epoch": 17.263901147396293, "grad_norm": 4.6103105545043945, "learning_rate": 7.79580820889765e-06, "loss": 0.05339604, "memory(GiB)": 15.03, "step": 9780, "train_speed(iter/s)": 1.478392 }, { "acc": 0.99600992, "epoch": 17.272727272727273, "grad_norm": 1.5684425830841064, "learning_rate": 7.793385937675962e-06, "loss": 0.03082765, "memory(GiB)": 15.03, "step": 9785, "train_speed(iter/s)": 1.478383 }, { "acc": 0.99557924, "epoch": 17.281553398058254, "grad_norm": 1.2775377035140991, "learning_rate": 7.790962713059294e-06, "loss": 0.03525525, "memory(GiB)": 15.03, "step": 9790, "train_speed(iter/s)": 1.478332 }, { "acc": 0.99428387, "epoch": 17.290379523389234, "grad_norm": 2.879889965057373, "learning_rate": 7.788538535874856e-06, "loss": 0.04505767, "memory(GiB)": 15.03, "step": 9795, "train_speed(iter/s)": 1.478344 }, { "acc": 0.99540501, "epoch": 17.29920564872021, "grad_norm": 3.025895118713379, "learning_rate": 7.786113406950178e-06, "loss": 0.03623917, "memory(GiB)": 15.03, "step": 9800, "train_speed(iter/s)": 1.478383 }, { "acc": 0.99203596, "epoch": 17.30803177405119, "grad_norm": 3.52785325050354, "learning_rate": 7.783687327113114e-06, "loss": 0.05955845, "memory(GiB)": 15.03, "step": 9805, "train_speed(iter/s)": 1.478386 }, { "acc": 0.9937706, "epoch": 17.31685789938217, "grad_norm": 2.468553066253662, "learning_rate": 7.781260297191843e-06, "loss": 0.04059466, "memory(GiB)": 15.03, "step": 9810, "train_speed(iter/s)": 1.47841 }, { "acc": 0.99325294, "epoch": 17.32568402471315, "grad_norm": 1.3375524282455444, "learning_rate": 7.778832318014871e-06, "loss": 0.0400872, "memory(GiB)": 15.03, "step": 9815, "train_speed(iter/s)": 1.478427 }, { "acc": 0.99541788, "epoch": 17.33451015004413, "grad_norm": 2.305830240249634, "learning_rate": 7.776403390411027e-06, "loss": 0.03852956, "memory(GiB)": 15.03, "step": 9820, "train_speed(iter/s)": 1.478428 }, { "acc": 0.99551859, "epoch": 17.343336275375112, "grad_norm": 3.092710494995117, "learning_rate": 7.773973515209464e-06, "loss": 0.02752784, "memory(GiB)": 15.03, "step": 9825, "train_speed(iter/s)": 1.478457 }, { "acc": 0.99425201, "epoch": 17.35216240070609, "grad_norm": 3.792820692062378, "learning_rate": 7.771542693239653e-06, "loss": 0.04608046, "memory(GiB)": 15.03, "step": 9830, "train_speed(iter/s)": 1.478415 }, { "acc": 0.99539413, "epoch": 17.36098852603707, "grad_norm": 1.7828278541564941, "learning_rate": 7.769110925331398e-06, "loss": 0.03611037, "memory(GiB)": 15.03, "step": 9835, "train_speed(iter/s)": 1.47838 }, { "acc": 0.9940073, "epoch": 17.36981465136805, "grad_norm": 3.6699178218841553, "learning_rate": 7.766678212314815e-06, "loss": 0.04330898, "memory(GiB)": 15.03, "step": 9840, "train_speed(iter/s)": 1.478383 }, { "acc": 0.99789314, "epoch": 17.37864077669903, "grad_norm": 2.9162964820861816, "learning_rate": 7.764244555020356e-06, "loss": 0.01775346, "memory(GiB)": 15.03, "step": 9845, "train_speed(iter/s)": 1.478371 }, { "acc": 0.99220047, "epoch": 17.38746690203001, "grad_norm": 2.589425802230835, "learning_rate": 7.761809954278783e-06, "loss": 0.05446519, "memory(GiB)": 15.03, "step": 9850, "train_speed(iter/s)": 1.478342 }, { "acc": 0.99463978, "epoch": 17.39629302736099, "grad_norm": 1.5630978345870972, "learning_rate": 7.759374410921185e-06, "loss": 0.0469648, "memory(GiB)": 15.03, "step": 9855, "train_speed(iter/s)": 1.478347 }, { "acc": 0.99238558, "epoch": 17.405119152691967, "grad_norm": 5.051239967346191, "learning_rate": 7.756937925778971e-06, "loss": 0.0543581, "memory(GiB)": 15.03, "step": 9860, "train_speed(iter/s)": 1.478357 }, { "acc": 0.99567108, "epoch": 17.413945278022947, "grad_norm": 1.042861819267273, "learning_rate": 7.754500499683875e-06, "loss": 0.02781825, "memory(GiB)": 15.03, "step": 9865, "train_speed(iter/s)": 1.478384 }, { "acc": 0.99605503, "epoch": 17.422771403353927, "grad_norm": 0.7020120024681091, "learning_rate": 7.752062133467953e-06, "loss": 0.0297663, "memory(GiB)": 15.03, "step": 9870, "train_speed(iter/s)": 1.478405 }, { "acc": 0.99550304, "epoch": 17.431597528684907, "grad_norm": 1.8260146379470825, "learning_rate": 7.749622827963574e-06, "loss": 0.03281555, "memory(GiB)": 15.03, "step": 9875, "train_speed(iter/s)": 1.478414 }, { "acc": 0.99549885, "epoch": 17.440423654015888, "grad_norm": 3.422137498855591, "learning_rate": 7.747182584003435e-06, "loss": 0.03595309, "memory(GiB)": 15.03, "step": 9880, "train_speed(iter/s)": 1.478411 }, { "acc": 0.99544086, "epoch": 17.449249779346868, "grad_norm": 4.450588226318359, "learning_rate": 7.744741402420553e-06, "loss": 0.04044379, "memory(GiB)": 15.03, "step": 9885, "train_speed(iter/s)": 1.478422 }, { "acc": 0.99351845, "epoch": 17.458075904677848, "grad_norm": 1.6657445430755615, "learning_rate": 7.74229928404826e-06, "loss": 0.04925648, "memory(GiB)": 15.03, "step": 9890, "train_speed(iter/s)": 1.47844 }, { "acc": 0.99416618, "epoch": 17.466902030008825, "grad_norm": 1.441227912902832, "learning_rate": 7.739856229720214e-06, "loss": 0.05025038, "memory(GiB)": 15.03, "step": 9895, "train_speed(iter/s)": 1.47846 }, { "acc": 0.99628267, "epoch": 17.475728155339805, "grad_norm": 1.467867374420166, "learning_rate": 7.737412240270387e-06, "loss": 0.01986405, "memory(GiB)": 15.03, "step": 9900, "train_speed(iter/s)": 1.478485 }, { "acc": 0.99408083, "epoch": 17.484554280670785, "grad_norm": 1.957005262374878, "learning_rate": 7.734967316533074e-06, "loss": 0.03856125, "memory(GiB)": 15.03, "step": 9905, "train_speed(iter/s)": 1.478472 }, { "acc": 0.99367809, "epoch": 17.493380406001766, "grad_norm": 3.8621137142181396, "learning_rate": 7.732521459342888e-06, "loss": 0.06251855, "memory(GiB)": 15.03, "step": 9910, "train_speed(iter/s)": 1.4785 }, { "acc": 0.99358912, "epoch": 17.502206531332746, "grad_norm": 2.052616834640503, "learning_rate": 7.730074669534758e-06, "loss": 0.04817207, "memory(GiB)": 15.03, "step": 9915, "train_speed(iter/s)": 1.478522 }, { "acc": 0.99396572, "epoch": 17.511032656663726, "grad_norm": 5.782065391540527, "learning_rate": 7.727626947943938e-06, "loss": 0.04457709, "memory(GiB)": 15.03, "step": 9920, "train_speed(iter/s)": 1.478499 }, { "acc": 0.9948576, "epoch": 17.519858781994703, "grad_norm": 2.2200982570648193, "learning_rate": 7.725178295405991e-06, "loss": 0.03645321, "memory(GiB)": 15.03, "step": 9925, "train_speed(iter/s)": 1.478486 }, { "acc": 0.9948308, "epoch": 17.528684907325683, "grad_norm": 5.274273872375488, "learning_rate": 7.722728712756807e-06, "loss": 0.02887793, "memory(GiB)": 15.03, "step": 9930, "train_speed(iter/s)": 1.478486 }, { "acc": 0.9962965, "epoch": 17.537511032656663, "grad_norm": 4.026265621185303, "learning_rate": 7.720278200832585e-06, "loss": 0.03112929, "memory(GiB)": 15.03, "step": 9935, "train_speed(iter/s)": 1.478478 }, { "acc": 0.99584446, "epoch": 17.546337157987644, "grad_norm": 4.593540668487549, "learning_rate": 7.71782676046985e-06, "loss": 0.03158109, "memory(GiB)": 15.03, "step": 9940, "train_speed(iter/s)": 1.478461 }, { "acc": 0.99423161, "epoch": 17.555163283318624, "grad_norm": 4.0273051261901855, "learning_rate": 7.715374392505433e-06, "loss": 0.0544739, "memory(GiB)": 15.03, "step": 9945, "train_speed(iter/s)": 1.478483 }, { "acc": 0.99599047, "epoch": 17.563989408649604, "grad_norm": 1.6942386627197266, "learning_rate": 7.712921097776494e-06, "loss": 0.03742793, "memory(GiB)": 15.03, "step": 9950, "train_speed(iter/s)": 1.478517 }, { "acc": 0.99657965, "epoch": 17.57281553398058, "grad_norm": 1.3992186784744263, "learning_rate": 7.7104668771205e-06, "loss": 0.03179007, "memory(GiB)": 15.03, "step": 9955, "train_speed(iter/s)": 1.478538 }, { "acc": 0.9952816, "epoch": 17.58164165931156, "grad_norm": 1.2683521509170532, "learning_rate": 7.708011731375236e-06, "loss": 0.03274857, "memory(GiB)": 15.03, "step": 9960, "train_speed(iter/s)": 1.478543 }, { "acc": 0.99427156, "epoch": 17.59046778464254, "grad_norm": 4.37578821182251, "learning_rate": 7.705555661378807e-06, "loss": 0.04048403, "memory(GiB)": 15.03, "step": 9965, "train_speed(iter/s)": 1.478542 }, { "acc": 0.99535961, "epoch": 17.59929390997352, "grad_norm": 3.0051193237304688, "learning_rate": 7.70309866796963e-06, "loss": 0.0467912, "memory(GiB)": 15.03, "step": 9970, "train_speed(iter/s)": 1.478559 }, { "acc": 0.9965147, "epoch": 17.608120035304502, "grad_norm": 2.729163408279419, "learning_rate": 7.700640751986436e-06, "loss": 0.0268898, "memory(GiB)": 15.03, "step": 9975, "train_speed(iter/s)": 1.47855 }, { "acc": 0.99838715, "epoch": 17.616946160635482, "grad_norm": 1.7705786228179932, "learning_rate": 7.698181914268278e-06, "loss": 0.02134103, "memory(GiB)": 15.03, "step": 9980, "train_speed(iter/s)": 1.478568 }, { "acc": 0.99499626, "epoch": 17.625772285966463, "grad_norm": 2.466395854949951, "learning_rate": 7.69572215565451e-06, "loss": 0.03990422, "memory(GiB)": 15.03, "step": 9985, "train_speed(iter/s)": 1.478546 }, { "acc": 0.99314079, "epoch": 17.63459841129744, "grad_norm": 5.555733680725098, "learning_rate": 7.693261476984818e-06, "loss": 0.04989852, "memory(GiB)": 15.03, "step": 9990, "train_speed(iter/s)": 1.478552 }, { "acc": 0.9938776, "epoch": 17.64342453662842, "grad_norm": 1.189917802810669, "learning_rate": 7.690799879099185e-06, "loss": 0.04334961, "memory(GiB)": 15.03, "step": 9995, "train_speed(iter/s)": 1.478577 }, { "acc": 0.99579792, "epoch": 17.6522506619594, "grad_norm": 2.7536933422088623, "learning_rate": 7.68833736283792e-06, "loss": 0.03024867, "memory(GiB)": 15.03, "step": 10000, "train_speed(iter/s)": 1.478571 }, { "epoch": 17.6522506619594, "eval_acc": 0.7835826519461924, "eval_loss": 1.4869691133499146, "eval_runtime": 29.9625, "eval_samples_per_second": 44.556, "eval_steps_per_second": 5.574, "step": 10000 }, { "acc": 0.99740467, "epoch": 17.66107678729038, "grad_norm": 1.5049043893814087, "learning_rate": 7.68587392904164e-06, "loss": 0.02404172, "memory(GiB)": 15.03, "step": 10005, "train_speed(iter/s)": 1.466397 }, { "acc": 0.99232149, "epoch": 17.66990291262136, "grad_norm": 5.482903480529785, "learning_rate": 7.683409578551277e-06, "loss": 0.05811324, "memory(GiB)": 15.03, "step": 10010, "train_speed(iter/s)": 1.466393 }, { "acc": 0.99437342, "epoch": 17.67872903795234, "grad_norm": 3.9501161575317383, "learning_rate": 7.680944312208076e-06, "loss": 0.0411664, "memory(GiB)": 15.03, "step": 10015, "train_speed(iter/s)": 1.466417 }, { "acc": 0.99573956, "epoch": 17.687555163283317, "grad_norm": 2.317035675048828, "learning_rate": 7.678478130853591e-06, "loss": 0.03188133, "memory(GiB)": 15.03, "step": 10020, "train_speed(iter/s)": 1.466425 }, { "acc": 0.9965724, "epoch": 17.696381288614297, "grad_norm": 2.2091453075408936, "learning_rate": 7.676011035329694e-06, "loss": 0.02581523, "memory(GiB)": 15.03, "step": 10025, "train_speed(iter/s)": 1.466425 }, { "acc": 0.99278069, "epoch": 17.705207413945278, "grad_norm": 4.233280658721924, "learning_rate": 7.673543026478565e-06, "loss": 0.07244796, "memory(GiB)": 15.03, "step": 10030, "train_speed(iter/s)": 1.466458 }, { "acc": 0.99593792, "epoch": 17.714033539276258, "grad_norm": 0.3629441559314728, "learning_rate": 7.671074105142698e-06, "loss": 0.0399328, "memory(GiB)": 15.03, "step": 10035, "train_speed(iter/s)": 1.466533 }, { "acc": 0.99594383, "epoch": 17.72285966460724, "grad_norm": 5.78774356842041, "learning_rate": 7.668604272164895e-06, "loss": 0.03257771, "memory(GiB)": 15.03, "step": 10040, "train_speed(iter/s)": 1.466565 }, { "acc": 0.99390078, "epoch": 17.73168578993822, "grad_norm": 2.638082265853882, "learning_rate": 7.666133528388276e-06, "loss": 0.04690317, "memory(GiB)": 15.03, "step": 10045, "train_speed(iter/s)": 1.466597 }, { "acc": 0.99493065, "epoch": 17.740511915269195, "grad_norm": 3.1249802112579346, "learning_rate": 7.663661874656266e-06, "loss": 0.02921008, "memory(GiB)": 15.03, "step": 10050, "train_speed(iter/s)": 1.466647 }, { "acc": 0.99255447, "epoch": 17.749338040600176, "grad_norm": 4.417450428009033, "learning_rate": 7.661189311812603e-06, "loss": 0.04916076, "memory(GiB)": 15.03, "step": 10055, "train_speed(iter/s)": 1.466704 }, { "acc": 0.99445381, "epoch": 17.758164165931156, "grad_norm": 3.4678070545196533, "learning_rate": 7.658715840701331e-06, "loss": 0.04341879, "memory(GiB)": 15.03, "step": 10060, "train_speed(iter/s)": 1.466713 }, { "acc": 0.99633474, "epoch": 17.766990291262136, "grad_norm": 2.6440513134002686, "learning_rate": 7.656241462166814e-06, "loss": 0.02901059, "memory(GiB)": 15.03, "step": 10065, "train_speed(iter/s)": 1.466713 }, { "acc": 0.99161901, "epoch": 17.775816416593116, "grad_norm": 3.370867967605591, "learning_rate": 7.653766177053715e-06, "loss": 0.04859219, "memory(GiB)": 15.03, "step": 10070, "train_speed(iter/s)": 1.466721 }, { "acc": 0.99333019, "epoch": 17.784642541924097, "grad_norm": 2.0927236080169678, "learning_rate": 7.65128998620701e-06, "loss": 0.04393213, "memory(GiB)": 15.03, "step": 10075, "train_speed(iter/s)": 1.466684 }, { "acc": 0.99421835, "epoch": 17.793468667255077, "grad_norm": 0.9678975343704224, "learning_rate": 7.648812890471988e-06, "loss": 0.04452407, "memory(GiB)": 15.03, "step": 10080, "train_speed(iter/s)": 1.466683 }, { "acc": 0.99250431, "epoch": 17.802294792586054, "grad_norm": 2.181732177734375, "learning_rate": 7.646334890694244e-06, "loss": 0.04561382, "memory(GiB)": 15.03, "step": 10085, "train_speed(iter/s)": 1.466698 }, { "acc": 0.99487648, "epoch": 17.811120917917034, "grad_norm": 12.806998252868652, "learning_rate": 7.643855987719684e-06, "loss": 0.04191253, "memory(GiB)": 15.03, "step": 10090, "train_speed(iter/s)": 1.466721 }, { "acc": 0.99565096, "epoch": 17.819947043248014, "grad_norm": 6.200354099273682, "learning_rate": 7.641376182394514e-06, "loss": 0.02978515, "memory(GiB)": 15.03, "step": 10095, "train_speed(iter/s)": 1.466723 }, { "acc": 0.99398041, "epoch": 17.828773168578994, "grad_norm": 4.286774635314941, "learning_rate": 7.638895475565256e-06, "loss": 0.03020864, "memory(GiB)": 15.03, "step": 10100, "train_speed(iter/s)": 1.466715 }, { "acc": 0.99430494, "epoch": 17.837599293909975, "grad_norm": 3.5080573558807373, "learning_rate": 7.636413868078742e-06, "loss": 0.03891816, "memory(GiB)": 15.03, "step": 10105, "train_speed(iter/s)": 1.466748 }, { "acc": 0.9956769, "epoch": 17.846425419240955, "grad_norm": 3.4001429080963135, "learning_rate": 7.633931360782103e-06, "loss": 0.023072, "memory(GiB)": 15.03, "step": 10110, "train_speed(iter/s)": 1.466803 }, { "acc": 0.99387369, "epoch": 17.85525154457193, "grad_norm": 2.974884510040283, "learning_rate": 7.631447954522783e-06, "loss": 0.03672757, "memory(GiB)": 15.03, "step": 10115, "train_speed(iter/s)": 1.466809 }, { "acc": 0.99452486, "epoch": 17.864077669902912, "grad_norm": 3.7273781299591064, "learning_rate": 7.628963650148531e-06, "loss": 0.03709808, "memory(GiB)": 15.03, "step": 10120, "train_speed(iter/s)": 1.466795 }, { "acc": 0.99203148, "epoch": 17.872903795233892, "grad_norm": 2.4205644130706787, "learning_rate": 7.626478448507403e-06, "loss": 0.04875247, "memory(GiB)": 15.03, "step": 10125, "train_speed(iter/s)": 1.466792 }, { "acc": 0.99258099, "epoch": 17.881729920564872, "grad_norm": 2.762462854385376, "learning_rate": 7.623992350447762e-06, "loss": 0.04784314, "memory(GiB)": 15.03, "step": 10130, "train_speed(iter/s)": 1.466819 }, { "acc": 0.99225006, "epoch": 17.890556045895853, "grad_norm": 3.2449891567230225, "learning_rate": 7.621505356818272e-06, "loss": 0.04671842, "memory(GiB)": 15.03, "step": 10135, "train_speed(iter/s)": 1.466807 }, { "acc": 0.99590502, "epoch": 17.899382171226833, "grad_norm": 1.4878824949264526, "learning_rate": 7.619017468467912e-06, "loss": 0.04433917, "memory(GiB)": 15.03, "step": 10140, "train_speed(iter/s)": 1.466809 }, { "acc": 0.99263697, "epoch": 17.90820829655781, "grad_norm": 3.129918098449707, "learning_rate": 7.61652868624596e-06, "loss": 0.03509561, "memory(GiB)": 15.03, "step": 10145, "train_speed(iter/s)": 1.466796 }, { "acc": 0.99490223, "epoch": 17.91703442188879, "grad_norm": 3.6546027660369873, "learning_rate": 7.614039011001998e-06, "loss": 0.04252486, "memory(GiB)": 15.03, "step": 10150, "train_speed(iter/s)": 1.466802 }, { "acc": 0.99435101, "epoch": 17.92586054721977, "grad_norm": 2.6008570194244385, "learning_rate": 7.6115484435859185e-06, "loss": 0.04082156, "memory(GiB)": 15.03, "step": 10155, "train_speed(iter/s)": 1.466825 }, { "acc": 0.99716301, "epoch": 17.93468667255075, "grad_norm": 2.648212432861328, "learning_rate": 7.609056984847913e-06, "loss": 0.02625949, "memory(GiB)": 15.03, "step": 10160, "train_speed(iter/s)": 1.466869 }, { "acc": 0.99502916, "epoch": 17.94351279788173, "grad_norm": 1.7892282009124756, "learning_rate": 7.6065646356384815e-06, "loss": 0.04424247, "memory(GiB)": 15.03, "step": 10165, "train_speed(iter/s)": 1.466892 }, { "acc": 0.99540501, "epoch": 17.95233892321271, "grad_norm": 3.4043874740600586, "learning_rate": 7.604071396808425e-06, "loss": 0.0314017, "memory(GiB)": 15.03, "step": 10170, "train_speed(iter/s)": 1.466915 }, { "acc": 0.9944315, "epoch": 17.96116504854369, "grad_norm": 1.4133124351501465, "learning_rate": 7.601577269208849e-06, "loss": 0.03991629, "memory(GiB)": 15.03, "step": 10175, "train_speed(iter/s)": 1.466927 }, { "acc": 0.99466333, "epoch": 17.969991173874668, "grad_norm": 2.9340054988861084, "learning_rate": 7.599082253691166e-06, "loss": 0.0428099, "memory(GiB)": 15.03, "step": 10180, "train_speed(iter/s)": 1.466926 }, { "acc": 0.99562588, "epoch": 17.978817299205648, "grad_norm": 2.9268722534179688, "learning_rate": 7.596586351107081e-06, "loss": 0.04710588, "memory(GiB)": 15.03, "step": 10185, "train_speed(iter/s)": 1.466951 }, { "acc": 0.99392071, "epoch": 17.98764342453663, "grad_norm": 3.3984947204589844, "learning_rate": 7.594089562308614e-06, "loss": 0.04086304, "memory(GiB)": 15.03, "step": 10190, "train_speed(iter/s)": 1.466946 }, { "acc": 0.995121, "epoch": 17.99646954986761, "grad_norm": 2.177217483520508, "learning_rate": 7.591591888148085e-06, "loss": 0.03509564, "memory(GiB)": 15.03, "step": 10195, "train_speed(iter/s)": 1.466961 }, { "acc": 0.995051, "epoch": 18.00529567519859, "grad_norm": 3.2771718502044678, "learning_rate": 7.5890933294781075e-06, "loss": 0.03902372, "memory(GiB)": 15.03, "step": 10200, "train_speed(iter/s)": 1.466907 }, { "acc": 0.99567585, "epoch": 18.01412180052957, "grad_norm": 0.8040958642959595, "learning_rate": 7.5865938871516095e-06, "loss": 0.03313239, "memory(GiB)": 15.03, "step": 10205, "train_speed(iter/s)": 1.466939 }, { "acc": 0.99361019, "epoch": 18.022947925860546, "grad_norm": 2.789924144744873, "learning_rate": 7.584093562021813e-06, "loss": 0.04210989, "memory(GiB)": 15.03, "step": 10210, "train_speed(iter/s)": 1.46694 }, { "acc": 0.99369125, "epoch": 18.031774051191526, "grad_norm": 3.0302867889404297, "learning_rate": 7.581592354942239e-06, "loss": 0.03999768, "memory(GiB)": 15.03, "step": 10215, "train_speed(iter/s)": 1.466985 }, { "acc": 0.99391222, "epoch": 18.040600176522506, "grad_norm": 0.9447545409202576, "learning_rate": 7.5790902667667165e-06, "loss": 0.04656133, "memory(GiB)": 15.03, "step": 10220, "train_speed(iter/s)": 1.467001 }, { "acc": 0.99040527, "epoch": 18.049426301853487, "grad_norm": 6.38631534576416, "learning_rate": 7.576587298349371e-06, "loss": 0.07236059, "memory(GiB)": 15.03, "step": 10225, "train_speed(iter/s)": 1.467028 }, { "acc": 0.99601536, "epoch": 18.058252427184467, "grad_norm": 1.472269058227539, "learning_rate": 7.574083450544631e-06, "loss": 0.03763237, "memory(GiB)": 15.03, "step": 10230, "train_speed(iter/s)": 1.467043 }, { "acc": 0.99634266, "epoch": 18.067078552515447, "grad_norm": 2.7504637241363525, "learning_rate": 7.571578724207224e-06, "loss": 0.02447524, "memory(GiB)": 15.03, "step": 10235, "train_speed(iter/s)": 1.46704 }, { "acc": 0.99321594, "epoch": 18.075904677846424, "grad_norm": 2.173748731613159, "learning_rate": 7.569073120192175e-06, "loss": 0.05163277, "memory(GiB)": 15.03, "step": 10240, "train_speed(iter/s)": 1.467019 }, { "acc": 0.99421644, "epoch": 18.084730803177404, "grad_norm": 1.3330950736999512, "learning_rate": 7.566566639354812e-06, "loss": 0.04639413, "memory(GiB)": 15.03, "step": 10245, "train_speed(iter/s)": 1.467036 }, { "acc": 0.99543762, "epoch": 18.093556928508384, "grad_norm": 2.87141752243042, "learning_rate": 7.564059282550762e-06, "loss": 0.02888824, "memory(GiB)": 15.03, "step": 10250, "train_speed(iter/s)": 1.467053 }, { "acc": 0.99319849, "epoch": 18.102383053839365, "grad_norm": 2.5134356021881104, "learning_rate": 7.561551050635948e-06, "loss": 0.04523617, "memory(GiB)": 15.03, "step": 10255, "train_speed(iter/s)": 1.467065 }, { "acc": 0.99013309, "epoch": 18.111209179170345, "grad_norm": 4.766937255859375, "learning_rate": 7.559041944466595e-06, "loss": 0.06854216, "memory(GiB)": 15.03, "step": 10260, "train_speed(iter/s)": 1.467092 }, { "acc": 0.99400063, "epoch": 18.120035304501325, "grad_norm": 1.1949303150177002, "learning_rate": 7.556531964899229e-06, "loss": 0.04052277, "memory(GiB)": 15.03, "step": 10265, "train_speed(iter/s)": 1.467106 }, { "acc": 0.9955595, "epoch": 18.128861429832302, "grad_norm": 2.0645060539245605, "learning_rate": 7.554021112790664e-06, "loss": 0.03888389, "memory(GiB)": 15.03, "step": 10270, "train_speed(iter/s)": 1.467069 }, { "acc": 0.9949192, "epoch": 18.137687555163282, "grad_norm": 2.476905107498169, "learning_rate": 7.551509388998022e-06, "loss": 0.03007277, "memory(GiB)": 15.03, "step": 10275, "train_speed(iter/s)": 1.467077 }, { "acc": 0.99474125, "epoch": 18.146513680494262, "grad_norm": 1.4110442399978638, "learning_rate": 7.548996794378719e-06, "loss": 0.03628931, "memory(GiB)": 15.03, "step": 10280, "train_speed(iter/s)": 1.46706 }, { "acc": 0.99523544, "epoch": 18.155339805825243, "grad_norm": 4.216243267059326, "learning_rate": 7.546483329790469e-06, "loss": 0.03328291, "memory(GiB)": 15.03, "step": 10285, "train_speed(iter/s)": 1.467033 }, { "acc": 0.99291582, "epoch": 18.164165931156223, "grad_norm": 1.1969436407089233, "learning_rate": 7.543968996091276e-06, "loss": 0.05348536, "memory(GiB)": 15.03, "step": 10290, "train_speed(iter/s)": 1.467051 }, { "acc": 0.99495869, "epoch": 18.172992056487203, "grad_norm": 2.04093337059021, "learning_rate": 7.541453794139455e-06, "loss": 0.03742071, "memory(GiB)": 15.03, "step": 10295, "train_speed(iter/s)": 1.467018 }, { "acc": 0.99693375, "epoch": 18.181818181818183, "grad_norm": 2.262143135070801, "learning_rate": 7.538937724793607e-06, "loss": 0.02369385, "memory(GiB)": 15.03, "step": 10300, "train_speed(iter/s)": 1.467017 }, { "acc": 0.99695692, "epoch": 18.19064430714916, "grad_norm": 1.4326120615005493, "learning_rate": 7.536420788912627e-06, "loss": 0.02332468, "memory(GiB)": 15.03, "step": 10305, "train_speed(iter/s)": 1.467042 }, { "acc": 0.99731836, "epoch": 18.19947043248014, "grad_norm": 1.1645814180374146, "learning_rate": 7.533902987355716e-06, "loss": 0.02667435, "memory(GiB)": 15.03, "step": 10310, "train_speed(iter/s)": 1.467036 }, { "acc": 0.99265165, "epoch": 18.20829655781112, "grad_norm": 2.475341796875, "learning_rate": 7.531384320982361e-06, "loss": 0.04979344, "memory(GiB)": 15.03, "step": 10315, "train_speed(iter/s)": 1.467057 }, { "acc": 0.99641438, "epoch": 18.2171226831421, "grad_norm": 1.3961032629013062, "learning_rate": 7.528864790652348e-06, "loss": 0.0248911, "memory(GiB)": 15.03, "step": 10320, "train_speed(iter/s)": 1.467039 }, { "acc": 0.99522381, "epoch": 18.22594880847308, "grad_norm": 3.339519739151001, "learning_rate": 7.526344397225761e-06, "loss": 0.03121329, "memory(GiB)": 15.03, "step": 10325, "train_speed(iter/s)": 1.467074 }, { "acc": 0.99378014, "epoch": 18.23477493380406, "grad_norm": 3.781367540359497, "learning_rate": 7.52382314156297e-06, "loss": 0.04672502, "memory(GiB)": 15.03, "step": 10330, "train_speed(iter/s)": 1.467082 }, { "acc": 0.99525719, "epoch": 18.243601059135038, "grad_norm": 2.548447608947754, "learning_rate": 7.5213010245246495e-06, "loss": 0.03873033, "memory(GiB)": 15.03, "step": 10335, "train_speed(iter/s)": 1.467071 }, { "acc": 0.99468613, "epoch": 18.25242718446602, "grad_norm": 1.9319528341293335, "learning_rate": 7.518778046971762e-06, "loss": 0.0433689, "memory(GiB)": 15.03, "step": 10340, "train_speed(iter/s)": 1.467035 }, { "acc": 0.99676991, "epoch": 18.261253309797, "grad_norm": 2.8823227882385254, "learning_rate": 7.516254209765563e-06, "loss": 0.02964991, "memory(GiB)": 15.03, "step": 10345, "train_speed(iter/s)": 1.467079 }, { "acc": 0.99430399, "epoch": 18.27007943512798, "grad_norm": 2.357220411300659, "learning_rate": 7.513729513767607e-06, "loss": 0.0305687, "memory(GiB)": 15.03, "step": 10350, "train_speed(iter/s)": 1.467119 }, { "acc": 0.99687719, "epoch": 18.27890556045896, "grad_norm": 4.715262413024902, "learning_rate": 7.511203959839736e-06, "loss": 0.02537756, "memory(GiB)": 15.03, "step": 10355, "train_speed(iter/s)": 1.467155 }, { "acc": 0.99589071, "epoch": 18.28773168578994, "grad_norm": 3.202244997024536, "learning_rate": 7.508677548844088e-06, "loss": 0.04457169, "memory(GiB)": 15.03, "step": 10360, "train_speed(iter/s)": 1.467155 }, { "acc": 0.99524288, "epoch": 18.296557811120916, "grad_norm": 3.7213199138641357, "learning_rate": 7.506150281643091e-06, "loss": 0.02317304, "memory(GiB)": 15.03, "step": 10365, "train_speed(iter/s)": 1.467173 }, { "acc": 0.99177313, "epoch": 18.305383936451896, "grad_norm": 4.018159866333008, "learning_rate": 7.503622159099467e-06, "loss": 0.05468427, "memory(GiB)": 15.03, "step": 10370, "train_speed(iter/s)": 1.467187 }, { "acc": 0.99435425, "epoch": 18.314210061782877, "grad_norm": 1.7193354368209839, "learning_rate": 7.501093182076232e-06, "loss": 0.04521595, "memory(GiB)": 15.03, "step": 10375, "train_speed(iter/s)": 1.467208 }, { "acc": 0.99632416, "epoch": 18.323036187113857, "grad_norm": 2.220630645751953, "learning_rate": 7.4985633514366915e-06, "loss": 0.03403509, "memory(GiB)": 15.03, "step": 10380, "train_speed(iter/s)": 1.46716 }, { "acc": 0.99365101, "epoch": 18.331862312444837, "grad_norm": 4.286769866943359, "learning_rate": 7.49603266804444e-06, "loss": 0.03444353, "memory(GiB)": 15.03, "step": 10385, "train_speed(iter/s)": 1.46711 }, { "acc": 0.99465237, "epoch": 18.340688437775817, "grad_norm": 2.937446355819702, "learning_rate": 7.493501132763367e-06, "loss": 0.03146525, "memory(GiB)": 15.03, "step": 10390, "train_speed(iter/s)": 1.46716 }, { "acc": 0.99400406, "epoch": 18.349514563106798, "grad_norm": 2.7685256004333496, "learning_rate": 7.490968746457652e-06, "loss": 0.04558892, "memory(GiB)": 15.03, "step": 10395, "train_speed(iter/s)": 1.467174 }, { "acc": 0.99481373, "epoch": 18.358340688437774, "grad_norm": 2.3499014377593994, "learning_rate": 7.488435509991763e-06, "loss": 0.03854008, "memory(GiB)": 15.03, "step": 10400, "train_speed(iter/s)": 1.467189 }, { "acc": 0.99504528, "epoch": 18.367166813768755, "grad_norm": 1.1618775129318237, "learning_rate": 7.48590142423046e-06, "loss": 0.03567562, "memory(GiB)": 15.03, "step": 10405, "train_speed(iter/s)": 1.467165 }, { "acc": 0.99377995, "epoch": 18.375992939099735, "grad_norm": 3.6673824787139893, "learning_rate": 7.483366490038791e-06, "loss": 0.04789905, "memory(GiB)": 15.03, "step": 10410, "train_speed(iter/s)": 1.467158 }, { "acc": 0.99543419, "epoch": 18.384819064430715, "grad_norm": 1.2376039028167725, "learning_rate": 7.4808307082821005e-06, "loss": 0.02942109, "memory(GiB)": 15.03, "step": 10415, "train_speed(iter/s)": 1.467146 }, { "acc": 0.99411259, "epoch": 18.393645189761695, "grad_norm": 4.03737735748291, "learning_rate": 7.4782940798260116e-06, "loss": 0.03818934, "memory(GiB)": 15.03, "step": 10420, "train_speed(iter/s)": 1.467144 }, { "acc": 0.99318352, "epoch": 18.402471315092676, "grad_norm": 3.34446120262146, "learning_rate": 7.475756605536445e-06, "loss": 0.04556619, "memory(GiB)": 15.03, "step": 10425, "train_speed(iter/s)": 1.467177 }, { "acc": 0.99340858, "epoch": 18.411297440423652, "grad_norm": 3.5381062030792236, "learning_rate": 7.473218286279604e-06, "loss": 0.04372908, "memory(GiB)": 15.03, "step": 10430, "train_speed(iter/s)": 1.467176 }, { "acc": 0.994664, "epoch": 18.420123565754633, "grad_norm": 2.9087462425231934, "learning_rate": 7.470679122921987e-06, "loss": 0.03306186, "memory(GiB)": 15.03, "step": 10435, "train_speed(iter/s)": 1.467172 }, { "acc": 0.99824696, "epoch": 18.428949691085613, "grad_norm": 1.779293179512024, "learning_rate": 7.468139116330372e-06, "loss": 0.01927549, "memory(GiB)": 15.03, "step": 10440, "train_speed(iter/s)": 1.467148 }, { "acc": 0.99360828, "epoch": 18.437775816416593, "grad_norm": 2.235201597213745, "learning_rate": 7.465598267371834e-06, "loss": 0.05014745, "memory(GiB)": 15.03, "step": 10445, "train_speed(iter/s)": 1.467152 }, { "acc": 0.99401188, "epoch": 18.446601941747574, "grad_norm": 0.8639301657676697, "learning_rate": 7.46305657691373e-06, "loss": 0.03350316, "memory(GiB)": 15.03, "step": 10450, "train_speed(iter/s)": 1.467132 }, { "acc": 0.99513683, "epoch": 18.455428067078554, "grad_norm": 1.75994074344635, "learning_rate": 7.460514045823703e-06, "loss": 0.03282738, "memory(GiB)": 15.03, "step": 10455, "train_speed(iter/s)": 1.46715 }, { "acc": 0.99562855, "epoch": 18.46425419240953, "grad_norm": 3.1535375118255615, "learning_rate": 7.457970674969689e-06, "loss": 0.03616733, "memory(GiB)": 15.03, "step": 10460, "train_speed(iter/s)": 1.467117 }, { "acc": 0.99561386, "epoch": 18.47308031774051, "grad_norm": 2.127722978591919, "learning_rate": 7.455426465219906e-06, "loss": 0.03747438, "memory(GiB)": 15.03, "step": 10465, "train_speed(iter/s)": 1.467134 }, { "acc": 0.99530582, "epoch": 18.48190644307149, "grad_norm": 2.1746981143951416, "learning_rate": 7.4528814174428585e-06, "loss": 0.0382607, "memory(GiB)": 15.03, "step": 10470, "train_speed(iter/s)": 1.467162 }, { "acc": 0.99558449, "epoch": 18.49073256840247, "grad_norm": 1.371174693107605, "learning_rate": 7.450335532507339e-06, "loss": 0.03995432, "memory(GiB)": 15.03, "step": 10475, "train_speed(iter/s)": 1.467147 }, { "acc": 0.9950079, "epoch": 18.49955869373345, "grad_norm": 3.362412929534912, "learning_rate": 7.447788811282425e-06, "loss": 0.02788143, "memory(GiB)": 15.03, "step": 10480, "train_speed(iter/s)": 1.467166 }, { "acc": 0.99642982, "epoch": 18.508384819064432, "grad_norm": 2.306835412979126, "learning_rate": 7.445241254637479e-06, "loss": 0.02848786, "memory(GiB)": 15.03, "step": 10485, "train_speed(iter/s)": 1.467156 }, { "acc": 0.99731941, "epoch": 18.517210944395412, "grad_norm": 0.8975778222084045, "learning_rate": 7.4426928634421495e-06, "loss": 0.0277067, "memory(GiB)": 15.03, "step": 10490, "train_speed(iter/s)": 1.467098 }, { "acc": 0.99457798, "epoch": 18.52603706972639, "grad_norm": 2.491084337234497, "learning_rate": 7.440143638566367e-06, "loss": 0.04174317, "memory(GiB)": 15.03, "step": 10495, "train_speed(iter/s)": 1.467095 }, { "acc": 0.99581566, "epoch": 18.53486319505737, "grad_norm": 2.910156488418579, "learning_rate": 7.437593580880352e-06, "loss": 0.03075714, "memory(GiB)": 15.03, "step": 10500, "train_speed(iter/s)": 1.467083 }, { "acc": 0.99408598, "epoch": 18.54368932038835, "grad_norm": 4.178725242614746, "learning_rate": 7.435042691254606e-06, "loss": 0.0361092, "memory(GiB)": 15.03, "step": 10505, "train_speed(iter/s)": 1.4671 }, { "acc": 0.99570732, "epoch": 18.55251544571933, "grad_norm": 3.6524136066436768, "learning_rate": 7.432490970559911e-06, "loss": 0.03118411, "memory(GiB)": 15.03, "step": 10510, "train_speed(iter/s)": 1.467107 }, { "acc": 0.99366665, "epoch": 18.56134157105031, "grad_norm": 5.952239513397217, "learning_rate": 7.429938419667342e-06, "loss": 0.04356334, "memory(GiB)": 15.03, "step": 10515, "train_speed(iter/s)": 1.467121 }, { "acc": 0.99534178, "epoch": 18.57016769638129, "grad_norm": 1.6702674627304077, "learning_rate": 7.427385039448248e-06, "loss": 0.0398398, "memory(GiB)": 15.03, "step": 10520, "train_speed(iter/s)": 1.467146 }, { "acc": 0.99453058, "epoch": 18.578993821712267, "grad_norm": 4.231917381286621, "learning_rate": 7.424830830774268e-06, "loss": 0.04136465, "memory(GiB)": 15.03, "step": 10525, "train_speed(iter/s)": 1.467148 }, { "acc": 0.99281187, "epoch": 18.587819947043247, "grad_norm": 4.548876762390137, "learning_rate": 7.422275794517317e-06, "loss": 0.049172, "memory(GiB)": 15.03, "step": 10530, "train_speed(iter/s)": 1.467159 }, { "acc": 0.99853172, "epoch": 18.596646072374227, "grad_norm": 1.018575668334961, "learning_rate": 7.4197199315496e-06, "loss": 0.01610657, "memory(GiB)": 15.03, "step": 10535, "train_speed(iter/s)": 1.467164 }, { "acc": 0.99544373, "epoch": 18.605472197705208, "grad_norm": 4.717028617858887, "learning_rate": 7.417163242743598e-06, "loss": 0.03876404, "memory(GiB)": 15.03, "step": 10540, "train_speed(iter/s)": 1.467202 }, { "acc": 0.99599571, "epoch": 18.614298323036188, "grad_norm": 4.054540634155273, "learning_rate": 7.414605728972078e-06, "loss": 0.02925652, "memory(GiB)": 15.03, "step": 10545, "train_speed(iter/s)": 1.467214 }, { "acc": 0.99388628, "epoch": 18.623124448367168, "grad_norm": 4.056271553039551, "learning_rate": 7.412047391108086e-06, "loss": 0.05085248, "memory(GiB)": 15.03, "step": 10550, "train_speed(iter/s)": 1.467231 }, { "acc": 0.99335327, "epoch": 18.63195057369815, "grad_norm": 1.3387870788574219, "learning_rate": 7.409488230024949e-06, "loss": 0.04933652, "memory(GiB)": 15.03, "step": 10555, "train_speed(iter/s)": 1.467265 }, { "acc": 0.99510078, "epoch": 18.640776699029125, "grad_norm": 1.713810920715332, "learning_rate": 7.40692824659628e-06, "loss": 0.02552889, "memory(GiB)": 15.03, "step": 10560, "train_speed(iter/s)": 1.467272 }, { "acc": 0.99405403, "epoch": 18.649602824360105, "grad_norm": 3.5346474647521973, "learning_rate": 7.404367441695967e-06, "loss": 0.04058222, "memory(GiB)": 15.03, "step": 10565, "train_speed(iter/s)": 1.46727 }, { "acc": 0.99506922, "epoch": 18.658428949691086, "grad_norm": 1.1699702739715576, "learning_rate": 7.40180581619818e-06, "loss": 0.04928575, "memory(GiB)": 15.03, "step": 10570, "train_speed(iter/s)": 1.467293 }, { "acc": 0.99521313, "epoch": 18.667255075022066, "grad_norm": 1.9453082084655762, "learning_rate": 7.399243370977372e-06, "loss": 0.03345677, "memory(GiB)": 15.03, "step": 10575, "train_speed(iter/s)": 1.467332 }, { "acc": 0.99621115, "epoch": 18.676081200353046, "grad_norm": 2.6477348804473877, "learning_rate": 7.396680106908269e-06, "loss": 0.03394611, "memory(GiB)": 15.03, "step": 10580, "train_speed(iter/s)": 1.467335 }, { "acc": 0.9940279, "epoch": 18.684907325684026, "grad_norm": 1.255216360092163, "learning_rate": 7.394116024865884e-06, "loss": 0.03707152, "memory(GiB)": 15.03, "step": 10585, "train_speed(iter/s)": 1.467345 }, { "acc": 0.99684725, "epoch": 18.693733451015003, "grad_norm": 3.093287229537964, "learning_rate": 7.391551125725507e-06, "loss": 0.03515161, "memory(GiB)": 15.03, "step": 10590, "train_speed(iter/s)": 1.467347 }, { "acc": 0.99466496, "epoch": 18.702559576345983, "grad_norm": 3.6533730030059814, "learning_rate": 7.3889854103627056e-06, "loss": 0.02410012, "memory(GiB)": 15.03, "step": 10595, "train_speed(iter/s)": 1.467348 }, { "acc": 0.99441538, "epoch": 18.711385701676964, "grad_norm": 1.992003083229065, "learning_rate": 7.386418879653325e-06, "loss": 0.03446819, "memory(GiB)": 15.03, "step": 10600, "train_speed(iter/s)": 1.467378 }, { "acc": 0.99538727, "epoch": 18.720211827007944, "grad_norm": 3.4683218002319336, "learning_rate": 7.383851534473492e-06, "loss": 0.02743043, "memory(GiB)": 15.03, "step": 10605, "train_speed(iter/s)": 1.467371 }, { "acc": 0.99383459, "epoch": 18.729037952338924, "grad_norm": 3.5902442932128906, "learning_rate": 7.381283375699608e-06, "loss": 0.04721849, "memory(GiB)": 15.03, "step": 10610, "train_speed(iter/s)": 1.467378 }, { "acc": 0.99579029, "epoch": 18.737864077669904, "grad_norm": 1.290844440460205, "learning_rate": 7.378714404208357e-06, "loss": 0.03731512, "memory(GiB)": 15.03, "step": 10615, "train_speed(iter/s)": 1.467396 }, { "acc": 0.99556885, "epoch": 18.74669020300088, "grad_norm": 2.788574695587158, "learning_rate": 7.376144620876694e-06, "loss": 0.02662655, "memory(GiB)": 15.03, "step": 10620, "train_speed(iter/s)": 1.467399 }, { "acc": 0.99538727, "epoch": 18.75551632833186, "grad_norm": 2.085345506668091, "learning_rate": 7.3735740265818545e-06, "loss": 0.03352069, "memory(GiB)": 15.03, "step": 10625, "train_speed(iter/s)": 1.467413 }, { "acc": 0.99752979, "epoch": 18.76434245366284, "grad_norm": 4.585538387298584, "learning_rate": 7.371002622201353e-06, "loss": 0.01517713, "memory(GiB)": 15.03, "step": 10630, "train_speed(iter/s)": 1.467402 }, { "acc": 0.99498138, "epoch": 18.773168578993822, "grad_norm": 2.115297555923462, "learning_rate": 7.368430408612979e-06, "loss": 0.04396999, "memory(GiB)": 15.03, "step": 10635, "train_speed(iter/s)": 1.467408 }, { "acc": 0.991996, "epoch": 18.781994704324802, "grad_norm": 1.5553427934646606, "learning_rate": 7.3658573866947935e-06, "loss": 0.05116656, "memory(GiB)": 15.03, "step": 10640, "train_speed(iter/s)": 1.467388 }, { "acc": 0.99615107, "epoch": 18.790820829655782, "grad_norm": 1.5465439558029175, "learning_rate": 7.363283557325139e-06, "loss": 0.03175544, "memory(GiB)": 15.03, "step": 10645, "train_speed(iter/s)": 1.467398 }, { "acc": 0.99500656, "epoch": 18.799646954986763, "grad_norm": 1.886567234992981, "learning_rate": 7.3607089213826354e-06, "loss": 0.03611193, "memory(GiB)": 15.03, "step": 10650, "train_speed(iter/s)": 1.467439 }, { "acc": 0.99331074, "epoch": 18.80847308031774, "grad_norm": 2.538506269454956, "learning_rate": 7.3581334797461716e-06, "loss": 0.05150344, "memory(GiB)": 15.03, "step": 10655, "train_speed(iter/s)": 1.467483 }, { "acc": 0.9964489, "epoch": 18.81729920564872, "grad_norm": 3.850626230239868, "learning_rate": 7.3555572332949145e-06, "loss": 0.02917534, "memory(GiB)": 15.03, "step": 10660, "train_speed(iter/s)": 1.467497 }, { "acc": 0.99434958, "epoch": 18.8261253309797, "grad_norm": 2.3743114471435547, "learning_rate": 7.352980182908306e-06, "loss": 0.03739051, "memory(GiB)": 15.03, "step": 10665, "train_speed(iter/s)": 1.467476 }, { "acc": 0.99302082, "epoch": 18.83495145631068, "grad_norm": 2.8646974563598633, "learning_rate": 7.350402329466063e-06, "loss": 0.04427825, "memory(GiB)": 15.03, "step": 10670, "train_speed(iter/s)": 1.467489 }, { "acc": 0.99374447, "epoch": 18.84377758164166, "grad_norm": 2.4636521339416504, "learning_rate": 7.3478236738481764e-06, "loss": 0.04016417, "memory(GiB)": 15.03, "step": 10675, "train_speed(iter/s)": 1.467514 }, { "acc": 0.99264708, "epoch": 18.85260370697264, "grad_norm": 5.217719078063965, "learning_rate": 7.345244216934909e-06, "loss": 0.04429552, "memory(GiB)": 15.03, "step": 10680, "train_speed(iter/s)": 1.467515 }, { "acc": 0.99647942, "epoch": 18.861429832303617, "grad_norm": 0.6372243762016296, "learning_rate": 7.342663959606799e-06, "loss": 0.03412834, "memory(GiB)": 15.03, "step": 10685, "train_speed(iter/s)": 1.467498 }, { "acc": 0.995576, "epoch": 18.870255957634598, "grad_norm": 1.8631318807601929, "learning_rate": 7.340082902744658e-06, "loss": 0.03293747, "memory(GiB)": 15.03, "step": 10690, "train_speed(iter/s)": 1.467484 }, { "acc": 0.99523773, "epoch": 18.879082082965578, "grad_norm": 3.2314345836639404, "learning_rate": 7.3375010472295685e-06, "loss": 0.04369133, "memory(GiB)": 15.03, "step": 10695, "train_speed(iter/s)": 1.467504 }, { "acc": 0.99415836, "epoch": 18.887908208296558, "grad_norm": 3.273977518081665, "learning_rate": 7.334918393942888e-06, "loss": 0.03848065, "memory(GiB)": 15.03, "step": 10700, "train_speed(iter/s)": 1.467497 }, { "acc": 0.99557228, "epoch": 18.89673433362754, "grad_norm": 3.2116150856018066, "learning_rate": 7.332334943766244e-06, "loss": 0.03275875, "memory(GiB)": 15.03, "step": 10705, "train_speed(iter/s)": 1.467518 }, { "acc": 0.9957365, "epoch": 18.90556045895852, "grad_norm": 1.3480300903320312, "learning_rate": 7.329750697581538e-06, "loss": 0.03527932, "memory(GiB)": 15.03, "step": 10710, "train_speed(iter/s)": 1.467523 }, { "acc": 0.99310846, "epoch": 18.914386584289495, "grad_norm": 2.8697993755340576, "learning_rate": 7.327165656270944e-06, "loss": 0.05071274, "memory(GiB)": 15.03, "step": 10715, "train_speed(iter/s)": 1.467556 }, { "acc": 0.99379339, "epoch": 18.923212709620476, "grad_norm": 11.033556938171387, "learning_rate": 7.324579820716903e-06, "loss": 0.04387584, "memory(GiB)": 15.03, "step": 10720, "train_speed(iter/s)": 1.467586 }, { "acc": 0.99548578, "epoch": 18.932038834951456, "grad_norm": 4.459423542022705, "learning_rate": 7.3219931918021325e-06, "loss": 0.02735809, "memory(GiB)": 15.03, "step": 10725, "train_speed(iter/s)": 1.467581 }, { "acc": 0.99240389, "epoch": 18.940864960282436, "grad_norm": 3.479215145111084, "learning_rate": 7.319405770409617e-06, "loss": 0.05003611, "memory(GiB)": 15.03, "step": 10730, "train_speed(iter/s)": 1.467602 }, { "acc": 0.99598389, "epoch": 18.949691085613416, "grad_norm": 2.3896729946136475, "learning_rate": 7.316817557422615e-06, "loss": 0.02716452, "memory(GiB)": 15.03, "step": 10735, "train_speed(iter/s)": 1.467643 }, { "acc": 0.99426117, "epoch": 18.958517210944397, "grad_norm": 2.3748745918273926, "learning_rate": 7.314228553724652e-06, "loss": 0.03746675, "memory(GiB)": 15.03, "step": 10740, "train_speed(iter/s)": 1.467681 }, { "acc": 0.99465752, "epoch": 18.967343336275377, "grad_norm": 3.4426088333129883, "learning_rate": 7.311638760199523e-06, "loss": 0.03873794, "memory(GiB)": 15.03, "step": 10745, "train_speed(iter/s)": 1.467686 }, { "acc": 0.99650087, "epoch": 18.976169461606354, "grad_norm": 0.47353076934814453, "learning_rate": 7.309048177731297e-06, "loss": 0.02429324, "memory(GiB)": 15.03, "step": 10750, "train_speed(iter/s)": 1.467708 }, { "acc": 0.99524736, "epoch": 18.984995586937334, "grad_norm": 2.3355331420898438, "learning_rate": 7.3064568072043085e-06, "loss": 0.02962433, "memory(GiB)": 15.03, "step": 10755, "train_speed(iter/s)": 1.467695 }, { "acc": 0.99530106, "epoch": 18.993821712268314, "grad_norm": 2.087158203125, "learning_rate": 7.303864649503163e-06, "loss": 0.02913635, "memory(GiB)": 15.03, "step": 10760, "train_speed(iter/s)": 1.467706 }, { "acc": 0.99304905, "epoch": 19.002647837599294, "grad_norm": 1.9450145959854126, "learning_rate": 7.30127170551273e-06, "loss": 0.0394668, "memory(GiB)": 15.03, "step": 10765, "train_speed(iter/s)": 1.467625 }, { "acc": 0.99391747, "epoch": 19.011473962930275, "grad_norm": 2.262213706970215, "learning_rate": 7.298677976118153e-06, "loss": 0.04161475, "memory(GiB)": 15.03, "step": 10770, "train_speed(iter/s)": 1.467609 }, { "acc": 0.99503403, "epoch": 19.020300088261255, "grad_norm": 2.4803788661956787, "learning_rate": 7.296083462204846e-06, "loss": 0.03455219, "memory(GiB)": 15.03, "step": 10775, "train_speed(iter/s)": 1.467617 }, { "acc": 0.99536514, "epoch": 19.02912621359223, "grad_norm": 4.6899261474609375, "learning_rate": 7.293488164658482e-06, "loss": 0.03569758, "memory(GiB)": 15.03, "step": 10780, "train_speed(iter/s)": 1.467576 }, { "acc": 0.99441013, "epoch": 19.037952338923212, "grad_norm": 3.9374284744262695, "learning_rate": 7.2908920843650064e-06, "loss": 0.03880895, "memory(GiB)": 15.03, "step": 10785, "train_speed(iter/s)": 1.467608 }, { "acc": 0.99404221, "epoch": 19.046778464254192, "grad_norm": 1.7066391706466675, "learning_rate": 7.288295222210634e-06, "loss": 0.03856907, "memory(GiB)": 15.03, "step": 10790, "train_speed(iter/s)": 1.467604 }, { "acc": 0.99499435, "epoch": 19.055604589585172, "grad_norm": 3.8181586265563965, "learning_rate": 7.2856975790818405e-06, "loss": 0.03510127, "memory(GiB)": 15.03, "step": 10795, "train_speed(iter/s)": 1.467623 }, { "acc": 0.99455547, "epoch": 19.064430714916153, "grad_norm": 2.086409568786621, "learning_rate": 7.2830991558653765e-06, "loss": 0.0423415, "memory(GiB)": 15.03, "step": 10800, "train_speed(iter/s)": 1.467634 }, { "acc": 0.99608488, "epoch": 19.073256840247133, "grad_norm": 1.4715282917022705, "learning_rate": 7.280499953448248e-06, "loss": 0.03117564, "memory(GiB)": 15.03, "step": 10805, "train_speed(iter/s)": 1.467653 }, { "acc": 0.99412518, "epoch": 19.08208296557811, "grad_norm": 5.574045658111572, "learning_rate": 7.277899972717738e-06, "loss": 0.04187601, "memory(GiB)": 15.03, "step": 10810, "train_speed(iter/s)": 1.467686 }, { "acc": 0.9933897, "epoch": 19.09090909090909, "grad_norm": 4.753693103790283, "learning_rate": 7.27529921456139e-06, "loss": 0.05849199, "memory(GiB)": 15.03, "step": 10815, "train_speed(iter/s)": 1.467701 }, { "acc": 0.99801426, "epoch": 19.09973521624007, "grad_norm": 0.6694067120552063, "learning_rate": 7.27269767986701e-06, "loss": 0.01964336, "memory(GiB)": 15.03, "step": 10820, "train_speed(iter/s)": 1.467683 }, { "acc": 0.99357471, "epoch": 19.10856134157105, "grad_norm": 1.7670185565948486, "learning_rate": 7.270095369522676e-06, "loss": 0.03921113, "memory(GiB)": 15.03, "step": 10825, "train_speed(iter/s)": 1.46769 }, { "acc": 0.99679489, "epoch": 19.11738746690203, "grad_norm": 0.9229050278663635, "learning_rate": 7.267492284416723e-06, "loss": 0.02768255, "memory(GiB)": 15.03, "step": 10830, "train_speed(iter/s)": 1.467717 }, { "acc": 0.99568691, "epoch": 19.12621359223301, "grad_norm": 6.347385406494141, "learning_rate": 7.264888425437756e-06, "loss": 0.03251109, "memory(GiB)": 15.03, "step": 10835, "train_speed(iter/s)": 1.46776 }, { "acc": 0.99480705, "epoch": 19.135039717563988, "grad_norm": 4.670385837554932, "learning_rate": 7.262283793474646e-06, "loss": 0.03692262, "memory(GiB)": 15.03, "step": 10840, "train_speed(iter/s)": 1.467782 }, { "acc": 0.99835768, "epoch": 19.143865842894968, "grad_norm": 1.0823428630828857, "learning_rate": 7.25967838941652e-06, "loss": 0.01729255, "memory(GiB)": 15.03, "step": 10845, "train_speed(iter/s)": 1.467784 }, { "acc": 0.99669342, "epoch": 19.152691968225948, "grad_norm": 1.9684052467346191, "learning_rate": 7.257072214152776e-06, "loss": 0.02767957, "memory(GiB)": 15.03, "step": 10850, "train_speed(iter/s)": 1.467788 }, { "acc": 0.99414425, "epoch": 19.16151809355693, "grad_norm": 3.192390203475952, "learning_rate": 7.25446526857307e-06, "loss": 0.04095367, "memory(GiB)": 15.03, "step": 10855, "train_speed(iter/s)": 1.467806 }, { "acc": 0.9948123, "epoch": 19.17034421888791, "grad_norm": 2.604891300201416, "learning_rate": 7.251857553567326e-06, "loss": 0.0363847, "memory(GiB)": 15.03, "step": 10860, "train_speed(iter/s)": 1.467829 }, { "acc": 0.99625454, "epoch": 19.17917034421889, "grad_norm": 1.9547412395477295, "learning_rate": 7.249249070025728e-06, "loss": 0.02475607, "memory(GiB)": 15.03, "step": 10865, "train_speed(iter/s)": 1.467821 }, { "acc": 0.9946949, "epoch": 19.18799646954987, "grad_norm": 2.949550151824951, "learning_rate": 7.246639818838719e-06, "loss": 0.02622627, "memory(GiB)": 15.03, "step": 10870, "train_speed(iter/s)": 1.467805 }, { "acc": 0.99375391, "epoch": 19.196822594880846, "grad_norm": 4.892343044281006, "learning_rate": 7.244029800897011e-06, "loss": 0.04245609, "memory(GiB)": 15.03, "step": 10875, "train_speed(iter/s)": 1.46783 }, { "acc": 0.99659233, "epoch": 19.205648720211826, "grad_norm": 2.948734760284424, "learning_rate": 7.241419017091572e-06, "loss": 0.03058265, "memory(GiB)": 15.03, "step": 10880, "train_speed(iter/s)": 1.467835 }, { "acc": 0.99268332, "epoch": 19.214474845542806, "grad_norm": 2.4911906719207764, "learning_rate": 7.238807468313636e-06, "loss": 0.05277511, "memory(GiB)": 15.03, "step": 10885, "train_speed(iter/s)": 1.467867 }, { "acc": 0.99622927, "epoch": 19.223300970873787, "grad_norm": 2.08198881149292, "learning_rate": 7.236195155454691e-06, "loss": 0.0251657, "memory(GiB)": 15.03, "step": 10890, "train_speed(iter/s)": 1.467845 }, { "acc": 0.99615974, "epoch": 19.232127096204767, "grad_norm": 2.272552251815796, "learning_rate": 7.233582079406495e-06, "loss": 0.03470866, "memory(GiB)": 15.03, "step": 10895, "train_speed(iter/s)": 1.467853 }, { "acc": 0.99405832, "epoch": 19.240953221535747, "grad_norm": 3.1820342540740967, "learning_rate": 7.230968241061062e-06, "loss": 0.03793371, "memory(GiB)": 15.03, "step": 10900, "train_speed(iter/s)": 1.46788 }, { "acc": 0.9936758, "epoch": 19.249779346866724, "grad_norm": 2.3926382064819336, "learning_rate": 7.228353641310663e-06, "loss": 0.04519944, "memory(GiB)": 15.03, "step": 10905, "train_speed(iter/s)": 1.467907 }, { "acc": 0.99623013, "epoch": 19.258605472197704, "grad_norm": 1.8664648532867432, "learning_rate": 7.225738281047835e-06, "loss": 0.03418368, "memory(GiB)": 15.03, "step": 10910, "train_speed(iter/s)": 1.467933 }, { "acc": 0.99755821, "epoch": 19.267431597528685, "grad_norm": 1.999851942062378, "learning_rate": 7.22312216116537e-06, "loss": 0.01712024, "memory(GiB)": 15.03, "step": 10915, "train_speed(iter/s)": 1.467947 }, { "acc": 0.99687176, "epoch": 19.276257722859665, "grad_norm": 2.270970344543457, "learning_rate": 7.220505282556323e-06, "loss": 0.02998117, "memory(GiB)": 15.03, "step": 10920, "train_speed(iter/s)": 1.467986 }, { "acc": 0.99378414, "epoch": 19.285083848190645, "grad_norm": 2.4827682971954346, "learning_rate": 7.217887646114006e-06, "loss": 0.04486508, "memory(GiB)": 15.03, "step": 10925, "train_speed(iter/s)": 1.468022 }, { "acc": 0.99651909, "epoch": 19.293909973521625, "grad_norm": 2.0483808517456055, "learning_rate": 7.215269252731988e-06, "loss": 0.02996536, "memory(GiB)": 15.03, "step": 10930, "train_speed(iter/s)": 1.468036 }, { "acc": 0.99690971, "epoch": 19.302736098852602, "grad_norm": 2.413815498352051, "learning_rate": 7.212650103304099e-06, "loss": 0.03561395, "memory(GiB)": 15.03, "step": 10935, "train_speed(iter/s)": 1.468058 }, { "acc": 0.99548626, "epoch": 19.311562224183582, "grad_norm": 1.4509217739105225, "learning_rate": 7.210030198724427e-06, "loss": 0.03171998, "memory(GiB)": 15.03, "step": 10940, "train_speed(iter/s)": 1.468052 }, { "acc": 0.99803219, "epoch": 19.320388349514563, "grad_norm": 0.7805094122886658, "learning_rate": 7.207409539887318e-06, "loss": 0.02620303, "memory(GiB)": 15.03, "step": 10945, "train_speed(iter/s)": 1.468059 }, { "acc": 0.99561939, "epoch": 19.329214474845543, "grad_norm": 2.8705122470855713, "learning_rate": 7.204788127687371e-06, "loss": 0.03580691, "memory(GiB)": 15.03, "step": 10950, "train_speed(iter/s)": 1.468044 }, { "acc": 0.99502602, "epoch": 19.338040600176523, "grad_norm": 4.257328987121582, "learning_rate": 7.202165963019449e-06, "loss": 0.03754888, "memory(GiB)": 15.03, "step": 10955, "train_speed(iter/s)": 1.468069 }, { "acc": 0.9966918, "epoch": 19.346866725507503, "grad_norm": 2.325326919555664, "learning_rate": 7.199543046778669e-06, "loss": 0.02917108, "memory(GiB)": 15.03, "step": 10960, "train_speed(iter/s)": 1.468054 }, { "acc": 0.99405346, "epoch": 19.355692850838484, "grad_norm": 4.204026699066162, "learning_rate": 7.196919379860402e-06, "loss": 0.03183196, "memory(GiB)": 15.03, "step": 10965, "train_speed(iter/s)": 1.468044 }, { "acc": 0.99375486, "epoch": 19.36451897616946, "grad_norm": 1.617804765701294, "learning_rate": 7.1942949631602785e-06, "loss": 0.03911158, "memory(GiB)": 15.03, "step": 10970, "train_speed(iter/s)": 1.468025 }, { "acc": 0.99700823, "epoch": 19.37334510150044, "grad_norm": 2.3844871520996094, "learning_rate": 7.191669797574183e-06, "loss": 0.03660068, "memory(GiB)": 15.03, "step": 10975, "train_speed(iter/s)": 1.468031 }, { "acc": 0.99406462, "epoch": 19.38217122683142, "grad_norm": 2.341581106185913, "learning_rate": 7.189043883998259e-06, "loss": 0.0383655, "memory(GiB)": 15.03, "step": 10980, "train_speed(iter/s)": 1.468072 }, { "acc": 0.99716549, "epoch": 19.3909973521624, "grad_norm": 2.7627158164978027, "learning_rate": 7.1864172233289e-06, "loss": 0.02428824, "memory(GiB)": 15.03, "step": 10985, "train_speed(iter/s)": 1.468069 }, { "acc": 0.9953392, "epoch": 19.39982347749338, "grad_norm": 2.2740631103515625, "learning_rate": 7.183789816462759e-06, "loss": 0.0480412, "memory(GiB)": 15.03, "step": 10990, "train_speed(iter/s)": 1.46809 }, { "acc": 0.99742861, "epoch": 19.40864960282436, "grad_norm": 2.1291842460632324, "learning_rate": 7.181161664296741e-06, "loss": 0.01687106, "memory(GiB)": 15.03, "step": 10995, "train_speed(iter/s)": 1.468058 }, { "acc": 0.99431372, "epoch": 19.41747572815534, "grad_norm": 2.393174886703491, "learning_rate": 7.178532767728007e-06, "loss": 0.04169644, "memory(GiB)": 15.03, "step": 11000, "train_speed(iter/s)": 1.46806 }, { "acc": 0.99599476, "epoch": 19.42630185348632, "grad_norm": 0.85368812084198, "learning_rate": 7.1759031276539715e-06, "loss": 0.03650552, "memory(GiB)": 15.03, "step": 11005, "train_speed(iter/s)": 1.468108 }, { "acc": 0.99608173, "epoch": 19.4351279788173, "grad_norm": 1.1927298307418823, "learning_rate": 7.173272744972305e-06, "loss": 0.04052993, "memory(GiB)": 15.03, "step": 11010, "train_speed(iter/s)": 1.468063 }, { "acc": 0.997896, "epoch": 19.44395410414828, "grad_norm": 3.1490139961242676, "learning_rate": 7.170641620580927e-06, "loss": 0.0267843, "memory(GiB)": 15.03, "step": 11015, "train_speed(iter/s)": 1.468065 }, { "acc": 0.99446211, "epoch": 19.45278022947926, "grad_norm": 1.0020325183868408, "learning_rate": 7.168009755378013e-06, "loss": 0.04007209, "memory(GiB)": 15.03, "step": 11020, "train_speed(iter/s)": 1.468072 }, { "acc": 0.99525042, "epoch": 19.46160635481024, "grad_norm": 3.663456678390503, "learning_rate": 7.165377150261993e-06, "loss": 0.0400946, "memory(GiB)": 15.03, "step": 11025, "train_speed(iter/s)": 1.46808 }, { "acc": 0.99534302, "epoch": 19.470432480141216, "grad_norm": 1.5526318550109863, "learning_rate": 7.162743806131545e-06, "loss": 0.03993999, "memory(GiB)": 15.03, "step": 11030, "train_speed(iter/s)": 1.468073 }, { "acc": 0.99755611, "epoch": 19.479258605472197, "grad_norm": 1.7588688135147095, "learning_rate": 7.160109723885602e-06, "loss": 0.02071346, "memory(GiB)": 15.03, "step": 11035, "train_speed(iter/s)": 1.468089 }, { "acc": 0.99632616, "epoch": 19.488084730803177, "grad_norm": 2.1531319618225098, "learning_rate": 7.157474904423351e-06, "loss": 0.0223113, "memory(GiB)": 15.03, "step": 11040, "train_speed(iter/s)": 1.46808 }, { "acc": 0.99696236, "epoch": 19.496910856134157, "grad_norm": 2.0429131984710693, "learning_rate": 7.154839348644228e-06, "loss": 0.02561548, "memory(GiB)": 15.03, "step": 11045, "train_speed(iter/s)": 1.468117 }, { "acc": 0.99666004, "epoch": 19.505736981465137, "grad_norm": 0.9401593804359436, "learning_rate": 7.152203057447921e-06, "loss": 0.02855185, "memory(GiB)": 15.03, "step": 11050, "train_speed(iter/s)": 1.468145 }, { "acc": 0.99493256, "epoch": 19.514563106796118, "grad_norm": 3.294703960418701, "learning_rate": 7.149566031734366e-06, "loss": 0.04653559, "memory(GiB)": 15.03, "step": 11055, "train_speed(iter/s)": 1.468133 }, { "acc": 0.99630156, "epoch": 19.523389232127098, "grad_norm": 1.984369158744812, "learning_rate": 7.146928272403756e-06, "loss": 0.02637404, "memory(GiB)": 15.03, "step": 11060, "train_speed(iter/s)": 1.468135 }, { "acc": 0.99459858, "epoch": 19.532215357458075, "grad_norm": 2.1346423625946045, "learning_rate": 7.144289780356529e-06, "loss": 0.03658881, "memory(GiB)": 15.03, "step": 11065, "train_speed(iter/s)": 1.468138 }, { "acc": 0.9942399, "epoch": 19.541041482789055, "grad_norm": 4.539979457855225, "learning_rate": 7.1416505564933784e-06, "loss": 0.03980021, "memory(GiB)": 15.03, "step": 11070, "train_speed(iter/s)": 1.468148 }, { "acc": 0.99666138, "epoch": 19.549867608120035, "grad_norm": 3.2334940433502197, "learning_rate": 7.13901060171524e-06, "loss": 0.03226629, "memory(GiB)": 15.03, "step": 11075, "train_speed(iter/s)": 1.468177 }, { "acc": 0.99527493, "epoch": 19.558693733451015, "grad_norm": 3.20743465423584, "learning_rate": 7.136369916923307e-06, "loss": 0.03364353, "memory(GiB)": 15.03, "step": 11080, "train_speed(iter/s)": 1.468173 }, { "acc": 0.99390278, "epoch": 19.567519858781996, "grad_norm": 2.9292185306549072, "learning_rate": 7.1337285030190165e-06, "loss": 0.04123576, "memory(GiB)": 15.03, "step": 11085, "train_speed(iter/s)": 1.468158 }, { "acc": 0.99586611, "epoch": 19.576345984112976, "grad_norm": 2.841324806213379, "learning_rate": 7.131086360904055e-06, "loss": 0.03436969, "memory(GiB)": 15.03, "step": 11090, "train_speed(iter/s)": 1.468183 }, { "acc": 0.99846687, "epoch": 19.585172109443953, "grad_norm": 3.241793394088745, "learning_rate": 7.128443491480361e-06, "loss": 0.0151427, "memory(GiB)": 15.03, "step": 11095, "train_speed(iter/s)": 1.468206 }, { "acc": 0.99427881, "epoch": 19.593998234774933, "grad_norm": 1.2945324182510376, "learning_rate": 7.125799895650119e-06, "loss": 0.0353843, "memory(GiB)": 15.03, "step": 11100, "train_speed(iter/s)": 1.468245 }, { "acc": 0.99558678, "epoch": 19.602824360105913, "grad_norm": 2.394338369369507, "learning_rate": 7.123155574315758e-06, "loss": 0.0312039, "memory(GiB)": 15.03, "step": 11105, "train_speed(iter/s)": 1.46826 }, { "acc": 0.99789371, "epoch": 19.611650485436893, "grad_norm": 2.5023577213287354, "learning_rate": 7.120510528379961e-06, "loss": 0.02143299, "memory(GiB)": 15.03, "step": 11110, "train_speed(iter/s)": 1.468306 }, { "acc": 0.99394903, "epoch": 19.620476610767874, "grad_norm": 2.4616453647613525, "learning_rate": 7.117864758745657e-06, "loss": 0.03355768, "memory(GiB)": 15.03, "step": 11115, "train_speed(iter/s)": 1.468333 }, { "acc": 0.99515038, "epoch": 19.629302736098854, "grad_norm": 2.3207640647888184, "learning_rate": 7.115218266316019e-06, "loss": 0.0349163, "memory(GiB)": 15.03, "step": 11120, "train_speed(iter/s)": 1.468359 }, { "acc": 0.99438725, "epoch": 19.63812886142983, "grad_norm": 2.404448986053467, "learning_rate": 7.112571051994465e-06, "loss": 0.04362395, "memory(GiB)": 15.03, "step": 11125, "train_speed(iter/s)": 1.468351 }, { "acc": 0.99685307, "epoch": 19.64695498676081, "grad_norm": 2.3514931201934814, "learning_rate": 7.109923116684667e-06, "loss": 0.02630526, "memory(GiB)": 15.03, "step": 11130, "train_speed(iter/s)": 1.468373 }, { "acc": 0.99625225, "epoch": 19.65578111209179, "grad_norm": 2.1144216060638428, "learning_rate": 7.1072744612905375e-06, "loss": 0.02962613, "memory(GiB)": 15.03, "step": 11135, "train_speed(iter/s)": 1.46838 }, { "acc": 0.99384155, "epoch": 19.66460723742277, "grad_norm": 5.430032253265381, "learning_rate": 7.104625086716237e-06, "loss": 0.04052506, "memory(GiB)": 15.03, "step": 11140, "train_speed(iter/s)": 1.468357 }, { "acc": 0.99484339, "epoch": 19.67343336275375, "grad_norm": 1.6834561824798584, "learning_rate": 7.1019749938661684e-06, "loss": 0.02852775, "memory(GiB)": 15.03, "step": 11145, "train_speed(iter/s)": 1.468344 }, { "acc": 0.99268131, "epoch": 19.682259488084732, "grad_norm": 2.4080328941345215, "learning_rate": 7.099324183644985e-06, "loss": 0.04416629, "memory(GiB)": 15.03, "step": 11150, "train_speed(iter/s)": 1.468337 }, { "acc": 0.99419432, "epoch": 19.691085613415712, "grad_norm": 1.477057695388794, "learning_rate": 7.096672656957579e-06, "loss": 0.04306775, "memory(GiB)": 15.03, "step": 11155, "train_speed(iter/s)": 1.468341 }, { "acc": 0.99742374, "epoch": 19.69991173874669, "grad_norm": 0.9261921644210815, "learning_rate": 7.094020414709093e-06, "loss": 0.01891157, "memory(GiB)": 15.03, "step": 11160, "train_speed(iter/s)": 1.468329 }, { "acc": 0.99485855, "epoch": 19.70873786407767, "grad_norm": 2.725114345550537, "learning_rate": 7.091367457804907e-06, "loss": 0.03999579, "memory(GiB)": 15.03, "step": 11165, "train_speed(iter/s)": 1.468304 }, { "acc": 0.99481916, "epoch": 19.71756398940865, "grad_norm": 2.305769443511963, "learning_rate": 7.088713787150655e-06, "loss": 0.04093505, "memory(GiB)": 15.03, "step": 11170, "train_speed(iter/s)": 1.468293 }, { "acc": 0.99480362, "epoch": 19.72639011473963, "grad_norm": 3.115745782852173, "learning_rate": 7.0860594036522035e-06, "loss": 0.03437588, "memory(GiB)": 15.03, "step": 11175, "train_speed(iter/s)": 1.468317 }, { "acc": 0.99541168, "epoch": 19.73521624007061, "grad_norm": 2.6249492168426514, "learning_rate": 7.0834043082156704e-06, "loss": 0.0428428, "memory(GiB)": 15.03, "step": 11180, "train_speed(iter/s)": 1.468318 }, { "acc": 0.995473, "epoch": 19.74404236540159, "grad_norm": 2.47493577003479, "learning_rate": 7.080748501747412e-06, "loss": 0.0302498, "memory(GiB)": 15.03, "step": 11185, "train_speed(iter/s)": 1.468303 }, { "acc": 0.99666882, "epoch": 19.752868490732567, "grad_norm": 2.635413408279419, "learning_rate": 7.078091985154029e-06, "loss": 0.02780136, "memory(GiB)": 15.03, "step": 11190, "train_speed(iter/s)": 1.468314 }, { "acc": 0.99820499, "epoch": 19.761694616063547, "grad_norm": 1.1189647912979126, "learning_rate": 7.075434759342367e-06, "loss": 0.0211196, "memory(GiB)": 15.03, "step": 11195, "train_speed(iter/s)": 1.468302 }, { "acc": 0.99699802, "epoch": 19.770520741394527, "grad_norm": 3.3009417057037354, "learning_rate": 7.07277682521951e-06, "loss": 0.02283935, "memory(GiB)": 15.03, "step": 11200, "train_speed(iter/s)": 1.468328 }, { "acc": 0.99601402, "epoch": 19.779346866725508, "grad_norm": 8.052562713623047, "learning_rate": 7.070118183692784e-06, "loss": 0.03550126, "memory(GiB)": 15.03, "step": 11205, "train_speed(iter/s)": 1.468339 }, { "acc": 0.99617519, "epoch": 19.788172992056488, "grad_norm": 0.32499030232429504, "learning_rate": 7.067458835669758e-06, "loss": 0.02759706, "memory(GiB)": 15.03, "step": 11210, "train_speed(iter/s)": 1.468358 }, { "acc": 0.9952858, "epoch": 19.796999117387468, "grad_norm": 2.1811575889587402, "learning_rate": 7.064798782058242e-06, "loss": 0.03751369, "memory(GiB)": 15.03, "step": 11215, "train_speed(iter/s)": 1.468395 }, { "acc": 0.99602013, "epoch": 19.805825242718445, "grad_norm": 2.3726305961608887, "learning_rate": 7.062138023766289e-06, "loss": 0.03963426, "memory(GiB)": 15.03, "step": 11220, "train_speed(iter/s)": 1.468381 }, { "acc": 0.9919241, "epoch": 19.814651368049425, "grad_norm": 1.2942469120025635, "learning_rate": 7.059476561702184e-06, "loss": 0.05754732, "memory(GiB)": 15.03, "step": 11225, "train_speed(iter/s)": 1.468421 }, { "acc": 0.99204521, "epoch": 19.823477493380405, "grad_norm": 2.516651153564453, "learning_rate": 7.056814396774465e-06, "loss": 0.06023258, "memory(GiB)": 15.03, "step": 11230, "train_speed(iter/s)": 1.468424 }, { "acc": 0.99616394, "epoch": 19.832303618711386, "grad_norm": 2.442690849304199, "learning_rate": 7.0541515298919e-06, "loss": 0.02614451, "memory(GiB)": 15.03, "step": 11235, "train_speed(iter/s)": 1.468421 }, { "acc": 0.99563313, "epoch": 19.841129744042366, "grad_norm": 1.1985772848129272, "learning_rate": 7.0514879619635e-06, "loss": 0.02989587, "memory(GiB)": 15.03, "step": 11240, "train_speed(iter/s)": 1.468415 }, { "acc": 0.99714518, "epoch": 19.849955869373346, "grad_norm": 0.8854919672012329, "learning_rate": 7.048823693898518e-06, "loss": 0.02250883, "memory(GiB)": 15.03, "step": 11245, "train_speed(iter/s)": 1.468466 }, { "acc": 0.99779263, "epoch": 19.858781994704326, "grad_norm": 3.6044888496398926, "learning_rate": 7.046158726606438e-06, "loss": 0.02668288, "memory(GiB)": 15.03, "step": 11250, "train_speed(iter/s)": 1.468497 }, { "acc": 0.99724312, "epoch": 19.867608120035303, "grad_norm": 1.4807323217391968, "learning_rate": 7.043493060996992e-06, "loss": 0.01715653, "memory(GiB)": 15.03, "step": 11255, "train_speed(iter/s)": 1.468488 }, { "acc": 0.9941988, "epoch": 19.876434245366283, "grad_norm": 1.7808079719543457, "learning_rate": 7.040826697980145e-06, "loss": 0.03544561, "memory(GiB)": 15.03, "step": 11260, "train_speed(iter/s)": 1.468492 }, { "acc": 0.99660501, "epoch": 19.885260370697264, "grad_norm": 1.296295404434204, "learning_rate": 7.038159638466103e-06, "loss": 0.04123459, "memory(GiB)": 15.03, "step": 11265, "train_speed(iter/s)": 1.468518 }, { "acc": 0.99579105, "epoch": 19.894086496028244, "grad_norm": 6.913495063781738, "learning_rate": 7.035491883365306e-06, "loss": 0.03334861, "memory(GiB)": 15.03, "step": 11270, "train_speed(iter/s)": 1.468534 }, { "acc": 0.9963316, "epoch": 19.902912621359224, "grad_norm": 2.5810697078704834, "learning_rate": 7.032823433588435e-06, "loss": 0.03747104, "memory(GiB)": 15.03, "step": 11275, "train_speed(iter/s)": 1.468573 }, { "acc": 0.99589901, "epoch": 19.911738746690204, "grad_norm": 1.9479440450668335, "learning_rate": 7.030154290046406e-06, "loss": 0.03271069, "memory(GiB)": 15.03, "step": 11280, "train_speed(iter/s)": 1.468576 }, { "acc": 0.99681244, "epoch": 19.92056487202118, "grad_norm": 3.104379653930664, "learning_rate": 7.0274844536503725e-06, "loss": 0.02020519, "memory(GiB)": 15.03, "step": 11285, "train_speed(iter/s)": 1.468613 }, { "acc": 0.99429102, "epoch": 19.92939099735216, "grad_norm": 0.6972684264183044, "learning_rate": 7.024813925311726e-06, "loss": 0.04893436, "memory(GiB)": 15.03, "step": 11290, "train_speed(iter/s)": 1.468679 }, { "acc": 0.99570999, "epoch": 19.93821712268314, "grad_norm": 0.5460999011993408, "learning_rate": 7.022142705942092e-06, "loss": 0.02719445, "memory(GiB)": 15.03, "step": 11295, "train_speed(iter/s)": 1.468682 }, { "acc": 0.99502201, "epoch": 19.947043248014122, "grad_norm": 1.7947614192962646, "learning_rate": 7.019470796453332e-06, "loss": 0.03299732, "memory(GiB)": 15.03, "step": 11300, "train_speed(iter/s)": 1.468664 }, { "acc": 0.99375305, "epoch": 19.955869373345102, "grad_norm": 12.335611343383789, "learning_rate": 7.016798197757545e-06, "loss": 0.04215955, "memory(GiB)": 15.03, "step": 11305, "train_speed(iter/s)": 1.468678 }, { "acc": 0.99384575, "epoch": 19.964695498676083, "grad_norm": 2.7483785152435303, "learning_rate": 7.0141249107670614e-06, "loss": 0.05568932, "memory(GiB)": 15.03, "step": 11310, "train_speed(iter/s)": 1.468704 }, { "acc": 0.99414215, "epoch": 19.97352162400706, "grad_norm": 2.33005428314209, "learning_rate": 7.0114509363944535e-06, "loss": 0.04052532, "memory(GiB)": 15.03, "step": 11315, "train_speed(iter/s)": 1.46871 }, { "acc": 0.99589968, "epoch": 19.98234774933804, "grad_norm": 1.8585147857666016, "learning_rate": 7.008776275552522e-06, "loss": 0.03028606, "memory(GiB)": 15.03, "step": 11320, "train_speed(iter/s)": 1.46873 }, { "acc": 0.99424152, "epoch": 19.99117387466902, "grad_norm": 2.7432055473327637, "learning_rate": 7.0061009291543015e-06, "loss": 0.04631053, "memory(GiB)": 15.03, "step": 11325, "train_speed(iter/s)": 1.468733 }, { "acc": 0.99528074, "epoch": 20.0, "grad_norm": 3.2807164192199707, "learning_rate": 7.003424898113066e-06, "loss": 0.03751671, "memory(GiB)": 15.03, "step": 11330, "train_speed(iter/s)": 1.468686 }, { "acc": 0.99672079, "epoch": 20.00882612533098, "grad_norm": 2.2897887229919434, "learning_rate": 7.00074818334232e-06, "loss": 0.02844768, "memory(GiB)": 15.03, "step": 11335, "train_speed(iter/s)": 1.468625 }, { "acc": 0.99547262, "epoch": 20.01765225066196, "grad_norm": 2.8390159606933594, "learning_rate": 6.998070785755801e-06, "loss": 0.02535103, "memory(GiB)": 15.03, "step": 11340, "train_speed(iter/s)": 1.468634 }, { "acc": 0.99586563, "epoch": 20.02647837599294, "grad_norm": 5.099536895751953, "learning_rate": 6.99539270626748e-06, "loss": 0.02873666, "memory(GiB)": 15.03, "step": 11345, "train_speed(iter/s)": 1.468646 }, { "acc": 0.99708157, "epoch": 20.035304501323917, "grad_norm": 3.303095579147339, "learning_rate": 6.992713945791561e-06, "loss": 0.02559377, "memory(GiB)": 15.03, "step": 11350, "train_speed(iter/s)": 1.468643 }, { "acc": 0.99774513, "epoch": 20.044130626654898, "grad_norm": 1.6195378303527832, "learning_rate": 6.99003450524248e-06, "loss": 0.01462144, "memory(GiB)": 15.03, "step": 11355, "train_speed(iter/s)": 1.468683 }, { "acc": 0.99739437, "epoch": 20.052956751985878, "grad_norm": 2.140660047531128, "learning_rate": 6.987354385534907e-06, "loss": 0.02646928, "memory(GiB)": 15.03, "step": 11360, "train_speed(iter/s)": 1.468712 }, { "acc": 0.99679947, "epoch": 20.06178287731686, "grad_norm": 3.718231201171875, "learning_rate": 6.9846735875837415e-06, "loss": 0.03526824, "memory(GiB)": 15.03, "step": 11365, "train_speed(iter/s)": 1.468708 }, { "acc": 0.99346123, "epoch": 20.07060900264784, "grad_norm": 4.256365776062012, "learning_rate": 6.981992112304118e-06, "loss": 0.04034773, "memory(GiB)": 15.03, "step": 11370, "train_speed(iter/s)": 1.468687 }, { "acc": 0.99267778, "epoch": 20.07943512797882, "grad_norm": 3.4269423484802246, "learning_rate": 6.979309960611394e-06, "loss": 0.05105261, "memory(GiB)": 15.03, "step": 11375, "train_speed(iter/s)": 1.468706 }, { "acc": 0.99731693, "epoch": 20.088261253309796, "grad_norm": 0.6813912987709045, "learning_rate": 6.97662713342117e-06, "loss": 0.01672795, "memory(GiB)": 15.03, "step": 11380, "train_speed(iter/s)": 1.468704 }, { "acc": 0.99665384, "epoch": 20.097087378640776, "grad_norm": 1.131717562675476, "learning_rate": 6.973943631649266e-06, "loss": 0.02891812, "memory(GiB)": 15.03, "step": 11385, "train_speed(iter/s)": 1.468688 }, { "acc": 0.9953825, "epoch": 20.105913503971756, "grad_norm": 3.4349350929260254, "learning_rate": 6.971259456211739e-06, "loss": 0.03010671, "memory(GiB)": 15.03, "step": 11390, "train_speed(iter/s)": 1.468693 }, { "acc": 0.99550238, "epoch": 20.114739629302736, "grad_norm": 3.54219913482666, "learning_rate": 6.968574608024875e-06, "loss": 0.02731645, "memory(GiB)": 15.03, "step": 11395, "train_speed(iter/s)": 1.468671 }, { "acc": 0.9941, "epoch": 20.123565754633717, "grad_norm": 2.6283931732177734, "learning_rate": 6.965889088005186e-06, "loss": 0.0360226, "memory(GiB)": 15.03, "step": 11400, "train_speed(iter/s)": 1.46867 }, { "acc": 0.99650574, "epoch": 20.132391879964697, "grad_norm": 1.3665896654129028, "learning_rate": 6.96320289706942e-06, "loss": 0.02430113, "memory(GiB)": 15.03, "step": 11405, "train_speed(iter/s)": 1.468708 }, { "acc": 0.9941123, "epoch": 20.141218005295674, "grad_norm": 1.8152916431427002, "learning_rate": 6.960516036134546e-06, "loss": 0.03601837, "memory(GiB)": 15.03, "step": 11410, "train_speed(iter/s)": 1.468689 }, { "acc": 0.99525986, "epoch": 20.150044130626654, "grad_norm": 1.8733443021774292, "learning_rate": 6.957828506117767e-06, "loss": 0.03524791, "memory(GiB)": 15.03, "step": 11415, "train_speed(iter/s)": 1.468682 }, { "acc": 0.99801464, "epoch": 20.158870255957634, "grad_norm": 1.9801520109176636, "learning_rate": 6.955140307936513e-06, "loss": 0.02017961, "memory(GiB)": 15.03, "step": 11420, "train_speed(iter/s)": 1.46867 }, { "acc": 0.996875, "epoch": 20.167696381288614, "grad_norm": 1.8554610013961792, "learning_rate": 6.952451442508442e-06, "loss": 0.0206806, "memory(GiB)": 15.03, "step": 11425, "train_speed(iter/s)": 1.468674 }, { "acc": 0.99598217, "epoch": 20.176522506619595, "grad_norm": 1.8216514587402344, "learning_rate": 6.949761910751443e-06, "loss": 0.02755051, "memory(GiB)": 15.03, "step": 11430, "train_speed(iter/s)": 1.468644 }, { "acc": 0.99687719, "epoch": 20.185348631950575, "grad_norm": 1.0684312582015991, "learning_rate": 6.947071713583623e-06, "loss": 0.0271515, "memory(GiB)": 15.03, "step": 11435, "train_speed(iter/s)": 1.468676 }, { "acc": 0.99669514, "epoch": 20.194174757281555, "grad_norm": 2.0526528358459473, "learning_rate": 6.944380851923327e-06, "loss": 0.02386459, "memory(GiB)": 15.03, "step": 11440, "train_speed(iter/s)": 1.468667 }, { "acc": 0.99540424, "epoch": 20.203000882612532, "grad_norm": 2.8723068237304688, "learning_rate": 6.941689326689125e-06, "loss": 0.04297107, "memory(GiB)": 15.03, "step": 11445, "train_speed(iter/s)": 1.468664 }, { "acc": 0.99805632, "epoch": 20.211827007943512, "grad_norm": 0.6782394051551819, "learning_rate": 6.9389971387998045e-06, "loss": 0.01729486, "memory(GiB)": 15.03, "step": 11450, "train_speed(iter/s)": 1.468651 }, { "acc": 0.99590406, "epoch": 20.220653133274492, "grad_norm": 4.669719696044922, "learning_rate": 6.9363042891743894e-06, "loss": 0.03167217, "memory(GiB)": 15.03, "step": 11455, "train_speed(iter/s)": 1.468652 }, { "acc": 0.99504929, "epoch": 20.229479258605473, "grad_norm": 2.5177853107452393, "learning_rate": 6.933610778732125e-06, "loss": 0.04419363, "memory(GiB)": 15.03, "step": 11460, "train_speed(iter/s)": 1.468658 }, { "acc": 0.99567184, "epoch": 20.238305383936453, "grad_norm": 2.9084784984588623, "learning_rate": 6.930916608392485e-06, "loss": 0.03834355, "memory(GiB)": 15.03, "step": 11465, "train_speed(iter/s)": 1.468679 }, { "acc": 0.99695959, "epoch": 20.247131509267433, "grad_norm": 1.3703031539916992, "learning_rate": 6.928221779075163e-06, "loss": 0.02379176, "memory(GiB)": 15.03, "step": 11470, "train_speed(iter/s)": 1.468676 }, { "acc": 0.99651375, "epoch": 20.25595763459841, "grad_norm": 3.626739501953125, "learning_rate": 6.925526291700081e-06, "loss": 0.03362564, "memory(GiB)": 15.03, "step": 11475, "train_speed(iter/s)": 1.468657 }, { "acc": 0.99639578, "epoch": 20.26478375992939, "grad_norm": 3.5015017986297607, "learning_rate": 6.922830147187387e-06, "loss": 0.02375913, "memory(GiB)": 15.03, "step": 11480, "train_speed(iter/s)": 1.468658 }, { "acc": 0.99798546, "epoch": 20.27360988526037, "grad_norm": 2.4422733783721924, "learning_rate": 6.9201333464574525e-06, "loss": 0.019798, "memory(GiB)": 15.03, "step": 11485, "train_speed(iter/s)": 1.46868 }, { "acc": 0.99689846, "epoch": 20.28243601059135, "grad_norm": 1.365190863609314, "learning_rate": 6.91743589043087e-06, "loss": 0.03342267, "memory(GiB)": 15.03, "step": 11490, "train_speed(iter/s)": 1.468697 }, { "acc": 0.99499474, "epoch": 20.29126213592233, "grad_norm": 4.2589826583862305, "learning_rate": 6.91473778002846e-06, "loss": 0.03429844, "memory(GiB)": 15.03, "step": 11495, "train_speed(iter/s)": 1.468716 }, { "acc": 0.99737091, "epoch": 20.30008826125331, "grad_norm": 1.1090666055679321, "learning_rate": 6.912039016171263e-06, "loss": 0.02993096, "memory(GiB)": 15.03, "step": 11500, "train_speed(iter/s)": 1.468723 }, { "acc": 0.99627209, "epoch": 20.308914386584288, "grad_norm": 3.159846067428589, "learning_rate": 6.909339599780545e-06, "loss": 0.02278384, "memory(GiB)": 15.03, "step": 11505, "train_speed(iter/s)": 1.468721 }, { "acc": 0.99491901, "epoch": 20.317740511915268, "grad_norm": 5.608421802520752, "learning_rate": 6.9066395317777935e-06, "loss": 0.03000953, "memory(GiB)": 15.03, "step": 11510, "train_speed(iter/s)": 1.468717 }, { "acc": 0.9949543, "epoch": 20.32656663724625, "grad_norm": 2.300891637802124, "learning_rate": 6.903938813084717e-06, "loss": 0.03353817, "memory(GiB)": 15.03, "step": 11515, "train_speed(iter/s)": 1.468732 }, { "acc": 0.9935482, "epoch": 20.33539276257723, "grad_norm": 4.690170764923096, "learning_rate": 6.901237444623251e-06, "loss": 0.04100528, "memory(GiB)": 15.03, "step": 11520, "train_speed(iter/s)": 1.468762 }, { "acc": 0.99669666, "epoch": 20.34421888790821, "grad_norm": 1.8368704319000244, "learning_rate": 6.898535427315546e-06, "loss": 0.02409161, "memory(GiB)": 15.03, "step": 11525, "train_speed(iter/s)": 1.468793 }, { "acc": 0.99359112, "epoch": 20.35304501323919, "grad_norm": 2.226534128189087, "learning_rate": 6.895832762083982e-06, "loss": 0.0541327, "memory(GiB)": 15.03, "step": 11530, "train_speed(iter/s)": 1.468785 }, { "acc": 0.99408779, "epoch": 20.36187113857017, "grad_norm": 3.1183969974517822, "learning_rate": 6.89312944985115e-06, "loss": 0.05608799, "memory(GiB)": 15.03, "step": 11535, "train_speed(iter/s)": 1.468832 }, { "acc": 0.99369392, "epoch": 20.370697263901146, "grad_norm": 3.6980717182159424, "learning_rate": 6.89042549153987e-06, "loss": 0.04531907, "memory(GiB)": 15.03, "step": 11540, "train_speed(iter/s)": 1.468858 }, { "acc": 0.99709167, "epoch": 20.379523389232126, "grad_norm": 2.482335329055786, "learning_rate": 6.887720888073184e-06, "loss": 0.02988101, "memory(GiB)": 15.03, "step": 11545, "train_speed(iter/s)": 1.468864 }, { "acc": 0.99691162, "epoch": 20.388349514563107, "grad_norm": 0.9966170191764832, "learning_rate": 6.885015640374344e-06, "loss": 0.02585528, "memory(GiB)": 15.03, "step": 11550, "train_speed(iter/s)": 1.468883 }, { "acc": 0.99638882, "epoch": 20.397175639894087, "grad_norm": 2.2512903213500977, "learning_rate": 6.882309749366834e-06, "loss": 0.03520979, "memory(GiB)": 15.03, "step": 11555, "train_speed(iter/s)": 1.468926 }, { "acc": 0.99674273, "epoch": 20.406001765225067, "grad_norm": 2.144690990447998, "learning_rate": 6.879603215974347e-06, "loss": 0.02868847, "memory(GiB)": 15.03, "step": 11560, "train_speed(iter/s)": 1.468959 }, { "acc": 0.99758091, "epoch": 20.414827890556047, "grad_norm": 1.4328352212905884, "learning_rate": 6.876896041120803e-06, "loss": 0.02432945, "memory(GiB)": 15.03, "step": 11565, "train_speed(iter/s)": 1.468972 }, { "acc": 0.99607277, "epoch": 20.423654015887024, "grad_norm": 1.373733639717102, "learning_rate": 6.874188225730338e-06, "loss": 0.02836418, "memory(GiB)": 15.03, "step": 11570, "train_speed(iter/s)": 1.46897 }, { "acc": 0.99668226, "epoch": 20.432480141218004, "grad_norm": 1.8620604276657104, "learning_rate": 6.871479770727308e-06, "loss": 0.02346237, "memory(GiB)": 15.03, "step": 11575, "train_speed(iter/s)": 1.468998 }, { "acc": 0.99700031, "epoch": 20.441306266548985, "grad_norm": 2.0306851863861084, "learning_rate": 6.868770677036283e-06, "loss": 0.02679109, "memory(GiB)": 15.03, "step": 11580, "train_speed(iter/s)": 1.468959 }, { "acc": 0.99562101, "epoch": 20.450132391879965, "grad_norm": 1.3890929222106934, "learning_rate": 6.866060945582056e-06, "loss": 0.0286984, "memory(GiB)": 15.03, "step": 11585, "train_speed(iter/s)": 1.468972 }, { "acc": 0.99470072, "epoch": 20.458958517210945, "grad_norm": 1.3300564289093018, "learning_rate": 6.8633505772896344e-06, "loss": 0.02993804, "memory(GiB)": 15.03, "step": 11590, "train_speed(iter/s)": 1.468972 }, { "acc": 0.99556713, "epoch": 20.467784642541925, "grad_norm": 1.7347384691238403, "learning_rate": 6.860639573084249e-06, "loss": 0.02999914, "memory(GiB)": 15.03, "step": 11595, "train_speed(iter/s)": 1.468986 }, { "acc": 0.99770679, "epoch": 20.476610767872902, "grad_norm": 1.7616405487060547, "learning_rate": 6.857927933891338e-06, "loss": 0.02087686, "memory(GiB)": 15.03, "step": 11600, "train_speed(iter/s)": 1.469001 }, { "acc": 0.99799767, "epoch": 20.485436893203882, "grad_norm": 1.437574028968811, "learning_rate": 6.855215660636563e-06, "loss": 0.01771725, "memory(GiB)": 15.03, "step": 11605, "train_speed(iter/s)": 1.469001 }, { "acc": 0.99594784, "epoch": 20.494263018534863, "grad_norm": 2.518404006958008, "learning_rate": 6.852502754245802e-06, "loss": 0.03124419, "memory(GiB)": 15.03, "step": 11610, "train_speed(iter/s)": 1.46903 }, { "acc": 0.9966013, "epoch": 20.503089143865843, "grad_norm": 1.1932785511016846, "learning_rate": 6.849789215645147e-06, "loss": 0.02774968, "memory(GiB)": 15.03, "step": 11615, "train_speed(iter/s)": 1.469042 }, { "acc": 0.99570637, "epoch": 20.511915269196823, "grad_norm": 1.392101526260376, "learning_rate": 6.847075045760906e-06, "loss": 0.03833887, "memory(GiB)": 15.03, "step": 11620, "train_speed(iter/s)": 1.469044 }, { "acc": 0.997052, "epoch": 20.520741394527803, "grad_norm": 0.9691382646560669, "learning_rate": 6.8443602455196045e-06, "loss": 0.04173261, "memory(GiB)": 15.03, "step": 11625, "train_speed(iter/s)": 1.469076 }, { "acc": 0.99297752, "epoch": 20.529567519858784, "grad_norm": 0.49479594826698303, "learning_rate": 6.841644815847981e-06, "loss": 0.04035722, "memory(GiB)": 15.03, "step": 11630, "train_speed(iter/s)": 1.46907 }, { "acc": 0.99598112, "epoch": 20.53839364518976, "grad_norm": 4.257979393005371, "learning_rate": 6.838928757672987e-06, "loss": 0.03700193, "memory(GiB)": 15.03, "step": 11635, "train_speed(iter/s)": 1.469098 }, { "acc": 0.99663906, "epoch": 20.54721977052074, "grad_norm": 0.6309407949447632, "learning_rate": 6.836212071921795e-06, "loss": 0.01609545, "memory(GiB)": 15.03, "step": 11640, "train_speed(iter/s)": 1.469068 }, { "acc": 0.99603577, "epoch": 20.55604589585172, "grad_norm": 1.8267923593521118, "learning_rate": 6.833494759521786e-06, "loss": 0.03420743, "memory(GiB)": 15.03, "step": 11645, "train_speed(iter/s)": 1.469067 }, { "acc": 0.9977211, "epoch": 20.5648720211827, "grad_norm": 3.3462960720062256, "learning_rate": 6.8307768214005555e-06, "loss": 0.01995946, "memory(GiB)": 15.03, "step": 11650, "train_speed(iter/s)": 1.469052 }, { "acc": 0.99728432, "epoch": 20.57369814651368, "grad_norm": 1.4648287296295166, "learning_rate": 6.828058258485918e-06, "loss": 0.01732111, "memory(GiB)": 15.03, "step": 11655, "train_speed(iter/s)": 1.469049 }, { "acc": 0.99526443, "epoch": 20.58252427184466, "grad_norm": 3.8418636322021484, "learning_rate": 6.825339071705892e-06, "loss": 0.04106879, "memory(GiB)": 15.03, "step": 11660, "train_speed(iter/s)": 1.469041 }, { "acc": 0.99783173, "epoch": 20.59135039717564, "grad_norm": 0.7641792893409729, "learning_rate": 6.822619261988717e-06, "loss": 0.01767133, "memory(GiB)": 15.03, "step": 11665, "train_speed(iter/s)": 1.46906 }, { "acc": 0.9918951, "epoch": 20.60017652250662, "grad_norm": 3.284404754638672, "learning_rate": 6.8198988302628435e-06, "loss": 0.06784381, "memory(GiB)": 15.03, "step": 11670, "train_speed(iter/s)": 1.469079 }, { "acc": 0.99730473, "epoch": 20.6090026478376, "grad_norm": 3.529428005218506, "learning_rate": 6.81717777745693e-06, "loss": 0.02792052, "memory(GiB)": 15.03, "step": 11675, "train_speed(iter/s)": 1.46909 }, { "acc": 0.99426479, "epoch": 20.61782877316858, "grad_norm": 0.6780421137809753, "learning_rate": 6.8144561044998525e-06, "loss": 0.04460815, "memory(GiB)": 15.03, "step": 11680, "train_speed(iter/s)": 1.469127 }, { "acc": 0.99710646, "epoch": 20.62665489849956, "grad_norm": 2.0737578868865967, "learning_rate": 6.811733812320697e-06, "loss": 0.02029042, "memory(GiB)": 15.03, "step": 11685, "train_speed(iter/s)": 1.469151 }, { "acc": 0.99600601, "epoch": 20.63548102383054, "grad_norm": 1.9989289045333862, "learning_rate": 6.809010901848756e-06, "loss": 0.02687613, "memory(GiB)": 15.03, "step": 11690, "train_speed(iter/s)": 1.469166 }, { "acc": 0.9957901, "epoch": 20.644307149161516, "grad_norm": 6.160000324249268, "learning_rate": 6.80628737401354e-06, "loss": 0.03375019, "memory(GiB)": 15.03, "step": 11695, "train_speed(iter/s)": 1.469195 }, { "acc": 0.99701157, "epoch": 20.653133274492497, "grad_norm": 1.7818267345428467, "learning_rate": 6.803563229744769e-06, "loss": 0.03192264, "memory(GiB)": 15.03, "step": 11700, "train_speed(iter/s)": 1.46919 }, { "acc": 0.99701061, "epoch": 20.661959399823477, "grad_norm": 3.1370503902435303, "learning_rate": 6.800838469972371e-06, "loss": 0.02632526, "memory(GiB)": 15.03, "step": 11705, "train_speed(iter/s)": 1.469209 }, { "acc": 0.99511929, "epoch": 20.670785525154457, "grad_norm": 1.4908334016799927, "learning_rate": 6.798113095626481e-06, "loss": 0.02894391, "memory(GiB)": 15.03, "step": 11710, "train_speed(iter/s)": 1.469222 }, { "acc": 0.99443197, "epoch": 20.679611650485437, "grad_norm": 4.057671070098877, "learning_rate": 6.795387107637454e-06, "loss": 0.05060911, "memory(GiB)": 15.03, "step": 11715, "train_speed(iter/s)": 1.469227 }, { "acc": 0.99691429, "epoch": 20.688437775816418, "grad_norm": 4.685373783111572, "learning_rate": 6.792660506935844e-06, "loss": 0.01696385, "memory(GiB)": 15.03, "step": 11720, "train_speed(iter/s)": 1.469255 }, { "acc": 0.99683924, "epoch": 20.697263901147398, "grad_norm": 5.088984966278076, "learning_rate": 6.789933294452421e-06, "loss": 0.02410519, "memory(GiB)": 15.03, "step": 11725, "train_speed(iter/s)": 1.469265 }, { "acc": 0.99727802, "epoch": 20.706090026478375, "grad_norm": 1.1669700145721436, "learning_rate": 6.787205471118161e-06, "loss": 0.02793313, "memory(GiB)": 15.03, "step": 11730, "train_speed(iter/s)": 1.469301 }, { "acc": 0.99842396, "epoch": 20.714916151809355, "grad_norm": 0.5507938861846924, "learning_rate": 6.784477037864248e-06, "loss": 0.02010109, "memory(GiB)": 15.03, "step": 11735, "train_speed(iter/s)": 1.469303 }, { "acc": 0.9952035, "epoch": 20.723742277140335, "grad_norm": 3.9843082427978516, "learning_rate": 6.781747995622076e-06, "loss": 0.02516325, "memory(GiB)": 15.03, "step": 11740, "train_speed(iter/s)": 1.469299 }, { "acc": 0.99646339, "epoch": 20.732568402471315, "grad_norm": 0.38103997707366943, "learning_rate": 6.779018345323243e-06, "loss": 0.02812132, "memory(GiB)": 15.03, "step": 11745, "train_speed(iter/s)": 1.469317 }, { "acc": 0.99530125, "epoch": 20.741394527802296, "grad_norm": 2.660245895385742, "learning_rate": 6.776288087899562e-06, "loss": 0.03623455, "memory(GiB)": 15.03, "step": 11750, "train_speed(iter/s)": 1.4693 }, { "acc": 0.9960371, "epoch": 20.750220653133276, "grad_norm": 2.908781051635742, "learning_rate": 6.773557224283045e-06, "loss": 0.03095592, "memory(GiB)": 15.03, "step": 11755, "train_speed(iter/s)": 1.46929 }, { "acc": 0.99592361, "epoch": 20.759046778464253, "grad_norm": 3.9631896018981934, "learning_rate": 6.770825755405918e-06, "loss": 0.02979634, "memory(GiB)": 15.03, "step": 11760, "train_speed(iter/s)": 1.469305 }, { "acc": 0.99707575, "epoch": 20.767872903795233, "grad_norm": 3.074695110321045, "learning_rate": 6.768093682200607e-06, "loss": 0.02365619, "memory(GiB)": 15.03, "step": 11765, "train_speed(iter/s)": 1.469287 }, { "acc": 0.99572287, "epoch": 20.776699029126213, "grad_norm": 1.6589990854263306, "learning_rate": 6.76536100559975e-06, "loss": 0.02784116, "memory(GiB)": 15.03, "step": 11770, "train_speed(iter/s)": 1.46929 }, { "acc": 0.99637823, "epoch": 20.785525154457194, "grad_norm": 2.3900651931762695, "learning_rate": 6.7626277265361865e-06, "loss": 0.02651716, "memory(GiB)": 15.03, "step": 11775, "train_speed(iter/s)": 1.469291 }, { "acc": 0.99662247, "epoch": 20.794351279788174, "grad_norm": 4.289814472198486, "learning_rate": 6.759893845942966e-06, "loss": 0.0272873, "memory(GiB)": 15.03, "step": 11780, "train_speed(iter/s)": 1.469335 }, { "acc": 0.99584713, "epoch": 20.803177405119154, "grad_norm": 2.644883155822754, "learning_rate": 6.75715936475334e-06, "loss": 0.02772457, "memory(GiB)": 15.03, "step": 11785, "train_speed(iter/s)": 1.469361 }, { "acc": 0.99635296, "epoch": 20.81200353045013, "grad_norm": 1.7745535373687744, "learning_rate": 6.7544242839007655e-06, "loss": 0.03528707, "memory(GiB)": 15.03, "step": 11790, "train_speed(iter/s)": 1.469351 }, { "acc": 0.99654942, "epoch": 20.82082965578111, "grad_norm": 2.4728307723999023, "learning_rate": 6.751688604318907e-06, "loss": 0.0426509, "memory(GiB)": 15.03, "step": 11795, "train_speed(iter/s)": 1.469353 }, { "acc": 0.99830456, "epoch": 20.82965578111209, "grad_norm": 1.6975289583206177, "learning_rate": 6.748952326941628e-06, "loss": 0.03239129, "memory(GiB)": 15.03, "step": 11800, "train_speed(iter/s)": 1.469391 }, { "acc": 0.99699831, "epoch": 20.83848190644307, "grad_norm": 3.1539804935455322, "learning_rate": 6.746215452703002e-06, "loss": 0.02566279, "memory(GiB)": 15.03, "step": 11805, "train_speed(iter/s)": 1.469394 }, { "acc": 0.99729843, "epoch": 20.847308031774052, "grad_norm": 1.8182134628295898, "learning_rate": 6.743477982537302e-06, "loss": 0.01901203, "memory(GiB)": 15.03, "step": 11810, "train_speed(iter/s)": 1.469408 }, { "acc": 0.99718456, "epoch": 20.856134157105032, "grad_norm": 2.1594107151031494, "learning_rate": 6.740739917379005e-06, "loss": 0.02424044, "memory(GiB)": 15.03, "step": 11815, "train_speed(iter/s)": 1.46943 }, { "acc": 0.9953619, "epoch": 20.864960282436012, "grad_norm": 1.6751805543899536, "learning_rate": 6.738001258162795e-06, "loss": 0.03592256, "memory(GiB)": 15.03, "step": 11820, "train_speed(iter/s)": 1.469452 }, { "acc": 0.99800949, "epoch": 20.87378640776699, "grad_norm": 1.6874191761016846, "learning_rate": 6.7352620058235556e-06, "loss": 0.01596551, "memory(GiB)": 15.03, "step": 11825, "train_speed(iter/s)": 1.469474 }, { "acc": 0.9945612, "epoch": 20.88261253309797, "grad_norm": 2.456399917602539, "learning_rate": 6.73252216129637e-06, "loss": 0.04395757, "memory(GiB)": 15.03, "step": 11830, "train_speed(iter/s)": 1.469494 }, { "acc": 0.99707947, "epoch": 20.89143865842895, "grad_norm": 3.087447166442871, "learning_rate": 6.7297817255165275e-06, "loss": 0.01898207, "memory(GiB)": 15.03, "step": 11835, "train_speed(iter/s)": 1.469504 }, { "acc": 0.99523497, "epoch": 20.90026478375993, "grad_norm": 1.2652016878128052, "learning_rate": 6.727040699419522e-06, "loss": 0.03784457, "memory(GiB)": 15.03, "step": 11840, "train_speed(iter/s)": 1.469544 }, { "acc": 0.99543209, "epoch": 20.90909090909091, "grad_norm": 1.991388201713562, "learning_rate": 6.72429908394104e-06, "loss": 0.0383831, "memory(GiB)": 15.03, "step": 11845, "train_speed(iter/s)": 1.469531 }, { "acc": 0.99763031, "epoch": 20.91791703442189, "grad_norm": 1.871523380279541, "learning_rate": 6.721556880016981e-06, "loss": 0.01620269, "memory(GiB)": 15.03, "step": 11850, "train_speed(iter/s)": 1.469531 }, { "acc": 0.99754009, "epoch": 20.926743159752867, "grad_norm": 1.9969696998596191, "learning_rate": 6.718814088583432e-06, "loss": 0.02180164, "memory(GiB)": 15.03, "step": 11855, "train_speed(iter/s)": 1.46953 }, { "acc": 0.99759054, "epoch": 20.935569285083847, "grad_norm": 1.7103426456451416, "learning_rate": 6.716070710576692e-06, "loss": 0.0228885, "memory(GiB)": 15.03, "step": 11860, "train_speed(iter/s)": 1.469556 }, { "acc": 0.9970871, "epoch": 20.944395410414828, "grad_norm": 2.210845947265625, "learning_rate": 6.713326746933253e-06, "loss": 0.01438115, "memory(GiB)": 15.03, "step": 11865, "train_speed(iter/s)": 1.469573 }, { "acc": 0.99451675, "epoch": 20.953221535745808, "grad_norm": 1.8248034715652466, "learning_rate": 6.7105821985898125e-06, "loss": 0.03563535, "memory(GiB)": 15.03, "step": 11870, "train_speed(iter/s)": 1.469581 }, { "acc": 0.9958643, "epoch": 20.962047661076788, "grad_norm": 3.112424612045288, "learning_rate": 6.70783706648326e-06, "loss": 0.02734413, "memory(GiB)": 15.03, "step": 11875, "train_speed(iter/s)": 1.469591 }, { "acc": 0.99503794, "epoch": 20.97087378640777, "grad_norm": 1.1112366914749146, "learning_rate": 6.705091351550692e-06, "loss": 0.0358901, "memory(GiB)": 15.03, "step": 11880, "train_speed(iter/s)": 1.469635 }, { "acc": 0.99646006, "epoch": 20.979699911738745, "grad_norm": 1.8977891206741333, "learning_rate": 6.702345054729404e-06, "loss": 0.0280488, "memory(GiB)": 15.03, "step": 11885, "train_speed(iter/s)": 1.469659 }, { "acc": 0.99080048, "epoch": 20.988526037069725, "grad_norm": 2.2494289875030518, "learning_rate": 6.699598176956882e-06, "loss": 0.05448233, "memory(GiB)": 15.03, "step": 11890, "train_speed(iter/s)": 1.469666 }, { "acc": 0.99707899, "epoch": 20.997352162400706, "grad_norm": 1.1353039741516113, "learning_rate": 6.6968507191708184e-06, "loss": 0.02903688, "memory(GiB)": 15.03, "step": 11895, "train_speed(iter/s)": 1.46966 }, { "acc": 0.99716854, "epoch": 21.006178287731686, "grad_norm": 4.935488700866699, "learning_rate": 6.694102682309099e-06, "loss": 0.02203473, "memory(GiB)": 15.03, "step": 11900, "train_speed(iter/s)": 1.469577 }, { "acc": 0.9978158, "epoch": 21.015004413062666, "grad_norm": 1.8264049291610718, "learning_rate": 6.6913540673098094e-06, "loss": 0.02377527, "memory(GiB)": 15.03, "step": 11905, "train_speed(iter/s)": 1.469601 }, { "acc": 0.99587355, "epoch": 21.023830538393646, "grad_norm": 2.491323709487915, "learning_rate": 6.68860487511123e-06, "loss": 0.02690804, "memory(GiB)": 15.03, "step": 11910, "train_speed(iter/s)": 1.469586 }, { "acc": 0.99479914, "epoch": 21.032656663724627, "grad_norm": 3.3668413162231445, "learning_rate": 6.685855106651844e-06, "loss": 0.03430663, "memory(GiB)": 15.03, "step": 11915, "train_speed(iter/s)": 1.469599 }, { "acc": 0.99628048, "epoch": 21.041482789055603, "grad_norm": 0.6758538484573364, "learning_rate": 6.683104762870326e-06, "loss": 0.02702948, "memory(GiB)": 15.03, "step": 11920, "train_speed(iter/s)": 1.469578 }, { "acc": 0.99599915, "epoch": 21.050308914386584, "grad_norm": 0.7934470176696777, "learning_rate": 6.680353844705548e-06, "loss": 0.02527221, "memory(GiB)": 15.03, "step": 11925, "train_speed(iter/s)": 1.469589 }, { "acc": 0.99395247, "epoch": 21.059135039717564, "grad_norm": 1.5018011331558228, "learning_rate": 6.677602353096579e-06, "loss": 0.03635088, "memory(GiB)": 15.03, "step": 11930, "train_speed(iter/s)": 1.469591 }, { "acc": 0.99938698, "epoch": 21.067961165048544, "grad_norm": 0.6373846530914307, "learning_rate": 6.674850288982684e-06, "loss": 0.01337044, "memory(GiB)": 15.03, "step": 11935, "train_speed(iter/s)": 1.469584 }, { "acc": 0.99658566, "epoch": 21.076787290379524, "grad_norm": 0.5049974322319031, "learning_rate": 6.67209765330332e-06, "loss": 0.02489654, "memory(GiB)": 15.03, "step": 11940, "train_speed(iter/s)": 1.469589 }, { "acc": 0.99618187, "epoch": 21.085613415710505, "grad_norm": 2.5880343914031982, "learning_rate": 6.6693444469981465e-06, "loss": 0.02630383, "memory(GiB)": 15.03, "step": 11945, "train_speed(iter/s)": 1.469599 }, { "acc": 0.99718361, "epoch": 21.09443954104148, "grad_norm": 2.1471877098083496, "learning_rate": 6.666590671007008e-06, "loss": 0.02118483, "memory(GiB)": 15.03, "step": 11950, "train_speed(iter/s)": 1.469647 }, { "acc": 0.99714031, "epoch": 21.10326566637246, "grad_norm": 1.0876911878585815, "learning_rate": 6.663836326269955e-06, "loss": 0.02380691, "memory(GiB)": 15.03, "step": 11955, "train_speed(iter/s)": 1.469645 }, { "acc": 0.99597168, "epoch": 21.112091791703442, "grad_norm": 4.622867584228516, "learning_rate": 6.661081413727218e-06, "loss": 0.02657543, "memory(GiB)": 15.03, "step": 11960, "train_speed(iter/s)": 1.469647 }, { "acc": 0.99369621, "epoch": 21.120917917034422, "grad_norm": 2.1674022674560547, "learning_rate": 6.658325934319236e-06, "loss": 0.03969138, "memory(GiB)": 15.03, "step": 11965, "train_speed(iter/s)": 1.469635 }, { "acc": 0.99739723, "epoch": 21.129744042365402, "grad_norm": 1.8469010591506958, "learning_rate": 6.655569888986633e-06, "loss": 0.01838326, "memory(GiB)": 15.03, "step": 11970, "train_speed(iter/s)": 1.469679 }, { "acc": 0.99759121, "epoch": 21.138570167696383, "grad_norm": 0.4702383279800415, "learning_rate": 6.652813278670225e-06, "loss": 0.02403338, "memory(GiB)": 15.03, "step": 11975, "train_speed(iter/s)": 1.469669 }, { "acc": 0.99592342, "epoch": 21.14739629302736, "grad_norm": 3.6091861724853516, "learning_rate": 6.650056104311026e-06, "loss": 0.0342767, "memory(GiB)": 15.03, "step": 11980, "train_speed(iter/s)": 1.469663 }, { "acc": 0.99669743, "epoch": 21.15622241835834, "grad_norm": 3.5297915935516357, "learning_rate": 6.647298366850241e-06, "loss": 0.03032171, "memory(GiB)": 15.03, "step": 11985, "train_speed(iter/s)": 1.469653 }, { "acc": 0.9976613, "epoch": 21.16504854368932, "grad_norm": 2.161700487136841, "learning_rate": 6.644540067229265e-06, "loss": 0.02021085, "memory(GiB)": 15.03, "step": 11990, "train_speed(iter/s)": 1.46963 }, { "acc": 0.99847422, "epoch": 21.1738746690203, "grad_norm": 1.1302474737167358, "learning_rate": 6.6417812063896854e-06, "loss": 0.01588413, "memory(GiB)": 15.03, "step": 11995, "train_speed(iter/s)": 1.469644 }, { "acc": 0.99749222, "epoch": 21.18270079435128, "grad_norm": 2.943385601043701, "learning_rate": 6.639021785273287e-06, "loss": 0.01925432, "memory(GiB)": 15.03, "step": 12000, "train_speed(iter/s)": 1.469659 }, { "acc": 0.99532776, "epoch": 21.19152691968226, "grad_norm": 3.1802737712860107, "learning_rate": 6.636261804822034e-06, "loss": 0.04267368, "memory(GiB)": 15.03, "step": 12005, "train_speed(iter/s)": 1.469632 }, { "acc": 0.99695559, "epoch": 21.20035304501324, "grad_norm": 2.611677885055542, "learning_rate": 6.633501265978094e-06, "loss": 0.02314058, "memory(GiB)": 15.03, "step": 12010, "train_speed(iter/s)": 1.469656 }, { "acc": 0.99655533, "epoch": 21.209179170344218, "grad_norm": 1.581214427947998, "learning_rate": 6.630740169683818e-06, "loss": 0.02349881, "memory(GiB)": 15.03, "step": 12015, "train_speed(iter/s)": 1.46966 }, { "acc": 0.99490223, "epoch": 21.218005295675198, "grad_norm": 0.9434047341346741, "learning_rate": 6.62797851688175e-06, "loss": 0.04626391, "memory(GiB)": 15.03, "step": 12020, "train_speed(iter/s)": 1.469676 }, { "acc": 0.99510202, "epoch": 21.226831421006178, "grad_norm": 4.894164562225342, "learning_rate": 6.625216308514622e-06, "loss": 0.03052229, "memory(GiB)": 15.03, "step": 12025, "train_speed(iter/s)": 1.469714 }, { "acc": 0.99721546, "epoch": 21.23565754633716, "grad_norm": 1.738499641418457, "learning_rate": 6.622453545525356e-06, "loss": 0.02209505, "memory(GiB)": 15.03, "step": 12030, "train_speed(iter/s)": 1.469727 }, { "acc": 0.9956501, "epoch": 21.24448367166814, "grad_norm": 1.1205945014953613, "learning_rate": 6.619690228857065e-06, "loss": 0.03763472, "memory(GiB)": 15.03, "step": 12035, "train_speed(iter/s)": 1.469713 }, { "acc": 0.99517155, "epoch": 21.25330979699912, "grad_norm": 5.202886581420898, "learning_rate": 6.616926359453052e-06, "loss": 0.04439218, "memory(GiB)": 15.03, "step": 12040, "train_speed(iter/s)": 1.469706 }, { "acc": 0.99631634, "epoch": 21.262135922330096, "grad_norm": 3.1302967071533203, "learning_rate": 6.614161938256801e-06, "loss": 0.03506793, "memory(GiB)": 15.03, "step": 12045, "train_speed(iter/s)": 1.469712 }, { "acc": 0.99649525, "epoch": 21.270962047661076, "grad_norm": 2.3465116024017334, "learning_rate": 6.611396966211997e-06, "loss": 0.02774972, "memory(GiB)": 15.03, "step": 12050, "train_speed(iter/s)": 1.469732 }, { "acc": 0.99517117, "epoch": 21.279788172992056, "grad_norm": 3.102801561355591, "learning_rate": 6.608631444262502e-06, "loss": 0.03145716, "memory(GiB)": 15.03, "step": 12055, "train_speed(iter/s)": 1.469721 }, { "acc": 0.99431763, "epoch": 21.288614298323036, "grad_norm": 1.9454317092895508, "learning_rate": 6.605865373352373e-06, "loss": 0.0388845, "memory(GiB)": 15.03, "step": 12060, "train_speed(iter/s)": 1.46975 }, { "acc": 0.99736328, "epoch": 21.297440423654017, "grad_norm": 2.873465061187744, "learning_rate": 6.603098754425845e-06, "loss": 0.03169807, "memory(GiB)": 15.03, "step": 12065, "train_speed(iter/s)": 1.46979 }, { "acc": 0.9957962, "epoch": 21.306266548984997, "grad_norm": 3.3915693759918213, "learning_rate": 6.6003315884273535e-06, "loss": 0.03005992, "memory(GiB)": 15.03, "step": 12070, "train_speed(iter/s)": 1.469801 }, { "acc": 0.99786148, "epoch": 21.315092674315974, "grad_norm": 2.9509599208831787, "learning_rate": 6.597563876301509e-06, "loss": 0.02271064, "memory(GiB)": 15.03, "step": 12075, "train_speed(iter/s)": 1.469816 }, { "acc": 0.99764986, "epoch": 21.323918799646954, "grad_norm": 0.44787609577178955, "learning_rate": 6.594795618993115e-06, "loss": 0.02073657, "memory(GiB)": 15.03, "step": 12080, "train_speed(iter/s)": 1.469839 }, { "acc": 0.99610529, "epoch": 21.332744924977934, "grad_norm": 0.5118236541748047, "learning_rate": 6.592026817447157e-06, "loss": 0.02801876, "memory(GiB)": 15.03, "step": 12085, "train_speed(iter/s)": 1.469843 }, { "acc": 0.99654102, "epoch": 21.341571050308914, "grad_norm": 2.234431028366089, "learning_rate": 6.589257472608812e-06, "loss": 0.03174267, "memory(GiB)": 15.03, "step": 12090, "train_speed(iter/s)": 1.469846 }, { "acc": 0.99623117, "epoch": 21.350397175639895, "grad_norm": 9.533373832702637, "learning_rate": 6.586487585423437e-06, "loss": 0.02845207, "memory(GiB)": 15.03, "step": 12095, "train_speed(iter/s)": 1.469904 }, { "acc": 0.99781342, "epoch": 21.359223300970875, "grad_norm": 2.7854256629943848, "learning_rate": 6.583717156836576e-06, "loss": 0.01911304, "memory(GiB)": 15.03, "step": 12100, "train_speed(iter/s)": 1.469934 }, { "acc": 0.99689322, "epoch": 21.368049426301855, "grad_norm": 1.3485133647918701, "learning_rate": 6.580946187793956e-06, "loss": 0.03447926, "memory(GiB)": 15.03, "step": 12105, "train_speed(iter/s)": 1.469922 }, { "acc": 0.99755096, "epoch": 21.376875551632832, "grad_norm": 0.8587046265602112, "learning_rate": 6.5781746792414935e-06, "loss": 0.02632097, "memory(GiB)": 15.03, "step": 12110, "train_speed(iter/s)": 1.46995 }, { "acc": 0.99761925, "epoch": 21.385701676963812, "grad_norm": 0.9185575246810913, "learning_rate": 6.575402632125284e-06, "loss": 0.02324752, "memory(GiB)": 15.03, "step": 12115, "train_speed(iter/s)": 1.469957 }, { "acc": 0.99792242, "epoch": 21.394527802294792, "grad_norm": 2.002448320388794, "learning_rate": 6.5726300473916085e-06, "loss": 0.02577079, "memory(GiB)": 15.03, "step": 12120, "train_speed(iter/s)": 1.469935 }, { "acc": 0.99563618, "epoch": 21.403353927625773, "grad_norm": 1.6840862035751343, "learning_rate": 6.569856925986934e-06, "loss": 0.02659895, "memory(GiB)": 15.03, "step": 12125, "train_speed(iter/s)": 1.469939 }, { "acc": 0.99351444, "epoch": 21.412180052956753, "grad_norm": 5.516316890716553, "learning_rate": 6.567083268857906e-06, "loss": 0.04408172, "memory(GiB)": 15.03, "step": 12130, "train_speed(iter/s)": 1.46989 }, { "acc": 0.99736404, "epoch": 21.421006178287733, "grad_norm": 1.3672001361846924, "learning_rate": 6.564309076951358e-06, "loss": 0.01886975, "memory(GiB)": 15.03, "step": 12135, "train_speed(iter/s)": 1.469848 }, { "acc": 0.99714174, "epoch": 21.42983230361871, "grad_norm": 2.4592437744140625, "learning_rate": 6.5615343512143014e-06, "loss": 0.02792178, "memory(GiB)": 15.03, "step": 12140, "train_speed(iter/s)": 1.469822 }, { "acc": 0.99778156, "epoch": 21.43865842894969, "grad_norm": 2.1994690895080566, "learning_rate": 6.558759092593932e-06, "loss": 0.02274104, "memory(GiB)": 15.03, "step": 12145, "train_speed(iter/s)": 1.469838 }, { "acc": 0.99572639, "epoch": 21.44748455428067, "grad_norm": 1.7375452518463135, "learning_rate": 6.555983302037629e-06, "loss": 0.0465883, "memory(GiB)": 15.03, "step": 12150, "train_speed(iter/s)": 1.469811 }, { "acc": 0.99582691, "epoch": 21.45631067961165, "grad_norm": 3.9422874450683594, "learning_rate": 6.55320698049295e-06, "loss": 0.03398085, "memory(GiB)": 15.03, "step": 12155, "train_speed(iter/s)": 1.469806 }, { "acc": 0.99696522, "epoch": 21.46513680494263, "grad_norm": 2.5316097736358643, "learning_rate": 6.550430128907636e-06, "loss": 0.02479848, "memory(GiB)": 15.03, "step": 12160, "train_speed(iter/s)": 1.469808 }, { "acc": 0.99528475, "epoch": 21.47396293027361, "grad_norm": 4.651650428771973, "learning_rate": 6.547652748229608e-06, "loss": 0.04180525, "memory(GiB)": 15.03, "step": 12165, "train_speed(iter/s)": 1.469806 }, { "acc": 0.99727345, "epoch": 21.482789055604588, "grad_norm": 3.478546380996704, "learning_rate": 6.544874839406967e-06, "loss": 0.01847852, "memory(GiB)": 15.03, "step": 12170, "train_speed(iter/s)": 1.469862 }, { "acc": 0.99639359, "epoch": 21.491615180935568, "grad_norm": 1.3012635707855225, "learning_rate": 6.542096403388e-06, "loss": 0.0291914, "memory(GiB)": 15.03, "step": 12175, "train_speed(iter/s)": 1.469865 }, { "acc": 0.99569597, "epoch": 21.50044130626655, "grad_norm": 2.6433963775634766, "learning_rate": 6.539317441121166e-06, "loss": 0.0336272, "memory(GiB)": 15.03, "step": 12180, "train_speed(iter/s)": 1.469887 }, { "acc": 0.99648056, "epoch": 21.50926743159753, "grad_norm": 2.29276180267334, "learning_rate": 6.5365379535551055e-06, "loss": 0.03466611, "memory(GiB)": 15.03, "step": 12185, "train_speed(iter/s)": 1.469897 }, { "acc": 0.99621582, "epoch": 21.51809355692851, "grad_norm": 1.0352094173431396, "learning_rate": 6.533757941638642e-06, "loss": 0.0334052, "memory(GiB)": 15.03, "step": 12190, "train_speed(iter/s)": 1.469899 }, { "acc": 0.99346361, "epoch": 21.52691968225949, "grad_norm": 1.7321921586990356, "learning_rate": 6.530977406320774e-06, "loss": 0.04463977, "memory(GiB)": 15.03, "step": 12195, "train_speed(iter/s)": 1.469914 }, { "acc": 0.99618492, "epoch": 21.535745807590466, "grad_norm": 0.5074664354324341, "learning_rate": 6.528196348550682e-06, "loss": 0.02173223, "memory(GiB)": 15.03, "step": 12200, "train_speed(iter/s)": 1.469916 }, { "acc": 0.99629002, "epoch": 21.544571932921446, "grad_norm": 1.638535499572754, "learning_rate": 6.52541476927772e-06, "loss": 0.0218591, "memory(GiB)": 15.03, "step": 12205, "train_speed(iter/s)": 1.469908 }, { "acc": 0.99915295, "epoch": 21.553398058252426, "grad_norm": 0.4073748290538788, "learning_rate": 6.522632669451431e-06, "loss": 0.01355221, "memory(GiB)": 15.03, "step": 12210, "train_speed(iter/s)": 1.469939 }, { "acc": 0.99766607, "epoch": 21.562224183583407, "grad_norm": 0.9364656805992126, "learning_rate": 6.519850050021518e-06, "loss": 0.02244304, "memory(GiB)": 15.03, "step": 12215, "train_speed(iter/s)": 1.469982 }, { "acc": 0.99502373, "epoch": 21.571050308914387, "grad_norm": 6.552870750427246, "learning_rate": 6.51706691193788e-06, "loss": 0.0314945, "memory(GiB)": 15.03, "step": 12220, "train_speed(iter/s)": 1.469983 }, { "acc": 0.99668255, "epoch": 21.579876434245367, "grad_norm": 3.931213855743408, "learning_rate": 6.514283256150579e-06, "loss": 0.02125878, "memory(GiB)": 15.03, "step": 12225, "train_speed(iter/s)": 1.470012 }, { "acc": 0.99752512, "epoch": 21.588702559576348, "grad_norm": 0.7141975164413452, "learning_rate": 6.51149908360986e-06, "loss": 0.02040848, "memory(GiB)": 15.03, "step": 12230, "train_speed(iter/s)": 1.470014 }, { "acc": 0.99553432, "epoch": 21.597528684907324, "grad_norm": 0.8063466548919678, "learning_rate": 6.508714395266146e-06, "loss": 0.0287303, "memory(GiB)": 15.03, "step": 12235, "train_speed(iter/s)": 1.470019 }, { "acc": 0.99608955, "epoch": 21.606354810238305, "grad_norm": 3.678741931915283, "learning_rate": 6.50592919207003e-06, "loss": 0.02700407, "memory(GiB)": 15.03, "step": 12240, "train_speed(iter/s)": 1.47003 }, { "acc": 0.99577141, "epoch": 21.615180935569285, "grad_norm": 2.0793259143829346, "learning_rate": 6.503143474972286e-06, "loss": 0.03648351, "memory(GiB)": 15.03, "step": 12245, "train_speed(iter/s)": 1.470057 }, { "acc": 0.9985177, "epoch": 21.624007060900265, "grad_norm": 2.31373929977417, "learning_rate": 6.500357244923863e-06, "loss": 0.01794302, "memory(GiB)": 15.03, "step": 12250, "train_speed(iter/s)": 1.470078 }, { "acc": 0.99579439, "epoch": 21.632833186231245, "grad_norm": 1.0123404264450073, "learning_rate": 6.497570502875883e-06, "loss": 0.02947093, "memory(GiB)": 15.03, "step": 12255, "train_speed(iter/s)": 1.470052 }, { "acc": 0.9946003, "epoch": 21.641659311562226, "grad_norm": 2.4923901557922363, "learning_rate": 6.494783249779644e-06, "loss": 0.03333504, "memory(GiB)": 15.03, "step": 12260, "train_speed(iter/s)": 1.470083 }, { "acc": 0.99431849, "epoch": 21.650485436893202, "grad_norm": 1.5751255750656128, "learning_rate": 6.491995486586615e-06, "loss": 0.05634852, "memory(GiB)": 15.03, "step": 12265, "train_speed(iter/s)": 1.470089 }, { "acc": 0.99509602, "epoch": 21.659311562224183, "grad_norm": 2.123697519302368, "learning_rate": 6.489207214248447e-06, "loss": 0.03828749, "memory(GiB)": 15.03, "step": 12270, "train_speed(iter/s)": 1.470079 }, { "acc": 0.99642897, "epoch": 21.668137687555163, "grad_norm": 0.6698795557022095, "learning_rate": 6.486418433716958e-06, "loss": 0.02728045, "memory(GiB)": 15.03, "step": 12275, "train_speed(iter/s)": 1.470101 }, { "acc": 0.9946846, "epoch": 21.676963812886143, "grad_norm": 2.228682518005371, "learning_rate": 6.483629145944142e-06, "loss": 0.04082271, "memory(GiB)": 15.03, "step": 12280, "train_speed(iter/s)": 1.470104 }, { "acc": 0.99580383, "epoch": 21.685789938217123, "grad_norm": 2.8437626361846924, "learning_rate": 6.4808393518821645e-06, "loss": 0.03329247, "memory(GiB)": 15.03, "step": 12285, "train_speed(iter/s)": 1.470103 }, { "acc": 0.99759102, "epoch": 21.694616063548104, "grad_norm": 0.9066205024719238, "learning_rate": 6.478049052483366e-06, "loss": 0.02215498, "memory(GiB)": 15.03, "step": 12290, "train_speed(iter/s)": 1.470124 }, { "acc": 0.99617224, "epoch": 21.70344218887908, "grad_norm": 2.1108481884002686, "learning_rate": 6.4752582487002555e-06, "loss": 0.03361204, "memory(GiB)": 15.03, "step": 12295, "train_speed(iter/s)": 1.470161 }, { "acc": 0.99680328, "epoch": 21.71226831421006, "grad_norm": 1.0020421743392944, "learning_rate": 6.472466941485522e-06, "loss": 0.0399758, "memory(GiB)": 15.03, "step": 12300, "train_speed(iter/s)": 1.470179 }, { "acc": 0.99787178, "epoch": 21.72109443954104, "grad_norm": 0.5664821863174438, "learning_rate": 6.4696751317920205e-06, "loss": 0.01695136, "memory(GiB)": 15.03, "step": 12305, "train_speed(iter/s)": 1.470164 }, { "acc": 0.99571838, "epoch": 21.72992056487202, "grad_norm": 2.2995386123657227, "learning_rate": 6.466882820572775e-06, "loss": 0.0336317, "memory(GiB)": 15.03, "step": 12310, "train_speed(iter/s)": 1.470168 }, { "acc": 0.99726868, "epoch": 21.738746690203, "grad_norm": 1.7595527172088623, "learning_rate": 6.464090008780989e-06, "loss": 0.02427807, "memory(GiB)": 15.03, "step": 12315, "train_speed(iter/s)": 1.470186 }, { "acc": 0.99820633, "epoch": 21.74757281553398, "grad_norm": 2.357659339904785, "learning_rate": 6.4612966973700285e-06, "loss": 0.01461508, "memory(GiB)": 15.03, "step": 12320, "train_speed(iter/s)": 1.470172 }, { "acc": 0.99308815, "epoch": 21.756398940864962, "grad_norm": 1.4471853971481323, "learning_rate": 6.458502887293438e-06, "loss": 0.04763771, "memory(GiB)": 15.03, "step": 12325, "train_speed(iter/s)": 1.470198 }, { "acc": 0.99572077, "epoch": 21.76522506619594, "grad_norm": 5.033434867858887, "learning_rate": 6.45570857950492e-06, "loss": 0.03285257, "memory(GiB)": 15.03, "step": 12330, "train_speed(iter/s)": 1.470195 }, { "acc": 0.99643297, "epoch": 21.77405119152692, "grad_norm": 0.834710955619812, "learning_rate": 6.452913774958365e-06, "loss": 0.02700616, "memory(GiB)": 15.03, "step": 12335, "train_speed(iter/s)": 1.470182 }, { "acc": 0.99599266, "epoch": 21.7828773168579, "grad_norm": 3.0165657997131348, "learning_rate": 6.450118474607815e-06, "loss": 0.02975655, "memory(GiB)": 15.03, "step": 12340, "train_speed(iter/s)": 1.470198 }, { "acc": 0.99309826, "epoch": 21.79170344218888, "grad_norm": 3.8383262157440186, "learning_rate": 6.447322679407494e-06, "loss": 0.04436788, "memory(GiB)": 15.03, "step": 12345, "train_speed(iter/s)": 1.470199 }, { "acc": 0.99861259, "epoch": 21.80052956751986, "grad_norm": 0.7108463048934937, "learning_rate": 6.444526390311786e-06, "loss": 0.01545538, "memory(GiB)": 15.03, "step": 12350, "train_speed(iter/s)": 1.470197 }, { "acc": 0.99676409, "epoch": 21.80935569285084, "grad_norm": 3.9989757537841797, "learning_rate": 6.441729608275252e-06, "loss": 0.02603663, "memory(GiB)": 15.03, "step": 12355, "train_speed(iter/s)": 1.470178 }, { "acc": 0.99804211, "epoch": 21.818181818181817, "grad_norm": 0.2673841118812561, "learning_rate": 6.438932334252614e-06, "loss": 0.01625914, "memory(GiB)": 15.03, "step": 12360, "train_speed(iter/s)": 1.470143 }, { "acc": 0.99743843, "epoch": 21.827007943512797, "grad_norm": 0.8972805738449097, "learning_rate": 6.436134569198766e-06, "loss": 0.01813404, "memory(GiB)": 15.03, "step": 12365, "train_speed(iter/s)": 1.470104 }, { "acc": 0.99849491, "epoch": 21.835834068843777, "grad_norm": 1.9717392921447754, "learning_rate": 6.433336314068767e-06, "loss": 0.01686722, "memory(GiB)": 15.03, "step": 12370, "train_speed(iter/s)": 1.470129 }, { "acc": 0.99534168, "epoch": 21.844660194174757, "grad_norm": 4.209080696105957, "learning_rate": 6.430537569817848e-06, "loss": 0.03990111, "memory(GiB)": 15.03, "step": 12375, "train_speed(iter/s)": 1.470131 }, { "acc": 0.9977951, "epoch": 21.853486319505738, "grad_norm": 6.2644734382629395, "learning_rate": 6.4277383374014e-06, "loss": 0.02411359, "memory(GiB)": 15.03, "step": 12380, "train_speed(iter/s)": 1.470163 }, { "acc": 0.99578981, "epoch": 21.862312444836718, "grad_norm": 0.3404533565044403, "learning_rate": 6.424938617774989e-06, "loss": 0.02112116, "memory(GiB)": 15.03, "step": 12385, "train_speed(iter/s)": 1.470147 }, { "acc": 0.99785337, "epoch": 21.871138570167695, "grad_norm": 0.8347015976905823, "learning_rate": 6.422138411894339e-06, "loss": 0.02146329, "memory(GiB)": 15.03, "step": 12390, "train_speed(iter/s)": 1.470165 }, { "acc": 0.99731731, "epoch": 21.879964695498675, "grad_norm": 1.323114037513733, "learning_rate": 6.4193377207153466e-06, "loss": 0.02242535, "memory(GiB)": 15.03, "step": 12395, "train_speed(iter/s)": 1.470143 }, { "acc": 0.99689693, "epoch": 21.888790820829655, "grad_norm": 1.8163048028945923, "learning_rate": 6.4165365451940665e-06, "loss": 0.03854343, "memory(GiB)": 15.03, "step": 12400, "train_speed(iter/s)": 1.470177 }, { "acc": 0.99649506, "epoch": 21.897616946160635, "grad_norm": 2.518533945083618, "learning_rate": 6.413734886286728e-06, "loss": 0.02717898, "memory(GiB)": 15.03, "step": 12405, "train_speed(iter/s)": 1.470164 }, { "acc": 0.99730473, "epoch": 21.906443071491616, "grad_norm": 0.7375039458274841, "learning_rate": 6.410932744949718e-06, "loss": 0.02835179, "memory(GiB)": 15.03, "step": 12410, "train_speed(iter/s)": 1.47016 }, { "acc": 0.99737806, "epoch": 21.915269196822596, "grad_norm": 3.1238718032836914, "learning_rate": 6.408130122139594e-06, "loss": 0.02262637, "memory(GiB)": 15.03, "step": 12415, "train_speed(iter/s)": 1.47016 }, { "acc": 0.99708405, "epoch": 21.924095322153576, "grad_norm": 3.2089340686798096, "learning_rate": 6.405327018813072e-06, "loss": 0.01896079, "memory(GiB)": 15.03, "step": 12420, "train_speed(iter/s)": 1.470167 }, { "acc": 0.9970665, "epoch": 21.932921447484553, "grad_norm": 1.6808823347091675, "learning_rate": 6.402523435927035e-06, "loss": 0.0144999, "memory(GiB)": 15.03, "step": 12425, "train_speed(iter/s)": 1.470146 }, { "acc": 0.9982543, "epoch": 21.941747572815533, "grad_norm": 0.7888918519020081, "learning_rate": 6.399719374438534e-06, "loss": 0.01799254, "memory(GiB)": 15.03, "step": 12430, "train_speed(iter/s)": 1.470153 }, { "acc": 0.99499035, "epoch": 21.950573698146513, "grad_norm": 1.3922125101089478, "learning_rate": 6.396914835304773e-06, "loss": 0.03787658, "memory(GiB)": 15.03, "step": 12435, "train_speed(iter/s)": 1.470135 }, { "acc": 0.99750271, "epoch": 21.959399823477494, "grad_norm": 0.7653646469116211, "learning_rate": 6.394109819483128e-06, "loss": 0.02133389, "memory(GiB)": 15.03, "step": 12440, "train_speed(iter/s)": 1.470137 }, { "acc": 0.99586668, "epoch": 21.968225948808474, "grad_norm": 2.8259332180023193, "learning_rate": 6.3913043279311326e-06, "loss": 0.02917753, "memory(GiB)": 15.03, "step": 12445, "train_speed(iter/s)": 1.470153 }, { "acc": 0.99521103, "epoch": 21.977052074139454, "grad_norm": 1.9689159393310547, "learning_rate": 6.388498361606488e-06, "loss": 0.04038506, "memory(GiB)": 15.03, "step": 12450, "train_speed(iter/s)": 1.470153 }, { "acc": 0.99823103, "epoch": 21.98587819947043, "grad_norm": 0.6455918550491333, "learning_rate": 6.385691921467051e-06, "loss": 0.01247414, "memory(GiB)": 15.03, "step": 12455, "train_speed(iter/s)": 1.47017 }, { "acc": 0.99610682, "epoch": 21.99470432480141, "grad_norm": 1.7752376794815063, "learning_rate": 6.382885008470847e-06, "loss": 0.02679209, "memory(GiB)": 15.03, "step": 12460, "train_speed(iter/s)": 1.470192 }, { "acc": 0.99684505, "epoch": 22.00353045013239, "grad_norm": 2.8056650161743164, "learning_rate": 6.380077623576058e-06, "loss": 0.02395639, "memory(GiB)": 15.03, "step": 12465, "train_speed(iter/s)": 1.470116 }, { "acc": 0.99659634, "epoch": 22.01235657546337, "grad_norm": 2.9710044860839844, "learning_rate": 6.377269767741028e-06, "loss": 0.02238261, "memory(GiB)": 15.03, "step": 12470, "train_speed(iter/s)": 1.470111 }, { "acc": 0.9976553, "epoch": 22.021182700794352, "grad_norm": 2.599707841873169, "learning_rate": 6.37446144192426e-06, "loss": 0.02024159, "memory(GiB)": 15.03, "step": 12475, "train_speed(iter/s)": 1.470129 }, { "acc": 0.99499083, "epoch": 22.030008826125332, "grad_norm": 1.7119064331054688, "learning_rate": 6.371652647084424e-06, "loss": 0.03483887, "memory(GiB)": 15.03, "step": 12480, "train_speed(iter/s)": 1.470128 }, { "acc": 0.99573126, "epoch": 22.038834951456312, "grad_norm": 4.901206016540527, "learning_rate": 6.368843384180343e-06, "loss": 0.03157653, "memory(GiB)": 15.03, "step": 12485, "train_speed(iter/s)": 1.470153 }, { "acc": 0.99619904, "epoch": 22.04766107678729, "grad_norm": 2.9656033515930176, "learning_rate": 6.366033654171003e-06, "loss": 0.02880386, "memory(GiB)": 15.03, "step": 12490, "train_speed(iter/s)": 1.470209 }, { "acc": 0.99488964, "epoch": 22.05648720211827, "grad_norm": 2.7418105602264404, "learning_rate": 6.3632234580155495e-06, "loss": 0.02868888, "memory(GiB)": 15.03, "step": 12495, "train_speed(iter/s)": 1.470226 }, { "acc": 0.99766407, "epoch": 22.06531332744925, "grad_norm": 2.402677297592163, "learning_rate": 6.3604127966732865e-06, "loss": 0.01865329, "memory(GiB)": 15.03, "step": 12500, "train_speed(iter/s)": 1.470229 }, { "acc": 0.99885616, "epoch": 22.07413945278023, "grad_norm": 1.1286042928695679, "learning_rate": 6.357601671103675e-06, "loss": 0.01343663, "memory(GiB)": 15.03, "step": 12505, "train_speed(iter/s)": 1.470249 }, { "acc": 0.99593658, "epoch": 22.08296557811121, "grad_norm": 1.5665249824523926, "learning_rate": 6.354790082266339e-06, "loss": 0.02946776, "memory(GiB)": 15.03, "step": 12510, "train_speed(iter/s)": 1.47025 }, { "acc": 0.99690228, "epoch": 22.09179170344219, "grad_norm": 1.9932502508163452, "learning_rate": 6.351978031121056e-06, "loss": 0.02379041, "memory(GiB)": 15.03, "step": 12515, "train_speed(iter/s)": 1.470267 }, { "acc": 0.99590683, "epoch": 22.100617828773167, "grad_norm": 4.905023574829102, "learning_rate": 6.349165518627765e-06, "loss": 0.02537633, "memory(GiB)": 15.03, "step": 12520, "train_speed(iter/s)": 1.470246 }, { "acc": 0.99761295, "epoch": 22.109443954104147, "grad_norm": 0.6413697600364685, "learning_rate": 6.346352545746559e-06, "loss": 0.01986172, "memory(GiB)": 15.03, "step": 12525, "train_speed(iter/s)": 1.470234 }, { "acc": 0.99769421, "epoch": 22.118270079435128, "grad_norm": 0.5271923542022705, "learning_rate": 6.343539113437691e-06, "loss": 0.02889602, "memory(GiB)": 15.03, "step": 12530, "train_speed(iter/s)": 1.470241 }, { "acc": 0.99800282, "epoch": 22.127096204766108, "grad_norm": 1.5917447805404663, "learning_rate": 6.340725222661572e-06, "loss": 0.01892549, "memory(GiB)": 15.03, "step": 12535, "train_speed(iter/s)": 1.470231 }, { "acc": 0.99689226, "epoch": 22.135922330097088, "grad_norm": 0.8039721846580505, "learning_rate": 6.337910874378761e-06, "loss": 0.02799923, "memory(GiB)": 15.03, "step": 12540, "train_speed(iter/s)": 1.470243 }, { "acc": 0.99730072, "epoch": 22.14474845542807, "grad_norm": 1.5794439315795898, "learning_rate": 6.3350960695499875e-06, "loss": 0.01892457, "memory(GiB)": 15.03, "step": 12545, "train_speed(iter/s)": 1.470273 }, { "acc": 0.99604483, "epoch": 22.153574580759045, "grad_norm": 1.6034001111984253, "learning_rate": 6.332280809136123e-06, "loss": 0.02440517, "memory(GiB)": 15.03, "step": 12550, "train_speed(iter/s)": 1.470279 }, { "acc": 0.99809036, "epoch": 22.162400706090025, "grad_norm": 1.4283536672592163, "learning_rate": 6.329465094098202e-06, "loss": 0.02077788, "memory(GiB)": 15.03, "step": 12555, "train_speed(iter/s)": 1.470268 }, { "acc": 0.99623318, "epoch": 22.171226831421006, "grad_norm": 1.2670496702194214, "learning_rate": 6.326648925397414e-06, "loss": 0.02937182, "memory(GiB)": 15.03, "step": 12560, "train_speed(iter/s)": 1.470271 }, { "acc": 0.9986043, "epoch": 22.180052956751986, "grad_norm": 0.9494180083274841, "learning_rate": 6.323832303995101e-06, "loss": 0.01714506, "memory(GiB)": 15.03, "step": 12565, "train_speed(iter/s)": 1.470262 }, { "acc": 0.99747849, "epoch": 22.188879082082966, "grad_norm": 4.890135765075684, "learning_rate": 6.321015230852759e-06, "loss": 0.01837468, "memory(GiB)": 15.03, "step": 12570, "train_speed(iter/s)": 1.470268 }, { "acc": 0.99686365, "epoch": 22.197705207413946, "grad_norm": 1.7979481220245361, "learning_rate": 6.318197706932042e-06, "loss": 0.03598395, "memory(GiB)": 15.03, "step": 12575, "train_speed(iter/s)": 1.470297 }, { "acc": 0.99814072, "epoch": 22.206531332744923, "grad_norm": 0.5792882442474365, "learning_rate": 6.315379733194752e-06, "loss": 0.02637296, "memory(GiB)": 15.03, "step": 12580, "train_speed(iter/s)": 1.470282 }, { "acc": 0.99417992, "epoch": 22.215357458075903, "grad_norm": 3.3636560440063477, "learning_rate": 6.312561310602852e-06, "loss": 0.04668866, "memory(GiB)": 15.03, "step": 12585, "train_speed(iter/s)": 1.470305 }, { "acc": 0.99574137, "epoch": 22.224183583406884, "grad_norm": 3.4154012203216553, "learning_rate": 6.309742440118453e-06, "loss": 0.02888146, "memory(GiB)": 15.03, "step": 12590, "train_speed(iter/s)": 1.470324 }, { "acc": 0.99719372, "epoch": 22.233009708737864, "grad_norm": 1.0094627141952515, "learning_rate": 6.3069231227038195e-06, "loss": 0.02002556, "memory(GiB)": 15.03, "step": 12595, "train_speed(iter/s)": 1.470354 }, { "acc": 0.99591722, "epoch": 22.241835834068844, "grad_norm": 4.034322738647461, "learning_rate": 6.304103359321369e-06, "loss": 0.04370266, "memory(GiB)": 15.03, "step": 12600, "train_speed(iter/s)": 1.470344 }, { "acc": 0.99758625, "epoch": 22.250661959399824, "grad_norm": 1.6882574558258057, "learning_rate": 6.301283150933674e-06, "loss": 0.01917587, "memory(GiB)": 15.03, "step": 12605, "train_speed(iter/s)": 1.470344 }, { "acc": 0.99614973, "epoch": 22.259488084730805, "grad_norm": 1.8819304704666138, "learning_rate": 6.298462498503453e-06, "loss": 0.03981767, "memory(GiB)": 15.03, "step": 12610, "train_speed(iter/s)": 1.47033 }, { "acc": 0.99370708, "epoch": 22.26831421006178, "grad_norm": 3.423875331878662, "learning_rate": 6.295641402993581e-06, "loss": 0.04333763, "memory(GiB)": 15.03, "step": 12615, "train_speed(iter/s)": 1.47034 }, { "acc": 0.99745016, "epoch": 22.27714033539276, "grad_norm": 2.1203935146331787, "learning_rate": 6.292819865367083e-06, "loss": 0.02212946, "memory(GiB)": 15.03, "step": 12620, "train_speed(iter/s)": 1.470339 }, { "acc": 0.99545507, "epoch": 22.285966460723742, "grad_norm": 3.1099677085876465, "learning_rate": 6.289997886587136e-06, "loss": 0.03112293, "memory(GiB)": 15.03, "step": 12625, "train_speed(iter/s)": 1.470364 }, { "acc": 0.99602652, "epoch": 22.294792586054722, "grad_norm": 5.396105766296387, "learning_rate": 6.287175467617064e-06, "loss": 0.02926163, "memory(GiB)": 15.03, "step": 12630, "train_speed(iter/s)": 1.47037 }, { "acc": 0.99771805, "epoch": 22.303618711385703, "grad_norm": 2.1559431552886963, "learning_rate": 6.284352609420344e-06, "loss": 0.02113792, "memory(GiB)": 15.03, "step": 12635, "train_speed(iter/s)": 1.470397 }, { "acc": 0.9957552, "epoch": 22.312444836716683, "grad_norm": 3.575411081314087, "learning_rate": 6.281529312960603e-06, "loss": 0.03357881, "memory(GiB)": 15.03, "step": 12640, "train_speed(iter/s)": 1.470393 }, { "acc": 0.99567308, "epoch": 22.32127096204766, "grad_norm": 2.330353260040283, "learning_rate": 6.278705579201615e-06, "loss": 0.0374521, "memory(GiB)": 15.03, "step": 12645, "train_speed(iter/s)": 1.47038 }, { "acc": 0.9964201, "epoch": 22.33009708737864, "grad_norm": 0.2843952476978302, "learning_rate": 6.275881409107311e-06, "loss": 0.02730432, "memory(GiB)": 15.03, "step": 12650, "train_speed(iter/s)": 1.470374 }, { "acc": 0.99707918, "epoch": 22.33892321270962, "grad_norm": 1.650241732597351, "learning_rate": 6.273056803641759e-06, "loss": 0.03703732, "memory(GiB)": 15.03, "step": 12655, "train_speed(iter/s)": 1.470385 }, { "acc": 0.99741344, "epoch": 22.3477493380406, "grad_norm": 1.988069772720337, "learning_rate": 6.270231763769187e-06, "loss": 0.02265075, "memory(GiB)": 15.03, "step": 12660, "train_speed(iter/s)": 1.470395 }, { "acc": 0.99350777, "epoch": 22.35657546337158, "grad_norm": 1.8476186990737915, "learning_rate": 6.267406290453959e-06, "loss": 0.03706524, "memory(GiB)": 15.03, "step": 12665, "train_speed(iter/s)": 1.47036 }, { "acc": 0.99772606, "epoch": 22.36540158870256, "grad_norm": 2.1996214389801025, "learning_rate": 6.264580384660604e-06, "loss": 0.01592406, "memory(GiB)": 15.03, "step": 12670, "train_speed(iter/s)": 1.470331 }, { "acc": 0.99866467, "epoch": 22.374227714033537, "grad_norm": 1.7303237915039062, "learning_rate": 6.261754047353782e-06, "loss": 0.01595102, "memory(GiB)": 15.03, "step": 12675, "train_speed(iter/s)": 1.470351 }, { "acc": 0.99692755, "epoch": 22.383053839364518, "grad_norm": 1.2271376848220825, "learning_rate": 6.258927279498308e-06, "loss": 0.02221404, "memory(GiB)": 15.03, "step": 12680, "train_speed(iter/s)": 1.470353 }, { "acc": 0.99685631, "epoch": 22.391879964695498, "grad_norm": 1.8517460823059082, "learning_rate": 6.256100082059144e-06, "loss": 0.03235414, "memory(GiB)": 15.03, "step": 12685, "train_speed(iter/s)": 1.470359 }, { "acc": 0.99614124, "epoch": 22.40070609002648, "grad_norm": 2.763782024383545, "learning_rate": 6.253272456001399e-06, "loss": 0.02972264, "memory(GiB)": 15.03, "step": 12690, "train_speed(iter/s)": 1.470365 }, { "acc": 0.99728603, "epoch": 22.40953221535746, "grad_norm": 1.7509350776672363, "learning_rate": 6.250444402290326e-06, "loss": 0.02088723, "memory(GiB)": 15.03, "step": 12695, "train_speed(iter/s)": 1.470387 }, { "acc": 0.997404, "epoch": 22.41835834068844, "grad_norm": 0.9123959541320801, "learning_rate": 6.247615921891323e-06, "loss": 0.01588886, "memory(GiB)": 15.03, "step": 12700, "train_speed(iter/s)": 1.470411 }, { "acc": 0.99619598, "epoch": 22.42718446601942, "grad_norm": 1.756128191947937, "learning_rate": 6.244787015769938e-06, "loss": 0.01994447, "memory(GiB)": 15.03, "step": 12705, "train_speed(iter/s)": 1.470419 }, { "acc": 0.9975153, "epoch": 22.436010591350396, "grad_norm": 0.5723978877067566, "learning_rate": 6.241957684891862e-06, "loss": 0.02327892, "memory(GiB)": 15.03, "step": 12710, "train_speed(iter/s)": 1.470436 }, { "acc": 0.99789562, "epoch": 22.444836716681376, "grad_norm": 3.43097186088562, "learning_rate": 6.23912793022293e-06, "loss": 0.01511289, "memory(GiB)": 15.03, "step": 12715, "train_speed(iter/s)": 1.470412 }, { "acc": 0.99673882, "epoch": 22.453662842012356, "grad_norm": 3.888967990875244, "learning_rate": 6.236297752729124e-06, "loss": 0.03100557, "memory(GiB)": 15.03, "step": 12720, "train_speed(iter/s)": 1.470381 }, { "acc": 0.99735689, "epoch": 22.462488967343337, "grad_norm": 0.49361151456832886, "learning_rate": 6.2334671533765674e-06, "loss": 0.01693401, "memory(GiB)": 15.03, "step": 12725, "train_speed(iter/s)": 1.470373 }, { "acc": 0.99635201, "epoch": 22.471315092674317, "grad_norm": 1.8590173721313477, "learning_rate": 6.230636133131531e-06, "loss": 0.02885556, "memory(GiB)": 15.03, "step": 12730, "train_speed(iter/s)": 1.470362 }, { "acc": 0.99496841, "epoch": 22.480141218005297, "grad_norm": 0.6887074708938599, "learning_rate": 6.2278046929604265e-06, "loss": 0.03339312, "memory(GiB)": 15.03, "step": 12735, "train_speed(iter/s)": 1.470343 }, { "acc": 0.99661407, "epoch": 22.488967343336274, "grad_norm": 0.3743496239185333, "learning_rate": 6.22497283382981e-06, "loss": 0.02770496, "memory(GiB)": 15.03, "step": 12740, "train_speed(iter/s)": 1.470357 }, { "acc": 0.99912157, "epoch": 22.497793468667254, "grad_norm": 0.1707017868757248, "learning_rate": 6.22214055670638e-06, "loss": 0.01124835, "memory(GiB)": 15.03, "step": 12745, "train_speed(iter/s)": 1.470391 }, { "acc": 0.99757175, "epoch": 22.506619593998234, "grad_norm": 2.2430429458618164, "learning_rate": 6.219307862556978e-06, "loss": 0.01977832, "memory(GiB)": 15.03, "step": 12750, "train_speed(iter/s)": 1.470412 }, { "acc": 0.99742908, "epoch": 22.515445719329215, "grad_norm": 0.962376058101654, "learning_rate": 6.21647475234859e-06, "loss": 0.02439048, "memory(GiB)": 15.03, "step": 12755, "train_speed(iter/s)": 1.470444 }, { "acc": 0.99609146, "epoch": 22.524271844660195, "grad_norm": 1.4986881017684937, "learning_rate": 6.2136412270483425e-06, "loss": 0.0252511, "memory(GiB)": 15.03, "step": 12760, "train_speed(iter/s)": 1.470448 }, { "acc": 0.99727211, "epoch": 22.533097969991175, "grad_norm": 2.7684943675994873, "learning_rate": 6.2108072876235016e-06, "loss": 0.01913182, "memory(GiB)": 15.03, "step": 12765, "train_speed(iter/s)": 1.470449 }, { "acc": 0.99793215, "epoch": 22.541924095322152, "grad_norm": 1.9896128177642822, "learning_rate": 6.207972935041476e-06, "loss": 0.02074496, "memory(GiB)": 15.03, "step": 12770, "train_speed(iter/s)": 1.470468 }, { "acc": 0.99690971, "epoch": 22.550750220653132, "grad_norm": 0.28714585304260254, "learning_rate": 6.205138170269818e-06, "loss": 0.02580048, "memory(GiB)": 15.03, "step": 12775, "train_speed(iter/s)": 1.47047 }, { "acc": 0.99684181, "epoch": 22.559576345984112, "grad_norm": 0.6994149088859558, "learning_rate": 6.202302994276216e-06, "loss": 0.02686896, "memory(GiB)": 15.03, "step": 12780, "train_speed(iter/s)": 1.470445 }, { "acc": 0.99821587, "epoch": 22.568402471315093, "grad_norm": 1.5986921787261963, "learning_rate": 6.199467408028505e-06, "loss": 0.01723484, "memory(GiB)": 15.03, "step": 12785, "train_speed(iter/s)": 1.470442 }, { "acc": 0.99723101, "epoch": 22.577228596646073, "grad_norm": 0.8169687390327454, "learning_rate": 6.196631412494652e-06, "loss": 0.01559137, "memory(GiB)": 15.03, "step": 12790, "train_speed(iter/s)": 1.470444 }, { "acc": 0.99719648, "epoch": 22.586054721977053, "grad_norm": 0.8048145771026611, "learning_rate": 6.193795008642772e-06, "loss": 0.02610955, "memory(GiB)": 15.03, "step": 12795, "train_speed(iter/s)": 1.470431 }, { "acc": 0.99777536, "epoch": 22.594880847308033, "grad_norm": 1.2443817853927612, "learning_rate": 6.190958197441114e-06, "loss": 0.01606249, "memory(GiB)": 15.03, "step": 12800, "train_speed(iter/s)": 1.470447 }, { "acc": 0.9959404, "epoch": 22.60370697263901, "grad_norm": 0.59725022315979, "learning_rate": 6.1881209798580665e-06, "loss": 0.03121257, "memory(GiB)": 15.03, "step": 12805, "train_speed(iter/s)": 1.470446 }, { "acc": 0.99876156, "epoch": 22.61253309796999, "grad_norm": 0.8158498406410217, "learning_rate": 6.1852833568621585e-06, "loss": 0.0139249, "memory(GiB)": 15.03, "step": 12810, "train_speed(iter/s)": 1.470466 }, { "acc": 0.9975873, "epoch": 22.62135922330097, "grad_norm": 0.9181464314460754, "learning_rate": 6.182445329422058e-06, "loss": 0.01463479, "memory(GiB)": 15.03, "step": 12815, "train_speed(iter/s)": 1.47047 }, { "acc": 0.99735298, "epoch": 22.63018534863195, "grad_norm": 0.785559356212616, "learning_rate": 6.1796068985065675e-06, "loss": 0.02153365, "memory(GiB)": 15.03, "step": 12820, "train_speed(iter/s)": 1.470447 }, { "acc": 0.99862394, "epoch": 22.63901147396293, "grad_norm": 1.630408525466919, "learning_rate": 6.176768065084632e-06, "loss": 0.01571158, "memory(GiB)": 15.03, "step": 12825, "train_speed(iter/s)": 1.470451 }, { "acc": 0.99366207, "epoch": 22.64783759929391, "grad_norm": 1.5857888460159302, "learning_rate": 6.173928830125326e-06, "loss": 0.03972637, "memory(GiB)": 15.03, "step": 12830, "train_speed(iter/s)": 1.470452 }, { "acc": 0.99854927, "epoch": 22.656663724624888, "grad_norm": 0.6215782761573792, "learning_rate": 6.1710891945978745e-06, "loss": 0.0163265, "memory(GiB)": 15.03, "step": 12835, "train_speed(iter/s)": 1.470463 }, { "acc": 0.99801712, "epoch": 22.66548984995587, "grad_norm": 1.5793266296386719, "learning_rate": 6.168249159471625e-06, "loss": 0.01466192, "memory(GiB)": 15.03, "step": 12840, "train_speed(iter/s)": 1.470466 }, { "acc": 0.99818869, "epoch": 22.67431597528685, "grad_norm": 0.9899569749832153, "learning_rate": 6.165408725716071e-06, "loss": 0.01756188, "memory(GiB)": 15.03, "step": 12845, "train_speed(iter/s)": 1.470472 }, { "acc": 0.99888191, "epoch": 22.68314210061783, "grad_norm": 0.6540178656578064, "learning_rate": 6.162567894300835e-06, "loss": 0.0120918, "memory(GiB)": 15.03, "step": 12850, "train_speed(iter/s)": 1.470479 }, { "acc": 0.99530087, "epoch": 22.69196822594881, "grad_norm": 2.818143606185913, "learning_rate": 6.159726666195682e-06, "loss": 0.02649296, "memory(GiB)": 15.03, "step": 12855, "train_speed(iter/s)": 1.470469 }, { "acc": 0.99508686, "epoch": 22.70079435127979, "grad_norm": 5.030887603759766, "learning_rate": 6.156885042370511e-06, "loss": 0.02730213, "memory(GiB)": 15.03, "step": 12860, "train_speed(iter/s)": 1.47048 }, { "acc": 0.99752111, "epoch": 22.709620476610766, "grad_norm": 0.8983172178268433, "learning_rate": 6.1540430237953484e-06, "loss": 0.02144203, "memory(GiB)": 15.03, "step": 12865, "train_speed(iter/s)": 1.470483 }, { "acc": 0.99431238, "epoch": 22.718446601941746, "grad_norm": 2.037872076034546, "learning_rate": 6.151200611440368e-06, "loss": 0.04143063, "memory(GiB)": 15.03, "step": 12870, "train_speed(iter/s)": 1.47046 }, { "acc": 0.99426403, "epoch": 22.727272727272727, "grad_norm": 1.0353963375091553, "learning_rate": 6.148357806275865e-06, "loss": 0.03560434, "memory(GiB)": 15.03, "step": 12875, "train_speed(iter/s)": 1.470466 }, { "acc": 0.99575243, "epoch": 22.736098852603707, "grad_norm": 4.315107345581055, "learning_rate": 6.14551460927228e-06, "loss": 0.02056675, "memory(GiB)": 15.03, "step": 12880, "train_speed(iter/s)": 1.470472 }, { "acc": 0.9947156, "epoch": 22.744924977934687, "grad_norm": 2.0490541458129883, "learning_rate": 6.14267102140018e-06, "loss": 0.04589814, "memory(GiB)": 15.03, "step": 12885, "train_speed(iter/s)": 1.470436 }, { "acc": 0.99767323, "epoch": 22.753751103265667, "grad_norm": 2.422245740890503, "learning_rate": 6.13982704363027e-06, "loss": 0.01457931, "memory(GiB)": 15.03, "step": 12890, "train_speed(iter/s)": 1.470427 }, { "acc": 0.99655838, "epoch": 22.762577228596648, "grad_norm": 0.44218045473098755, "learning_rate": 6.136982676933383e-06, "loss": 0.02852376, "memory(GiB)": 15.03, "step": 12895, "train_speed(iter/s)": 1.470447 }, { "acc": 0.9974267, "epoch": 22.771403353927624, "grad_norm": 2.960052251815796, "learning_rate": 6.134137922280492e-06, "loss": 0.0183727, "memory(GiB)": 15.03, "step": 12900, "train_speed(iter/s)": 1.47049 }, { "acc": 0.9974514, "epoch": 22.780229479258605, "grad_norm": 1.1070667505264282, "learning_rate": 6.131292780642694e-06, "loss": 0.02328701, "memory(GiB)": 15.03, "step": 12905, "train_speed(iter/s)": 1.470468 }, { "acc": 0.99664879, "epoch": 22.789055604589585, "grad_norm": 1.2468020915985107, "learning_rate": 6.128447252991223e-06, "loss": 0.03176151, "memory(GiB)": 15.03, "step": 12910, "train_speed(iter/s)": 1.470467 }, { "acc": 0.99898539, "epoch": 22.797881729920565, "grad_norm": 0.9682605862617493, "learning_rate": 6.125601340297444e-06, "loss": 0.01277784, "memory(GiB)": 15.03, "step": 12915, "train_speed(iter/s)": 1.470455 }, { "acc": 0.99807072, "epoch": 22.806707855251545, "grad_norm": 0.8189550042152405, "learning_rate": 6.122755043532856e-06, "loss": 0.01607734, "memory(GiB)": 15.03, "step": 12920, "train_speed(iter/s)": 1.470466 }, { "acc": 0.99770098, "epoch": 22.815533980582526, "grad_norm": 1.3951807022094727, "learning_rate": 6.1199083636690845e-06, "loss": 0.018735, "memory(GiB)": 15.03, "step": 12925, "train_speed(iter/s)": 1.470477 }, { "acc": 0.99722834, "epoch": 22.824360105913502, "grad_norm": 1.2934178113937378, "learning_rate": 6.117061301677889e-06, "loss": 0.02014235, "memory(GiB)": 15.03, "step": 12930, "train_speed(iter/s)": 1.470489 }, { "acc": 0.9992074, "epoch": 22.833186231244483, "grad_norm": 0.8152055740356445, "learning_rate": 6.114213858531157e-06, "loss": 0.01423897, "memory(GiB)": 15.03, "step": 12935, "train_speed(iter/s)": 1.47048 }, { "acc": 0.99611702, "epoch": 22.842012356575463, "grad_norm": 1.2725011110305786, "learning_rate": 6.111366035200909e-06, "loss": 0.03706469, "memory(GiB)": 15.03, "step": 12940, "train_speed(iter/s)": 1.470473 }, { "acc": 0.99659567, "epoch": 22.850838481906443, "grad_norm": 1.1654893159866333, "learning_rate": 6.108517832659291e-06, "loss": 0.02804915, "memory(GiB)": 15.03, "step": 12945, "train_speed(iter/s)": 1.470463 }, { "acc": 0.9980814, "epoch": 22.859664607237423, "grad_norm": 1.254271388053894, "learning_rate": 6.105669251878584e-06, "loss": 0.0251339, "memory(GiB)": 15.03, "step": 12950, "train_speed(iter/s)": 1.470492 }, { "acc": 0.99836168, "epoch": 22.868490732568404, "grad_norm": 0.6981532573699951, "learning_rate": 6.1028202938311935e-06, "loss": 0.01562855, "memory(GiB)": 15.03, "step": 12955, "train_speed(iter/s)": 1.470508 }, { "acc": 0.99729176, "epoch": 22.87731685789938, "grad_norm": 1.332467794418335, "learning_rate": 6.0999709594896565e-06, "loss": 0.02646288, "memory(GiB)": 15.03, "step": 12960, "train_speed(iter/s)": 1.470495 }, { "acc": 0.9981061, "epoch": 22.88614298323036, "grad_norm": 0.4970782697200775, "learning_rate": 6.0971212498266355e-06, "loss": 0.02049838, "memory(GiB)": 15.03, "step": 12965, "train_speed(iter/s)": 1.470493 }, { "acc": 0.99666214, "epoch": 22.89496910856134, "grad_norm": 0.5638509392738342, "learning_rate": 6.094271165814926e-06, "loss": 0.02993975, "memory(GiB)": 15.03, "step": 12970, "train_speed(iter/s)": 1.470499 }, { "acc": 0.99780874, "epoch": 22.90379523389232, "grad_norm": 4.01145601272583, "learning_rate": 6.091420708427449e-06, "loss": 0.02405227, "memory(GiB)": 15.03, "step": 12975, "train_speed(iter/s)": 1.470497 }, { "acc": 0.99581432, "epoch": 22.9126213592233, "grad_norm": 1.1627881526947021, "learning_rate": 6.088569878637248e-06, "loss": 0.03269275, "memory(GiB)": 15.03, "step": 12980, "train_speed(iter/s)": 1.470509 }, { "acc": 0.99845152, "epoch": 22.92144748455428, "grad_norm": 0.5463666319847107, "learning_rate": 6.085718677417503e-06, "loss": 0.01403073, "memory(GiB)": 15.03, "step": 12985, "train_speed(iter/s)": 1.470505 }, { "acc": 0.99542484, "epoch": 22.930273609885262, "grad_norm": 0.4539801776409149, "learning_rate": 6.082867105741512e-06, "loss": 0.03501869, "memory(GiB)": 15.03, "step": 12990, "train_speed(iter/s)": 1.470483 }, { "acc": 0.9986949, "epoch": 22.93909973521624, "grad_norm": 0.6708534955978394, "learning_rate": 6.080015164582707e-06, "loss": 0.01418324, "memory(GiB)": 15.03, "step": 12995, "train_speed(iter/s)": 1.47049 }, { "acc": 0.99583282, "epoch": 22.94792586054722, "grad_norm": 0.20305988192558289, "learning_rate": 6.0771628549146365e-06, "loss": 0.02442575, "memory(GiB)": 15.03, "step": 13000, "train_speed(iter/s)": 1.470485 }, { "acc": 0.9982132, "epoch": 22.9567519858782, "grad_norm": 0.6565145254135132, "learning_rate": 6.074310177710987e-06, "loss": 0.01930987, "memory(GiB)": 15.03, "step": 13005, "train_speed(iter/s)": 1.470498 }, { "acc": 0.99820271, "epoch": 22.96557811120918, "grad_norm": 0.6776201128959656, "learning_rate": 6.071457133945561e-06, "loss": 0.01149797, "memory(GiB)": 15.03, "step": 13010, "train_speed(iter/s)": 1.470467 }, { "acc": 0.99423847, "epoch": 22.97440423654016, "grad_norm": 1.3998459577560425, "learning_rate": 6.068603724592289e-06, "loss": 0.03437026, "memory(GiB)": 15.03, "step": 13015, "train_speed(iter/s)": 1.470477 }, { "acc": 0.99689875, "epoch": 22.98323036187114, "grad_norm": 1.0394313335418701, "learning_rate": 6.065749950625228e-06, "loss": 0.01717562, "memory(GiB)": 15.03, "step": 13020, "train_speed(iter/s)": 1.470509 }, { "acc": 0.99744005, "epoch": 22.992056487202117, "grad_norm": 2.3007211685180664, "learning_rate": 6.062895813018559e-06, "loss": 0.02217046, "memory(GiB)": 15.03, "step": 13025, "train_speed(iter/s)": 1.470501 }, { "acc": 0.99840899, "epoch": 23.000882612533097, "grad_norm": 1.8582420349121094, "learning_rate": 6.060041312746582e-06, "loss": 0.02255174, "memory(GiB)": 15.03, "step": 13030, "train_speed(iter/s)": 1.470423 }, { "acc": 0.99631805, "epoch": 23.009708737864077, "grad_norm": 2.2744245529174805, "learning_rate": 6.05718645078373e-06, "loss": 0.02107538, "memory(GiB)": 15.03, "step": 13035, "train_speed(iter/s)": 1.470444 }, { "acc": 0.99408092, "epoch": 23.018534863195057, "grad_norm": 4.121513366699219, "learning_rate": 6.054331228104548e-06, "loss": 0.03118683, "memory(GiB)": 15.03, "step": 13040, "train_speed(iter/s)": 1.470449 }, { "acc": 0.99709873, "epoch": 23.027360988526038, "grad_norm": 0.920204222202301, "learning_rate": 6.051475645683719e-06, "loss": 0.03254378, "memory(GiB)": 15.03, "step": 13045, "train_speed(iter/s)": 1.470439 }, { "acc": 0.99832344, "epoch": 23.036187113857018, "grad_norm": 1.3542948961257935, "learning_rate": 6.048619704496034e-06, "loss": 0.02076808, "memory(GiB)": 15.03, "step": 13050, "train_speed(iter/s)": 1.470443 }, { "acc": 0.99697094, "epoch": 23.045013239187995, "grad_norm": 1.184268832206726, "learning_rate": 6.045763405516415e-06, "loss": 0.02172913, "memory(GiB)": 15.03, "step": 13055, "train_speed(iter/s)": 1.470433 }, { "acc": 0.99697056, "epoch": 23.053839364518975, "grad_norm": 1.3625152111053467, "learning_rate": 6.042906749719904e-06, "loss": 0.0362326, "memory(GiB)": 15.03, "step": 13060, "train_speed(iter/s)": 1.470455 }, { "acc": 0.99873085, "epoch": 23.062665489849955, "grad_norm": 0.6216133236885071, "learning_rate": 6.040049738081666e-06, "loss": 0.01160176, "memory(GiB)": 15.03, "step": 13065, "train_speed(iter/s)": 1.470471 }, { "acc": 0.99613981, "epoch": 23.071491615180935, "grad_norm": 1.2348164319992065, "learning_rate": 6.0371923715769835e-06, "loss": 0.02243493, "memory(GiB)": 15.03, "step": 13070, "train_speed(iter/s)": 1.470489 }, { "acc": 0.99674158, "epoch": 23.080317740511916, "grad_norm": 0.36516815423965454, "learning_rate": 6.034334651181264e-06, "loss": 0.02129772, "memory(GiB)": 15.03, "step": 13075, "train_speed(iter/s)": 1.470503 }, { "acc": 0.99842625, "epoch": 23.089143865842896, "grad_norm": 3.0777881145477295, "learning_rate": 6.031476577870036e-06, "loss": 0.01172808, "memory(GiB)": 15.03, "step": 13080, "train_speed(iter/s)": 1.470514 }, { "acc": 0.99752922, "epoch": 23.097969991173876, "grad_norm": 1.4006682634353638, "learning_rate": 6.028618152618944e-06, "loss": 0.02253465, "memory(GiB)": 15.03, "step": 13085, "train_speed(iter/s)": 1.470535 }, { "acc": 0.99726906, "epoch": 23.106796116504853, "grad_norm": 0.4996936023235321, "learning_rate": 6.02575937640376e-06, "loss": 0.0121887, "memory(GiB)": 15.03, "step": 13090, "train_speed(iter/s)": 1.470525 }, { "acc": 0.99523344, "epoch": 23.115622241835833, "grad_norm": 2.097654104232788, "learning_rate": 6.022900250200371e-06, "loss": 0.03379517, "memory(GiB)": 15.03, "step": 13095, "train_speed(iter/s)": 1.47054 }, { "acc": 0.99755478, "epoch": 23.124448367166814, "grad_norm": 3.1342687606811523, "learning_rate": 6.020040774984782e-06, "loss": 0.0271405, "memory(GiB)": 15.03, "step": 13100, "train_speed(iter/s)": 1.470535 }, { "acc": 0.99807291, "epoch": 23.133274492497794, "grad_norm": 0.7710711359977722, "learning_rate": 6.01718095173312e-06, "loss": 0.01655747, "memory(GiB)": 15.03, "step": 13105, "train_speed(iter/s)": 1.470547 }, { "acc": 0.99794521, "epoch": 23.142100617828774, "grad_norm": 2.731689691543579, "learning_rate": 6.014320781421631e-06, "loss": 0.02038446, "memory(GiB)": 15.03, "step": 13110, "train_speed(iter/s)": 1.470539 }, { "acc": 0.99692459, "epoch": 23.150926743159754, "grad_norm": 0.22890695929527283, "learning_rate": 6.0114602650266774e-06, "loss": 0.02134089, "memory(GiB)": 15.03, "step": 13115, "train_speed(iter/s)": 1.470513 }, { "acc": 0.99627295, "epoch": 23.15975286849073, "grad_norm": 2.176356554031372, "learning_rate": 6.008599403524742e-06, "loss": 0.01993459, "memory(GiB)": 15.03, "step": 13120, "train_speed(iter/s)": 1.470528 }, { "acc": 0.99739008, "epoch": 23.16857899382171, "grad_norm": 0.9405291676521301, "learning_rate": 6.0057381978924215e-06, "loss": 0.02718505, "memory(GiB)": 15.03, "step": 13125, "train_speed(iter/s)": 1.470549 }, { "acc": 0.99875984, "epoch": 23.17740511915269, "grad_norm": 1.0928623676300049, "learning_rate": 6.002876649106437e-06, "loss": 0.0138992, "memory(GiB)": 15.03, "step": 13130, "train_speed(iter/s)": 1.470541 }, { "acc": 0.9959013, "epoch": 23.186231244483672, "grad_norm": 2.025855779647827, "learning_rate": 6.00001475814362e-06, "loss": 0.02567489, "memory(GiB)": 15.03, "step": 13135, "train_speed(iter/s)": 1.470551 }, { "acc": 0.99725647, "epoch": 23.195057369814652, "grad_norm": 0.8461995720863342, "learning_rate": 5.997152525980921e-06, "loss": 0.01742659, "memory(GiB)": 15.03, "step": 13140, "train_speed(iter/s)": 1.470545 }, { "acc": 0.99719419, "epoch": 23.203883495145632, "grad_norm": 4.155110836029053, "learning_rate": 5.994289953595409e-06, "loss": 0.02392255, "memory(GiB)": 15.03, "step": 13145, "train_speed(iter/s)": 1.470548 }, { "acc": 0.99627123, "epoch": 23.21270962047661, "grad_norm": 1.1337772607803345, "learning_rate": 5.9914270419642675e-06, "loss": 0.02919633, "memory(GiB)": 15.03, "step": 13150, "train_speed(iter/s)": 1.470539 }, { "acc": 0.9984684, "epoch": 23.22153574580759, "grad_norm": 0.6224440932273865, "learning_rate": 5.988563792064794e-06, "loss": 0.01199251, "memory(GiB)": 15.03, "step": 13155, "train_speed(iter/s)": 1.470538 }, { "acc": 0.99805975, "epoch": 23.23036187113857, "grad_norm": 1.42714262008667, "learning_rate": 5.985700204874406e-06, "loss": 0.01932443, "memory(GiB)": 15.03, "step": 13160, "train_speed(iter/s)": 1.470555 }, { "acc": 0.9966713, "epoch": 23.23918799646955, "grad_norm": 2.6796586513519287, "learning_rate": 5.98283628137063e-06, "loss": 0.02910973, "memory(GiB)": 15.03, "step": 13165, "train_speed(iter/s)": 1.470564 }, { "acc": 0.99722137, "epoch": 23.24801412180053, "grad_norm": 0.969180166721344, "learning_rate": 5.979972022531114e-06, "loss": 0.02186131, "memory(GiB)": 15.03, "step": 13170, "train_speed(iter/s)": 1.470573 }, { "acc": 0.99656944, "epoch": 23.25684024713151, "grad_norm": 3.5606167316436768, "learning_rate": 5.977107429333616e-06, "loss": 0.02732706, "memory(GiB)": 15.03, "step": 13175, "train_speed(iter/s)": 1.470575 }, { "acc": 0.99750004, "epoch": 23.26566637246249, "grad_norm": 1.1628185510635376, "learning_rate": 5.974242502756009e-06, "loss": 0.02193569, "memory(GiB)": 15.03, "step": 13180, "train_speed(iter/s)": 1.470619 }, { "acc": 0.99744987, "epoch": 23.274492497793467, "grad_norm": 0.4743487536907196, "learning_rate": 5.971377243776281e-06, "loss": 0.01686539, "memory(GiB)": 15.03, "step": 13185, "train_speed(iter/s)": 1.470613 }, { "acc": 0.9968111, "epoch": 23.283318623124448, "grad_norm": 3.1848015785217285, "learning_rate": 5.9685116533725305e-06, "loss": 0.02504972, "memory(GiB)": 15.03, "step": 13190, "train_speed(iter/s)": 1.47062 }, { "acc": 0.99684992, "epoch": 23.292144748455428, "grad_norm": 1.5304704904556274, "learning_rate": 5.9656457325229755e-06, "loss": 0.01617028, "memory(GiB)": 15.03, "step": 13195, "train_speed(iter/s)": 1.470625 }, { "acc": 0.99697628, "epoch": 23.300970873786408, "grad_norm": 0.8445071578025818, "learning_rate": 5.962779482205937e-06, "loss": 0.02284186, "memory(GiB)": 15.03, "step": 13200, "train_speed(iter/s)": 1.470666 }, { "acc": 0.99705563, "epoch": 23.30979699911739, "grad_norm": 3.776515245437622, "learning_rate": 5.959912903399861e-06, "loss": 0.01970674, "memory(GiB)": 15.03, "step": 13205, "train_speed(iter/s)": 1.470646 }, { "acc": 0.99716616, "epoch": 23.31862312444837, "grad_norm": 2.3151676654815674, "learning_rate": 5.957045997083289e-06, "loss": 0.02144653, "memory(GiB)": 15.03, "step": 13210, "train_speed(iter/s)": 1.470653 }, { "acc": 0.99936562, "epoch": 23.327449249779345, "grad_norm": 0.1464461237192154, "learning_rate": 5.954178764234893e-06, "loss": 0.00696777, "memory(GiB)": 15.03, "step": 13215, "train_speed(iter/s)": 1.470653 }, { "acc": 0.99737177, "epoch": 23.336275375110326, "grad_norm": 4.066980838775635, "learning_rate": 5.951311205833446e-06, "loss": 0.02573349, "memory(GiB)": 15.03, "step": 13220, "train_speed(iter/s)": 1.470666 }, { "acc": 0.99742775, "epoch": 23.345101500441306, "grad_norm": 1.5114117860794067, "learning_rate": 5.94844332285783e-06, "loss": 0.01848612, "memory(GiB)": 15.03, "step": 13225, "train_speed(iter/s)": 1.47069 }, { "acc": 0.99801474, "epoch": 23.353927625772286, "grad_norm": 0.9446728825569153, "learning_rate": 5.945575116287042e-06, "loss": 0.02005865, "memory(GiB)": 15.03, "step": 13230, "train_speed(iter/s)": 1.470708 }, { "acc": 0.9975008, "epoch": 23.362753751103266, "grad_norm": 1.8606078624725342, "learning_rate": 5.942706587100191e-06, "loss": 0.01798357, "memory(GiB)": 15.03, "step": 13235, "train_speed(iter/s)": 1.470719 }, { "acc": 0.99937325, "epoch": 23.371579876434247, "grad_norm": 0.3015839755535126, "learning_rate": 5.939837736276493e-06, "loss": 0.00822951, "memory(GiB)": 15.03, "step": 13240, "train_speed(iter/s)": 1.470757 }, { "acc": 0.99739742, "epoch": 23.380406001765223, "grad_norm": 1.0218855142593384, "learning_rate": 5.936968564795275e-06, "loss": 0.02370558, "memory(GiB)": 15.03, "step": 13245, "train_speed(iter/s)": 1.470754 }, { "acc": 0.99782286, "epoch": 23.389232127096204, "grad_norm": 1.6713379621505737, "learning_rate": 5.93409907363597e-06, "loss": 0.01720603, "memory(GiB)": 15.03, "step": 13250, "train_speed(iter/s)": 1.470758 }, { "acc": 0.99751644, "epoch": 23.398058252427184, "grad_norm": 0.6374874711036682, "learning_rate": 5.93122926377813e-06, "loss": 0.01950376, "memory(GiB)": 15.03, "step": 13255, "train_speed(iter/s)": 1.470766 }, { "acc": 0.99842625, "epoch": 23.406884377758164, "grad_norm": 1.0685521364212036, "learning_rate": 5.928359136201404e-06, "loss": 0.01895942, "memory(GiB)": 15.03, "step": 13260, "train_speed(iter/s)": 1.470795 }, { "acc": 0.9970335, "epoch": 23.415710503089144, "grad_norm": 0.9921093583106995, "learning_rate": 5.925488691885556e-06, "loss": 0.01824553, "memory(GiB)": 15.03, "step": 13265, "train_speed(iter/s)": 1.470791 }, { "acc": 0.998104, "epoch": 23.424536628420125, "grad_norm": 1.0353668928146362, "learning_rate": 5.922617931810457e-06, "loss": 0.0140865, "memory(GiB)": 15.03, "step": 13270, "train_speed(iter/s)": 1.470814 }, { "acc": 0.99746876, "epoch": 23.433362753751105, "grad_norm": 0.5549824833869934, "learning_rate": 5.919746856956086e-06, "loss": 0.0146515, "memory(GiB)": 15.03, "step": 13275, "train_speed(iter/s)": 1.470817 }, { "acc": 0.99737377, "epoch": 23.44218887908208, "grad_norm": 1.6382901668548584, "learning_rate": 5.9168754683025276e-06, "loss": 0.01540985, "memory(GiB)": 15.03, "step": 13280, "train_speed(iter/s)": 1.470838 }, { "acc": 0.99592381, "epoch": 23.451015004413062, "grad_norm": 0.9470586180686951, "learning_rate": 5.914003766829975e-06, "loss": 0.03192806, "memory(GiB)": 15.03, "step": 13285, "train_speed(iter/s)": 1.470864 }, { "acc": 0.99827137, "epoch": 23.459841129744042, "grad_norm": 1.1842150688171387, "learning_rate": 5.911131753518728e-06, "loss": 0.01284296, "memory(GiB)": 15.03, "step": 13290, "train_speed(iter/s)": 1.470862 }, { "acc": 0.99593992, "epoch": 23.468667255075022, "grad_norm": 0.6886034607887268, "learning_rate": 5.9082594293491935e-06, "loss": 0.03739878, "memory(GiB)": 15.03, "step": 13295, "train_speed(iter/s)": 1.470851 }, { "acc": 0.99710217, "epoch": 23.477493380406003, "grad_norm": 1.3639081716537476, "learning_rate": 5.905386795301886e-06, "loss": 0.01846866, "memory(GiB)": 15.03, "step": 13300, "train_speed(iter/s)": 1.470872 }, { "acc": 0.99629955, "epoch": 23.486319505736983, "grad_norm": 2.906522750854492, "learning_rate": 5.90251385235742e-06, "loss": 0.01702679, "memory(GiB)": 15.03, "step": 13305, "train_speed(iter/s)": 1.470895 }, { "acc": 0.99848385, "epoch": 23.49514563106796, "grad_norm": 0.38601425290107727, "learning_rate": 5.899640601496523e-06, "loss": 0.01471058, "memory(GiB)": 15.03, "step": 13310, "train_speed(iter/s)": 1.470888 }, { "acc": 0.99649029, "epoch": 23.50397175639894, "grad_norm": 1.9937058687210083, "learning_rate": 5.896767043700019e-06, "loss": 0.02344516, "memory(GiB)": 15.03, "step": 13315, "train_speed(iter/s)": 1.470901 }, { "acc": 0.99741402, "epoch": 23.51279788172992, "grad_norm": 2.8622353076934814, "learning_rate": 5.8938931799488475e-06, "loss": 0.01996391, "memory(GiB)": 15.03, "step": 13320, "train_speed(iter/s)": 1.470915 }, { "acc": 0.99899445, "epoch": 23.5216240070609, "grad_norm": 3.4896914958953857, "learning_rate": 5.891019011224041e-06, "loss": 0.01316199, "memory(GiB)": 15.03, "step": 13325, "train_speed(iter/s)": 1.47093 }, { "acc": 0.9959239, "epoch": 23.53045013239188, "grad_norm": 1.7337766885757446, "learning_rate": 5.888144538506745e-06, "loss": 0.02923751, "memory(GiB)": 15.03, "step": 13330, "train_speed(iter/s)": 1.470965 }, { "acc": 0.99734735, "epoch": 23.53927625772286, "grad_norm": 0.31403252482414246, "learning_rate": 5.885269762778202e-06, "loss": 0.01981043, "memory(GiB)": 15.03, "step": 13335, "train_speed(iter/s)": 1.470972 }, { "acc": 0.99794159, "epoch": 23.548102383053838, "grad_norm": 1.3737961053848267, "learning_rate": 5.882394685019766e-06, "loss": 0.01115277, "memory(GiB)": 15.03, "step": 13340, "train_speed(iter/s)": 1.470969 }, { "acc": 0.99840078, "epoch": 23.556928508384818, "grad_norm": 5.749560356140137, "learning_rate": 5.879519306212887e-06, "loss": 0.02709166, "memory(GiB)": 15.03, "step": 13345, "train_speed(iter/s)": 1.470965 }, { "acc": 0.99676456, "epoch": 23.565754633715798, "grad_norm": 0.5570851564407349, "learning_rate": 5.87664362733912e-06, "loss": 0.02104712, "memory(GiB)": 15.03, "step": 13350, "train_speed(iter/s)": 1.47096 }, { "acc": 0.9965127, "epoch": 23.57458075904678, "grad_norm": 2.1819026470184326, "learning_rate": 5.873767649380123e-06, "loss": 0.02555194, "memory(GiB)": 15.03, "step": 13355, "train_speed(iter/s)": 1.470967 }, { "acc": 0.99694853, "epoch": 23.58340688437776, "grad_norm": 0.5635316967964172, "learning_rate": 5.870891373317657e-06, "loss": 0.02283358, "memory(GiB)": 15.03, "step": 13360, "train_speed(iter/s)": 1.470964 }, { "acc": 0.99811468, "epoch": 23.59223300970874, "grad_norm": 2.196848154067993, "learning_rate": 5.868014800133579e-06, "loss": 0.0218087, "memory(GiB)": 15.03, "step": 13365, "train_speed(iter/s)": 1.470958 }, { "acc": 0.99518337, "epoch": 23.60105913503972, "grad_norm": 4.457225322723389, "learning_rate": 5.865137930809858e-06, "loss": 0.03804574, "memory(GiB)": 15.03, "step": 13370, "train_speed(iter/s)": 1.470926 }, { "acc": 0.99609423, "epoch": 23.609885260370696, "grad_norm": 2.803023338317871, "learning_rate": 5.862260766328551e-06, "loss": 0.0182413, "memory(GiB)": 15.03, "step": 13375, "train_speed(iter/s)": 1.470924 }, { "acc": 0.99732857, "epoch": 23.618711385701676, "grad_norm": 0.7452579736709595, "learning_rate": 5.859383307671831e-06, "loss": 0.02682301, "memory(GiB)": 15.03, "step": 13380, "train_speed(iter/s)": 1.470947 }, { "acc": 0.99831114, "epoch": 23.627537511032656, "grad_norm": 0.6964741349220276, "learning_rate": 5.856505555821957e-06, "loss": 0.01346668, "memory(GiB)": 15.03, "step": 13385, "train_speed(iter/s)": 1.470988 }, { "acc": 0.99608917, "epoch": 23.636363636363637, "grad_norm": 0.8305291533470154, "learning_rate": 5.853627511761298e-06, "loss": 0.03009967, "memory(GiB)": 15.03, "step": 13390, "train_speed(iter/s)": 1.471003 }, { "acc": 0.99700003, "epoch": 23.645189761694617, "grad_norm": 0.3768455982208252, "learning_rate": 5.850749176472316e-06, "loss": 0.02293167, "memory(GiB)": 15.03, "step": 13395, "train_speed(iter/s)": 1.471012 }, { "acc": 0.99780283, "epoch": 23.654015887025597, "grad_norm": 1.983216404914856, "learning_rate": 5.8478705509375745e-06, "loss": 0.01650913, "memory(GiB)": 15.03, "step": 13400, "train_speed(iter/s)": 1.471008 }, { "acc": 0.99780006, "epoch": 23.662842012356574, "grad_norm": 1.6353708505630493, "learning_rate": 5.844991636139743e-06, "loss": 0.01777499, "memory(GiB)": 15.03, "step": 13405, "train_speed(iter/s)": 1.471016 }, { "acc": 0.99790649, "epoch": 23.671668137687554, "grad_norm": 4.271903038024902, "learning_rate": 5.842112433061578e-06, "loss": 0.02033992, "memory(GiB)": 15.03, "step": 13410, "train_speed(iter/s)": 1.471031 }, { "acc": 0.99720554, "epoch": 23.680494263018534, "grad_norm": 4.6778717041015625, "learning_rate": 5.839232942685944e-06, "loss": 0.02302446, "memory(GiB)": 15.03, "step": 13415, "train_speed(iter/s)": 1.471026 }, { "acc": 0.99886427, "epoch": 23.689320388349515, "grad_norm": 0.36062178015708923, "learning_rate": 5.836353165995795e-06, "loss": 0.01386079, "memory(GiB)": 15.03, "step": 13420, "train_speed(iter/s)": 1.471042 }, { "acc": 0.99682655, "epoch": 23.698146513680495, "grad_norm": 1.3782460689544678, "learning_rate": 5.8334731039741934e-06, "loss": 0.03143162, "memory(GiB)": 15.03, "step": 13425, "train_speed(iter/s)": 1.471058 }, { "acc": 0.99763727, "epoch": 23.706972639011475, "grad_norm": 1.9350101947784424, "learning_rate": 5.830592757604289e-06, "loss": 0.01491998, "memory(GiB)": 15.03, "step": 13430, "train_speed(iter/s)": 1.47105 }, { "acc": 0.99844685, "epoch": 23.715798764342452, "grad_norm": 0.7193171977996826, "learning_rate": 5.827712127869334e-06, "loss": 0.01821649, "memory(GiB)": 15.03, "step": 13435, "train_speed(iter/s)": 1.471027 }, { "acc": 0.99735355, "epoch": 23.724624889673432, "grad_norm": 2.0692927837371826, "learning_rate": 5.824831215752674e-06, "loss": 0.0155641, "memory(GiB)": 15.03, "step": 13440, "train_speed(iter/s)": 1.47105 }, { "acc": 0.99746952, "epoch": 23.733451015004412, "grad_norm": 2.1818654537200928, "learning_rate": 5.821950022237759e-06, "loss": 0.02542954, "memory(GiB)": 15.03, "step": 13445, "train_speed(iter/s)": 1.471059 }, { "acc": 0.99732018, "epoch": 23.742277140335393, "grad_norm": 1.4170558452606201, "learning_rate": 5.819068548308122e-06, "loss": 0.0214025, "memory(GiB)": 15.03, "step": 13450, "train_speed(iter/s)": 1.471058 }, { "acc": 0.99709082, "epoch": 23.751103265666373, "grad_norm": 0.5562149286270142, "learning_rate": 5.8161867949474034e-06, "loss": 0.0244683, "memory(GiB)": 15.03, "step": 13455, "train_speed(iter/s)": 1.471085 }, { "acc": 0.99650011, "epoch": 23.759929390997353, "grad_norm": 1.609352707862854, "learning_rate": 5.813304763139331e-06, "loss": 0.03921862, "memory(GiB)": 15.03, "step": 13460, "train_speed(iter/s)": 1.471067 }, { "acc": 0.99612064, "epoch": 23.768755516328333, "grad_norm": 4.166249752044678, "learning_rate": 5.8104224538677365e-06, "loss": 0.01796705, "memory(GiB)": 15.03, "step": 13465, "train_speed(iter/s)": 1.471063 }, { "acc": 0.99836273, "epoch": 23.77758164165931, "grad_norm": 3.120647430419922, "learning_rate": 5.807539868116535e-06, "loss": 0.018174, "memory(GiB)": 15.03, "step": 13470, "train_speed(iter/s)": 1.47107 }, { "acc": 0.99831505, "epoch": 23.78640776699029, "grad_norm": 4.6818952560424805, "learning_rate": 5.804657006869748e-06, "loss": 0.02216614, "memory(GiB)": 15.03, "step": 13475, "train_speed(iter/s)": 1.471087 }, { "acc": 0.9989397, "epoch": 23.79523389232127, "grad_norm": 1.571094274520874, "learning_rate": 5.801773871111477e-06, "loss": 0.0123838, "memory(GiB)": 15.03, "step": 13480, "train_speed(iter/s)": 1.471093 }, { "acc": 0.99797058, "epoch": 23.80406001765225, "grad_norm": 0.3743875324726105, "learning_rate": 5.798890461825932e-06, "loss": 0.01770028, "memory(GiB)": 15.03, "step": 13485, "train_speed(iter/s)": 1.471111 }, { "acc": 0.99730768, "epoch": 23.81288614298323, "grad_norm": 1.362762689590454, "learning_rate": 5.796006779997406e-06, "loss": 0.02309884, "memory(GiB)": 15.03, "step": 13490, "train_speed(iter/s)": 1.471097 }, { "acc": 0.99527321, "epoch": 23.82171226831421, "grad_norm": 1.0151227712631226, "learning_rate": 5.7931228266102915e-06, "loss": 0.03336454, "memory(GiB)": 15.03, "step": 13495, "train_speed(iter/s)": 1.471098 }, { "acc": 0.99792271, "epoch": 23.830538393645188, "grad_norm": 0.43731746077537537, "learning_rate": 5.790238602649065e-06, "loss": 0.02119467, "memory(GiB)": 15.03, "step": 13500, "train_speed(iter/s)": 1.471117 }, { "acc": 0.9938942, "epoch": 23.83936451897617, "grad_norm": 6.364181041717529, "learning_rate": 5.78735410909831e-06, "loss": 0.04151541, "memory(GiB)": 15.03, "step": 13505, "train_speed(iter/s)": 1.4711 }, { "acc": 0.99758205, "epoch": 23.84819064430715, "grad_norm": 1.678815484046936, "learning_rate": 5.784469346942686e-06, "loss": 0.02190136, "memory(GiB)": 15.03, "step": 13510, "train_speed(iter/s)": 1.471143 }, { "acc": 0.99633942, "epoch": 23.85701676963813, "grad_norm": 2.136540651321411, "learning_rate": 5.781584317166954e-06, "loss": 0.02371281, "memory(GiB)": 15.03, "step": 13515, "train_speed(iter/s)": 1.471148 }, { "acc": 0.99792786, "epoch": 23.86584289496911, "grad_norm": 3.0578479766845703, "learning_rate": 5.778699020755962e-06, "loss": 0.01617784, "memory(GiB)": 15.03, "step": 13520, "train_speed(iter/s)": 1.471167 }, { "acc": 0.99347115, "epoch": 23.87466902030009, "grad_norm": 0.9537317752838135, "learning_rate": 5.7758134586946536e-06, "loss": 0.03897419, "memory(GiB)": 15.03, "step": 13525, "train_speed(iter/s)": 1.471161 }, { "acc": 0.99844341, "epoch": 23.883495145631066, "grad_norm": 2.1123297214508057, "learning_rate": 5.772927631968061e-06, "loss": 0.01774777, "memory(GiB)": 15.03, "step": 13530, "train_speed(iter/s)": 1.471192 }, { "acc": 0.99732704, "epoch": 23.892321270962046, "grad_norm": 5.963242530822754, "learning_rate": 5.770041541561299e-06, "loss": 0.01897541, "memory(GiB)": 15.03, "step": 13535, "train_speed(iter/s)": 1.471198 }, { "acc": 0.99785099, "epoch": 23.901147396293027, "grad_norm": 2.094081163406372, "learning_rate": 5.767155188459587e-06, "loss": 0.01186255, "memory(GiB)": 15.03, "step": 13540, "train_speed(iter/s)": 1.471218 }, { "acc": 0.99732294, "epoch": 23.909973521624007, "grad_norm": 0.6398476958274841, "learning_rate": 5.764268573648221e-06, "loss": 0.02872542, "memory(GiB)": 15.03, "step": 13545, "train_speed(iter/s)": 1.471248 }, { "acc": 0.99598675, "epoch": 23.918799646954987, "grad_norm": 0.5996166467666626, "learning_rate": 5.761381698112596e-06, "loss": 0.0246061, "memory(GiB)": 15.03, "step": 13550, "train_speed(iter/s)": 1.471212 }, { "acc": 0.99735012, "epoch": 23.927625772285968, "grad_norm": 1.6407822370529175, "learning_rate": 5.758494562838189e-06, "loss": 0.02801011, "memory(GiB)": 15.03, "step": 13555, "train_speed(iter/s)": 1.47124 }, { "acc": 0.99531507, "epoch": 23.936451897616948, "grad_norm": 0.5079622268676758, "learning_rate": 5.75560716881057e-06, "loss": 0.03110148, "memory(GiB)": 15.03, "step": 13560, "train_speed(iter/s)": 1.47125 }, { "acc": 0.99793472, "epoch": 23.945278022947925, "grad_norm": 0.1921490579843521, "learning_rate": 5.752719517015395e-06, "loss": 0.01705626, "memory(GiB)": 15.03, "step": 13565, "train_speed(iter/s)": 1.471274 }, { "acc": 0.9981451, "epoch": 23.954104148278905, "grad_norm": 0.5113014578819275, "learning_rate": 5.74983160843841e-06, "loss": 0.01147989, "memory(GiB)": 15.03, "step": 13570, "train_speed(iter/s)": 1.471254 }, { "acc": 0.99848642, "epoch": 23.962930273609885, "grad_norm": 0.45539137721061707, "learning_rate": 5.746943444065446e-06, "loss": 0.01983534, "memory(GiB)": 15.03, "step": 13575, "train_speed(iter/s)": 1.471261 }, { "acc": 0.99641438, "epoch": 23.971756398940865, "grad_norm": 1.6584136486053467, "learning_rate": 5.744055024882422e-06, "loss": 0.02718867, "memory(GiB)": 15.03, "step": 13580, "train_speed(iter/s)": 1.471227 }, { "acc": 0.99986706, "epoch": 23.980582524271846, "grad_norm": 0.31739342212677, "learning_rate": 5.741166351875346e-06, "loss": 0.00482958, "memory(GiB)": 15.03, "step": 13585, "train_speed(iter/s)": 1.471216 }, { "acc": 0.9989687, "epoch": 23.989408649602826, "grad_norm": 0.38409706950187683, "learning_rate": 5.738277426030312e-06, "loss": 0.01084839, "memory(GiB)": 15.03, "step": 13590, "train_speed(iter/s)": 1.471203 }, { "acc": 0.99811621, "epoch": 23.998234774933803, "grad_norm": 1.0845887660980225, "learning_rate": 5.735388248333499e-06, "loss": 0.01665234, "memory(GiB)": 15.03, "step": 13595, "train_speed(iter/s)": 1.471199 }, { "acc": 0.99723015, "epoch": 24.007060900264783, "grad_norm": 0.9488145709037781, "learning_rate": 5.7324988197711725e-06, "loss": 0.02254588, "memory(GiB)": 15.03, "step": 13600, "train_speed(iter/s)": 1.471149 }, { "acc": 0.99546824, "epoch": 24.015887025595763, "grad_norm": 6.031540870666504, "learning_rate": 5.729609141329685e-06, "loss": 0.03047324, "memory(GiB)": 15.03, "step": 13605, "train_speed(iter/s)": 1.47117 }, { "acc": 0.99702721, "epoch": 24.024713150926743, "grad_norm": 0.9762485027313232, "learning_rate": 5.726719213995472e-06, "loss": 0.02245257, "memory(GiB)": 15.03, "step": 13610, "train_speed(iter/s)": 1.471158 }, { "acc": 0.9972929, "epoch": 24.033539276257724, "grad_norm": 3.832228899002075, "learning_rate": 5.7238290387550535e-06, "loss": 0.02480346, "memory(GiB)": 15.03, "step": 13615, "train_speed(iter/s)": 1.471179 }, { "acc": 0.99872341, "epoch": 24.042365401588704, "grad_norm": 2.463217258453369, "learning_rate": 5.7209386165950365e-06, "loss": 0.01805708, "memory(GiB)": 15.03, "step": 13620, "train_speed(iter/s)": 1.471173 }, { "acc": 0.99775171, "epoch": 24.05119152691968, "grad_norm": 1.0539658069610596, "learning_rate": 5.7180479485021135e-06, "loss": 0.01499096, "memory(GiB)": 15.03, "step": 13625, "train_speed(iter/s)": 1.471168 }, { "acc": 0.99860935, "epoch": 24.06001765225066, "grad_norm": 1.6767628192901611, "learning_rate": 5.715157035463058e-06, "loss": 0.01322573, "memory(GiB)": 15.03, "step": 13630, "train_speed(iter/s)": 1.471179 }, { "acc": 0.99733963, "epoch": 24.06884377758164, "grad_norm": 1.8283252716064453, "learning_rate": 5.712265878464729e-06, "loss": 0.01516207, "memory(GiB)": 15.03, "step": 13635, "train_speed(iter/s)": 1.471171 }, { "acc": 0.99772692, "epoch": 24.07766990291262, "grad_norm": 0.837952733039856, "learning_rate": 5.709374478494065e-06, "loss": 0.01537332, "memory(GiB)": 15.03, "step": 13640, "train_speed(iter/s)": 1.471139 }, { "acc": 0.9988204, "epoch": 24.0864960282436, "grad_norm": 1.5096508264541626, "learning_rate": 5.706482836538093e-06, "loss": 0.00892322, "memory(GiB)": 15.03, "step": 13645, "train_speed(iter/s)": 1.471178 }, { "acc": 0.99583492, "epoch": 24.095322153574582, "grad_norm": 1.7539783716201782, "learning_rate": 5.703590953583917e-06, "loss": 0.0325048, "memory(GiB)": 15.03, "step": 13650, "train_speed(iter/s)": 1.4712 }, { "acc": 0.9979351, "epoch": 24.104148278905562, "grad_norm": 2.664210081100464, "learning_rate": 5.70069883061873e-06, "loss": 0.01742772, "memory(GiB)": 15.03, "step": 13655, "train_speed(iter/s)": 1.471196 }, { "acc": 0.99877739, "epoch": 24.11297440423654, "grad_norm": 0.5489650368690491, "learning_rate": 5.6978064686297996e-06, "loss": 0.00992528, "memory(GiB)": 15.03, "step": 13660, "train_speed(iter/s)": 1.471219 }, { "acc": 0.99708719, "epoch": 24.12180052956752, "grad_norm": 1.7563832998275757, "learning_rate": 5.694913868604481e-06, "loss": 0.0185407, "memory(GiB)": 15.03, "step": 13665, "train_speed(iter/s)": 1.471223 }, { "acc": 0.99701099, "epoch": 24.1306266548985, "grad_norm": 1.8985768556594849, "learning_rate": 5.692021031530204e-06, "loss": 0.0235485, "memory(GiB)": 15.03, "step": 13670, "train_speed(iter/s)": 1.471224 }, { "acc": 0.99864998, "epoch": 24.13945278022948, "grad_norm": 0.615971028804779, "learning_rate": 5.689127958394489e-06, "loss": 0.02690974, "memory(GiB)": 15.03, "step": 13675, "train_speed(iter/s)": 1.471224 }, { "acc": 0.99635897, "epoch": 24.14827890556046, "grad_norm": 3.3631012439727783, "learning_rate": 5.6862346501849285e-06, "loss": 0.03392952, "memory(GiB)": 15.03, "step": 13680, "train_speed(iter/s)": 1.471205 }, { "acc": 0.99753103, "epoch": 24.15710503089144, "grad_norm": 1.7586177587509155, "learning_rate": 5.683341107889199e-06, "loss": 0.02446875, "memory(GiB)": 15.03, "step": 13685, "train_speed(iter/s)": 1.47122 }, { "acc": 0.99577131, "epoch": 24.165931156222417, "grad_norm": 2.6182682514190674, "learning_rate": 5.6804473324950535e-06, "loss": 0.03150969, "memory(GiB)": 15.03, "step": 13690, "train_speed(iter/s)": 1.471229 }, { "acc": 0.99928055, "epoch": 24.174757281553397, "grad_norm": 1.828903317451477, "learning_rate": 5.6775533249903324e-06, "loss": 0.01569703, "memory(GiB)": 15.03, "step": 13695, "train_speed(iter/s)": 1.471246 }, { "acc": 0.99566488, "epoch": 24.183583406884377, "grad_norm": 1.710451364517212, "learning_rate": 5.674659086362944e-06, "loss": 0.02988607, "memory(GiB)": 15.03, "step": 13700, "train_speed(iter/s)": 1.471259 }, { "acc": 0.99757872, "epoch": 24.192409532215358, "grad_norm": 2.369725227355957, "learning_rate": 5.671764617600886e-06, "loss": 0.02319327, "memory(GiB)": 15.03, "step": 13705, "train_speed(iter/s)": 1.47127 }, { "acc": 0.99843321, "epoch": 24.201235657546338, "grad_norm": 1.6271551847457886, "learning_rate": 5.668869919692228e-06, "loss": 0.01880288, "memory(GiB)": 15.03, "step": 13710, "train_speed(iter/s)": 1.471293 }, { "acc": 0.995961, "epoch": 24.210061782877318, "grad_norm": 2.2153515815734863, "learning_rate": 5.665974993625122e-06, "loss": 0.02911958, "memory(GiB)": 15.03, "step": 13715, "train_speed(iter/s)": 1.471291 }, { "acc": 0.99495344, "epoch": 24.218887908208295, "grad_norm": 3.445772171020508, "learning_rate": 5.663079840387795e-06, "loss": 0.04001721, "memory(GiB)": 15.03, "step": 13720, "train_speed(iter/s)": 1.471318 }, { "acc": 0.99765997, "epoch": 24.227714033539275, "grad_norm": 0.4857047200202942, "learning_rate": 5.66018446096855e-06, "loss": 0.01954571, "memory(GiB)": 15.03, "step": 13725, "train_speed(iter/s)": 1.4713 }, { "acc": 0.99824247, "epoch": 24.236540158870255, "grad_norm": 3.197537899017334, "learning_rate": 5.6572888563557744e-06, "loss": 0.02563879, "memory(GiB)": 15.03, "step": 13730, "train_speed(iter/s)": 1.471303 }, { "acc": 0.99609747, "epoch": 24.245366284201236, "grad_norm": 0.4709370732307434, "learning_rate": 5.654393027537926e-06, "loss": 0.03576151, "memory(GiB)": 15.03, "step": 13735, "train_speed(iter/s)": 1.471334 }, { "acc": 0.99627705, "epoch": 24.254192409532216, "grad_norm": 1.3927879333496094, "learning_rate": 5.65149697550354e-06, "loss": 0.02803939, "memory(GiB)": 15.03, "step": 13740, "train_speed(iter/s)": 1.471341 }, { "acc": 0.99727898, "epoch": 24.263018534863196, "grad_norm": 2.5717124938964844, "learning_rate": 5.648600701241228e-06, "loss": 0.01554808, "memory(GiB)": 15.03, "step": 13745, "train_speed(iter/s)": 1.471335 }, { "acc": 0.99677982, "epoch": 24.271844660194176, "grad_norm": 2.846895217895508, "learning_rate": 5.645704205739681e-06, "loss": 0.02404243, "memory(GiB)": 15.03, "step": 13750, "train_speed(iter/s)": 1.471345 }, { "acc": 0.99632645, "epoch": 24.280670785525153, "grad_norm": 2.097385883331299, "learning_rate": 5.642807489987658e-06, "loss": 0.01836198, "memory(GiB)": 15.03, "step": 13755, "train_speed(iter/s)": 1.471343 }, { "acc": 0.99671879, "epoch": 24.289496910856133, "grad_norm": 3.1556620597839355, "learning_rate": 5.6399105549740014e-06, "loss": 0.01858097, "memory(GiB)": 15.03, "step": 13760, "train_speed(iter/s)": 1.471335 }, { "acc": 0.99732018, "epoch": 24.298323036187114, "grad_norm": 0.5170601606369019, "learning_rate": 5.637013401687626e-06, "loss": 0.02148868, "memory(GiB)": 15.03, "step": 13765, "train_speed(iter/s)": 1.471319 }, { "acc": 0.99776669, "epoch": 24.307149161518094, "grad_norm": 3.0246293544769287, "learning_rate": 5.634116031117519e-06, "loss": 0.01464859, "memory(GiB)": 15.03, "step": 13770, "train_speed(iter/s)": 1.471332 }, { "acc": 0.99687471, "epoch": 24.315975286849074, "grad_norm": 2.518937826156616, "learning_rate": 5.631218444252742e-06, "loss": 0.02725354, "memory(GiB)": 15.03, "step": 13775, "train_speed(iter/s)": 1.471319 }, { "acc": 0.996805, "epoch": 24.324801412180054, "grad_norm": 1.0963259935379028, "learning_rate": 5.628320642082431e-06, "loss": 0.02060469, "memory(GiB)": 15.03, "step": 13780, "train_speed(iter/s)": 1.471349 }, { "acc": 0.99864044, "epoch": 24.33362753751103, "grad_norm": 4.633490562438965, "learning_rate": 5.625422625595796e-06, "loss": 0.01327218, "memory(GiB)": 15.03, "step": 13785, "train_speed(iter/s)": 1.471325 }, { "acc": 0.9982316, "epoch": 24.34245366284201, "grad_norm": 1.1351215839385986, "learning_rate": 5.622524395782122e-06, "loss": 0.0173247, "memory(GiB)": 15.03, "step": 13790, "train_speed(iter/s)": 1.471339 }, { "acc": 0.99802208, "epoch": 24.35127978817299, "grad_norm": 1.1147340536117554, "learning_rate": 5.61962595363076e-06, "loss": 0.01519823, "memory(GiB)": 15.03, "step": 13795, "train_speed(iter/s)": 1.47135 }, { "acc": 0.99615793, "epoch": 24.360105913503972, "grad_norm": 0.9809723496437073, "learning_rate": 5.616727300131144e-06, "loss": 0.0324602, "memory(GiB)": 15.03, "step": 13800, "train_speed(iter/s)": 1.471356 }, { "acc": 0.99813004, "epoch": 24.368932038834952, "grad_norm": 1.994120478630066, "learning_rate": 5.61382843627277e-06, "loss": 0.02410319, "memory(GiB)": 15.03, "step": 13805, "train_speed(iter/s)": 1.471344 }, { "acc": 0.99729452, "epoch": 24.377758164165932, "grad_norm": 0.8581472039222717, "learning_rate": 5.610929363045212e-06, "loss": 0.01995728, "memory(GiB)": 15.03, "step": 13810, "train_speed(iter/s)": 1.471358 }, { "acc": 0.99820938, "epoch": 24.38658428949691, "grad_norm": 1.3336862325668335, "learning_rate": 5.608030081438115e-06, "loss": 0.01630322, "memory(GiB)": 15.03, "step": 13815, "train_speed(iter/s)": 1.471363 }, { "acc": 0.99711037, "epoch": 24.39541041482789, "grad_norm": 4.520698547363281, "learning_rate": 5.6051305924411914e-06, "loss": 0.02322698, "memory(GiB)": 15.03, "step": 13820, "train_speed(iter/s)": 1.471348 }, { "acc": 0.99660339, "epoch": 24.40423654015887, "grad_norm": 2.891089677810669, "learning_rate": 5.602230897044227e-06, "loss": 0.02331476, "memory(GiB)": 15.03, "step": 13825, "train_speed(iter/s)": 1.471312 }, { "acc": 0.99624557, "epoch": 24.41306266548985, "grad_norm": 1.5519311428070068, "learning_rate": 5.599330996237079e-06, "loss": 0.02525531, "memory(GiB)": 15.03, "step": 13830, "train_speed(iter/s)": 1.471331 }, { "acc": 0.99809723, "epoch": 24.42188879082083, "grad_norm": 0.4588584005832672, "learning_rate": 5.596430891009672e-06, "loss": 0.0176219, "memory(GiB)": 15.03, "step": 13835, "train_speed(iter/s)": 1.471339 }, { "acc": 0.9981925, "epoch": 24.43071491615181, "grad_norm": 2.72015643119812, "learning_rate": 5.593530582352003e-06, "loss": 0.01341856, "memory(GiB)": 15.03, "step": 13840, "train_speed(iter/s)": 1.471351 }, { "acc": 0.99836903, "epoch": 24.43954104148279, "grad_norm": 2.139726161956787, "learning_rate": 5.590630071254139e-06, "loss": 0.01410037, "memory(GiB)": 15.03, "step": 13845, "train_speed(iter/s)": 1.471342 }, { "acc": 0.99525023, "epoch": 24.448367166813767, "grad_norm": 0.7148779630661011, "learning_rate": 5.587729358706212e-06, "loss": 0.03261512, "memory(GiB)": 15.03, "step": 13850, "train_speed(iter/s)": 1.471333 }, { "acc": 0.99709759, "epoch": 24.457193292144748, "grad_norm": 0.6023057699203491, "learning_rate": 5.584828445698428e-06, "loss": 0.02229231, "memory(GiB)": 15.03, "step": 13855, "train_speed(iter/s)": 1.47134 }, { "acc": 0.99878635, "epoch": 24.466019417475728, "grad_norm": 1.7920933961868286, "learning_rate": 5.581927333221054e-06, "loss": 0.0234032, "memory(GiB)": 15.03, "step": 13860, "train_speed(iter/s)": 1.471331 }, { "acc": 0.99689846, "epoch": 24.474845542806708, "grad_norm": 2.272243022918701, "learning_rate": 5.579026022264435e-06, "loss": 0.02972943, "memory(GiB)": 15.03, "step": 13865, "train_speed(iter/s)": 1.471325 }, { "acc": 0.99926777, "epoch": 24.48367166813769, "grad_norm": 0.9016860723495483, "learning_rate": 5.576124513818974e-06, "loss": 0.01091073, "memory(GiB)": 15.03, "step": 13870, "train_speed(iter/s)": 1.471319 }, { "acc": 0.99690943, "epoch": 24.49249779346867, "grad_norm": 4.475885391235352, "learning_rate": 5.57322280887515e-06, "loss": 0.02805096, "memory(GiB)": 15.03, "step": 13875, "train_speed(iter/s)": 1.471325 }, { "acc": 0.99825907, "epoch": 24.501323918799645, "grad_norm": 0.8146122694015503, "learning_rate": 5.570320908423501e-06, "loss": 0.01232319, "memory(GiB)": 15.03, "step": 13880, "train_speed(iter/s)": 1.471339 }, { "acc": 0.99802685, "epoch": 24.510150044130626, "grad_norm": 0.4744330644607544, "learning_rate": 5.5674188134546405e-06, "loss": 0.01457145, "memory(GiB)": 15.03, "step": 13885, "train_speed(iter/s)": 1.471358 }, { "acc": 0.99911041, "epoch": 24.518976169461606, "grad_norm": 0.40822166204452515, "learning_rate": 5.5645165249592395e-06, "loss": 0.01070663, "memory(GiB)": 15.03, "step": 13890, "train_speed(iter/s)": 1.471387 }, { "acc": 0.99725294, "epoch": 24.527802294792586, "grad_norm": 3.001638650894165, "learning_rate": 5.561614043928042e-06, "loss": 0.02013087, "memory(GiB)": 15.03, "step": 13895, "train_speed(iter/s)": 1.47139 }, { "acc": 0.99845676, "epoch": 24.536628420123566, "grad_norm": 2.0516610145568848, "learning_rate": 5.558711371351853e-06, "loss": 0.01057066, "memory(GiB)": 15.03, "step": 13900, "train_speed(iter/s)": 1.471388 }, { "acc": 0.99902468, "epoch": 24.545454545454547, "grad_norm": 1.9031459093093872, "learning_rate": 5.555808508221546e-06, "loss": 0.01790973, "memory(GiB)": 15.03, "step": 13905, "train_speed(iter/s)": 1.471381 }, { "acc": 0.99779425, "epoch": 24.554280670785523, "grad_norm": 1.6115778684616089, "learning_rate": 5.552905455528058e-06, "loss": 0.02057219, "memory(GiB)": 15.03, "step": 13910, "train_speed(iter/s)": 1.471384 }, { "acc": 0.99871206, "epoch": 24.563106796116504, "grad_norm": 0.4361395239830017, "learning_rate": 5.550002214262391e-06, "loss": 0.01228246, "memory(GiB)": 15.03, "step": 13915, "train_speed(iter/s)": 1.471406 }, { "acc": 0.99848766, "epoch": 24.571932921447484, "grad_norm": 0.48584529757499695, "learning_rate": 5.54709878541561e-06, "loss": 0.01257609, "memory(GiB)": 15.03, "step": 13920, "train_speed(iter/s)": 1.471421 }, { "acc": 0.99885931, "epoch": 24.580759046778464, "grad_norm": 0.6574710607528687, "learning_rate": 5.54419516997885e-06, "loss": 0.01280586, "memory(GiB)": 15.03, "step": 13925, "train_speed(iter/s)": 1.471417 }, { "acc": 0.99800186, "epoch": 24.589585172109444, "grad_norm": 1.1539421081542969, "learning_rate": 5.5412913689433014e-06, "loss": 0.01753523, "memory(GiB)": 15.03, "step": 13930, "train_speed(iter/s)": 1.471425 }, { "acc": 0.99645767, "epoch": 24.598411297440425, "grad_norm": 3.0356245040893555, "learning_rate": 5.538387383300223e-06, "loss": 0.02911798, "memory(GiB)": 15.03, "step": 13935, "train_speed(iter/s)": 1.471398 }, { "acc": 0.99763165, "epoch": 24.607237422771405, "grad_norm": 1.6382282972335815, "learning_rate": 5.535483214040935e-06, "loss": 0.02398096, "memory(GiB)": 15.03, "step": 13940, "train_speed(iter/s)": 1.471401 }, { "acc": 0.99628124, "epoch": 24.61606354810238, "grad_norm": 1.1124902963638306, "learning_rate": 5.53257886215682e-06, "loss": 0.02909988, "memory(GiB)": 15.03, "step": 13945, "train_speed(iter/s)": 1.471405 }, { "acc": 0.9972827, "epoch": 24.624889673433362, "grad_norm": 2.7802326679229736, "learning_rate": 5.529674328639325e-06, "loss": 0.02120908, "memory(GiB)": 15.03, "step": 13950, "train_speed(iter/s)": 1.47141 }, { "acc": 0.99595404, "epoch": 24.633715798764342, "grad_norm": 1.1772323846817017, "learning_rate": 5.526769614479955e-06, "loss": 0.02868412, "memory(GiB)": 15.03, "step": 13955, "train_speed(iter/s)": 1.471418 }, { "acc": 0.99735432, "epoch": 24.642541924095323, "grad_norm": 2.710235357284546, "learning_rate": 5.523864720670283e-06, "loss": 0.01965033, "memory(GiB)": 15.03, "step": 13960, "train_speed(iter/s)": 1.471414 }, { "acc": 0.99835491, "epoch": 24.651368049426303, "grad_norm": 2.1126463413238525, "learning_rate": 5.520959648201939e-06, "loss": 0.01285971, "memory(GiB)": 15.03, "step": 13965, "train_speed(iter/s)": 1.471427 }, { "acc": 0.99788103, "epoch": 24.660194174757283, "grad_norm": 2.730522632598877, "learning_rate": 5.518054398066614e-06, "loss": 0.02065564, "memory(GiB)": 15.03, "step": 13970, "train_speed(iter/s)": 1.471406 }, { "acc": 0.99684467, "epoch": 24.66902030008826, "grad_norm": 1.1974889039993286, "learning_rate": 5.515148971256057e-06, "loss": 0.02484282, "memory(GiB)": 15.03, "step": 13975, "train_speed(iter/s)": 1.471418 }, { "acc": 0.99851637, "epoch": 24.67784642541924, "grad_norm": 2.3431081771850586, "learning_rate": 5.512243368762086e-06, "loss": 0.01345049, "memory(GiB)": 15.03, "step": 13980, "train_speed(iter/s)": 1.471394 }, { "acc": 0.99735928, "epoch": 24.68667255075022, "grad_norm": 0.5531355142593384, "learning_rate": 5.509337591576569e-06, "loss": 0.01712982, "memory(GiB)": 15.03, "step": 13985, "train_speed(iter/s)": 1.471399 }, { "acc": 0.99777527, "epoch": 24.6954986760812, "grad_norm": 0.6000140905380249, "learning_rate": 5.5064316406914405e-06, "loss": 0.02110432, "memory(GiB)": 15.03, "step": 13990, "train_speed(iter/s)": 1.471379 }, { "acc": 0.99818649, "epoch": 24.70432480141218, "grad_norm": 3.0117063522338867, "learning_rate": 5.503525517098692e-06, "loss": 0.01971256, "memory(GiB)": 15.03, "step": 13995, "train_speed(iter/s)": 1.471394 }, { "acc": 0.99829798, "epoch": 24.71315092674316, "grad_norm": 0.9229688048362732, "learning_rate": 5.500619221790373e-06, "loss": 0.01246526, "memory(GiB)": 15.03, "step": 14000, "train_speed(iter/s)": 1.47139 }, { "acc": 0.99656677, "epoch": 24.721977052074138, "grad_norm": 2.3996095657348633, "learning_rate": 5.4977127557585905e-06, "loss": 0.0273682, "memory(GiB)": 15.03, "step": 14005, "train_speed(iter/s)": 1.471426 }, { "acc": 0.99715157, "epoch": 24.730803177405118, "grad_norm": 3.0865514278411865, "learning_rate": 5.494806119995517e-06, "loss": 0.02813854, "memory(GiB)": 15.03, "step": 14010, "train_speed(iter/s)": 1.471433 }, { "acc": 0.99543905, "epoch": 24.7396293027361, "grad_norm": 2.311464309692383, "learning_rate": 5.491899315493374e-06, "loss": 0.02392764, "memory(GiB)": 15.03, "step": 14015, "train_speed(iter/s)": 1.47145 }, { "acc": 0.9981678, "epoch": 24.74845542806708, "grad_norm": 1.2740874290466309, "learning_rate": 5.488992343244446e-06, "loss": 0.0144029, "memory(GiB)": 15.03, "step": 14020, "train_speed(iter/s)": 1.471463 }, { "acc": 0.99837332, "epoch": 24.75728155339806, "grad_norm": 2.965015411376953, "learning_rate": 5.486085204241071e-06, "loss": 0.01911298, "memory(GiB)": 15.03, "step": 14025, "train_speed(iter/s)": 1.471458 }, { "acc": 0.99703732, "epoch": 24.76610767872904, "grad_norm": 2.2271347045898438, "learning_rate": 5.483177899475649e-06, "loss": 0.02186829, "memory(GiB)": 15.03, "step": 14030, "train_speed(iter/s)": 1.471474 }, { "acc": 0.99834785, "epoch": 24.77493380406002, "grad_norm": 0.7055427432060242, "learning_rate": 5.480270429940631e-06, "loss": 0.01821934, "memory(GiB)": 15.03, "step": 14035, "train_speed(iter/s)": 1.47148 }, { "acc": 0.99691315, "epoch": 24.783759929390996, "grad_norm": 1.2686394453048706, "learning_rate": 5.4773627966285285e-06, "loss": 0.0266508, "memory(GiB)": 15.03, "step": 14040, "train_speed(iter/s)": 1.47151 }, { "acc": 0.99908562, "epoch": 24.792586054721976, "grad_norm": 1.282729148864746, "learning_rate": 5.474455000531904e-06, "loss": 0.01397633, "memory(GiB)": 15.03, "step": 14045, "train_speed(iter/s)": 1.471493 }, { "acc": 0.99775505, "epoch": 24.801412180052957, "grad_norm": 1.7797919511795044, "learning_rate": 5.471547042643386e-06, "loss": 0.01712509, "memory(GiB)": 15.03, "step": 14050, "train_speed(iter/s)": 1.471463 }, { "acc": 0.99766712, "epoch": 24.810238305383937, "grad_norm": 2.466735601425171, "learning_rate": 5.468638923955642e-06, "loss": 0.01487389, "memory(GiB)": 15.03, "step": 14055, "train_speed(iter/s)": 1.471472 }, { "acc": 0.99832191, "epoch": 24.819064430714917, "grad_norm": 1.3117305040359497, "learning_rate": 5.465730645461411e-06, "loss": 0.01110036, "memory(GiB)": 15.03, "step": 14060, "train_speed(iter/s)": 1.471465 }, { "acc": 0.99922161, "epoch": 24.827890556045897, "grad_norm": 0.8822212219238281, "learning_rate": 5.462822208153475e-06, "loss": 0.00872637, "memory(GiB)": 15.03, "step": 14065, "train_speed(iter/s)": 1.471461 }, { "acc": 0.9969079, "epoch": 24.836716681376874, "grad_norm": 1.3354051113128662, "learning_rate": 5.459913613024677e-06, "loss": 0.02233219, "memory(GiB)": 15.03, "step": 14070, "train_speed(iter/s)": 1.471454 }, { "acc": 0.99758759, "epoch": 24.845542806707854, "grad_norm": 5.017195224761963, "learning_rate": 5.4570048610679085e-06, "loss": 0.0221256, "memory(GiB)": 15.03, "step": 14075, "train_speed(iter/s)": 1.471461 }, { "acc": 0.99649181, "epoch": 24.854368932038835, "grad_norm": 2.4406609535217285, "learning_rate": 5.454095953276115e-06, "loss": 0.0203294, "memory(GiB)": 15.03, "step": 14080, "train_speed(iter/s)": 1.471463 }, { "acc": 0.99574051, "epoch": 24.863195057369815, "grad_norm": 7.395174503326416, "learning_rate": 5.4511868906423035e-06, "loss": 0.03168566, "memory(GiB)": 15.03, "step": 14085, "train_speed(iter/s)": 1.471477 }, { "acc": 0.99837933, "epoch": 24.872021182700795, "grad_norm": 2.2937514781951904, "learning_rate": 5.4482776741595235e-06, "loss": 0.01169435, "memory(GiB)": 15.03, "step": 14090, "train_speed(iter/s)": 1.471474 }, { "acc": 0.99695263, "epoch": 24.880847308031775, "grad_norm": 1.7976258993148804, "learning_rate": 5.445368304820884e-06, "loss": 0.02085861, "memory(GiB)": 15.03, "step": 14095, "train_speed(iter/s)": 1.471444 }, { "acc": 0.99714842, "epoch": 24.889673433362752, "grad_norm": 0.43934381008148193, "learning_rate": 5.442458783619541e-06, "loss": 0.02126363, "memory(GiB)": 15.03, "step": 14100, "train_speed(iter/s)": 1.471465 }, { "acc": 0.99705315, "epoch": 24.898499558693732, "grad_norm": 3.9030439853668213, "learning_rate": 5.439549111548704e-06, "loss": 0.02566875, "memory(GiB)": 15.03, "step": 14105, "train_speed(iter/s)": 1.471487 }, { "acc": 0.99809027, "epoch": 24.907325684024713, "grad_norm": 0.7073786854743958, "learning_rate": 5.436639289601638e-06, "loss": 0.01960604, "memory(GiB)": 15.03, "step": 14110, "train_speed(iter/s)": 1.471524 }, { "acc": 0.99679136, "epoch": 24.916151809355693, "grad_norm": 2.389294147491455, "learning_rate": 5.433729318771652e-06, "loss": 0.01796095, "memory(GiB)": 15.03, "step": 14115, "train_speed(iter/s)": 1.47153 }, { "acc": 0.99854107, "epoch": 24.924977934686673, "grad_norm": 0.39258185029029846, "learning_rate": 5.430819200052113e-06, "loss": 0.01229413, "memory(GiB)": 15.03, "step": 14120, "train_speed(iter/s)": 1.471523 }, { "acc": 0.99642773, "epoch": 24.933804060017653, "grad_norm": 2.9047842025756836, "learning_rate": 5.427908934436434e-06, "loss": 0.02503887, "memory(GiB)": 15.03, "step": 14125, "train_speed(iter/s)": 1.471515 }, { "acc": 0.99826641, "epoch": 24.942630185348634, "grad_norm": 2.593736410140991, "learning_rate": 5.424998522918078e-06, "loss": 0.01485046, "memory(GiB)": 15.03, "step": 14130, "train_speed(iter/s)": 1.47151 }, { "acc": 0.99758749, "epoch": 24.95145631067961, "grad_norm": 0.9931526184082031, "learning_rate": 5.422087966490563e-06, "loss": 0.03167161, "memory(GiB)": 15.03, "step": 14135, "train_speed(iter/s)": 1.471526 }, { "acc": 0.99856176, "epoch": 24.96028243601059, "grad_norm": 0.2537786066532135, "learning_rate": 5.419177266147448e-06, "loss": 0.01313398, "memory(GiB)": 15.03, "step": 14140, "train_speed(iter/s)": 1.471536 }, { "acc": 0.99852829, "epoch": 24.96910856134157, "grad_norm": 1.2475006580352783, "learning_rate": 5.416266422882349e-06, "loss": 0.01411046, "memory(GiB)": 15.03, "step": 14145, "train_speed(iter/s)": 1.471564 }, { "acc": 0.9990345, "epoch": 24.97793468667255, "grad_norm": 1.9767017364501953, "learning_rate": 5.413355437688926e-06, "loss": 0.01659938, "memory(GiB)": 15.03, "step": 14150, "train_speed(iter/s)": 1.471567 }, { "acc": 0.99421015, "epoch": 24.98676081200353, "grad_norm": 0.9474026560783386, "learning_rate": 5.410444311560891e-06, "loss": 0.0347034, "memory(GiB)": 15.03, "step": 14155, "train_speed(iter/s)": 1.471554 }, { "acc": 0.99876842, "epoch": 24.99558693733451, "grad_norm": 2.1073319911956787, "learning_rate": 5.407533045491997e-06, "loss": 0.01448031, "memory(GiB)": 15.03, "step": 14160, "train_speed(iter/s)": 1.471576 }, { "acc": 0.99680328, "epoch": 25.00441306266549, "grad_norm": 3.2899086475372314, "learning_rate": 5.404621640476055e-06, "loss": 0.01835653, "memory(GiB)": 15.03, "step": 14165, "train_speed(iter/s)": 1.471512 }, { "acc": 0.99762287, "epoch": 25.01323918799647, "grad_norm": 2.4144198894500732, "learning_rate": 5.401710097506918e-06, "loss": 0.01641395, "memory(GiB)": 15.03, "step": 14170, "train_speed(iter/s)": 1.471539 }, { "acc": 0.99764633, "epoch": 25.02206531332745, "grad_norm": 3.481618642807007, "learning_rate": 5.398798417578482e-06, "loss": 0.0178875, "memory(GiB)": 15.03, "step": 14175, "train_speed(iter/s)": 1.471529 }, { "acc": 0.99775047, "epoch": 25.03089143865843, "grad_norm": 0.7903532385826111, "learning_rate": 5.395886601684701e-06, "loss": 0.01613553, "memory(GiB)": 15.03, "step": 14180, "train_speed(iter/s)": 1.471552 }, { "acc": 0.99829435, "epoch": 25.03971756398941, "grad_norm": 0.8889632225036621, "learning_rate": 5.392974650819562e-06, "loss": 0.01641349, "memory(GiB)": 15.03, "step": 14185, "train_speed(iter/s)": 1.471562 }, { "acc": 0.99706364, "epoch": 25.04854368932039, "grad_norm": 1.5685935020446777, "learning_rate": 5.39006256597711e-06, "loss": 0.02782735, "memory(GiB)": 15.03, "step": 14190, "train_speed(iter/s)": 1.471588 }, { "acc": 0.99729061, "epoch": 25.057369814651366, "grad_norm": 0.9548899531364441, "learning_rate": 5.387150348151426e-06, "loss": 0.02766398, "memory(GiB)": 15.03, "step": 14195, "train_speed(iter/s)": 1.471606 }, { "acc": 0.99807167, "epoch": 25.066195939982347, "grad_norm": 0.9058611989021301, "learning_rate": 5.384237998336643e-06, "loss": 0.0178849, "memory(GiB)": 15.03, "step": 14200, "train_speed(iter/s)": 1.471618 }, { "acc": 0.99725685, "epoch": 25.075022065313327, "grad_norm": 0.38039731979370117, "learning_rate": 5.381325517526936e-06, "loss": 0.02003345, "memory(GiB)": 15.03, "step": 14205, "train_speed(iter/s)": 1.47164 }, { "acc": 0.99838467, "epoch": 25.083848190644307, "grad_norm": 1.20939302444458, "learning_rate": 5.378412906716527e-06, "loss": 0.01808212, "memory(GiB)": 15.03, "step": 14210, "train_speed(iter/s)": 1.471656 }, { "acc": 0.9986496, "epoch": 25.092674315975287, "grad_norm": 2.918826103210449, "learning_rate": 5.375500166899679e-06, "loss": 0.01180819, "memory(GiB)": 15.03, "step": 14215, "train_speed(iter/s)": 1.471669 }, { "acc": 0.99726553, "epoch": 25.101500441306268, "grad_norm": 0.24166615307331085, "learning_rate": 5.372587299070701e-06, "loss": 0.0147981, "memory(GiB)": 15.03, "step": 14220, "train_speed(iter/s)": 1.47166 }, { "acc": 0.99868965, "epoch": 25.110326566637248, "grad_norm": 0.25432872772216797, "learning_rate": 5.3696743042239474e-06, "loss": 0.01372889, "memory(GiB)": 15.03, "step": 14225, "train_speed(iter/s)": 1.47169 }, { "acc": 0.99714775, "epoch": 25.119152691968225, "grad_norm": 1.5458481311798096, "learning_rate": 5.366761183353814e-06, "loss": 0.02479046, "memory(GiB)": 15.03, "step": 14230, "train_speed(iter/s)": 1.471688 }, { "acc": 0.99908295, "epoch": 25.127978817299205, "grad_norm": 3.8161580562591553, "learning_rate": 5.363847937454738e-06, "loss": 0.01526807, "memory(GiB)": 15.03, "step": 14235, "train_speed(iter/s)": 1.471683 }, { "acc": 0.99909439, "epoch": 25.136804942630185, "grad_norm": 2.275057315826416, "learning_rate": 5.360934567521204e-06, "loss": 0.00826162, "memory(GiB)": 15.03, "step": 14240, "train_speed(iter/s)": 1.471679 }, { "acc": 0.99777498, "epoch": 25.145631067961165, "grad_norm": 0.1950756311416626, "learning_rate": 5.358021074547732e-06, "loss": 0.01786174, "memory(GiB)": 15.03, "step": 14245, "train_speed(iter/s)": 1.471675 }, { "acc": 0.99612074, "epoch": 25.154457193292146, "grad_norm": 0.9992485642433167, "learning_rate": 5.355107459528893e-06, "loss": 0.03555511, "memory(GiB)": 15.03, "step": 14250, "train_speed(iter/s)": 1.471702 }, { "acc": 0.99895134, "epoch": 25.163283318623126, "grad_norm": 0.22445401549339294, "learning_rate": 5.352193723459288e-06, "loss": 0.00801513, "memory(GiB)": 15.03, "step": 14255, "train_speed(iter/s)": 1.471741 }, { "acc": 0.99970036, "epoch": 25.172109443954103, "grad_norm": 0.04073309898376465, "learning_rate": 5.349279867333573e-06, "loss": 0.00430888, "memory(GiB)": 15.03, "step": 14260, "train_speed(iter/s)": 1.47173 }, { "acc": 0.99864006, "epoch": 25.180935569285083, "grad_norm": 1.0233328342437744, "learning_rate": 5.346365892146436e-06, "loss": 0.0124907, "memory(GiB)": 15.03, "step": 14265, "train_speed(iter/s)": 1.471728 }, { "acc": 0.99786263, "epoch": 25.189761694616063, "grad_norm": 1.2442868947982788, "learning_rate": 5.343451798892604e-06, "loss": 0.01725879, "memory(GiB)": 15.03, "step": 14270, "train_speed(iter/s)": 1.471729 }, { "acc": 0.99972219, "epoch": 25.198587819947043, "grad_norm": 1.3414630889892578, "learning_rate": 5.3405375885668534e-06, "loss": 0.00771747, "memory(GiB)": 15.03, "step": 14275, "train_speed(iter/s)": 1.471745 }, { "acc": 0.99681778, "epoch": 25.207413945278024, "grad_norm": 1.5958000421524048, "learning_rate": 5.337623262163991e-06, "loss": 0.02133864, "memory(GiB)": 15.03, "step": 14280, "train_speed(iter/s)": 1.471736 }, { "acc": 0.99753504, "epoch": 25.216240070609004, "grad_norm": 1.8927786350250244, "learning_rate": 5.33470882067887e-06, "loss": 0.0166521, "memory(GiB)": 15.03, "step": 14285, "train_speed(iter/s)": 1.471717 }, { "acc": 0.99809875, "epoch": 25.22506619593998, "grad_norm": 0.4505767822265625, "learning_rate": 5.331794265106377e-06, "loss": 0.02639543, "memory(GiB)": 15.03, "step": 14290, "train_speed(iter/s)": 1.471703 }, { "acc": 0.99781437, "epoch": 25.23389232127096, "grad_norm": 0.810458779335022, "learning_rate": 5.328879596441446e-06, "loss": 0.02188745, "memory(GiB)": 15.03, "step": 14295, "train_speed(iter/s)": 1.471698 }, { "acc": 0.99780846, "epoch": 25.24271844660194, "grad_norm": 3.159101724624634, "learning_rate": 5.32596481567904e-06, "loss": 0.0242294, "memory(GiB)": 15.03, "step": 14300, "train_speed(iter/s)": 1.471677 }, { "acc": 0.99789858, "epoch": 25.25154457193292, "grad_norm": 1.3463515043258667, "learning_rate": 5.323049923814166e-06, "loss": 0.02235167, "memory(GiB)": 15.03, "step": 14305, "train_speed(iter/s)": 1.471697 }, { "acc": 0.99787035, "epoch": 25.2603706972639, "grad_norm": 0.07808446139097214, "learning_rate": 5.320134921841869e-06, "loss": 0.01918539, "memory(GiB)": 15.03, "step": 14310, "train_speed(iter/s)": 1.471702 }, { "acc": 0.9976903, "epoch": 25.269196822594882, "grad_norm": 2.6626546382904053, "learning_rate": 5.317219810757227e-06, "loss": 0.01905459, "memory(GiB)": 15.03, "step": 14315, "train_speed(iter/s)": 1.471694 }, { "acc": 0.99714336, "epoch": 25.278022947925862, "grad_norm": 3.9942753314971924, "learning_rate": 5.31430459155536e-06, "loss": 0.02555231, "memory(GiB)": 15.03, "step": 14320, "train_speed(iter/s)": 1.471683 }, { "acc": 0.99800739, "epoch": 25.28684907325684, "grad_norm": 1.6611123085021973, "learning_rate": 5.311389265231425e-06, "loss": 0.025895, "memory(GiB)": 15.03, "step": 14325, "train_speed(iter/s)": 1.471721 }, { "acc": 0.99927597, "epoch": 25.29567519858782, "grad_norm": 0.3263859152793884, "learning_rate": 5.308473832780612e-06, "loss": 0.01270479, "memory(GiB)": 15.03, "step": 14330, "train_speed(iter/s)": 1.471736 }, { "acc": 0.99692726, "epoch": 25.3045013239188, "grad_norm": 1.9855265617370605, "learning_rate": 5.305558295198151e-06, "loss": 0.02507469, "memory(GiB)": 15.03, "step": 14335, "train_speed(iter/s)": 1.471743 }, { "acc": 0.9983963, "epoch": 25.31332744924978, "grad_norm": 0.46601274609565735, "learning_rate": 5.302642653479301e-06, "loss": 0.01435768, "memory(GiB)": 15.03, "step": 14340, "train_speed(iter/s)": 1.471749 }, { "acc": 0.99581985, "epoch": 25.32215357458076, "grad_norm": 4.136277675628662, "learning_rate": 5.299726908619368e-06, "loss": 0.03102254, "memory(GiB)": 15.03, "step": 14345, "train_speed(iter/s)": 1.47176 }, { "acc": 0.99862881, "epoch": 25.33097969991174, "grad_norm": 0.46630388498306274, "learning_rate": 5.296811061613682e-06, "loss": 0.01451876, "memory(GiB)": 15.03, "step": 14350, "train_speed(iter/s)": 1.471787 }, { "acc": 0.99804039, "epoch": 25.339805825242717, "grad_norm": 0.9496313333511353, "learning_rate": 5.293895113457618e-06, "loss": 0.01790734, "memory(GiB)": 15.03, "step": 14355, "train_speed(iter/s)": 1.471797 }, { "acc": 0.9964365, "epoch": 25.348631950573697, "grad_norm": 1.6573795080184937, "learning_rate": 5.290979065146573e-06, "loss": 0.02714452, "memory(GiB)": 15.03, "step": 14360, "train_speed(iter/s)": 1.471829 }, { "acc": 0.99698582, "epoch": 25.357458075904677, "grad_norm": 0.6892523765563965, "learning_rate": 5.288062917675993e-06, "loss": 0.02083543, "memory(GiB)": 15.03, "step": 14365, "train_speed(iter/s)": 1.471846 }, { "acc": 0.99802084, "epoch": 25.366284201235658, "grad_norm": 0.8922975063323975, "learning_rate": 5.285146672041343e-06, "loss": 0.01841722, "memory(GiB)": 15.03, "step": 14370, "train_speed(iter/s)": 1.471845 }, { "acc": 0.99774485, "epoch": 25.375110326566638, "grad_norm": 0.9110752940177917, "learning_rate": 5.282230329238134e-06, "loss": 0.03157754, "memory(GiB)": 15.03, "step": 14375, "train_speed(iter/s)": 1.471848 }, { "acc": 0.99852638, "epoch": 25.38393645189762, "grad_norm": 2.5236709117889404, "learning_rate": 5.2793138902619015e-06, "loss": 0.00975816, "memory(GiB)": 15.03, "step": 14380, "train_speed(iter/s)": 1.471843 }, { "acc": 0.9991641, "epoch": 25.392762577228595, "grad_norm": 0.4211662709712982, "learning_rate": 5.27639735610822e-06, "loss": 0.00543728, "memory(GiB)": 15.03, "step": 14385, "train_speed(iter/s)": 1.471828 }, { "acc": 0.99678402, "epoch": 25.401588702559575, "grad_norm": 4.835851192474365, "learning_rate": 5.273480727772692e-06, "loss": 0.02313175, "memory(GiB)": 15.03, "step": 14390, "train_speed(iter/s)": 1.4718 }, { "acc": 0.99936867, "epoch": 25.410414827890556, "grad_norm": 0.3009636104106903, "learning_rate": 5.270564006250953e-06, "loss": 0.01454066, "memory(GiB)": 15.03, "step": 14395, "train_speed(iter/s)": 1.471806 }, { "acc": 0.99882736, "epoch": 25.419240953221536, "grad_norm": 0.41395696997642517, "learning_rate": 5.267647192538675e-06, "loss": 0.01209155, "memory(GiB)": 15.03, "step": 14400, "train_speed(iter/s)": 1.471819 }, { "acc": 0.99924889, "epoch": 25.428067078552516, "grad_norm": 0.9688644409179688, "learning_rate": 5.264730287631555e-06, "loss": 0.01117618, "memory(GiB)": 15.03, "step": 14405, "train_speed(iter/s)": 1.471813 }, { "acc": 0.99856567, "epoch": 25.436893203883496, "grad_norm": 0.08765949308872223, "learning_rate": 5.261813292525323e-06, "loss": 0.01145089, "memory(GiB)": 15.03, "step": 14410, "train_speed(iter/s)": 1.471815 }, { "acc": 0.99824123, "epoch": 25.445719329214477, "grad_norm": 0.8993898630142212, "learning_rate": 5.258896208215739e-06, "loss": 0.01738659, "memory(GiB)": 15.03, "step": 14415, "train_speed(iter/s)": 1.471816 }, { "acc": 0.9983429, "epoch": 25.454545454545453, "grad_norm": 0.7427573800086975, "learning_rate": 5.255979035698602e-06, "loss": 0.01363214, "memory(GiB)": 15.03, "step": 14420, "train_speed(iter/s)": 1.471812 }, { "acc": 0.99869413, "epoch": 25.463371579876434, "grad_norm": 0.10261514037847519, "learning_rate": 5.253061775969725e-06, "loss": 0.01384239, "memory(GiB)": 15.03, "step": 14425, "train_speed(iter/s)": 1.471823 }, { "acc": 0.998874, "epoch": 25.472197705207414, "grad_norm": 0.4070579409599304, "learning_rate": 5.250144430024968e-06, "loss": 0.0075264, "memory(GiB)": 15.03, "step": 14430, "train_speed(iter/s)": 1.471826 }, { "acc": 0.99782705, "epoch": 25.481023830538394, "grad_norm": 1.6234432458877563, "learning_rate": 5.247226998860209e-06, "loss": 0.0179346, "memory(GiB)": 15.03, "step": 14435, "train_speed(iter/s)": 1.471832 }, { "acc": 0.99859123, "epoch": 25.489849955869374, "grad_norm": 1.8680437803268433, "learning_rate": 5.244309483471358e-06, "loss": 0.01610436, "memory(GiB)": 15.03, "step": 14440, "train_speed(iter/s)": 1.471803 }, { "acc": 0.99679985, "epoch": 25.498676081200355, "grad_norm": 0.4807489514350891, "learning_rate": 5.241391884854352e-06, "loss": 0.02151855, "memory(GiB)": 15.03, "step": 14445, "train_speed(iter/s)": 1.471815 }, { "acc": 0.99904842, "epoch": 25.50750220653133, "grad_norm": 0.9180478453636169, "learning_rate": 5.238474204005164e-06, "loss": 0.01435358, "memory(GiB)": 15.03, "step": 14450, "train_speed(iter/s)": 1.471819 }, { "acc": 0.99753132, "epoch": 25.51632833186231, "grad_norm": 0.326351135969162, "learning_rate": 5.235556441919785e-06, "loss": 0.01936254, "memory(GiB)": 15.03, "step": 14455, "train_speed(iter/s)": 1.471817 }, { "acc": 0.99842358, "epoch": 25.525154457193292, "grad_norm": 0.40114378929138184, "learning_rate": 5.2326385995942415e-06, "loss": 0.02137786, "memory(GiB)": 15.03, "step": 14460, "train_speed(iter/s)": 1.471826 }, { "acc": 0.99659119, "epoch": 25.533980582524272, "grad_norm": 2.2306220531463623, "learning_rate": 5.22972067802458e-06, "loss": 0.01975627, "memory(GiB)": 15.03, "step": 14465, "train_speed(iter/s)": 1.471819 }, { "acc": 0.99726028, "epoch": 25.542806707855252, "grad_norm": 0.673640787601471, "learning_rate": 5.226802678206884e-06, "loss": 0.01878638, "memory(GiB)": 15.03, "step": 14470, "train_speed(iter/s)": 1.471809 }, { "acc": 0.99708824, "epoch": 25.551632833186233, "grad_norm": 1.3653488159179688, "learning_rate": 5.223884601137254e-06, "loss": 0.01904495, "memory(GiB)": 15.03, "step": 14475, "train_speed(iter/s)": 1.471812 }, { "acc": 0.99792309, "epoch": 25.56045895851721, "grad_norm": 1.9171497821807861, "learning_rate": 5.220966447811823e-06, "loss": 0.01844565, "memory(GiB)": 15.03, "step": 14480, "train_speed(iter/s)": 1.47182 }, { "acc": 0.99811459, "epoch": 25.56928508384819, "grad_norm": 1.1129990816116333, "learning_rate": 5.218048219226747e-06, "loss": 0.01043348, "memory(GiB)": 15.03, "step": 14485, "train_speed(iter/s)": 1.471833 }, { "acc": 0.99754686, "epoch": 25.57811120917917, "grad_norm": 2.4348554611206055, "learning_rate": 5.215129916378208e-06, "loss": 0.02192967, "memory(GiB)": 15.03, "step": 14490, "train_speed(iter/s)": 1.471841 }, { "acc": 0.9980711, "epoch": 25.58693733451015, "grad_norm": 0.48093172907829285, "learning_rate": 5.212211540262415e-06, "loss": 0.01989989, "memory(GiB)": 15.03, "step": 14495, "train_speed(iter/s)": 1.471857 }, { "acc": 0.9982192, "epoch": 25.59576345984113, "grad_norm": 2.0216352939605713, "learning_rate": 5.2092930918756e-06, "loss": 0.0182971, "memory(GiB)": 15.03, "step": 14500, "train_speed(iter/s)": 1.471869 }, { "acc": 0.99733887, "epoch": 25.60458958517211, "grad_norm": 0.9366548657417297, "learning_rate": 5.206374572214023e-06, "loss": 0.01972558, "memory(GiB)": 15.03, "step": 14505, "train_speed(iter/s)": 1.471869 }, { "acc": 0.9992857, "epoch": 25.61341571050309, "grad_norm": 1.376905083656311, "learning_rate": 5.203455982273962e-06, "loss": 0.01502272, "memory(GiB)": 15.03, "step": 14510, "train_speed(iter/s)": 1.471878 }, { "acc": 0.99903469, "epoch": 25.622241835834068, "grad_norm": 1.7611631155014038, "learning_rate": 5.200537323051729e-06, "loss": 0.01185557, "memory(GiB)": 15.03, "step": 14515, "train_speed(iter/s)": 1.471875 }, { "acc": 0.99811163, "epoch": 25.631067961165048, "grad_norm": 4.616240978240967, "learning_rate": 5.197618595543647e-06, "loss": 0.02081265, "memory(GiB)": 15.03, "step": 14520, "train_speed(iter/s)": 1.471868 }, { "acc": 0.99788837, "epoch": 25.639894086496028, "grad_norm": 2.0206921100616455, "learning_rate": 5.194699800746072e-06, "loss": 0.02190639, "memory(GiB)": 15.03, "step": 14525, "train_speed(iter/s)": 1.471852 }, { "acc": 0.99578047, "epoch": 25.64872021182701, "grad_norm": 0.3374409079551697, "learning_rate": 5.191780939655382e-06, "loss": 0.02640417, "memory(GiB)": 15.03, "step": 14530, "train_speed(iter/s)": 1.471845 }, { "acc": 0.99802895, "epoch": 25.65754633715799, "grad_norm": 0.36572036147117615, "learning_rate": 5.188862013267972e-06, "loss": 0.01182254, "memory(GiB)": 15.03, "step": 14535, "train_speed(iter/s)": 1.471845 }, { "acc": 0.99749851, "epoch": 25.66637246248897, "grad_norm": 0.7104806303977966, "learning_rate": 5.185943022580264e-06, "loss": 0.0240979, "memory(GiB)": 15.03, "step": 14540, "train_speed(iter/s)": 1.471881 }, { "acc": 0.99913244, "epoch": 25.675198587819946, "grad_norm": 0.8649794459342957, "learning_rate": 5.183023968588702e-06, "loss": 0.00793866, "memory(GiB)": 15.03, "step": 14545, "train_speed(iter/s)": 1.471866 }, { "acc": 0.99705238, "epoch": 25.684024713150926, "grad_norm": 5.699592590332031, "learning_rate": 5.180104852289749e-06, "loss": 0.02042513, "memory(GiB)": 15.03, "step": 14550, "train_speed(iter/s)": 1.47188 }, { "acc": 0.99735527, "epoch": 25.692850838481906, "grad_norm": 0.3010511100292206, "learning_rate": 5.177185674679892e-06, "loss": 0.02793197, "memory(GiB)": 15.03, "step": 14555, "train_speed(iter/s)": 1.471885 }, { "acc": 0.99940729, "epoch": 25.701676963812886, "grad_norm": 0.4989984631538391, "learning_rate": 5.174266436755635e-06, "loss": 0.00798907, "memory(GiB)": 15.03, "step": 14560, "train_speed(iter/s)": 1.471875 }, { "acc": 0.99897432, "epoch": 25.710503089143867, "grad_norm": 1.921071171760559, "learning_rate": 5.1713471395135094e-06, "loss": 0.01292372, "memory(GiB)": 15.03, "step": 14565, "train_speed(iter/s)": 1.471892 }, { "acc": 0.99701567, "epoch": 25.719329214474847, "grad_norm": 2.275639295578003, "learning_rate": 5.168427783950058e-06, "loss": 0.01722991, "memory(GiB)": 15.03, "step": 14570, "train_speed(iter/s)": 1.471906 }, { "acc": 0.99844875, "epoch": 25.728155339805824, "grad_norm": 0.9076905250549316, "learning_rate": 5.165508371061853e-06, "loss": 0.01682808, "memory(GiB)": 15.03, "step": 14575, "train_speed(iter/s)": 1.471919 }, { "acc": 0.99899273, "epoch": 25.736981465136804, "grad_norm": 0.8057174682617188, "learning_rate": 5.162588901845476e-06, "loss": 0.01727598, "memory(GiB)": 15.03, "step": 14580, "train_speed(iter/s)": 1.471933 }, { "acc": 0.99855995, "epoch": 25.745807590467784, "grad_norm": 0.45821917057037354, "learning_rate": 5.159669377297537e-06, "loss": 0.00872893, "memory(GiB)": 15.03, "step": 14585, "train_speed(iter/s)": 1.471928 }, { "acc": 0.99858589, "epoch": 25.754633715798764, "grad_norm": 0.7026869058609009, "learning_rate": 5.156749798414658e-06, "loss": 0.01614003, "memory(GiB)": 15.03, "step": 14590, "train_speed(iter/s)": 1.471941 }, { "acc": 0.99906664, "epoch": 25.763459841129745, "grad_norm": 1.1820560693740845, "learning_rate": 5.153830166193485e-06, "loss": 0.00901518, "memory(GiB)": 15.03, "step": 14595, "train_speed(iter/s)": 1.471954 }, { "acc": 0.99898148, "epoch": 25.772285966460725, "grad_norm": 0.6446002721786499, "learning_rate": 5.15091048163068e-06, "loss": 0.01329349, "memory(GiB)": 15.03, "step": 14600, "train_speed(iter/s)": 1.471975 }, { "acc": 0.99825859, "epoch": 25.781112091791705, "grad_norm": 0.31684139370918274, "learning_rate": 5.147990745722921e-06, "loss": 0.01571763, "memory(GiB)": 15.03, "step": 14605, "train_speed(iter/s)": 1.472 }, { "acc": 0.99810371, "epoch": 25.789938217122682, "grad_norm": 1.5076161623001099, "learning_rate": 5.145070959466908e-06, "loss": 0.01546865, "memory(GiB)": 15.03, "step": 14610, "train_speed(iter/s)": 1.471992 }, { "acc": 0.99722195, "epoch": 25.798764342453662, "grad_norm": 2.3936831951141357, "learning_rate": 5.142151123859352e-06, "loss": 0.02691727, "memory(GiB)": 15.03, "step": 14615, "train_speed(iter/s)": 1.472009 }, { "acc": 0.99675341, "epoch": 25.807590467784642, "grad_norm": 3.9804701805114746, "learning_rate": 5.139231239896986e-06, "loss": 0.03146849, "memory(GiB)": 15.03, "step": 14620, "train_speed(iter/s)": 1.472032 }, { "acc": 0.99731894, "epoch": 25.816416593115623, "grad_norm": 1.6902120113372803, "learning_rate": 5.136311308576555e-06, "loss": 0.02558983, "memory(GiB)": 15.03, "step": 14625, "train_speed(iter/s)": 1.472046 }, { "acc": 0.99795475, "epoch": 25.825242718446603, "grad_norm": 2.2463557720184326, "learning_rate": 5.133391330894829e-06, "loss": 0.0140237, "memory(GiB)": 15.03, "step": 14630, "train_speed(iter/s)": 1.472063 }, { "acc": 0.99584131, "epoch": 25.834068843777583, "grad_norm": 0.9588068723678589, "learning_rate": 5.130471307848581e-06, "loss": 0.02519169, "memory(GiB)": 15.03, "step": 14635, "train_speed(iter/s)": 1.472076 }, { "acc": 0.99869251, "epoch": 25.84289496910856, "grad_norm": 2.7529799938201904, "learning_rate": 5.127551240434611e-06, "loss": 0.01262374, "memory(GiB)": 15.03, "step": 14640, "train_speed(iter/s)": 1.472088 }, { "acc": 0.99950447, "epoch": 25.85172109443954, "grad_norm": 1.6707799434661865, "learning_rate": 5.124631129649727e-06, "loss": 0.00790476, "memory(GiB)": 15.03, "step": 14645, "train_speed(iter/s)": 1.47209 }, { "acc": 0.9976593, "epoch": 25.86054721977052, "grad_norm": 3.385000228881836, "learning_rate": 5.121710976490754e-06, "loss": 0.02315534, "memory(GiB)": 15.03, "step": 14650, "train_speed(iter/s)": 1.472082 }, { "acc": 0.99897308, "epoch": 25.8693733451015, "grad_norm": 0.4413639307022095, "learning_rate": 5.118790781954534e-06, "loss": 0.01266468, "memory(GiB)": 15.03, "step": 14655, "train_speed(iter/s)": 1.472107 }, { "acc": 0.99842739, "epoch": 25.87819947043248, "grad_norm": 2.2875330448150635, "learning_rate": 5.115870547037919e-06, "loss": 0.01888636, "memory(GiB)": 15.03, "step": 14660, "train_speed(iter/s)": 1.472129 }, { "acc": 0.99828987, "epoch": 25.88702559576346, "grad_norm": 1.153674840927124, "learning_rate": 5.112950272737774e-06, "loss": 0.01422951, "memory(GiB)": 15.03, "step": 14665, "train_speed(iter/s)": 1.472141 }, { "acc": 0.99617863, "epoch": 25.895851721094438, "grad_norm": 2.844456195831299, "learning_rate": 5.110029960050985e-06, "loss": 0.02670752, "memory(GiB)": 15.03, "step": 14670, "train_speed(iter/s)": 1.472153 }, { "acc": 0.9977747, "epoch": 25.904677846425418, "grad_norm": 2.0607056617736816, "learning_rate": 5.107109609974441e-06, "loss": 0.01596329, "memory(GiB)": 15.03, "step": 14675, "train_speed(iter/s)": 1.472149 }, { "acc": 0.99815521, "epoch": 25.9135039717564, "grad_norm": 1.2878937721252441, "learning_rate": 5.1041892235050525e-06, "loss": 0.01603893, "memory(GiB)": 15.03, "step": 14680, "train_speed(iter/s)": 1.47213 }, { "acc": 0.99709387, "epoch": 25.92233009708738, "grad_norm": 1.14156973361969, "learning_rate": 5.101268801639738e-06, "loss": 0.02346681, "memory(GiB)": 15.03, "step": 14685, "train_speed(iter/s)": 1.472133 }, { "acc": 0.99881573, "epoch": 25.93115622241836, "grad_norm": 1.1692078113555908, "learning_rate": 5.09834834537543e-06, "loss": 0.00929488, "memory(GiB)": 15.03, "step": 14690, "train_speed(iter/s)": 1.47212 }, { "acc": 0.99847851, "epoch": 25.93998234774934, "grad_norm": 1.1223139762878418, "learning_rate": 5.095427855709068e-06, "loss": 0.01277819, "memory(GiB)": 15.03, "step": 14695, "train_speed(iter/s)": 1.472124 }, { "acc": 0.99796543, "epoch": 25.94880847308032, "grad_norm": 4.228378772735596, "learning_rate": 5.092507333637609e-06, "loss": 0.01936389, "memory(GiB)": 15.03, "step": 14700, "train_speed(iter/s)": 1.47213 }, { "acc": 0.99882088, "epoch": 25.957634598411296, "grad_norm": 1.9596304893493652, "learning_rate": 5.089586780158019e-06, "loss": 0.02040281, "memory(GiB)": 15.03, "step": 14705, "train_speed(iter/s)": 1.472142 }, { "acc": 0.99775639, "epoch": 25.966460723742276, "grad_norm": 2.308884382247925, "learning_rate": 5.086666196267272e-06, "loss": 0.01722652, "memory(GiB)": 15.03, "step": 14710, "train_speed(iter/s)": 1.472137 }, { "acc": 0.99754353, "epoch": 25.975286849073257, "grad_norm": 3.048579454421997, "learning_rate": 5.083745582962357e-06, "loss": 0.02112283, "memory(GiB)": 15.03, "step": 14715, "train_speed(iter/s)": 1.472148 }, { "acc": 0.99677792, "epoch": 25.984112974404237, "grad_norm": 0.6625285148620605, "learning_rate": 5.08082494124027e-06, "loss": 0.02511514, "memory(GiB)": 15.03, "step": 14720, "train_speed(iter/s)": 1.472151 }, { "acc": 0.99946899, "epoch": 25.992939099735217, "grad_norm": 0.5394542217254639, "learning_rate": 5.077904272098017e-06, "loss": 0.01480581, "memory(GiB)": 15.03, "step": 14725, "train_speed(iter/s)": 1.472161 }, { "acc": 0.99793453, "epoch": 26.001765225066197, "grad_norm": 0.6259333491325378, "learning_rate": 5.074983576532614e-06, "loss": 0.0130883, "memory(GiB)": 15.03, "step": 14730, "train_speed(iter/s)": 1.472083 }, { "acc": 0.99776392, "epoch": 26.010591350397174, "grad_norm": 0.7246370315551758, "learning_rate": 5.072062855541087e-06, "loss": 0.02801254, "memory(GiB)": 15.03, "step": 14735, "train_speed(iter/s)": 1.472106 }, { "acc": 0.9987751, "epoch": 26.019417475728154, "grad_norm": 1.2366784811019897, "learning_rate": 5.069142110120467e-06, "loss": 0.01162338, "memory(GiB)": 15.03, "step": 14740, "train_speed(iter/s)": 1.472107 }, { "acc": 0.99799585, "epoch": 26.028243601059135, "grad_norm": 2.2994792461395264, "learning_rate": 5.066221341267796e-06, "loss": 0.01494629, "memory(GiB)": 15.03, "step": 14745, "train_speed(iter/s)": 1.472093 }, { "acc": 0.99876184, "epoch": 26.037069726390115, "grad_norm": 0.7126723527908325, "learning_rate": 5.063300549980124e-06, "loss": 0.01129669, "memory(GiB)": 15.03, "step": 14750, "train_speed(iter/s)": 1.472107 }, { "acc": 0.999016, "epoch": 26.045895851721095, "grad_norm": 1.1050403118133545, "learning_rate": 5.060379737254513e-06, "loss": 0.01781835, "memory(GiB)": 15.03, "step": 14755, "train_speed(iter/s)": 1.472117 }, { "acc": 0.99773693, "epoch": 26.054721977052075, "grad_norm": 1.63379967212677, "learning_rate": 5.0574589040880206e-06, "loss": 0.02014797, "memory(GiB)": 15.03, "step": 14760, "train_speed(iter/s)": 1.472143 }, { "acc": 0.99852629, "epoch": 26.063548102383052, "grad_norm": 1.0575259923934937, "learning_rate": 5.054538051477725e-06, "loss": 0.00967941, "memory(GiB)": 15.03, "step": 14765, "train_speed(iter/s)": 1.472175 }, { "acc": 0.99858952, "epoch": 26.072374227714032, "grad_norm": 0.38848674297332764, "learning_rate": 5.0516171804206985e-06, "loss": 0.01472813, "memory(GiB)": 15.03, "step": 14770, "train_speed(iter/s)": 1.472158 }, { "acc": 0.99747066, "epoch": 26.081200353045013, "grad_norm": 4.260964870452881, "learning_rate": 5.048696291914031e-06, "loss": 0.02610105, "memory(GiB)": 15.03, "step": 14775, "train_speed(iter/s)": 1.472165 }, { "acc": 0.9989872, "epoch": 26.090026478375993, "grad_norm": 0.4630887806415558, "learning_rate": 5.045775386954809e-06, "loss": 0.00896683, "memory(GiB)": 15.03, "step": 14780, "train_speed(iter/s)": 1.472157 }, { "acc": 0.99850311, "epoch": 26.098852603706973, "grad_norm": 2.852480888366699, "learning_rate": 5.042854466540133e-06, "loss": 0.01241532, "memory(GiB)": 15.03, "step": 14785, "train_speed(iter/s)": 1.472165 }, { "acc": 0.99779606, "epoch": 26.107678729037954, "grad_norm": 0.556108295917511, "learning_rate": 5.039933531667101e-06, "loss": 0.01277273, "memory(GiB)": 15.03, "step": 14790, "train_speed(iter/s)": 1.472181 }, { "acc": 0.99598312, "epoch": 26.116504854368934, "grad_norm": 1.9465385675430298, "learning_rate": 5.037012583332821e-06, "loss": 0.02616531, "memory(GiB)": 15.03, "step": 14795, "train_speed(iter/s)": 1.472189 }, { "acc": 0.99855022, "epoch": 26.12533097969991, "grad_norm": 1.821815848350525, "learning_rate": 5.0340916225344025e-06, "loss": 0.0234304, "memory(GiB)": 15.03, "step": 14800, "train_speed(iter/s)": 1.472195 }, { "acc": 0.99854431, "epoch": 26.13415710503089, "grad_norm": 4.3652215003967285, "learning_rate": 5.031170650268964e-06, "loss": 0.01108631, "memory(GiB)": 15.03, "step": 14805, "train_speed(iter/s)": 1.472214 }, { "acc": 0.9988184, "epoch": 26.14298323036187, "grad_norm": 1.4830458164215088, "learning_rate": 5.028249667533621e-06, "loss": 0.00874462, "memory(GiB)": 15.03, "step": 14810, "train_speed(iter/s)": 1.472186 }, { "acc": 0.99861116, "epoch": 26.15180935569285, "grad_norm": 0.6599021553993225, "learning_rate": 5.0253286753254994e-06, "loss": 0.01653768, "memory(GiB)": 15.03, "step": 14815, "train_speed(iter/s)": 1.472194 }, { "acc": 0.99808817, "epoch": 26.16063548102383, "grad_norm": 5.145630359649658, "learning_rate": 5.022407674641724e-06, "loss": 0.01564245, "memory(GiB)": 15.03, "step": 14820, "train_speed(iter/s)": 1.47219 }, { "acc": 0.99832945, "epoch": 26.169461606354812, "grad_norm": 5.2988386154174805, "learning_rate": 5.019486666479424e-06, "loss": 0.0147928, "memory(GiB)": 15.03, "step": 14825, "train_speed(iter/s)": 1.472181 }, { "acc": 0.998913, "epoch": 26.17828773168579, "grad_norm": 2.2449705600738525, "learning_rate": 5.0165656518357305e-06, "loss": 0.01431201, "memory(GiB)": 15.03, "step": 14830, "train_speed(iter/s)": 1.472197 }, { "acc": 0.99745655, "epoch": 26.18711385701677, "grad_norm": 0.34193548560142517, "learning_rate": 5.013644631707776e-06, "loss": 0.01866721, "memory(GiB)": 15.03, "step": 14835, "train_speed(iter/s)": 1.472206 }, { "acc": 0.99818592, "epoch": 26.19593998234775, "grad_norm": 1.1126083135604858, "learning_rate": 5.0107236070926975e-06, "loss": 0.01760177, "memory(GiB)": 15.03, "step": 14840, "train_speed(iter/s)": 1.472208 }, { "acc": 0.99840679, "epoch": 26.20476610767873, "grad_norm": 0.39012259244918823, "learning_rate": 5.007802578987633e-06, "loss": 0.01094352, "memory(GiB)": 15.03, "step": 14845, "train_speed(iter/s)": 1.472217 }, { "acc": 0.99709539, "epoch": 26.21359223300971, "grad_norm": 1.2398250102996826, "learning_rate": 5.00488154838972e-06, "loss": 0.02823944, "memory(GiB)": 15.03, "step": 14850, "train_speed(iter/s)": 1.472197 }, { "acc": 0.99939728, "epoch": 26.22241835834069, "grad_norm": 1.8490314483642578, "learning_rate": 5.001960516296097e-06, "loss": 0.01441697, "memory(GiB)": 15.03, "step": 14855, "train_speed(iter/s)": 1.472218 }, { "acc": 0.99718122, "epoch": 26.231244483671667, "grad_norm": 0.3233621418476105, "learning_rate": 4.999039483703905e-06, "loss": 0.02130922, "memory(GiB)": 15.03, "step": 14860, "train_speed(iter/s)": 1.472237 }, { "acc": 0.99778805, "epoch": 26.240070609002647, "grad_norm": 2.8600387573242188, "learning_rate": 4.996118451610281e-06, "loss": 0.01510334, "memory(GiB)": 15.03, "step": 14865, "train_speed(iter/s)": 1.472258 }, { "acc": 0.99816265, "epoch": 26.248896734333627, "grad_norm": 0.8902212977409363, "learning_rate": 4.993197421012368e-06, "loss": 0.0181149, "memory(GiB)": 15.03, "step": 14870, "train_speed(iter/s)": 1.472288 }, { "acc": 0.99784241, "epoch": 26.257722859664607, "grad_norm": 2.5287628173828125, "learning_rate": 4.990276392907302e-06, "loss": 0.01736676, "memory(GiB)": 15.03, "step": 14875, "train_speed(iter/s)": 1.472295 }, { "acc": 0.9970747, "epoch": 26.266548984995588, "grad_norm": 0.29163169860839844, "learning_rate": 4.987355368292225e-06, "loss": 0.01904415, "memory(GiB)": 15.03, "step": 14880, "train_speed(iter/s)": 1.472313 }, { "acc": 0.99872284, "epoch": 26.275375110326568, "grad_norm": 0.6056236028671265, "learning_rate": 4.984434348164272e-06, "loss": 0.00870846, "memory(GiB)": 15.03, "step": 14885, "train_speed(iter/s)": 1.472318 }, { "acc": 1.0, "epoch": 26.284201235657548, "grad_norm": 0.475204735994339, "learning_rate": 4.9815133335205785e-06, "loss": 0.00552207, "memory(GiB)": 15.03, "step": 14890, "train_speed(iter/s)": 1.472331 }, { "acc": 0.99717073, "epoch": 26.293027360988525, "grad_norm": 0.0861678421497345, "learning_rate": 4.978592325358277e-06, "loss": 0.02234814, "memory(GiB)": 15.03, "step": 14895, "train_speed(iter/s)": 1.472343 }, { "acc": 0.99952812, "epoch": 26.301853486319505, "grad_norm": 0.7527661919593811, "learning_rate": 4.975671324674502e-06, "loss": 0.0085631, "memory(GiB)": 15.03, "step": 14900, "train_speed(iter/s)": 1.472361 }, { "acc": 0.99736481, "epoch": 26.310679611650485, "grad_norm": 0.9752123951911926, "learning_rate": 4.9727503324663805e-06, "loss": 0.0197441, "memory(GiB)": 15.03, "step": 14905, "train_speed(iter/s)": 1.472383 }, { "acc": 0.99846878, "epoch": 26.319505736981466, "grad_norm": 1.8391079902648926, "learning_rate": 4.9698293497310385e-06, "loss": 0.01187245, "memory(GiB)": 15.03, "step": 14910, "train_speed(iter/s)": 1.472392 }, { "acc": 0.99905977, "epoch": 26.328331862312446, "grad_norm": 1.8861150741577148, "learning_rate": 4.966908377465599e-06, "loss": 0.00979199, "memory(GiB)": 15.03, "step": 14915, "train_speed(iter/s)": 1.472406 }, { "acc": 0.99735241, "epoch": 26.337157987643426, "grad_norm": 6.280943393707275, "learning_rate": 4.963987416667183e-06, "loss": 0.01819336, "memory(GiB)": 15.03, "step": 14920, "train_speed(iter/s)": 1.472432 }, { "acc": 0.99981613, "epoch": 26.345984112974403, "grad_norm": 0.4702857434749603, "learning_rate": 4.961066468332901e-06, "loss": 0.01180596, "memory(GiB)": 15.03, "step": 14925, "train_speed(iter/s)": 1.472431 }, { "acc": 0.99535856, "epoch": 26.354810238305383, "grad_norm": 2.1760830879211426, "learning_rate": 4.9581455334598685e-06, "loss": 0.0266276, "memory(GiB)": 15.03, "step": 14930, "train_speed(iter/s)": 1.472426 }, { "acc": 0.99772377, "epoch": 26.363636363636363, "grad_norm": 0.5575456619262695, "learning_rate": 4.9552246130451916e-06, "loss": 0.01825286, "memory(GiB)": 15.03, "step": 14935, "train_speed(iter/s)": 1.47244 }, { "acc": 0.99856491, "epoch": 26.372462488967344, "grad_norm": 0.43176698684692383, "learning_rate": 4.952303708085971e-06, "loss": 0.01611756, "memory(GiB)": 15.03, "step": 14940, "train_speed(iter/s)": 1.472436 }, { "acc": 0.99987869, "epoch": 26.381288614298324, "grad_norm": 1.1523240804672241, "learning_rate": 4.949382819579303e-06, "loss": 0.00251687, "memory(GiB)": 15.03, "step": 14945, "train_speed(iter/s)": 1.472447 }, { "acc": 0.99794111, "epoch": 26.390114739629304, "grad_norm": 1.2351446151733398, "learning_rate": 4.946461948522277e-06, "loss": 0.02873668, "memory(GiB)": 15.03, "step": 14950, "train_speed(iter/s)": 1.472452 }, { "acc": 0.99888592, "epoch": 26.39894086496028, "grad_norm": 5.05573844909668, "learning_rate": 4.943541095911981e-06, "loss": 0.01296311, "memory(GiB)": 15.03, "step": 14955, "train_speed(iter/s)": 1.472472 }, { "acc": 0.99780798, "epoch": 26.40776699029126, "grad_norm": 1.9186488389968872, "learning_rate": 4.940620262745488e-06, "loss": 0.02540637, "memory(GiB)": 15.03, "step": 14960, "train_speed(iter/s)": 1.472454 }, { "acc": 0.99804821, "epoch": 26.41659311562224, "grad_norm": 0.6315779089927673, "learning_rate": 4.937699450019875e-06, "loss": 0.01641231, "memory(GiB)": 15.03, "step": 14965, "train_speed(iter/s)": 1.472477 }, { "acc": 0.99927073, "epoch": 26.42541924095322, "grad_norm": 0.3652247190475464, "learning_rate": 4.934778658732205e-06, "loss": 0.00886083, "memory(GiB)": 15.03, "step": 14970, "train_speed(iter/s)": 1.472495 }, { "acc": 0.99894047, "epoch": 26.434245366284202, "grad_norm": 0.37199175357818604, "learning_rate": 4.931857889879535e-06, "loss": 0.00903277, "memory(GiB)": 15.03, "step": 14975, "train_speed(iter/s)": 1.472491 }, { "acc": 0.99713535, "epoch": 26.443071491615182, "grad_norm": 1.4759061336517334, "learning_rate": 4.928937144458915e-06, "loss": 0.02556006, "memory(GiB)": 15.03, "step": 14980, "train_speed(iter/s)": 1.472487 }, { "acc": 0.99847498, "epoch": 26.451897616946162, "grad_norm": 1.3749593496322632, "learning_rate": 4.926016423467388e-06, "loss": 0.0130153, "memory(GiB)": 15.03, "step": 14985, "train_speed(iter/s)": 1.472472 }, { "acc": 0.9981493, "epoch": 26.46072374227714, "grad_norm": 2.5794198513031006, "learning_rate": 4.923095727901984e-06, "loss": 0.00782074, "memory(GiB)": 15.03, "step": 14990, "train_speed(iter/s)": 1.472457 }, { "acc": 0.99831858, "epoch": 26.46954986760812, "grad_norm": 0.8063109517097473, "learning_rate": 4.920175058759732e-06, "loss": 0.01277789, "memory(GiB)": 15.03, "step": 14995, "train_speed(iter/s)": 1.472481 }, { "acc": 0.99924917, "epoch": 26.4783759929391, "grad_norm": 0.2634279727935791, "learning_rate": 4.917254417037644e-06, "loss": 0.01109067, "memory(GiB)": 15.03, "step": 15000, "train_speed(iter/s)": 1.472482 }, { "acc": 0.99840555, "epoch": 26.48720211827008, "grad_norm": 0.5507003664970398, "learning_rate": 4.914333803732729e-06, "loss": 0.0139255, "memory(GiB)": 15.03, "step": 15005, "train_speed(iter/s)": 1.472512 }, { "acc": 0.99905605, "epoch": 26.49602824360106, "grad_norm": 2.03314471244812, "learning_rate": 4.911413219841982e-06, "loss": 0.00829774, "memory(GiB)": 15.03, "step": 15010, "train_speed(iter/s)": 1.472506 }, { "acc": 0.99843578, "epoch": 26.50485436893204, "grad_norm": 0.7054815888404846, "learning_rate": 4.908492666362393e-06, "loss": 0.01613079, "memory(GiB)": 15.03, "step": 15015, "train_speed(iter/s)": 1.472505 }, { "acc": 0.99738293, "epoch": 26.513680494263017, "grad_norm": 2.29378604888916, "learning_rate": 4.905572144290933e-06, "loss": 0.00993295, "memory(GiB)": 15.03, "step": 15020, "train_speed(iter/s)": 1.472497 }, { "acc": 0.99797783, "epoch": 26.522506619593997, "grad_norm": 0.9350422024726868, "learning_rate": 4.902651654624573e-06, "loss": 0.01133786, "memory(GiB)": 15.03, "step": 15025, "train_speed(iter/s)": 1.472523 }, { "acc": 0.9979557, "epoch": 26.531332744924978, "grad_norm": 0.4159752428531647, "learning_rate": 4.899731198360263e-06, "loss": 0.02430324, "memory(GiB)": 15.03, "step": 15030, "train_speed(iter/s)": 1.472525 }, { "acc": 0.99827232, "epoch": 26.540158870255958, "grad_norm": 0.9369404911994934, "learning_rate": 4.896810776494948e-06, "loss": 0.01864146, "memory(GiB)": 15.03, "step": 15035, "train_speed(iter/s)": 1.47255 }, { "acc": 0.99697075, "epoch": 26.548984995586938, "grad_norm": 0.4689289629459381, "learning_rate": 4.893890390025561e-06, "loss": 0.02078296, "memory(GiB)": 15.03, "step": 15040, "train_speed(iter/s)": 1.472553 }, { "acc": 0.99763231, "epoch": 26.55781112091792, "grad_norm": 1.161107063293457, "learning_rate": 4.890970039949017e-06, "loss": 0.02126895, "memory(GiB)": 15.03, "step": 15045, "train_speed(iter/s)": 1.472543 }, { "acc": 0.99887438, "epoch": 26.566637246248895, "grad_norm": 0.10710795223712921, "learning_rate": 4.888049727262228e-06, "loss": 0.01241445, "memory(GiB)": 15.03, "step": 15050, "train_speed(iter/s)": 1.472532 }, { "acc": 0.99899893, "epoch": 26.575463371579875, "grad_norm": 1.0375208854675293, "learning_rate": 4.8851294529620835e-06, "loss": 0.01432197, "memory(GiB)": 15.03, "step": 15055, "train_speed(iter/s)": 1.472512 }, { "acc": 0.99861584, "epoch": 26.584289496910856, "grad_norm": 0.38438931107521057, "learning_rate": 4.8822092180454685e-06, "loss": 0.01929628, "memory(GiB)": 15.03, "step": 15060, "train_speed(iter/s)": 1.472527 }, { "acc": 0.99825344, "epoch": 26.593115622241836, "grad_norm": 2.709287166595459, "learning_rate": 4.879289023509246e-06, "loss": 0.03394618, "memory(GiB)": 15.03, "step": 15065, "train_speed(iter/s)": 1.472522 }, { "acc": 0.9935812, "epoch": 26.601941747572816, "grad_norm": 1.5367662906646729, "learning_rate": 4.876368870350275e-06, "loss": 0.05060331, "memory(GiB)": 15.03, "step": 15070, "train_speed(iter/s)": 1.472527 }, { "acc": 0.9964138, "epoch": 26.610767872903796, "grad_norm": 3.1703295707702637, "learning_rate": 4.87344875956539e-06, "loss": 0.02426716, "memory(GiB)": 15.03, "step": 15075, "train_speed(iter/s)": 1.472528 }, { "acc": 0.99895, "epoch": 26.619593998234777, "grad_norm": 0.9182618260383606, "learning_rate": 4.87052869215142e-06, "loss": 0.01334883, "memory(GiB)": 15.03, "step": 15080, "train_speed(iter/s)": 1.472539 }, { "acc": 0.99865885, "epoch": 26.628420123565753, "grad_norm": 0.11233992129564285, "learning_rate": 4.8676086691051725e-06, "loss": 0.02242466, "memory(GiB)": 15.03, "step": 15085, "train_speed(iter/s)": 1.472551 }, { "acc": 0.99678726, "epoch": 26.637246248896734, "grad_norm": 0.2627080976963043, "learning_rate": 4.864688691423445e-06, "loss": 0.02859911, "memory(GiB)": 15.03, "step": 15090, "train_speed(iter/s)": 1.472562 }, { "acc": 0.99856949, "epoch": 26.646072374227714, "grad_norm": 0.7188271880149841, "learning_rate": 4.861768760103015e-06, "loss": 0.01570342, "memory(GiB)": 15.03, "step": 15095, "train_speed(iter/s)": 1.472548 }, { "acc": 0.9992857, "epoch": 26.654898499558694, "grad_norm": 1.4709105491638184, "learning_rate": 4.85884887614065e-06, "loss": 0.01174985, "memory(GiB)": 15.03, "step": 15100, "train_speed(iter/s)": 1.472561 }, { "acc": 0.9991951, "epoch": 26.663724624889674, "grad_norm": 0.4457523822784424, "learning_rate": 4.855929040533094e-06, "loss": 0.01156629, "memory(GiB)": 15.03, "step": 15105, "train_speed(iter/s)": 1.472544 }, { "acc": 0.99837761, "epoch": 26.672550750220655, "grad_norm": 3.4910149574279785, "learning_rate": 4.85300925427708e-06, "loss": 0.01068892, "memory(GiB)": 15.03, "step": 15110, "train_speed(iter/s)": 1.472548 }, { "acc": 0.99958668, "epoch": 26.68137687555163, "grad_norm": 2.2104008197784424, "learning_rate": 4.8500895183693214e-06, "loss": 0.00384494, "memory(GiB)": 15.03, "step": 15115, "train_speed(iter/s)": 1.472515 }, { "acc": 0.99778786, "epoch": 26.69020300088261, "grad_norm": 0.9901514053344727, "learning_rate": 4.847169833806517e-06, "loss": 0.02047781, "memory(GiB)": 15.03, "step": 15120, "train_speed(iter/s)": 1.472521 }, { "acc": 0.99721127, "epoch": 26.699029126213592, "grad_norm": 1.520809531211853, "learning_rate": 4.844250201585344e-06, "loss": 0.02176121, "memory(GiB)": 15.03, "step": 15125, "train_speed(iter/s)": 1.472516 }, { "acc": 0.99873524, "epoch": 26.707855251544572, "grad_norm": 3.4069759845733643, "learning_rate": 4.841330622702467e-06, "loss": 0.01102518, "memory(GiB)": 15.03, "step": 15130, "train_speed(iter/s)": 1.472525 }, { "acc": 0.99859056, "epoch": 26.716681376875552, "grad_norm": 0.918088972568512, "learning_rate": 4.838411098154526e-06, "loss": 0.00983883, "memory(GiB)": 15.03, "step": 15135, "train_speed(iter/s)": 1.472529 }, { "acc": 0.99959755, "epoch": 26.725507502206533, "grad_norm": 2.420886754989624, "learning_rate": 4.835491628938151e-06, "loss": 0.01148112, "memory(GiB)": 15.03, "step": 15140, "train_speed(iter/s)": 1.47254 }, { "acc": 0.99887638, "epoch": 26.73433362753751, "grad_norm": 2.1296310424804688, "learning_rate": 4.832572216049944e-06, "loss": 0.01393708, "memory(GiB)": 15.03, "step": 15145, "train_speed(iter/s)": 1.472551 }, { "acc": 0.99947567, "epoch": 26.74315975286849, "grad_norm": 0.3570752441883087, "learning_rate": 4.829652860486492e-06, "loss": 0.00634897, "memory(GiB)": 15.03, "step": 15150, "train_speed(iter/s)": 1.472531 }, { "acc": 0.99835205, "epoch": 26.75198587819947, "grad_norm": 1.3631948232650757, "learning_rate": 4.826733563244366e-06, "loss": 0.021818, "memory(GiB)": 15.03, "step": 15155, "train_speed(iter/s)": 1.472537 }, { "acc": 0.99821806, "epoch": 26.76081200353045, "grad_norm": 1.155410647392273, "learning_rate": 4.8238143253201095e-06, "loss": 0.02154604, "memory(GiB)": 15.03, "step": 15160, "train_speed(iter/s)": 1.47255 }, { "acc": 0.99920444, "epoch": 26.76963812886143, "grad_norm": 1.970543622970581, "learning_rate": 4.8208951477102525e-06, "loss": 0.01007177, "memory(GiB)": 15.03, "step": 15165, "train_speed(iter/s)": 1.472578 }, { "acc": 0.99754105, "epoch": 26.77846425419241, "grad_norm": 0.34992995858192444, "learning_rate": 4.817976031411298e-06, "loss": 0.01847099, "memory(GiB)": 15.03, "step": 15170, "train_speed(iter/s)": 1.472559 }, { "acc": 0.99812117, "epoch": 26.78729037952339, "grad_norm": 1.365127682685852, "learning_rate": 4.815056977419736e-06, "loss": 0.02010553, "memory(GiB)": 15.03, "step": 15175, "train_speed(iter/s)": 1.472562 }, { "acc": 0.99804955, "epoch": 26.796116504854368, "grad_norm": 3.1142420768737793, "learning_rate": 4.812137986732028e-06, "loss": 0.01824751, "memory(GiB)": 15.03, "step": 15180, "train_speed(iter/s)": 1.472578 }, { "acc": 0.99866362, "epoch": 26.804942630185348, "grad_norm": 0.9200199246406555, "learning_rate": 4.80921906034462e-06, "loss": 0.01946043, "memory(GiB)": 15.03, "step": 15185, "train_speed(iter/s)": 1.472566 }, { "acc": 0.99980774, "epoch": 26.813768755516328, "grad_norm": 0.4498727023601532, "learning_rate": 4.806300199253927e-06, "loss": 0.00430386, "memory(GiB)": 15.03, "step": 15190, "train_speed(iter/s)": 1.47256 }, { "acc": 0.99820652, "epoch": 26.82259488084731, "grad_norm": 0.5489145517349243, "learning_rate": 4.803381404456355e-06, "loss": 0.02595051, "memory(GiB)": 15.03, "step": 15195, "train_speed(iter/s)": 1.472561 }, { "acc": 0.99858017, "epoch": 26.83142100617829, "grad_norm": 0.8654347658157349, "learning_rate": 4.8004626769482735e-06, "loss": 0.01352707, "memory(GiB)": 15.03, "step": 15200, "train_speed(iter/s)": 1.472589 }, { "acc": 0.99913235, "epoch": 26.84024713150927, "grad_norm": 0.6178064942359924, "learning_rate": 4.797544017726039e-06, "loss": 0.01272549, "memory(GiB)": 15.03, "step": 15205, "train_speed(iter/s)": 1.472597 }, { "acc": 0.99766521, "epoch": 26.849073256840246, "grad_norm": 0.30740758776664734, "learning_rate": 4.794625427785979e-06, "loss": 0.01371945, "memory(GiB)": 15.03, "step": 15210, "train_speed(iter/s)": 1.472578 }, { "acc": 0.99806738, "epoch": 26.857899382171226, "grad_norm": 0.4665021300315857, "learning_rate": 4.7917069081244015e-06, "loss": 0.01831853, "memory(GiB)": 15.03, "step": 15215, "train_speed(iter/s)": 1.472564 }, { "acc": 0.99691916, "epoch": 26.866725507502206, "grad_norm": 2.5775279998779297, "learning_rate": 4.7887884597375865e-06, "loss": 0.01804144, "memory(GiB)": 15.03, "step": 15220, "train_speed(iter/s)": 1.472567 }, { "acc": 0.9986824, "epoch": 26.875551632833186, "grad_norm": 1.4141160249710083, "learning_rate": 4.7858700836217935e-06, "loss": 0.01038628, "memory(GiB)": 15.03, "step": 15225, "train_speed(iter/s)": 1.472589 }, { "acc": 0.99891653, "epoch": 26.884377758164167, "grad_norm": 1.5978634357452393, "learning_rate": 4.782951780773255e-06, "loss": 0.01496728, "memory(GiB)": 15.03, "step": 15230, "train_speed(iter/s)": 1.47259 }, { "acc": 0.99807301, "epoch": 26.893203883495147, "grad_norm": 1.2007684707641602, "learning_rate": 4.780033552188179e-06, "loss": 0.0182211, "memory(GiB)": 15.03, "step": 15235, "train_speed(iter/s)": 1.47254 }, { "acc": 0.99794998, "epoch": 26.902030008826124, "grad_norm": 4.184398651123047, "learning_rate": 4.777115398862747e-06, "loss": 0.01565674, "memory(GiB)": 15.03, "step": 15240, "train_speed(iter/s)": 1.472557 }, { "acc": 0.99812183, "epoch": 26.910856134157104, "grad_norm": 1.9374629259109497, "learning_rate": 4.774197321793118e-06, "loss": 0.02103944, "memory(GiB)": 15.03, "step": 15245, "train_speed(iter/s)": 1.472571 }, { "acc": 0.9982605, "epoch": 26.919682259488084, "grad_norm": 3.890886068344116, "learning_rate": 4.771279321975421e-06, "loss": 0.01231563, "memory(GiB)": 15.03, "step": 15250, "train_speed(iter/s)": 1.472582 }, { "acc": 0.99875412, "epoch": 26.928508384819065, "grad_norm": 0.2619456648826599, "learning_rate": 4.76836140040576e-06, "loss": 0.01558692, "memory(GiB)": 15.03, "step": 15255, "train_speed(iter/s)": 1.472571 }, { "acc": 0.9983408, "epoch": 26.937334510150045, "grad_norm": 0.5161765217781067, "learning_rate": 4.7654435580802165e-06, "loss": 0.01394684, "memory(GiB)": 15.03, "step": 15260, "train_speed(iter/s)": 1.472577 }, { "acc": 0.99818516, "epoch": 26.946160635481025, "grad_norm": 1.5960863828659058, "learning_rate": 4.762525795994837e-06, "loss": 0.01542528, "memory(GiB)": 15.03, "step": 15265, "train_speed(iter/s)": 1.472538 }, { "acc": 0.9975934, "epoch": 26.954986760812005, "grad_norm": 0.9274076819419861, "learning_rate": 4.759608115145648e-06, "loss": 0.02075213, "memory(GiB)": 15.03, "step": 15270, "train_speed(iter/s)": 1.472547 }, { "acc": 0.9998106, "epoch": 26.963812886142982, "grad_norm": 0.7483316659927368, "learning_rate": 4.756690516528644e-06, "loss": 0.00772649, "memory(GiB)": 15.03, "step": 15275, "train_speed(iter/s)": 1.47256 }, { "acc": 0.99838676, "epoch": 26.972639011473962, "grad_norm": 3.346216917037964, "learning_rate": 4.7537730011397925e-06, "loss": 0.01998308, "memory(GiB)": 15.03, "step": 15280, "train_speed(iter/s)": 1.472534 }, { "acc": 0.99919271, "epoch": 26.981465136804943, "grad_norm": 0.4567294418811798, "learning_rate": 4.750855569975032e-06, "loss": 0.00763977, "memory(GiB)": 15.03, "step": 15285, "train_speed(iter/s)": 1.472531 }, { "acc": 0.99974489, "epoch": 26.990291262135923, "grad_norm": 0.3855397403240204, "learning_rate": 4.747938224030274e-06, "loss": 0.00659413, "memory(GiB)": 15.03, "step": 15290, "train_speed(iter/s)": 1.472549 }, { "acc": 0.99937029, "epoch": 26.999117387466903, "grad_norm": 1.9249173402786255, "learning_rate": 4.745020964301399e-06, "loss": 0.01166364, "memory(GiB)": 15.03, "step": 15295, "train_speed(iter/s)": 1.472511 }, { "acc": 0.9979517, "epoch": 27.007943512797883, "grad_norm": 0.3613361120223999, "learning_rate": 4.742103791784261e-06, "loss": 0.01715961, "memory(GiB)": 15.03, "step": 15300, "train_speed(iter/s)": 1.472463 }, { "acc": 0.99927835, "epoch": 27.01676963812886, "grad_norm": 0.6837934255599976, "learning_rate": 4.7391867074746786e-06, "loss": 0.00607608, "memory(GiB)": 15.03, "step": 15305, "train_speed(iter/s)": 1.47248 }, { "acc": 0.99922867, "epoch": 27.02559576345984, "grad_norm": 0.4815046191215515, "learning_rate": 4.736269712368447e-06, "loss": 0.01143628, "memory(GiB)": 15.03, "step": 15310, "train_speed(iter/s)": 1.472482 }, { "acc": 0.99873142, "epoch": 27.03442188879082, "grad_norm": 0.08874252438545227, "learning_rate": 4.733352807461326e-06, "loss": 0.01624837, "memory(GiB)": 15.03, "step": 15315, "train_speed(iter/s)": 1.472474 }, { "acc": 0.99901304, "epoch": 27.0432480141218, "grad_norm": 0.7208616137504578, "learning_rate": 4.730435993749047e-06, "loss": 0.02110039, "memory(GiB)": 15.03, "step": 15320, "train_speed(iter/s)": 1.472477 }, { "acc": 0.99604959, "epoch": 27.05207413945278, "grad_norm": 0.9840213060379028, "learning_rate": 4.727519272227309e-06, "loss": 0.02849439, "memory(GiB)": 15.03, "step": 15325, "train_speed(iter/s)": 1.472482 }, { "acc": 0.99843206, "epoch": 27.06090026478376, "grad_norm": 1.0925958156585693, "learning_rate": 4.724602643891782e-06, "loss": 0.01800532, "memory(GiB)": 15.03, "step": 15330, "train_speed(iter/s)": 1.47248 }, { "acc": 0.99909897, "epoch": 27.069726390114738, "grad_norm": 2.363844633102417, "learning_rate": 4.721686109738098e-06, "loss": 0.01165482, "memory(GiB)": 15.03, "step": 15335, "train_speed(iter/s)": 1.47246 }, { "acc": 0.99923878, "epoch": 27.07855251544572, "grad_norm": 0.6255009770393372, "learning_rate": 4.718769670761868e-06, "loss": 0.00717502, "memory(GiB)": 15.03, "step": 15340, "train_speed(iter/s)": 1.472475 }, { "acc": 0.99743719, "epoch": 27.0873786407767, "grad_norm": 2.8095574378967285, "learning_rate": 4.715853327958658e-06, "loss": 0.01569133, "memory(GiB)": 15.03, "step": 15345, "train_speed(iter/s)": 1.472482 }, { "acc": 0.99748306, "epoch": 27.09620476610768, "grad_norm": 2.377380847930908, "learning_rate": 4.71293708232401e-06, "loss": 0.01734805, "memory(GiB)": 15.03, "step": 15350, "train_speed(iter/s)": 1.472491 }, { "acc": 0.99897079, "epoch": 27.10503089143866, "grad_norm": 0.5296412706375122, "learning_rate": 4.710020934853428e-06, "loss": 0.01212245, "memory(GiB)": 15.03, "step": 15355, "train_speed(iter/s)": 1.472494 }, { "acc": 0.9986063, "epoch": 27.11385701676964, "grad_norm": 0.5102086663246155, "learning_rate": 4.707104886542384e-06, "loss": 0.01483759, "memory(GiB)": 15.03, "step": 15360, "train_speed(iter/s)": 1.472472 }, { "acc": 0.99854317, "epoch": 27.12268314210062, "grad_norm": 3.839294195175171, "learning_rate": 4.704188938386318e-06, "loss": 0.01443487, "memory(GiB)": 15.03, "step": 15365, "train_speed(iter/s)": 1.472509 }, { "acc": 0.99893274, "epoch": 27.131509267431596, "grad_norm": 1.2099984884262085, "learning_rate": 4.701273091380633e-06, "loss": 0.01597686, "memory(GiB)": 15.03, "step": 15370, "train_speed(iter/s)": 1.472512 }, { "acc": 0.99946918, "epoch": 27.140335392762577, "grad_norm": 0.3485753834247589, "learning_rate": 4.6983573465207e-06, "loss": 0.01287174, "memory(GiB)": 15.03, "step": 15375, "train_speed(iter/s)": 1.472527 }, { "acc": 0.99920654, "epoch": 27.149161518093557, "grad_norm": 0.4139713644981384, "learning_rate": 4.695441704801851e-06, "loss": 0.01551648, "memory(GiB)": 15.03, "step": 15380, "train_speed(iter/s)": 1.472536 }, { "acc": 0.99900017, "epoch": 27.157987643424537, "grad_norm": 2.058993101119995, "learning_rate": 4.692526167219389e-06, "loss": 0.01659934, "memory(GiB)": 15.03, "step": 15385, "train_speed(iter/s)": 1.472536 }, { "acc": 0.99867773, "epoch": 27.166813768755517, "grad_norm": 2.11993145942688, "learning_rate": 4.689610734768576e-06, "loss": 0.01405914, "memory(GiB)": 15.03, "step": 15390, "train_speed(iter/s)": 1.472548 }, { "acc": 0.99857845, "epoch": 27.175639894086498, "grad_norm": 0.09816326200962067, "learning_rate": 4.686695408444641e-06, "loss": 0.020008, "memory(GiB)": 15.03, "step": 15395, "train_speed(iter/s)": 1.472568 }, { "acc": 0.99765854, "epoch": 27.184466019417474, "grad_norm": 1.4266725778579712, "learning_rate": 4.683780189242774e-06, "loss": 0.02072064, "memory(GiB)": 15.03, "step": 15400, "train_speed(iter/s)": 1.472576 }, { "acc": 0.99846191, "epoch": 27.193292144748455, "grad_norm": 0.6976628303527832, "learning_rate": 4.6808650781581335e-06, "loss": 0.01323286, "memory(GiB)": 15.03, "step": 15405, "train_speed(iter/s)": 1.472595 }, { "acc": 0.99931059, "epoch": 27.202118270079435, "grad_norm": 0.923958957195282, "learning_rate": 4.6779500761858345e-06, "loss": 0.01289061, "memory(GiB)": 15.03, "step": 15410, "train_speed(iter/s)": 1.472615 }, { "acc": 0.99840612, "epoch": 27.210944395410415, "grad_norm": 0.45356419682502747, "learning_rate": 4.675035184320962e-06, "loss": 0.01642329, "memory(GiB)": 15.03, "step": 15415, "train_speed(iter/s)": 1.472626 }, { "acc": 0.99981346, "epoch": 27.219770520741395, "grad_norm": 0.07528354227542877, "learning_rate": 4.672120403558554e-06, "loss": 0.01082794, "memory(GiB)": 15.03, "step": 15420, "train_speed(iter/s)": 1.472649 }, { "acc": 0.99820995, "epoch": 27.228596646072376, "grad_norm": 1.388299584388733, "learning_rate": 4.669205734893622e-06, "loss": 0.01092848, "memory(GiB)": 15.03, "step": 15425, "train_speed(iter/s)": 1.47265 }, { "acc": 0.99894123, "epoch": 27.237422771403352, "grad_norm": 1.8877966403961182, "learning_rate": 4.6662911793211305e-06, "loss": 0.01112055, "memory(GiB)": 15.03, "step": 15430, "train_speed(iter/s)": 1.472658 }, { "acc": 0.99869184, "epoch": 27.246248896734333, "grad_norm": 0.2740573287010193, "learning_rate": 4.66337673783601e-06, "loss": 0.01134716, "memory(GiB)": 15.03, "step": 15435, "train_speed(iter/s)": 1.472678 }, { "acc": 0.99914684, "epoch": 27.255075022065313, "grad_norm": 0.660354495048523, "learning_rate": 4.660462411433148e-06, "loss": 0.00988912, "memory(GiB)": 15.03, "step": 15440, "train_speed(iter/s)": 1.472702 }, { "acc": 0.99904385, "epoch": 27.263901147396293, "grad_norm": 0.16125857830047607, "learning_rate": 4.657548201107397e-06, "loss": 0.0177148, "memory(GiB)": 15.03, "step": 15445, "train_speed(iter/s)": 1.472741 }, { "acc": 0.99884624, "epoch": 27.272727272727273, "grad_norm": 1.1230618953704834, "learning_rate": 4.6546341078535666e-06, "loss": 0.01120591, "memory(GiB)": 15.03, "step": 15450, "train_speed(iter/s)": 1.472749 }, { "acc": 0.99857512, "epoch": 27.281553398058254, "grad_norm": 2.3929240703582764, "learning_rate": 4.651720132666428e-06, "loss": 0.01501578, "memory(GiB)": 15.03, "step": 15455, "train_speed(iter/s)": 1.472744 }, { "acc": 0.99842873, "epoch": 27.290379523389234, "grad_norm": 4.569150924682617, "learning_rate": 4.6488062765407126e-06, "loss": 0.0118453, "memory(GiB)": 15.03, "step": 15460, "train_speed(iter/s)": 1.472759 }, { "acc": 0.99825449, "epoch": 27.29920564872021, "grad_norm": 3.6679794788360596, "learning_rate": 4.6458925404711116e-06, "loss": 0.014802, "memory(GiB)": 15.03, "step": 15465, "train_speed(iter/s)": 1.472767 }, { "acc": 0.99924746, "epoch": 27.30803177405119, "grad_norm": 0.9242889285087585, "learning_rate": 4.64297892545227e-06, "loss": 0.00652582, "memory(GiB)": 15.03, "step": 15470, "train_speed(iter/s)": 1.472765 }, { "acc": 0.9990675, "epoch": 27.31685789938217, "grad_norm": 1.0815073251724243, "learning_rate": 4.640065432478798e-06, "loss": 0.01087885, "memory(GiB)": 15.03, "step": 15475, "train_speed(iter/s)": 1.47276 }, { "acc": 0.99895439, "epoch": 27.32568402471315, "grad_norm": 1.2620493173599243, "learning_rate": 4.637152062545263e-06, "loss": 0.01021711, "memory(GiB)": 15.03, "step": 15480, "train_speed(iter/s)": 1.47275 }, { "acc": 0.9992157, "epoch": 27.33451015004413, "grad_norm": 4.699481964111328, "learning_rate": 4.634238816646187e-06, "loss": 0.01240083, "memory(GiB)": 15.03, "step": 15485, "train_speed(iter/s)": 1.472753 }, { "acc": 0.9968091, "epoch": 27.343336275375112, "grad_norm": 0.9666380286216736, "learning_rate": 4.631325695776054e-06, "loss": 0.03127686, "memory(GiB)": 15.03, "step": 15490, "train_speed(iter/s)": 1.472772 }, { "acc": 0.99847126, "epoch": 27.35216240070609, "grad_norm": 1.9285739660263062, "learning_rate": 4.6284127009293e-06, "loss": 0.01876963, "memory(GiB)": 15.03, "step": 15495, "train_speed(iter/s)": 1.472771 }, { "acc": 0.99917107, "epoch": 27.36098852603707, "grad_norm": 0.6924996972084045, "learning_rate": 4.625499833100324e-06, "loss": 0.01001744, "memory(GiB)": 15.03, "step": 15500, "train_speed(iter/s)": 1.472779 }, { "acc": 0.99837189, "epoch": 27.36981465136805, "grad_norm": 1.2451304197311401, "learning_rate": 4.622587093283474e-06, "loss": 0.02229192, "memory(GiB)": 15.03, "step": 15505, "train_speed(iter/s)": 1.472773 }, { "acc": 0.99895821, "epoch": 27.37864077669903, "grad_norm": 0.27818378806114197, "learning_rate": 4.619674482473065e-06, "loss": 0.00926776, "memory(GiB)": 15.03, "step": 15510, "train_speed(iter/s)": 1.472753 }, { "acc": 0.99827881, "epoch": 27.38746690203001, "grad_norm": 4.148451805114746, "learning_rate": 4.616762001663358e-06, "loss": 0.01449135, "memory(GiB)": 15.03, "step": 15515, "train_speed(iter/s)": 1.47276 }, { "acc": 0.99824257, "epoch": 27.39629302736099, "grad_norm": 0.8733861446380615, "learning_rate": 4.613849651848575e-06, "loss": 0.00969471, "memory(GiB)": 15.03, "step": 15520, "train_speed(iter/s)": 1.47276 }, { "acc": 0.99847507, "epoch": 27.405119152691967, "grad_norm": 0.5502099394798279, "learning_rate": 4.610937434022891e-06, "loss": 0.0095328, "memory(GiB)": 15.03, "step": 15525, "train_speed(iter/s)": 1.472782 }, { "acc": 0.9982295, "epoch": 27.413945278022947, "grad_norm": 0.09558270126581192, "learning_rate": 4.6080253491804385e-06, "loss": 0.01701204, "memory(GiB)": 15.03, "step": 15530, "train_speed(iter/s)": 1.472791 }, { "acc": 0.99779291, "epoch": 27.422771403353927, "grad_norm": 2.7783362865448, "learning_rate": 4.6051133983153e-06, "loss": 0.01658373, "memory(GiB)": 15.03, "step": 15535, "train_speed(iter/s)": 1.472759 }, { "acc": 0.998349, "epoch": 27.431597528684907, "grad_norm": 3.4587242603302, "learning_rate": 4.602201582421518e-06, "loss": 0.01592762, "memory(GiB)": 15.03, "step": 15540, "train_speed(iter/s)": 1.472759 }, { "acc": 0.99908018, "epoch": 27.440423654015888, "grad_norm": 0.9012109637260437, "learning_rate": 4.5992899024930835e-06, "loss": 0.01817785, "memory(GiB)": 15.03, "step": 15545, "train_speed(iter/s)": 1.472773 }, { "acc": 0.99897442, "epoch": 27.449249779346868, "grad_norm": 0.26804274320602417, "learning_rate": 4.5963783595239455e-06, "loss": 0.00498547, "memory(GiB)": 15.03, "step": 15550, "train_speed(iter/s)": 1.47276 }, { "acc": 0.99841461, "epoch": 27.458075904677848, "grad_norm": 2.0270416736602783, "learning_rate": 4.593466954508003e-06, "loss": 0.01080524, "memory(GiB)": 15.03, "step": 15555, "train_speed(iter/s)": 1.472764 }, { "acc": 0.9996212, "epoch": 27.466902030008825, "grad_norm": 0.8411340713500977, "learning_rate": 4.590555688439112e-06, "loss": 0.01366043, "memory(GiB)": 15.03, "step": 15560, "train_speed(iter/s)": 1.472789 }, { "acc": 0.99886398, "epoch": 27.475728155339805, "grad_norm": 3.2753727436065674, "learning_rate": 4.587644562311076e-06, "loss": 0.01456225, "memory(GiB)": 15.03, "step": 15565, "train_speed(iter/s)": 1.472801 }, { "acc": 0.99786577, "epoch": 27.484554280670785, "grad_norm": 0.3639867901802063, "learning_rate": 4.584733577117653e-06, "loss": 0.01673396, "memory(GiB)": 15.03, "step": 15570, "train_speed(iter/s)": 1.472823 }, { "acc": 0.99824505, "epoch": 27.493380406001766, "grad_norm": 1.6154276132583618, "learning_rate": 4.581822733852553e-06, "loss": 0.00939635, "memory(GiB)": 15.03, "step": 15575, "train_speed(iter/s)": 1.472841 }, { "acc": 0.99899712, "epoch": 27.502206531332746, "grad_norm": 1.5618581771850586, "learning_rate": 4.578912033509438e-06, "loss": 0.00957749, "memory(GiB)": 15.03, "step": 15580, "train_speed(iter/s)": 1.472851 }, { "acc": 0.99770966, "epoch": 27.511032656663726, "grad_norm": 1.303484559059143, "learning_rate": 4.5760014770819225e-06, "loss": 0.02275469, "memory(GiB)": 15.03, "step": 15585, "train_speed(iter/s)": 1.472844 }, { "acc": 0.99938402, "epoch": 27.519858781994703, "grad_norm": 1.0687588453292847, "learning_rate": 4.573091065563566e-06, "loss": 0.00969139, "memory(GiB)": 15.03, "step": 15590, "train_speed(iter/s)": 1.472849 }, { "acc": 0.99835997, "epoch": 27.528684907325683, "grad_norm": 0.4672389626502991, "learning_rate": 4.570180799947888e-06, "loss": 0.01690982, "memory(GiB)": 15.03, "step": 15595, "train_speed(iter/s)": 1.472846 }, { "acc": 0.99794827, "epoch": 27.537511032656663, "grad_norm": 0.5066556334495544, "learning_rate": 4.567270681228348e-06, "loss": 0.01578472, "memory(GiB)": 15.03, "step": 15600, "train_speed(iter/s)": 1.472836 }, { "acc": 0.99812832, "epoch": 27.546337157987644, "grad_norm": 1.4668399095535278, "learning_rate": 4.5643607103983645e-06, "loss": 0.01409185, "memory(GiB)": 15.03, "step": 15605, "train_speed(iter/s)": 1.47286 }, { "acc": 0.99825859, "epoch": 27.555163283318624, "grad_norm": 2.236375093460083, "learning_rate": 4.561450888451297e-06, "loss": 0.01036625, "memory(GiB)": 15.03, "step": 15610, "train_speed(iter/s)": 1.472851 }, { "acc": 0.99868536, "epoch": 27.563989408649604, "grad_norm": 0.6597432494163513, "learning_rate": 4.558541216380462e-06, "loss": 0.0101231, "memory(GiB)": 15.03, "step": 15615, "train_speed(iter/s)": 1.472851 }, { "acc": 0.99921894, "epoch": 27.57281553398058, "grad_norm": 0.19554929435253143, "learning_rate": 4.555631695179117e-06, "loss": 0.01224385, "memory(GiB)": 15.03, "step": 15620, "train_speed(iter/s)": 1.47287 }, { "acc": 0.99917011, "epoch": 27.58164165931156, "grad_norm": 0.4447701871395111, "learning_rate": 4.552722325840477e-06, "loss": 0.00869977, "memory(GiB)": 15.03, "step": 15625, "train_speed(iter/s)": 1.472896 }, { "acc": 0.99823475, "epoch": 27.59046778464254, "grad_norm": 0.680227518081665, "learning_rate": 4.549813109357697e-06, "loss": 0.01642948, "memory(GiB)": 15.03, "step": 15630, "train_speed(iter/s)": 1.472922 }, { "acc": 0.99953709, "epoch": 27.59929390997352, "grad_norm": 3.326449394226074, "learning_rate": 4.5469040467238855e-06, "loss": 0.00564656, "memory(GiB)": 15.03, "step": 15635, "train_speed(iter/s)": 1.472925 }, { "acc": 0.99980164, "epoch": 27.608120035304502, "grad_norm": 0.2733563184738159, "learning_rate": 4.543995138932094e-06, "loss": 0.00453387, "memory(GiB)": 15.03, "step": 15640, "train_speed(iter/s)": 1.472926 }, { "acc": 0.99720135, "epoch": 27.616946160635482, "grad_norm": 3.7874302864074707, "learning_rate": 4.541086386975325e-06, "loss": 0.0162496, "memory(GiB)": 15.03, "step": 15645, "train_speed(iter/s)": 1.47293 }, { "acc": 0.99901323, "epoch": 27.625772285966463, "grad_norm": 3.2449405193328857, "learning_rate": 4.538177791846525e-06, "loss": 0.0101658, "memory(GiB)": 15.03, "step": 15650, "train_speed(iter/s)": 1.47295 }, { "acc": 0.99832478, "epoch": 27.63459841129744, "grad_norm": 0.3231232464313507, "learning_rate": 4.535269354538591e-06, "loss": 0.01857048, "memory(GiB)": 15.03, "step": 15655, "train_speed(iter/s)": 1.472972 }, { "acc": 0.99794445, "epoch": 27.64342453662842, "grad_norm": 3.6671247482299805, "learning_rate": 4.532361076044359e-06, "loss": 0.01304086, "memory(GiB)": 15.03, "step": 15660, "train_speed(iter/s)": 1.472976 }, { "acc": 0.99927177, "epoch": 27.6522506619594, "grad_norm": 0.7653417587280273, "learning_rate": 4.529452957356618e-06, "loss": 0.01224927, "memory(GiB)": 15.03, "step": 15665, "train_speed(iter/s)": 1.473004 }, { "acc": 0.99864635, "epoch": 27.66107678729038, "grad_norm": 2.1366524696350098, "learning_rate": 4.526544999468096e-06, "loss": 0.0108873, "memory(GiB)": 15.03, "step": 15670, "train_speed(iter/s)": 1.473029 }, { "acc": 0.99798431, "epoch": 27.66990291262136, "grad_norm": 0.9252285361289978, "learning_rate": 4.5236372033714745e-06, "loss": 0.01620501, "memory(GiB)": 15.03, "step": 15675, "train_speed(iter/s)": 1.473051 }, { "acc": 0.9985774, "epoch": 27.67872903795234, "grad_norm": 1.5319390296936035, "learning_rate": 4.52072957005937e-06, "loss": 0.01504219, "memory(GiB)": 15.03, "step": 15680, "train_speed(iter/s)": 1.473063 }, { "acc": 0.99832611, "epoch": 27.687555163283317, "grad_norm": 2.457514524459839, "learning_rate": 4.5178221005243535e-06, "loss": 0.02222934, "memory(GiB)": 15.03, "step": 15685, "train_speed(iter/s)": 1.473056 }, { "acc": 0.99920864, "epoch": 27.696381288614297, "grad_norm": 0.35395264625549316, "learning_rate": 4.51491479575893e-06, "loss": 0.01090293, "memory(GiB)": 15.03, "step": 15690, "train_speed(iter/s)": 1.473061 }, { "acc": 0.99952049, "epoch": 27.705207413945278, "grad_norm": 0.29431915283203125, "learning_rate": 4.512007656755555e-06, "loss": 0.00395287, "memory(GiB)": 15.03, "step": 15695, "train_speed(iter/s)": 1.473082 }, { "acc": 0.99846115, "epoch": 27.714033539276258, "grad_norm": 2.8336408138275146, "learning_rate": 4.5091006845066274e-06, "loss": 0.01060167, "memory(GiB)": 15.03, "step": 15700, "train_speed(iter/s)": 1.473077 }, { "acc": 0.99974995, "epoch": 27.72285966460724, "grad_norm": 1.7072405815124512, "learning_rate": 4.506193880004484e-06, "loss": 0.01142796, "memory(GiB)": 15.03, "step": 15705, "train_speed(iter/s)": 1.473085 }, { "acc": 0.99887877, "epoch": 27.73168578993822, "grad_norm": 0.28752774000167847, "learning_rate": 4.503287244241411e-06, "loss": 0.01356058, "memory(GiB)": 15.03, "step": 15710, "train_speed(iter/s)": 1.473104 }, { "acc": 0.99854603, "epoch": 27.740511915269195, "grad_norm": 1.3247700929641724, "learning_rate": 4.5003807782096285e-06, "loss": 0.00903123, "memory(GiB)": 15.03, "step": 15715, "train_speed(iter/s)": 1.473137 }, { "acc": 0.99914513, "epoch": 27.749338040600176, "grad_norm": 1.1900966167449951, "learning_rate": 4.49747448290131e-06, "loss": 0.01204786, "memory(GiB)": 15.03, "step": 15720, "train_speed(iter/s)": 1.473145 }, { "acc": 0.99875584, "epoch": 27.758164165931156, "grad_norm": 0.6574587821960449, "learning_rate": 4.494568359308561e-06, "loss": 0.01474251, "memory(GiB)": 15.03, "step": 15725, "train_speed(iter/s)": 1.473176 }, { "acc": 0.99842262, "epoch": 27.766990291262136, "grad_norm": 2.327944278717041, "learning_rate": 4.491662408423433e-06, "loss": 0.01076189, "memory(GiB)": 15.03, "step": 15730, "train_speed(iter/s)": 1.473172 }, { "acc": 0.99824429, "epoch": 27.775816416593116, "grad_norm": 0.3027351498603821, "learning_rate": 4.488756631237916e-06, "loss": 0.01689659, "memory(GiB)": 15.03, "step": 15735, "train_speed(iter/s)": 1.473166 }, { "acc": 0.9987771, "epoch": 27.784642541924097, "grad_norm": 1.4385740756988525, "learning_rate": 4.4858510287439435e-06, "loss": 0.01349946, "memory(GiB)": 15.03, "step": 15740, "train_speed(iter/s)": 1.473171 }, { "acc": 0.99774799, "epoch": 27.793468667255077, "grad_norm": 0.4095074236392975, "learning_rate": 4.482945601933388e-06, "loss": 0.0239337, "memory(GiB)": 15.03, "step": 15745, "train_speed(iter/s)": 1.473173 }, { "acc": 0.99842138, "epoch": 27.802294792586054, "grad_norm": 1.1742750406265259, "learning_rate": 4.480040351798062e-06, "loss": 0.01476011, "memory(GiB)": 15.03, "step": 15750, "train_speed(iter/s)": 1.473172 }, { "acc": 0.99873047, "epoch": 27.811120917917034, "grad_norm": 0.15836574137210846, "learning_rate": 4.477135279329716e-06, "loss": 0.01145777, "memory(GiB)": 15.03, "step": 15755, "train_speed(iter/s)": 1.473181 }, { "acc": 0.99911213, "epoch": 27.819947043248014, "grad_norm": 1.3203060626983643, "learning_rate": 4.474230385520045e-06, "loss": 0.01098924, "memory(GiB)": 15.03, "step": 15760, "train_speed(iter/s)": 1.473187 }, { "acc": 0.99879246, "epoch": 27.828773168578994, "grad_norm": 0.7865884900093079, "learning_rate": 4.471325671360677e-06, "loss": 0.01179361, "memory(GiB)": 15.03, "step": 15765, "train_speed(iter/s)": 1.4732 }, { "acc": 0.99800024, "epoch": 27.837599293909975, "grad_norm": 2.0858700275421143, "learning_rate": 4.4684211378431825e-06, "loss": 0.02233781, "memory(GiB)": 15.03, "step": 15770, "train_speed(iter/s)": 1.473205 }, { "acc": 0.9963563, "epoch": 27.846425419240955, "grad_norm": 0.8911840915679932, "learning_rate": 4.465516785959067e-06, "loss": 0.02840212, "memory(GiB)": 15.03, "step": 15775, "train_speed(iter/s)": 1.473212 }, { "acc": 0.9979393, "epoch": 27.85525154457193, "grad_norm": 1.1896071434020996, "learning_rate": 4.46261261669978e-06, "loss": 0.01759881, "memory(GiB)": 15.03, "step": 15780, "train_speed(iter/s)": 1.473193 }, { "acc": 0.99857769, "epoch": 27.864077669902912, "grad_norm": 0.3737759590148926, "learning_rate": 4.459708631056701e-06, "loss": 0.01227753, "memory(GiB)": 15.03, "step": 15785, "train_speed(iter/s)": 1.473218 }, { "acc": 0.99799595, "epoch": 27.872903795233892, "grad_norm": 0.23551660776138306, "learning_rate": 4.456804830021152e-06, "loss": 0.01670752, "memory(GiB)": 15.03, "step": 15790, "train_speed(iter/s)": 1.473222 }, { "acc": 0.99790916, "epoch": 27.881729920564872, "grad_norm": 0.46850767731666565, "learning_rate": 4.453901214584389e-06, "loss": 0.00978119, "memory(GiB)": 15.03, "step": 15795, "train_speed(iter/s)": 1.473235 }, { "acc": 0.99784117, "epoch": 27.890556045895853, "grad_norm": 1.3086082935333252, "learning_rate": 4.45099778573761e-06, "loss": 0.01349096, "memory(GiB)": 15.03, "step": 15800, "train_speed(iter/s)": 1.473217 }, { "acc": 0.99924784, "epoch": 27.899382171226833, "grad_norm": 0.41166242957115173, "learning_rate": 4.448094544471943e-06, "loss": 0.01077703, "memory(GiB)": 15.03, "step": 15805, "train_speed(iter/s)": 1.47322 }, { "acc": 0.99951334, "epoch": 27.90820829655781, "grad_norm": 0.5542597770690918, "learning_rate": 4.445191491778455e-06, "loss": 0.00821814, "memory(GiB)": 15.03, "step": 15810, "train_speed(iter/s)": 1.473231 }, { "acc": 0.99922333, "epoch": 27.91703442188879, "grad_norm": 3.459670066833496, "learning_rate": 4.442288628648148e-06, "loss": 0.01243253, "memory(GiB)": 15.03, "step": 15815, "train_speed(iter/s)": 1.473245 }, { "acc": 0.99832821, "epoch": 27.92586054721977, "grad_norm": 0.9472993612289429, "learning_rate": 4.439385956071959e-06, "loss": 0.0158063, "memory(GiB)": 15.03, "step": 15820, "train_speed(iter/s)": 1.473235 }, { "acc": 0.99826994, "epoch": 27.93468667255075, "grad_norm": 2.153503894805908, "learning_rate": 4.436483475040762e-06, "loss": 0.01593329, "memory(GiB)": 15.03, "step": 15825, "train_speed(iter/s)": 1.47324 }, { "acc": 0.99850655, "epoch": 27.94351279788173, "grad_norm": 2.567146062850952, "learning_rate": 4.433581186545361e-06, "loss": 0.02707001, "memory(GiB)": 15.03, "step": 15830, "train_speed(iter/s)": 1.473266 }, { "acc": 0.99848747, "epoch": 27.95233892321271, "grad_norm": 1.2242578268051147, "learning_rate": 4.430679091576499e-06, "loss": 0.01154658, "memory(GiB)": 15.03, "step": 15835, "train_speed(iter/s)": 1.473251 }, { "acc": 0.9991395, "epoch": 27.96116504854369, "grad_norm": 0.7981659173965454, "learning_rate": 4.427777191124851e-06, "loss": 0.00515034, "memory(GiB)": 15.03, "step": 15840, "train_speed(iter/s)": 1.473263 }, { "acc": 0.99938717, "epoch": 27.969991173874668, "grad_norm": 0.9506338834762573, "learning_rate": 4.424875486181027e-06, "loss": 0.01284226, "memory(GiB)": 15.03, "step": 15845, "train_speed(iter/s)": 1.47327 }, { "acc": 0.9994235, "epoch": 27.978817299205648, "grad_norm": 0.6351320147514343, "learning_rate": 4.421973977735566e-06, "loss": 0.00898256, "memory(GiB)": 15.03, "step": 15850, "train_speed(iter/s)": 1.473264 }, { "acc": 0.99940624, "epoch": 27.98764342453663, "grad_norm": 0.49696820974349976, "learning_rate": 4.419072666778947e-06, "loss": 0.00978615, "memory(GiB)": 15.03, "step": 15855, "train_speed(iter/s)": 1.473258 }, { "acc": 0.99852285, "epoch": 27.99646954986761, "grad_norm": 0.3136531412601471, "learning_rate": 4.4161715543015735e-06, "loss": 0.01454004, "memory(GiB)": 15.03, "step": 15860, "train_speed(iter/s)": 1.47327 }, { "acc": 0.99838104, "epoch": 28.00529567519859, "grad_norm": 1.2437399625778198, "learning_rate": 4.4132706412937894e-06, "loss": 0.01387346, "memory(GiB)": 15.03, "step": 15865, "train_speed(iter/s)": 1.473205 }, { "acc": 0.99812765, "epoch": 28.01412180052957, "grad_norm": 1.7638325691223145, "learning_rate": 4.410369928745861e-06, "loss": 0.01642378, "memory(GiB)": 15.03, "step": 15870, "train_speed(iter/s)": 1.473193 }, { "acc": 0.99905815, "epoch": 28.022947925860546, "grad_norm": 0.47998201847076416, "learning_rate": 4.407469417647998e-06, "loss": 0.00885534, "memory(GiB)": 15.03, "step": 15875, "train_speed(iter/s)": 1.473198 }, { "acc": 0.99599953, "epoch": 28.031774051191526, "grad_norm": 1.8038946390151978, "learning_rate": 4.404569108990328e-06, "loss": 0.02804548, "memory(GiB)": 15.03, "step": 15880, "train_speed(iter/s)": 1.473178 }, { "acc": 0.99724178, "epoch": 28.040600176522506, "grad_norm": 2.539503574371338, "learning_rate": 4.4016690037629235e-06, "loss": 0.01277876, "memory(GiB)": 15.03, "step": 15885, "train_speed(iter/s)": 1.473167 }, { "acc": 0.997719, "epoch": 28.049426301853487, "grad_norm": 1.8393133878707886, "learning_rate": 4.398769102955774e-06, "loss": 0.01394249, "memory(GiB)": 15.03, "step": 15890, "train_speed(iter/s)": 1.473136 }, { "acc": 0.99925194, "epoch": 28.058252427184467, "grad_norm": 1.1989063024520874, "learning_rate": 4.395869407558811e-06, "loss": 0.00936835, "memory(GiB)": 15.03, "step": 15895, "train_speed(iter/s)": 1.473122 }, { "acc": 0.99962616, "epoch": 28.067078552515447, "grad_norm": 0.5308052897453308, "learning_rate": 4.392969918561887e-06, "loss": 0.0111597, "memory(GiB)": 15.03, "step": 15900, "train_speed(iter/s)": 1.47312 }, { "acc": 0.99804029, "epoch": 28.075904677846424, "grad_norm": 2.10447096824646, "learning_rate": 4.390070636954788e-06, "loss": 0.01339591, "memory(GiB)": 15.03, "step": 15905, "train_speed(iter/s)": 1.473118 }, { "acc": 0.99823799, "epoch": 28.084730803177404, "grad_norm": 0.3721534311771393, "learning_rate": 4.3871715637272315e-06, "loss": 0.00734085, "memory(GiB)": 15.03, "step": 15910, "train_speed(iter/s)": 1.473123 }, { "acc": 0.99883928, "epoch": 28.093556928508384, "grad_norm": 1.272714376449585, "learning_rate": 4.3842726998688575e-06, "loss": 0.01312942, "memory(GiB)": 15.03, "step": 15915, "train_speed(iter/s)": 1.473128 }, { "acc": 0.99889984, "epoch": 28.102383053839365, "grad_norm": 0.5159773230552673, "learning_rate": 4.381374046369241e-06, "loss": 0.00924094, "memory(GiB)": 15.03, "step": 15920, "train_speed(iter/s)": 1.473124 }, { "acc": 0.99856615, "epoch": 28.111209179170345, "grad_norm": 2.9331166744232178, "learning_rate": 4.3784756042178794e-06, "loss": 0.01217443, "memory(GiB)": 15.03, "step": 15925, "train_speed(iter/s)": 1.473144 }, { "acc": 0.99701118, "epoch": 28.120035304501325, "grad_norm": 0.539864718914032, "learning_rate": 4.375577374404205e-06, "loss": 0.01700809, "memory(GiB)": 15.03, "step": 15930, "train_speed(iter/s)": 1.47315 }, { "acc": 0.99766588, "epoch": 28.128861429832302, "grad_norm": 0.39452049136161804, "learning_rate": 4.3726793579175705e-06, "loss": 0.01442121, "memory(GiB)": 15.03, "step": 15935, "train_speed(iter/s)": 1.473135 }, { "acc": 0.99859734, "epoch": 28.137687555163282, "grad_norm": 1.5775697231292725, "learning_rate": 4.36978155574726e-06, "loss": 0.01061221, "memory(GiB)": 15.03, "step": 15940, "train_speed(iter/s)": 1.473148 }, { "acc": 0.9991086, "epoch": 28.146513680494262, "grad_norm": 0.647124707698822, "learning_rate": 4.366883968882482e-06, "loss": 0.01074217, "memory(GiB)": 15.03, "step": 15945, "train_speed(iter/s)": 1.473161 }, { "acc": 0.99903831, "epoch": 28.155339805825243, "grad_norm": 0.8528969287872314, "learning_rate": 4.363986598312375e-06, "loss": 0.01127885, "memory(GiB)": 15.03, "step": 15950, "train_speed(iter/s)": 1.473167 }, { "acc": 1.0, "epoch": 28.164165931156223, "grad_norm": 0.35860419273376465, "learning_rate": 4.361089445025999e-06, "loss": 0.00583336, "memory(GiB)": 15.03, "step": 15955, "train_speed(iter/s)": 1.47319 }, { "acc": 0.99801474, "epoch": 28.172992056487203, "grad_norm": 1.1247655153274536, "learning_rate": 4.358192510012344e-06, "loss": 0.01299504, "memory(GiB)": 15.03, "step": 15960, "train_speed(iter/s)": 1.473206 }, { "acc": 0.99842243, "epoch": 28.181818181818183, "grad_norm": 0.10565304011106491, "learning_rate": 4.355295794260321e-06, "loss": 0.01543751, "memory(GiB)": 15.03, "step": 15965, "train_speed(iter/s)": 1.473211 }, { "acc": 1.0, "epoch": 28.19064430714916, "grad_norm": 0.21073856949806213, "learning_rate": 4.352399298758774e-06, "loss": 0.00509567, "memory(GiB)": 15.03, "step": 15970, "train_speed(iter/s)": 1.473203 }, { "acc": 0.99958649, "epoch": 28.19947043248014, "grad_norm": 0.6939207315444946, "learning_rate": 4.349503024496462e-06, "loss": 0.00866162, "memory(GiB)": 15.03, "step": 15975, "train_speed(iter/s)": 1.473236 }, { "acc": 0.99793377, "epoch": 28.20829655781112, "grad_norm": 0.3109021484851837, "learning_rate": 4.3466069724620754e-06, "loss": 0.0188575, "memory(GiB)": 15.03, "step": 15980, "train_speed(iter/s)": 1.473246 }, { "acc": 0.99906311, "epoch": 28.2171226831421, "grad_norm": 0.048257455229759216, "learning_rate": 4.343711143644225e-06, "loss": 0.00939847, "memory(GiB)": 15.03, "step": 15985, "train_speed(iter/s)": 1.473255 }, { "acc": 0.99909, "epoch": 28.22594880847308, "grad_norm": 1.1734850406646729, "learning_rate": 4.34081553903145e-06, "loss": 0.00935748, "memory(GiB)": 15.03, "step": 15990, "train_speed(iter/s)": 1.473258 }, { "acc": 0.99870014, "epoch": 28.23477493380406, "grad_norm": 0.5159811973571777, "learning_rate": 4.337920159612207e-06, "loss": 0.01008144, "memory(GiB)": 15.03, "step": 15995, "train_speed(iter/s)": 1.473234 }, { "acc": 0.99777966, "epoch": 28.243601059135038, "grad_norm": 0.044988103210926056, "learning_rate": 4.335025006374879e-06, "loss": 0.01708677, "memory(GiB)": 15.03, "step": 16000, "train_speed(iter/s)": 1.473244 }, { "acc": 0.99875631, "epoch": 28.25242718446602, "grad_norm": 1.1039361953735352, "learning_rate": 4.332130080307771e-06, "loss": 0.01150341, "memory(GiB)": 15.03, "step": 16005, "train_speed(iter/s)": 1.473241 }, { "acc": 0.99969511, "epoch": 28.261253309797, "grad_norm": 0.04556847736239433, "learning_rate": 4.329235382399115e-06, "loss": 0.00603003, "memory(GiB)": 15.03, "step": 16010, "train_speed(iter/s)": 1.473255 }, { "acc": 0.99891691, "epoch": 28.27007943512798, "grad_norm": 0.28925448656082153, "learning_rate": 4.326340913637056e-06, "loss": 0.00787625, "memory(GiB)": 15.03, "step": 16015, "train_speed(iter/s)": 1.473238 }, { "acc": 0.9988534, "epoch": 28.27890556045896, "grad_norm": 2.576453447341919, "learning_rate": 4.32344667500967e-06, "loss": 0.00646422, "memory(GiB)": 15.03, "step": 16020, "train_speed(iter/s)": 1.473255 }, { "acc": 0.99943466, "epoch": 28.28773168578994, "grad_norm": 0.42774251103401184, "learning_rate": 4.320552667504946e-06, "loss": 0.01302228, "memory(GiB)": 15.03, "step": 16025, "train_speed(iter/s)": 1.473246 }, { "acc": 0.9984293, "epoch": 28.296557811120916, "grad_norm": 1.3688398599624634, "learning_rate": 4.317658892110803e-06, "loss": 0.01435407, "memory(GiB)": 15.03, "step": 16030, "train_speed(iter/s)": 1.473246 }, { "acc": 0.99771099, "epoch": 28.305383936451896, "grad_norm": 2.1379354000091553, "learning_rate": 4.314765349815073e-06, "loss": 0.01885264, "memory(GiB)": 15.03, "step": 16035, "train_speed(iter/s)": 1.473271 }, { "acc": 0.99945116, "epoch": 28.314210061782877, "grad_norm": 0.195917010307312, "learning_rate": 4.311872041605512e-06, "loss": 0.00369617, "memory(GiB)": 15.03, "step": 16040, "train_speed(iter/s)": 1.473282 }, { "acc": 0.99839849, "epoch": 28.323036187113857, "grad_norm": 1.283966302871704, "learning_rate": 4.308978968469797e-06, "loss": 0.01701314, "memory(GiB)": 15.03, "step": 16045, "train_speed(iter/s)": 1.473271 }, { "acc": 0.99926405, "epoch": 28.331862312444837, "grad_norm": 5.212954044342041, "learning_rate": 4.306086131395521e-06, "loss": 0.00644119, "memory(GiB)": 15.03, "step": 16050, "train_speed(iter/s)": 1.473278 }, { "acc": 0.99922924, "epoch": 28.340688437775817, "grad_norm": 0.9361178278923035, "learning_rate": 4.303193531370202e-06, "loss": 0.01685038, "memory(GiB)": 15.03, "step": 16055, "train_speed(iter/s)": 1.473279 }, { "acc": 0.9984663, "epoch": 28.349514563106798, "grad_norm": 0.2920852601528168, "learning_rate": 4.300301169381272e-06, "loss": 0.01401645, "memory(GiB)": 15.03, "step": 16060, "train_speed(iter/s)": 1.473283 }, { "acc": 0.99847603, "epoch": 28.358340688437774, "grad_norm": 0.23729689419269562, "learning_rate": 4.297409046416084e-06, "loss": 0.01205557, "memory(GiB)": 15.03, "step": 16065, "train_speed(iter/s)": 1.473286 }, { "acc": 0.99767017, "epoch": 28.367166813768755, "grad_norm": 0.7482087016105652, "learning_rate": 4.294517163461909e-06, "loss": 0.01959453, "memory(GiB)": 15.03, "step": 16070, "train_speed(iter/s)": 1.473291 }, { "acc": 0.99924183, "epoch": 28.375992939099735, "grad_norm": 0.40081143379211426, "learning_rate": 4.2916255215059364e-06, "loss": 0.0117275, "memory(GiB)": 15.03, "step": 16075, "train_speed(iter/s)": 1.47331 }, { "acc": 0.99881697, "epoch": 28.384819064430715, "grad_norm": 1.9268752336502075, "learning_rate": 4.288734121535273e-06, "loss": 0.01363678, "memory(GiB)": 15.03, "step": 16080, "train_speed(iter/s)": 1.473329 }, { "acc": 0.99890976, "epoch": 28.393645189761695, "grad_norm": 0.27998286485671997, "learning_rate": 4.285842964536943e-06, "loss": 0.01041147, "memory(GiB)": 15.03, "step": 16085, "train_speed(iter/s)": 1.473317 }, { "acc": 0.99827538, "epoch": 28.402471315092676, "grad_norm": 2.682664155960083, "learning_rate": 4.282952051497885e-06, "loss": 0.01594337, "memory(GiB)": 15.03, "step": 16090, "train_speed(iter/s)": 1.473329 }, { "acc": 0.9983799, "epoch": 28.411297440423652, "grad_norm": 1.51548171043396, "learning_rate": 4.280061383404964e-06, "loss": 0.01546923, "memory(GiB)": 15.03, "step": 16095, "train_speed(iter/s)": 1.473344 }, { "acc": 0.99827614, "epoch": 28.420123565754633, "grad_norm": 0.5575498938560486, "learning_rate": 4.277170961244948e-06, "loss": 0.02108505, "memory(GiB)": 15.03, "step": 16100, "train_speed(iter/s)": 1.473316 }, { "acc": 0.99794369, "epoch": 28.428949691085613, "grad_norm": 1.4818788766860962, "learning_rate": 4.274280786004531e-06, "loss": 0.01678239, "memory(GiB)": 15.03, "step": 16105, "train_speed(iter/s)": 1.473302 }, { "acc": 0.99938822, "epoch": 28.437775816416593, "grad_norm": 0.5798857808113098, "learning_rate": 4.2713908586703164e-06, "loss": 0.01189719, "memory(GiB)": 15.03, "step": 16110, "train_speed(iter/s)": 1.473307 }, { "acc": 0.99705076, "epoch": 28.446601941747574, "grad_norm": 1.656603455543518, "learning_rate": 4.268501180228829e-06, "loss": 0.02725499, "memory(GiB)": 15.03, "step": 16115, "train_speed(iter/s)": 1.473322 }, { "acc": 0.99927244, "epoch": 28.455428067078554, "grad_norm": 0.9700446128845215, "learning_rate": 4.2656117516665025e-06, "loss": 0.0078047, "memory(GiB)": 15.03, "step": 16120, "train_speed(iter/s)": 1.473334 }, { "acc": 0.9987751, "epoch": 28.46425419240953, "grad_norm": 2.3492183685302734, "learning_rate": 4.262722573969688e-06, "loss": 0.01614992, "memory(GiB)": 15.03, "step": 16125, "train_speed(iter/s)": 1.473328 }, { "acc": 0.99969006, "epoch": 28.47308031774051, "grad_norm": 0.29803919792175293, "learning_rate": 4.259833648124654e-06, "loss": 0.00593528, "memory(GiB)": 15.03, "step": 16130, "train_speed(iter/s)": 1.473324 }, { "acc": 0.9990427, "epoch": 28.48190644307149, "grad_norm": 0.7883985638618469, "learning_rate": 4.256944975117578e-06, "loss": 0.01037577, "memory(GiB)": 15.03, "step": 16135, "train_speed(iter/s)": 1.47334 }, { "acc": 0.99865236, "epoch": 28.49073256840247, "grad_norm": 0.7918667793273926, "learning_rate": 4.254056555934556e-06, "loss": 0.02290533, "memory(GiB)": 15.03, "step": 16140, "train_speed(iter/s)": 1.473339 }, { "acc": 0.99960785, "epoch": 28.49955869373345, "grad_norm": 1.395592212677002, "learning_rate": 4.251168391561592e-06, "loss": 0.01145529, "memory(GiB)": 15.03, "step": 16145, "train_speed(iter/s)": 1.473344 }, { "acc": 0.9981946, "epoch": 28.508384819064432, "grad_norm": 0.8696362376213074, "learning_rate": 4.248280482984606e-06, "loss": 0.01990932, "memory(GiB)": 15.03, "step": 16150, "train_speed(iter/s)": 1.47336 }, { "acc": 0.99868917, "epoch": 28.517210944395412, "grad_norm": 0.35630664229393005, "learning_rate": 4.245392831189431e-06, "loss": 0.00849167, "memory(GiB)": 15.03, "step": 16155, "train_speed(iter/s)": 1.473379 }, { "acc": 0.99848757, "epoch": 28.52603706972639, "grad_norm": 0.44274890422821045, "learning_rate": 4.242505437161813e-06, "loss": 0.0107589, "memory(GiB)": 15.03, "step": 16160, "train_speed(iter/s)": 1.473407 }, { "acc": 0.9981143, "epoch": 28.53486319505737, "grad_norm": 0.438616007566452, "learning_rate": 4.239618301887406e-06, "loss": 0.01716821, "memory(GiB)": 15.03, "step": 16165, "train_speed(iter/s)": 1.473409 }, { "acc": 0.99828968, "epoch": 28.54368932038835, "grad_norm": 0.09512067586183548, "learning_rate": 4.236731426351781e-06, "loss": 0.0108444, "memory(GiB)": 15.03, "step": 16170, "train_speed(iter/s)": 1.473409 }, { "acc": 0.99837332, "epoch": 28.55251544571933, "grad_norm": 0.12320201098918915, "learning_rate": 4.233844811540415e-06, "loss": 0.01961195, "memory(GiB)": 15.03, "step": 16175, "train_speed(iter/s)": 1.473405 }, { "acc": 0.9988884, "epoch": 28.56134157105031, "grad_norm": 0.23086567223072052, "learning_rate": 4.230958458438702e-06, "loss": 0.00864553, "memory(GiB)": 15.03, "step": 16180, "train_speed(iter/s)": 1.473408 }, { "acc": 0.99761772, "epoch": 28.57016769638129, "grad_norm": 0.2638692855834961, "learning_rate": 4.2280723680319416e-06, "loss": 0.01196086, "memory(GiB)": 15.03, "step": 16185, "train_speed(iter/s)": 1.473415 }, { "acc": 0.99809132, "epoch": 28.578993821712267, "grad_norm": 1.1383999586105347, "learning_rate": 4.225186541305347e-06, "loss": 0.0235733, "memory(GiB)": 15.03, "step": 16190, "train_speed(iter/s)": 1.473427 }, { "acc": 0.99934063, "epoch": 28.587819947043247, "grad_norm": 0.5455592274665833, "learning_rate": 4.2223009792440375e-06, "loss": 0.00987331, "memory(GiB)": 15.03, "step": 16195, "train_speed(iter/s)": 1.473445 }, { "acc": 0.99781713, "epoch": 28.596646072374227, "grad_norm": 1.428947925567627, "learning_rate": 4.219415682833048e-06, "loss": 0.02132027, "memory(GiB)": 15.03, "step": 16200, "train_speed(iter/s)": 1.473463 }, { "acc": 0.99821682, "epoch": 28.605472197705208, "grad_norm": 0.7298922538757324, "learning_rate": 4.216530653057316e-06, "loss": 0.01489399, "memory(GiB)": 15.03, "step": 16205, "train_speed(iter/s)": 1.473452 }, { "acc": 0.99938726, "epoch": 28.614298323036188, "grad_norm": 0.04777241870760918, "learning_rate": 4.213645890901692e-06, "loss": 0.00482232, "memory(GiB)": 15.03, "step": 16210, "train_speed(iter/s)": 1.473468 }, { "acc": 0.9994792, "epoch": 28.623124448367168, "grad_norm": 2.1560699939727783, "learning_rate": 4.210761397350933e-06, "loss": 0.01036191, "memory(GiB)": 15.03, "step": 16215, "train_speed(iter/s)": 1.473484 }, { "acc": 0.99881496, "epoch": 28.63195057369815, "grad_norm": 0.09721553325653076, "learning_rate": 4.207877173389711e-06, "loss": 0.01201066, "memory(GiB)": 15.03, "step": 16220, "train_speed(iter/s)": 1.473477 }, { "acc": 0.99771767, "epoch": 28.640776699029125, "grad_norm": 0.7806895971298218, "learning_rate": 4.204993220002595e-06, "loss": 0.02916134, "memory(GiB)": 15.03, "step": 16225, "train_speed(iter/s)": 1.473485 }, { "acc": 0.99938259, "epoch": 28.649602824360105, "grad_norm": 0.4635048508644104, "learning_rate": 4.20210953817407e-06, "loss": 0.00802065, "memory(GiB)": 15.03, "step": 16230, "train_speed(iter/s)": 1.473479 }, { "acc": 0.99938049, "epoch": 28.658428949691086, "grad_norm": 0.24578626453876495, "learning_rate": 4.199226128888524e-06, "loss": 0.00972834, "memory(GiB)": 15.03, "step": 16235, "train_speed(iter/s)": 1.473492 }, { "acc": 0.99812918, "epoch": 28.667255075022066, "grad_norm": 0.02081729844212532, "learning_rate": 4.196342993130255e-06, "loss": 0.01593383, "memory(GiB)": 15.03, "step": 16240, "train_speed(iter/s)": 1.473506 }, { "acc": 0.99804058, "epoch": 28.676081200353046, "grad_norm": 0.6649013161659241, "learning_rate": 4.1934601318834664e-06, "loss": 0.02337461, "memory(GiB)": 15.03, "step": 16245, "train_speed(iter/s)": 1.473499 }, { "acc": 0.99790878, "epoch": 28.684907325684026, "grad_norm": 0.27596715092658997, "learning_rate": 4.190577546132265e-06, "loss": 0.01884783, "memory(GiB)": 15.03, "step": 16250, "train_speed(iter/s)": 1.473499 }, { "acc": 0.99800625, "epoch": 28.693733451015003, "grad_norm": 0.6774300336837769, "learning_rate": 4.187695236860669e-06, "loss": 0.01363583, "memory(GiB)": 15.03, "step": 16255, "train_speed(iter/s)": 1.473491 }, { "acc": 0.99855833, "epoch": 28.702559576345983, "grad_norm": 1.0033546686172485, "learning_rate": 4.184813205052598e-06, "loss": 0.01766647, "memory(GiB)": 15.03, "step": 16260, "train_speed(iter/s)": 1.473492 }, { "acc": 0.99734688, "epoch": 28.711385701676964, "grad_norm": 1.969375729560852, "learning_rate": 4.18193145169188e-06, "loss": 0.02318279, "memory(GiB)": 15.03, "step": 16265, "train_speed(iter/s)": 1.473492 }, { "acc": 0.99938564, "epoch": 28.720211827007944, "grad_norm": 0.2752729058265686, "learning_rate": 4.179049977762243e-06, "loss": 0.00756451, "memory(GiB)": 15.03, "step": 16270, "train_speed(iter/s)": 1.4735 }, { "acc": 0.99897652, "epoch": 28.729037952338924, "grad_norm": 0.2737491726875305, "learning_rate": 4.176168784247326e-06, "loss": 0.01494652, "memory(GiB)": 15.03, "step": 16275, "train_speed(iter/s)": 1.473512 }, { "acc": 0.99888268, "epoch": 28.737864077669904, "grad_norm": 0.3139168918132782, "learning_rate": 4.173287872130667e-06, "loss": 0.00718122, "memory(GiB)": 15.03, "step": 16280, "train_speed(iter/s)": 1.473512 }, { "acc": 0.99730577, "epoch": 28.74669020300088, "grad_norm": 2.9797112941741943, "learning_rate": 4.170407242395713e-06, "loss": 0.02152449, "memory(GiB)": 15.03, "step": 16285, "train_speed(iter/s)": 1.473498 }, { "acc": 0.9997282, "epoch": 28.75551632833186, "grad_norm": 0.867626428604126, "learning_rate": 4.167526896025809e-06, "loss": 0.00911563, "memory(GiB)": 15.03, "step": 16290, "train_speed(iter/s)": 1.47352 }, { "acc": 0.9989522, "epoch": 28.76434245366284, "grad_norm": 0.7498857378959656, "learning_rate": 4.1646468340042065e-06, "loss": 0.00592781, "memory(GiB)": 15.03, "step": 16295, "train_speed(iter/s)": 1.473519 }, { "acc": 1.0, "epoch": 28.773168578993822, "grad_norm": 0.12461160868406296, "learning_rate": 4.161767057314057e-06, "loss": 0.00327418, "memory(GiB)": 15.03, "step": 16300, "train_speed(iter/s)": 1.473536 }, { "acc": 0.99732132, "epoch": 28.781994704324802, "grad_norm": 1.2814395427703857, "learning_rate": 4.158887566938423e-06, "loss": 0.01867098, "memory(GiB)": 15.03, "step": 16305, "train_speed(iter/s)": 1.473526 }, { "acc": 0.99873028, "epoch": 28.790820829655782, "grad_norm": 0.13157212734222412, "learning_rate": 4.156008363860258e-06, "loss": 0.00948636, "memory(GiB)": 15.03, "step": 16310, "train_speed(iter/s)": 1.473519 }, { "acc": 0.99797897, "epoch": 28.799646954986763, "grad_norm": 1.643083930015564, "learning_rate": 4.153129449062426e-06, "loss": 0.01592854, "memory(GiB)": 15.03, "step": 16315, "train_speed(iter/s)": 1.473535 }, { "acc": 0.99907207, "epoch": 28.80847308031774, "grad_norm": 1.9868842363357544, "learning_rate": 4.150250823527687e-06, "loss": 0.01099613, "memory(GiB)": 15.03, "step": 16320, "train_speed(iter/s)": 1.473558 }, { "acc": 0.99870529, "epoch": 28.81729920564872, "grad_norm": 0.3525596559047699, "learning_rate": 4.147372488238705e-06, "loss": 0.00742435, "memory(GiB)": 15.03, "step": 16325, "train_speed(iter/s)": 1.473538 }, { "acc": 0.99793396, "epoch": 28.8261253309797, "grad_norm": 0.7839908003807068, "learning_rate": 4.144494444178044e-06, "loss": 0.01813431, "memory(GiB)": 15.03, "step": 16330, "train_speed(iter/s)": 1.473518 }, { "acc": 0.99896984, "epoch": 28.83495145631068, "grad_norm": 1.1335783004760742, "learning_rate": 4.141616692328171e-06, "loss": 0.00803751, "memory(GiB)": 15.03, "step": 16335, "train_speed(iter/s)": 1.473511 }, { "acc": 0.99891882, "epoch": 28.84377758164166, "grad_norm": 2.783290147781372, "learning_rate": 4.138739233671449e-06, "loss": 0.01349519, "memory(GiB)": 15.03, "step": 16340, "train_speed(iter/s)": 1.473532 }, { "acc": 0.99857731, "epoch": 28.85260370697264, "grad_norm": 0.9926166534423828, "learning_rate": 4.1358620691901445e-06, "loss": 0.01300436, "memory(GiB)": 15.03, "step": 16345, "train_speed(iter/s)": 1.473542 }, { "acc": 0.99943314, "epoch": 28.861429832303617, "grad_norm": 0.5696312785148621, "learning_rate": 4.132985199866422e-06, "loss": 0.00962128, "memory(GiB)": 15.03, "step": 16350, "train_speed(iter/s)": 1.473546 }, { "acc": 0.99880781, "epoch": 28.870255957634598, "grad_norm": 0.1540168821811676, "learning_rate": 4.130108626682345e-06, "loss": 0.00961946, "memory(GiB)": 15.03, "step": 16355, "train_speed(iter/s)": 1.473535 }, { "acc": 0.99821224, "epoch": 28.879082082965578, "grad_norm": 0.6314690113067627, "learning_rate": 4.127232350619878e-06, "loss": 0.02162425, "memory(GiB)": 15.03, "step": 16360, "train_speed(iter/s)": 1.473552 }, { "acc": 0.99873142, "epoch": 28.887908208296558, "grad_norm": 1.8629142045974731, "learning_rate": 4.124356372660881e-06, "loss": 0.01504089, "memory(GiB)": 15.03, "step": 16365, "train_speed(iter/s)": 1.473532 }, { "acc": 0.99834137, "epoch": 28.89673433362754, "grad_norm": 0.532900333404541, "learning_rate": 4.121480693787114e-06, "loss": 0.01483288, "memory(GiB)": 15.03, "step": 16370, "train_speed(iter/s)": 1.473552 }, { "acc": 0.9994318, "epoch": 28.90556045895852, "grad_norm": 3.616629123687744, "learning_rate": 4.118605314980234e-06, "loss": 0.00600181, "memory(GiB)": 15.03, "step": 16375, "train_speed(iter/s)": 1.473556 }, { "acc": 0.99907579, "epoch": 28.914386584289495, "grad_norm": 0.7602140307426453, "learning_rate": 4.115730237221799e-06, "loss": 0.01374862, "memory(GiB)": 15.03, "step": 16380, "train_speed(iter/s)": 1.473581 }, { "acc": 0.99846315, "epoch": 28.923212709620476, "grad_norm": 1.0194525718688965, "learning_rate": 4.112855461493256e-06, "loss": 0.01543135, "memory(GiB)": 15.03, "step": 16385, "train_speed(iter/s)": 1.473596 }, { "acc": 0.9988719, "epoch": 28.932038834951456, "grad_norm": 0.17611321806907654, "learning_rate": 4.10998098877596e-06, "loss": 0.01299606, "memory(GiB)": 15.03, "step": 16390, "train_speed(iter/s)": 1.473611 }, { "acc": 0.99920454, "epoch": 28.940864960282436, "grad_norm": 2.180164337158203, "learning_rate": 4.107106820051154e-06, "loss": 0.01025786, "memory(GiB)": 15.03, "step": 16395, "train_speed(iter/s)": 1.473632 }, { "acc": 0.99866123, "epoch": 28.949691085613416, "grad_norm": 1.0816707611083984, "learning_rate": 4.104232956299982e-06, "loss": 0.01210294, "memory(GiB)": 15.03, "step": 16400, "train_speed(iter/s)": 1.473633 }, { "acc": 0.99884872, "epoch": 28.958517210944397, "grad_norm": 0.7471379637718201, "learning_rate": 4.101359398503479e-06, "loss": 0.01725514, "memory(GiB)": 15.03, "step": 16405, "train_speed(iter/s)": 1.473627 }, { "acc": 0.99903231, "epoch": 28.967343336275377, "grad_norm": 0.044081464409828186, "learning_rate": 4.098486147642581e-06, "loss": 0.01088103, "memory(GiB)": 15.03, "step": 16410, "train_speed(iter/s)": 1.473644 }, { "acc": 0.99985638, "epoch": 28.976169461606354, "grad_norm": 0.09426778554916382, "learning_rate": 4.0956132046981155e-06, "loss": 0.00611186, "memory(GiB)": 15.03, "step": 16415, "train_speed(iter/s)": 1.473683 }, { "acc": 0.99887867, "epoch": 28.984995586937334, "grad_norm": 1.6977529525756836, "learning_rate": 4.092740570650808e-06, "loss": 0.00887525, "memory(GiB)": 15.03, "step": 16420, "train_speed(iter/s)": 1.473689 }, { "acc": 0.99893513, "epoch": 28.993821712268314, "grad_norm": 2.6030285358428955, "learning_rate": 4.089868246481273e-06, "loss": 0.01317191, "memory(GiB)": 15.03, "step": 16425, "train_speed(iter/s)": 1.473681 }, { "acc": 0.9994442, "epoch": 29.002647837599294, "grad_norm": 0.11285725235939026, "learning_rate": 4.086996233170028e-06, "loss": 0.01093685, "memory(GiB)": 15.03, "step": 16430, "train_speed(iter/s)": 1.473627 }, { "acc": 0.9974472, "epoch": 29.011473962930275, "grad_norm": 1.0864720344543457, "learning_rate": 4.084124531697475e-06, "loss": 0.02409737, "memory(GiB)": 15.03, "step": 16435, "train_speed(iter/s)": 1.473614 }, { "acc": 0.99936447, "epoch": 29.020300088261255, "grad_norm": 0.2066522240638733, "learning_rate": 4.081253143043917e-06, "loss": 0.00821337, "memory(GiB)": 15.03, "step": 16440, "train_speed(iter/s)": 1.473631 }, { "acc": 1.0, "epoch": 29.02912621359223, "grad_norm": 0.06895989924669266, "learning_rate": 4.078382068189544e-06, "loss": 0.00738868, "memory(GiB)": 15.03, "step": 16445, "train_speed(iter/s)": 1.473627 }, { "acc": 0.99790735, "epoch": 29.037952338923212, "grad_norm": 0.2716447710990906, "learning_rate": 4.0755113081144445e-06, "loss": 0.01699197, "memory(GiB)": 15.03, "step": 16450, "train_speed(iter/s)": 1.473621 }, { "acc": 0.99782181, "epoch": 29.046778464254192, "grad_norm": 2.193918228149414, "learning_rate": 4.072640863798597e-06, "loss": 0.01290005, "memory(GiB)": 15.03, "step": 16455, "train_speed(iter/s)": 1.473608 }, { "acc": 0.99910603, "epoch": 29.055604589585172, "grad_norm": 0.904459536075592, "learning_rate": 4.06977073622187e-06, "loss": 0.02062414, "memory(GiB)": 15.03, "step": 16460, "train_speed(iter/s)": 1.473596 }, { "acc": 0.9983696, "epoch": 29.064430714916153, "grad_norm": 1.5063828229904175, "learning_rate": 4.0669009263640295e-06, "loss": 0.01478682, "memory(GiB)": 15.03, "step": 16465, "train_speed(iter/s)": 1.4736 }, { "acc": 0.99917736, "epoch": 29.073256840247133, "grad_norm": 0.2639874517917633, "learning_rate": 4.064031435204726e-06, "loss": 0.00753726, "memory(GiB)": 15.03, "step": 16470, "train_speed(iter/s)": 1.473591 }, { "acc": 0.99884205, "epoch": 29.08208296557811, "grad_norm": 2.9412600994110107, "learning_rate": 4.061162263723508e-06, "loss": 0.01468429, "memory(GiB)": 15.03, "step": 16475, "train_speed(iter/s)": 1.473618 }, { "acc": 1.0, "epoch": 29.09090909090909, "grad_norm": 0.8472464680671692, "learning_rate": 4.05829341289981e-06, "loss": 0.01105104, "memory(GiB)": 15.03, "step": 16480, "train_speed(iter/s)": 1.473618 }, { "acc": 0.99935722, "epoch": 29.09973521624007, "grad_norm": 0.0781542956829071, "learning_rate": 4.0554248837129585e-06, "loss": 0.00683911, "memory(GiB)": 15.03, "step": 16485, "train_speed(iter/s)": 1.47363 }, { "acc": 0.99904537, "epoch": 29.10856134157105, "grad_norm": 0.5944424867630005, "learning_rate": 4.052556677142171e-06, "loss": 0.00882033, "memory(GiB)": 15.03, "step": 16490, "train_speed(iter/s)": 1.473646 }, { "acc": 0.99768658, "epoch": 29.11738746690203, "grad_norm": 1.336392879486084, "learning_rate": 4.0496887941665565e-06, "loss": 0.01628945, "memory(GiB)": 15.03, "step": 16495, "train_speed(iter/s)": 1.47365 }, { "acc": 0.99883785, "epoch": 29.12621359223301, "grad_norm": 0.4663721024990082, "learning_rate": 4.046821235765106e-06, "loss": 0.01029034, "memory(GiB)": 15.03, "step": 16500, "train_speed(iter/s)": 1.473666 }, { "acc": 0.99864483, "epoch": 29.135039717563988, "grad_norm": 2.046786308288574, "learning_rate": 4.043954002916711e-06, "loss": 0.01756079, "memory(GiB)": 15.03, "step": 16505, "train_speed(iter/s)": 1.473664 }, { "acc": 0.99908867, "epoch": 29.143865842894968, "grad_norm": 0.5995051264762878, "learning_rate": 4.041087096600141e-06, "loss": 0.01194989, "memory(GiB)": 15.03, "step": 16510, "train_speed(iter/s)": 1.47368 }, { "acc": 0.99906368, "epoch": 29.152691968225948, "grad_norm": 1.3334369659423828, "learning_rate": 4.038220517794063e-06, "loss": 0.00903707, "memory(GiB)": 15.03, "step": 16515, "train_speed(iter/s)": 1.473678 }, { "acc": 0.99872179, "epoch": 29.16151809355693, "grad_norm": 0.4551894962787628, "learning_rate": 4.035354267477027e-06, "loss": 0.01303024, "memory(GiB)": 15.03, "step": 16520, "train_speed(iter/s)": 1.473678 }, { "acc": 0.99879513, "epoch": 29.17034421888791, "grad_norm": 0.2905570864677429, "learning_rate": 4.03248834662747e-06, "loss": 0.01715617, "memory(GiB)": 15.03, "step": 16525, "train_speed(iter/s)": 1.473657 }, { "acc": 0.99951763, "epoch": 29.17917034421889, "grad_norm": 0.4708096981048584, "learning_rate": 4.02962275622372e-06, "loss": 0.00895682, "memory(GiB)": 15.03, "step": 16530, "train_speed(iter/s)": 1.473681 }, { "acc": 0.999191, "epoch": 29.18799646954987, "grad_norm": 0.23578383028507233, "learning_rate": 4.0267574972439925e-06, "loss": 0.00717594, "memory(GiB)": 15.03, "step": 16535, "train_speed(iter/s)": 1.473683 }, { "acc": 0.99948893, "epoch": 29.196822594880846, "grad_norm": 3.3554983139038086, "learning_rate": 4.023892570666386e-06, "loss": 0.00440198, "memory(GiB)": 15.03, "step": 16540, "train_speed(iter/s)": 1.473696 }, { "acc": 0.99862471, "epoch": 29.205648720211826, "grad_norm": 1.1194900274276733, "learning_rate": 4.021027977468889e-06, "loss": 0.01041246, "memory(GiB)": 15.03, "step": 16545, "train_speed(iter/s)": 1.473694 }, { "acc": 0.9990797, "epoch": 29.214474845542806, "grad_norm": 0.1983688771724701, "learning_rate": 4.018163718629371e-06, "loss": 0.0090965, "memory(GiB)": 15.03, "step": 16550, "train_speed(iter/s)": 1.473721 }, { "acc": 0.99938087, "epoch": 29.223300970873787, "grad_norm": 0.19200502336025238, "learning_rate": 4.015299795125596e-06, "loss": 0.0056199, "memory(GiB)": 15.03, "step": 16555, "train_speed(iter/s)": 1.473695 }, { "acc": 0.9994318, "epoch": 29.232127096204767, "grad_norm": 0.0585259273648262, "learning_rate": 4.012436207935207e-06, "loss": 0.00665655, "memory(GiB)": 15.03, "step": 16560, "train_speed(iter/s)": 1.47373 }, { "acc": 0.9980381, "epoch": 29.240953221535747, "grad_norm": 2.6446425914764404, "learning_rate": 4.009572958035734e-06, "loss": 0.01543758, "memory(GiB)": 15.03, "step": 16565, "train_speed(iter/s)": 1.473738 }, { "acc": 0.99929285, "epoch": 29.249779346866724, "grad_norm": 0.7129902243614197, "learning_rate": 4.006710046404592e-06, "loss": 0.00975298, "memory(GiB)": 15.03, "step": 16570, "train_speed(iter/s)": 1.473749 }, { "acc": 0.99894037, "epoch": 29.258605472197704, "grad_norm": 0.3416258692741394, "learning_rate": 4.00384747401908e-06, "loss": 0.00599918, "memory(GiB)": 15.03, "step": 16575, "train_speed(iter/s)": 1.473743 }, { "acc": 0.99856548, "epoch": 29.267431597528685, "grad_norm": 0.3412103056907654, "learning_rate": 4.000985241856383e-06, "loss": 0.0159366, "memory(GiB)": 15.03, "step": 16580, "train_speed(iter/s)": 1.473732 }, { "acc": 0.99783592, "epoch": 29.276257722859665, "grad_norm": 1.4302036762237549, "learning_rate": 3.998123350893563e-06, "loss": 0.01720909, "memory(GiB)": 15.03, "step": 16585, "train_speed(iter/s)": 1.473745 }, { "acc": 0.99877949, "epoch": 29.285083848190645, "grad_norm": 1.2337566614151, "learning_rate": 3.995261802107579e-06, "loss": 0.01464461, "memory(GiB)": 15.03, "step": 16590, "train_speed(iter/s)": 1.473744 }, { "acc": 0.99908123, "epoch": 29.293909973521625, "grad_norm": 0.3178923428058624, "learning_rate": 3.992400596475259e-06, "loss": 0.00702348, "memory(GiB)": 15.03, "step": 16595, "train_speed(iter/s)": 1.473751 }, { "acc": 0.9991375, "epoch": 29.302736098852602, "grad_norm": 0.45686075091362, "learning_rate": 3.989539734973324e-06, "loss": 0.00907677, "memory(GiB)": 15.03, "step": 16600, "train_speed(iter/s)": 1.473769 }, { "acc": 0.9974165, "epoch": 29.311562224183582, "grad_norm": 0.6667450666427612, "learning_rate": 3.9866792185783694e-06, "loss": 0.02521013, "memory(GiB)": 15.03, "step": 16605, "train_speed(iter/s)": 1.473772 }, { "acc": 0.99889441, "epoch": 29.320388349514563, "grad_norm": 0.3185840845108032, "learning_rate": 3.983819048266881e-06, "loss": 0.00895609, "memory(GiB)": 15.03, "step": 16610, "train_speed(iter/s)": 1.473768 }, { "acc": 0.99873352, "epoch": 29.329214474845543, "grad_norm": 1.6118860244750977, "learning_rate": 3.980959225015219e-06, "loss": 0.01944126, "memory(GiB)": 15.03, "step": 16615, "train_speed(iter/s)": 1.473765 }, { "acc": 0.99881821, "epoch": 29.338040600176523, "grad_norm": 0.3713066279888153, "learning_rate": 3.97809974979963e-06, "loss": 0.00889847, "memory(GiB)": 15.03, "step": 16620, "train_speed(iter/s)": 1.473763 }, { "acc": 0.99944191, "epoch": 29.346866725507503, "grad_norm": 0.4746209383010864, "learning_rate": 3.97524062359624e-06, "loss": 0.01081853, "memory(GiB)": 15.03, "step": 16625, "train_speed(iter/s)": 1.473773 }, { "acc": 0.99707098, "epoch": 29.355692850838484, "grad_norm": 0.6231221556663513, "learning_rate": 3.972381847381057e-06, "loss": 0.02569894, "memory(GiB)": 15.03, "step": 16630, "train_speed(iter/s)": 1.473768 }, { "acc": 0.99917831, "epoch": 29.36451897616946, "grad_norm": 1.4361616373062134, "learning_rate": 3.969523422129966e-06, "loss": 0.00808227, "memory(GiB)": 15.03, "step": 16635, "train_speed(iter/s)": 1.473769 }, { "acc": 0.9991806, "epoch": 29.37334510150044, "grad_norm": 0.5952984690666199, "learning_rate": 3.966665348818738e-06, "loss": 0.01090493, "memory(GiB)": 15.03, "step": 16640, "train_speed(iter/s)": 1.473781 }, { "acc": 0.99937515, "epoch": 29.38217122683142, "grad_norm": 1.1533141136169434, "learning_rate": 3.963807628423019e-06, "loss": 0.00959962, "memory(GiB)": 15.03, "step": 16645, "train_speed(iter/s)": 1.473771 }, { "acc": 0.99976854, "epoch": 29.3909973521624, "grad_norm": 0.05337279662489891, "learning_rate": 3.960950261918337e-06, "loss": 0.0139346, "memory(GiB)": 15.03, "step": 16650, "train_speed(iter/s)": 1.47378 }, { "acc": 0.99970932, "epoch": 29.39982347749338, "grad_norm": 0.7094597220420837, "learning_rate": 3.958093250280098e-06, "loss": 0.00659231, "memory(GiB)": 15.03, "step": 16655, "train_speed(iter/s)": 1.47377 }, { "acc": 0.99824991, "epoch": 29.40864960282436, "grad_norm": 0.2480144053697586, "learning_rate": 3.9552365944835874e-06, "loss": 0.0121726, "memory(GiB)": 15.03, "step": 16660, "train_speed(iter/s)": 1.473775 }, { "acc": 0.99803581, "epoch": 29.41747572815534, "grad_norm": 2.2744181156158447, "learning_rate": 3.952380295503968e-06, "loss": 0.01253339, "memory(GiB)": 15.03, "step": 16665, "train_speed(iter/s)": 1.473752 }, { "acc": 0.99908075, "epoch": 29.42630185348632, "grad_norm": 0.06948085874319077, "learning_rate": 3.949524354316282e-06, "loss": 0.01148444, "memory(GiB)": 15.03, "step": 16670, "train_speed(iter/s)": 1.47375 }, { "acc": 0.99938488, "epoch": 29.4351279788173, "grad_norm": 1.5319390296936035, "learning_rate": 3.946668771895451e-06, "loss": 0.01614432, "memory(GiB)": 15.03, "step": 16675, "train_speed(iter/s)": 1.473753 }, { "acc": 0.99921627, "epoch": 29.44395410414828, "grad_norm": 0.4465184807777405, "learning_rate": 3.943813549216272e-06, "loss": 0.00855913, "memory(GiB)": 15.03, "step": 16680, "train_speed(iter/s)": 1.473735 }, { "acc": 0.99836731, "epoch": 29.45278022947926, "grad_norm": 2.712343215942383, "learning_rate": 3.940958687253419e-06, "loss": 0.01455445, "memory(GiB)": 15.03, "step": 16685, "train_speed(iter/s)": 1.473714 }, { "acc": 1.0, "epoch": 29.46160635481024, "grad_norm": 0.2668696343898773, "learning_rate": 3.938104186981443e-06, "loss": 0.00597113, "memory(GiB)": 15.03, "step": 16690, "train_speed(iter/s)": 1.473714 }, { "acc": 0.99866409, "epoch": 29.470432480141216, "grad_norm": 2.4179458618164062, "learning_rate": 3.935250049374773e-06, "loss": 0.01610183, "memory(GiB)": 15.03, "step": 16695, "train_speed(iter/s)": 1.473708 }, { "acc": 0.99903936, "epoch": 29.479258605472197, "grad_norm": 0.3648051917552948, "learning_rate": 3.932396275407712e-06, "loss": 0.0097805, "memory(GiB)": 15.03, "step": 16700, "train_speed(iter/s)": 1.473692 }, { "acc": 0.99915133, "epoch": 29.488084730803177, "grad_norm": 3.3247101306915283, "learning_rate": 3.929542866054441e-06, "loss": 0.00795835, "memory(GiB)": 15.03, "step": 16705, "train_speed(iter/s)": 1.473694 }, { "acc": 0.99964952, "epoch": 29.496910856134157, "grad_norm": 0.518415093421936, "learning_rate": 3.926689822289014e-06, "loss": 0.01122842, "memory(GiB)": 15.03, "step": 16710, "train_speed(iter/s)": 1.473686 }, { "acc": 0.99857721, "epoch": 29.505736981465137, "grad_norm": 0.7078486084938049, "learning_rate": 3.923837145085363e-06, "loss": 0.01507604, "memory(GiB)": 15.03, "step": 16715, "train_speed(iter/s)": 1.473696 }, { "acc": 0.99963474, "epoch": 29.514563106796118, "grad_norm": 0.8546894192695618, "learning_rate": 3.9209848354172946e-06, "loss": 0.00953987, "memory(GiB)": 15.03, "step": 16720, "train_speed(iter/s)": 1.473714 }, { "acc": 0.99778385, "epoch": 29.523389232127098, "grad_norm": 0.6225610375404358, "learning_rate": 3.918132894258488e-06, "loss": 0.01749751, "memory(GiB)": 15.03, "step": 16725, "train_speed(iter/s)": 1.473714 }, { "acc": 0.99824333, "epoch": 29.532215357458075, "grad_norm": 1.2037144899368286, "learning_rate": 3.915281322582498e-06, "loss": 0.01430285, "memory(GiB)": 15.03, "step": 16730, "train_speed(iter/s)": 1.47371 }, { "acc": 0.99934454, "epoch": 29.541041482789055, "grad_norm": 0.9517191648483276, "learning_rate": 3.912430121362752e-06, "loss": 0.00822341, "memory(GiB)": 15.03, "step": 16735, "train_speed(iter/s)": 1.473712 }, { "acc": 0.99749908, "epoch": 29.549867608120035, "grad_norm": 0.18405228853225708, "learning_rate": 3.909579291572553e-06, "loss": 0.01955793, "memory(GiB)": 15.03, "step": 16740, "train_speed(iter/s)": 1.473741 }, { "acc": 0.99923077, "epoch": 29.558693733451015, "grad_norm": 1.0858194828033447, "learning_rate": 3.9067288341850754e-06, "loss": 0.00801703, "memory(GiB)": 15.03, "step": 16745, "train_speed(iter/s)": 1.473752 }, { "acc": 0.99877186, "epoch": 29.567519858781996, "grad_norm": 0.30302032828330994, "learning_rate": 3.903878750173366e-06, "loss": 0.01264779, "memory(GiB)": 15.03, "step": 16750, "train_speed(iter/s)": 1.473767 }, { "acc": 0.99907055, "epoch": 29.576345984112976, "grad_norm": 0.5262671113014221, "learning_rate": 3.9010290405103466e-06, "loss": 0.01205133, "memory(GiB)": 15.03, "step": 16755, "train_speed(iter/s)": 1.473762 }, { "acc": 0.99858732, "epoch": 29.585172109443953, "grad_norm": 0.6427792310714722, "learning_rate": 3.898179706168809e-06, "loss": 0.01122045, "memory(GiB)": 15.03, "step": 16760, "train_speed(iter/s)": 1.473757 }, { "acc": 0.9993269, "epoch": 29.593998234774933, "grad_norm": 0.68845134973526, "learning_rate": 3.895330748121419e-06, "loss": 0.01085377, "memory(GiB)": 15.03, "step": 16765, "train_speed(iter/s)": 1.473773 }, { "acc": 0.99950905, "epoch": 29.602824360105913, "grad_norm": 1.2871984243392944, "learning_rate": 3.89248216734071e-06, "loss": 0.01571371, "memory(GiB)": 15.03, "step": 16770, "train_speed(iter/s)": 1.473777 }, { "acc": 0.9986187, "epoch": 29.611650485436893, "grad_norm": 1.4357903003692627, "learning_rate": 3.889633964799094e-06, "loss": 0.01479416, "memory(GiB)": 15.03, "step": 16775, "train_speed(iter/s)": 1.473777 }, { "acc": 0.99825287, "epoch": 29.620476610767874, "grad_norm": 0.9702668786048889, "learning_rate": 3.886786141468845e-06, "loss": 0.0181152, "memory(GiB)": 15.03, "step": 16780, "train_speed(iter/s)": 1.473802 }, { "acc": 0.99864798, "epoch": 29.629302736098854, "grad_norm": 0.6230557560920715, "learning_rate": 3.883938698322112e-06, "loss": 0.01022204, "memory(GiB)": 15.03, "step": 16785, "train_speed(iter/s)": 1.473796 }, { "acc": 0.99853439, "epoch": 29.63812886142983, "grad_norm": 2.2259702682495117, "learning_rate": 3.881091636330917e-06, "loss": 0.01435342, "memory(GiB)": 15.03, "step": 16790, "train_speed(iter/s)": 1.473796 }, { "acc": 0.99873619, "epoch": 29.64695498676081, "grad_norm": 0.5720205903053284, "learning_rate": 3.878244956467143e-06, "loss": 0.01009723, "memory(GiB)": 15.03, "step": 16795, "train_speed(iter/s)": 1.47379 }, { "acc": 0.99783745, "epoch": 29.65578111209179, "grad_norm": 0.33864468336105347, "learning_rate": 3.875398659702556e-06, "loss": 0.02281389, "memory(GiB)": 15.03, "step": 16800, "train_speed(iter/s)": 1.473812 }, { "acc": 0.99926701, "epoch": 29.66460723742277, "grad_norm": 0.2173641175031662, "learning_rate": 3.8725527470087784e-06, "loss": 0.012614, "memory(GiB)": 15.03, "step": 16805, "train_speed(iter/s)": 1.473798 }, { "acc": 0.99800367, "epoch": 29.67343336275375, "grad_norm": 0.45898666977882385, "learning_rate": 3.869707219357309e-06, "loss": 0.02392733, "memory(GiB)": 15.03, "step": 16810, "train_speed(iter/s)": 1.473783 }, { "acc": 0.99926805, "epoch": 29.682259488084732, "grad_norm": 2.5068304538726807, "learning_rate": 3.86686207771951e-06, "loss": 0.01023634, "memory(GiB)": 15.03, "step": 16815, "train_speed(iter/s)": 1.473783 }, { "acc": 0.99924126, "epoch": 29.691085613415712, "grad_norm": 0.7182238698005676, "learning_rate": 3.8640173230666175e-06, "loss": 0.01123324, "memory(GiB)": 15.03, "step": 16820, "train_speed(iter/s)": 1.473807 }, { "acc": 0.99790354, "epoch": 29.69991173874669, "grad_norm": 2.3990437984466553, "learning_rate": 3.861172956369731e-06, "loss": 0.02637596, "memory(GiB)": 15.03, "step": 16825, "train_speed(iter/s)": 1.473805 }, { "acc": 1.0, "epoch": 29.70873786407767, "grad_norm": 0.513251006603241, "learning_rate": 3.858328978599821e-06, "loss": 0.01356073, "memory(GiB)": 15.03, "step": 16830, "train_speed(iter/s)": 1.473801 }, { "acc": 0.99857903, "epoch": 29.71756398940865, "grad_norm": 2.1667048931121826, "learning_rate": 3.855485390727721e-06, "loss": 0.01133237, "memory(GiB)": 15.03, "step": 16835, "train_speed(iter/s)": 1.473809 }, { "acc": 0.99813251, "epoch": 29.72639011473963, "grad_norm": 0.4708534777164459, "learning_rate": 3.852642193724137e-06, "loss": 0.01172903, "memory(GiB)": 15.03, "step": 16840, "train_speed(iter/s)": 1.473825 }, { "acc": 0.9987875, "epoch": 29.73521624007061, "grad_norm": 1.5623949766159058, "learning_rate": 3.849799388559634e-06, "loss": 0.01006894, "memory(GiB)": 15.03, "step": 16845, "train_speed(iter/s)": 1.473824 }, { "acc": 0.99841347, "epoch": 29.74404236540159, "grad_norm": 0.8226460218429565, "learning_rate": 3.846956976204653e-06, "loss": 0.01082755, "memory(GiB)": 15.03, "step": 16850, "train_speed(iter/s)": 1.47381 }, { "acc": 0.9990202, "epoch": 29.752868490732567, "grad_norm": 0.352660208940506, "learning_rate": 3.844114957629491e-06, "loss": 0.00696422, "memory(GiB)": 15.03, "step": 16855, "train_speed(iter/s)": 1.473799 }, { "acc": 0.99904118, "epoch": 29.761694616063547, "grad_norm": 1.3468912839889526, "learning_rate": 3.8412733338043185e-06, "loss": 0.01939898, "memory(GiB)": 15.03, "step": 16860, "train_speed(iter/s)": 1.473816 }, { "acc": 0.99873333, "epoch": 29.770520741394527, "grad_norm": 0.4881133735179901, "learning_rate": 3.838432105699166e-06, "loss": 0.0176496, "memory(GiB)": 15.03, "step": 16865, "train_speed(iter/s)": 1.473815 }, { "acc": 0.99905186, "epoch": 29.779346866725508, "grad_norm": 0.5651824474334717, "learning_rate": 3.835591274283932e-06, "loss": 0.01394438, "memory(GiB)": 15.03, "step": 16870, "train_speed(iter/s)": 1.473831 }, { "acc": 0.99811344, "epoch": 29.788172992056488, "grad_norm": 0.6764354109764099, "learning_rate": 3.832750840528376e-06, "loss": 0.01848145, "memory(GiB)": 15.03, "step": 16875, "train_speed(iter/s)": 1.473822 }, { "acc": 0.9997282, "epoch": 29.796999117387468, "grad_norm": 0.5005140900611877, "learning_rate": 3.8299108054021286e-06, "loss": 0.00742196, "memory(GiB)": 15.03, "step": 16880, "train_speed(iter/s)": 1.47382 }, { "acc": 0.99893036, "epoch": 29.805825242718445, "grad_norm": 0.436547189950943, "learning_rate": 3.827071169874673e-06, "loss": 0.01206717, "memory(GiB)": 15.03, "step": 16885, "train_speed(iter/s)": 1.47385 }, { "acc": 0.99905224, "epoch": 29.814651368049425, "grad_norm": 0.1225903257727623, "learning_rate": 3.82423193491537e-06, "loss": 0.00981991, "memory(GiB)": 15.03, "step": 16890, "train_speed(iter/s)": 1.473858 }, { "acc": 0.99951124, "epoch": 29.823477493380405, "grad_norm": 0.793908417224884, "learning_rate": 3.821393101493434e-06, "loss": 0.0093222, "memory(GiB)": 15.03, "step": 16895, "train_speed(iter/s)": 1.473869 }, { "acc": 0.99906082, "epoch": 29.832303618711386, "grad_norm": 3.7438600063323975, "learning_rate": 3.8185546705779436e-06, "loss": 0.01300521, "memory(GiB)": 15.03, "step": 16900, "train_speed(iter/s)": 1.473867 }, { "acc": 0.99851456, "epoch": 29.841129744042366, "grad_norm": 0.23412317037582397, "learning_rate": 3.815716643137843e-06, "loss": 0.01050709, "memory(GiB)": 15.03, "step": 16905, "train_speed(iter/s)": 1.473886 }, { "acc": 0.99934168, "epoch": 29.849955869373346, "grad_norm": 0.29907992482185364, "learning_rate": 3.812879020141935e-06, "loss": 0.00422298, "memory(GiB)": 15.03, "step": 16910, "train_speed(iter/s)": 1.473877 }, { "acc": 0.99960957, "epoch": 29.858781994704326, "grad_norm": 0.07465013861656189, "learning_rate": 3.810041802558888e-06, "loss": 0.01012234, "memory(GiB)": 15.03, "step": 16915, "train_speed(iter/s)": 1.473887 }, { "acc": 0.99956894, "epoch": 29.867608120035303, "grad_norm": 0.33062538504600525, "learning_rate": 3.807204991357228e-06, "loss": 0.00980427, "memory(GiB)": 15.03, "step": 16920, "train_speed(iter/s)": 1.473898 }, { "acc": 0.99873333, "epoch": 29.876434245366283, "grad_norm": 0.5093902945518494, "learning_rate": 3.8043685875053484e-06, "loss": 0.0095515, "memory(GiB)": 15.03, "step": 16925, "train_speed(iter/s)": 1.473907 }, { "acc": 1.0, "epoch": 29.885260370697264, "grad_norm": 0.04859641566872597, "learning_rate": 3.801532591971496e-06, "loss": 0.00791809, "memory(GiB)": 15.03, "step": 16930, "train_speed(iter/s)": 1.473905 }, { "acc": 0.9985714, "epoch": 29.894086496028244, "grad_norm": 0.4092571437358856, "learning_rate": 3.7986970057237843e-06, "loss": 0.01287647, "memory(GiB)": 15.03, "step": 16935, "train_speed(iter/s)": 1.473926 }, { "acc": 0.999617, "epoch": 29.902912621359224, "grad_norm": 0.12415459752082825, "learning_rate": 3.795861829730183e-06, "loss": 0.00641507, "memory(GiB)": 15.03, "step": 16940, "train_speed(iter/s)": 1.47395 }, { "acc": 0.99949379, "epoch": 29.911738746690204, "grad_norm": 0.32125967741012573, "learning_rate": 3.7930270649585256e-06, "loss": 0.00931053, "memory(GiB)": 15.03, "step": 16945, "train_speed(iter/s)": 1.473977 }, { "acc": 0.99951925, "epoch": 29.92056487202118, "grad_norm": 2.7449920177459717, "learning_rate": 3.7901927123764998e-06, "loss": 0.00793721, "memory(GiB)": 15.03, "step": 16950, "train_speed(iter/s)": 1.473991 }, { "acc": 0.99904957, "epoch": 29.92939099735216, "grad_norm": 2.380582094192505, "learning_rate": 3.7873587729516588e-06, "loss": 0.01131135, "memory(GiB)": 15.03, "step": 16955, "train_speed(iter/s)": 1.473986 }, { "acc": 0.99915886, "epoch": 29.93821712268314, "grad_norm": 0.18888580799102783, "learning_rate": 3.78452524765141e-06, "loss": 0.00671951, "memory(GiB)": 15.03, "step": 16960, "train_speed(iter/s)": 1.473987 }, { "acc": 0.9990696, "epoch": 29.947043248014122, "grad_norm": 1.3608238697052002, "learning_rate": 3.781692137443023e-06, "loss": 0.01506731, "memory(GiB)": 15.03, "step": 16965, "train_speed(iter/s)": 1.473995 }, { "acc": 0.99890041, "epoch": 29.955869373345102, "grad_norm": 4.234920501708984, "learning_rate": 3.778859443293621e-06, "loss": 0.00944352, "memory(GiB)": 15.03, "step": 16970, "train_speed(iter/s)": 1.47399 }, { "acc": 0.99943867, "epoch": 29.964695498676083, "grad_norm": 0.5832836031913757, "learning_rate": 3.776027166170192e-06, "loss": 0.01103914, "memory(GiB)": 15.03, "step": 16975, "train_speed(iter/s)": 1.473992 }, { "acc": 0.99822702, "epoch": 29.97352162400706, "grad_norm": 0.31209149956703186, "learning_rate": 3.773195307039575e-06, "loss": 0.01538163, "memory(GiB)": 15.03, "step": 16980, "train_speed(iter/s)": 1.47398 }, { "acc": 0.99873304, "epoch": 29.98234774933804, "grad_norm": 0.3648718297481537, "learning_rate": 3.7703638668684704e-06, "loss": 0.01110875, "memory(GiB)": 15.03, "step": 16985, "train_speed(iter/s)": 1.47397 }, { "acc": 0.99894123, "epoch": 29.99117387466902, "grad_norm": 0.2520592510700226, "learning_rate": 3.7675328466234335e-06, "loss": 0.0064636, "memory(GiB)": 15.03, "step": 16990, "train_speed(iter/s)": 1.473973 }, { "acc": 0.99930973, "epoch": 30.0, "grad_norm": 0.6250980496406555, "learning_rate": 3.764702247270877e-06, "loss": 0.00810805, "memory(GiB)": 15.03, "step": 16995, "train_speed(iter/s)": 1.473977 }, { "acc": 0.99955158, "epoch": 30.00882612533098, "grad_norm": 1.344116449356079, "learning_rate": 3.7618720697770716e-06, "loss": 0.0042317, "memory(GiB)": 15.03, "step": 17000, "train_speed(iter/s)": 1.47394 }, { "acc": 0.99938335, "epoch": 30.01765225066196, "grad_norm": 0.3825420141220093, "learning_rate": 3.7590423151081384e-06, "loss": 0.01887943, "memory(GiB)": 15.03, "step": 17005, "train_speed(iter/s)": 1.473963 }, { "acc": 0.99740791, "epoch": 30.02647837599294, "grad_norm": 1.7862098217010498, "learning_rate": 3.756212984230063e-06, "loss": 0.02304429, "memory(GiB)": 15.03, "step": 17010, "train_speed(iter/s)": 1.473976 }, { "acc": 0.99871254, "epoch": 30.035304501323917, "grad_norm": 0.27199485898017883, "learning_rate": 3.7533840781086777e-06, "loss": 0.00830028, "memory(GiB)": 15.03, "step": 17015, "train_speed(iter/s)": 1.473989 }, { "acc": 0.99899235, "epoch": 30.044130626654898, "grad_norm": 0.15275318920612335, "learning_rate": 3.7505555977096763e-06, "loss": 0.00683605, "memory(GiB)": 15.03, "step": 17020, "train_speed(iter/s)": 1.473996 }, { "acc": 0.99966221, "epoch": 30.052956751985878, "grad_norm": 0.31459474563598633, "learning_rate": 3.7477275439986016e-06, "loss": 0.01815262, "memory(GiB)": 15.03, "step": 17025, "train_speed(iter/s)": 1.474013 }, { "acc": 0.99937878, "epoch": 30.06178287731686, "grad_norm": 0.2870390713214874, "learning_rate": 3.744899917940857e-06, "loss": 0.01258891, "memory(GiB)": 15.03, "step": 17030, "train_speed(iter/s)": 1.474031 }, { "acc": 0.99982395, "epoch": 30.07060900264784, "grad_norm": 1.354213833808899, "learning_rate": 3.7420727205016932e-06, "loss": 0.00254773, "memory(GiB)": 15.03, "step": 17035, "train_speed(iter/s)": 1.474037 }, { "acc": 0.99923668, "epoch": 30.07943512797882, "grad_norm": 2.2274532318115234, "learning_rate": 3.7392459526462204e-06, "loss": 0.01271825, "memory(GiB)": 15.03, "step": 17040, "train_speed(iter/s)": 1.474051 }, { "acc": 0.99909782, "epoch": 30.088261253309796, "grad_norm": 2.01053786277771, "learning_rate": 3.736419615339396e-06, "loss": 0.00648232, "memory(GiB)": 15.03, "step": 17045, "train_speed(iter/s)": 1.474065 }, { "acc": 0.99908323, "epoch": 30.097087378640776, "grad_norm": 0.046651385724544525, "learning_rate": 3.73359370954604e-06, "loss": 0.00822069, "memory(GiB)": 15.03, "step": 17050, "train_speed(iter/s)": 1.47408 }, { "acc": 0.99905872, "epoch": 30.105913503971756, "grad_norm": 1.4015311002731323, "learning_rate": 3.7307682362308147e-06, "loss": 0.01239335, "memory(GiB)": 15.03, "step": 17055, "train_speed(iter/s)": 1.474096 }, { "acc": 0.99956188, "epoch": 30.114739629302736, "grad_norm": 2.010672092437744, "learning_rate": 3.7279431963582423e-06, "loss": 0.01068225, "memory(GiB)": 15.03, "step": 17060, "train_speed(iter/s)": 1.474115 }, { "acc": 0.99889631, "epoch": 30.123565754633717, "grad_norm": 2.0582115650177, "learning_rate": 3.7251185908926906e-06, "loss": 0.01104024, "memory(GiB)": 15.03, "step": 17065, "train_speed(iter/s)": 1.474113 }, { "acc": 0.9976428, "epoch": 30.132391879964697, "grad_norm": 1.8341305255889893, "learning_rate": 3.7222944207983853e-06, "loss": 0.01711619, "memory(GiB)": 15.03, "step": 17070, "train_speed(iter/s)": 1.474139 }, { "acc": 0.99878912, "epoch": 30.141218005295674, "grad_norm": 0.20152169466018677, "learning_rate": 3.7194706870393987e-06, "loss": 0.01154144, "memory(GiB)": 15.03, "step": 17075, "train_speed(iter/s)": 1.474139 }, { "acc": 0.99800596, "epoch": 30.150044130626654, "grad_norm": 0.6483573913574219, "learning_rate": 3.7166473905796592e-06, "loss": 0.01687961, "memory(GiB)": 15.03, "step": 17080, "train_speed(iter/s)": 1.47414 }, { "acc": 0.99897156, "epoch": 30.158870255957634, "grad_norm": 0.43253517150878906, "learning_rate": 3.7138245323829385e-06, "loss": 0.0121434, "memory(GiB)": 15.03, "step": 17085, "train_speed(iter/s)": 1.47414 }, { "acc": 0.99835205, "epoch": 30.167696381288614, "grad_norm": 2.0904159545898438, "learning_rate": 3.7110021134128664e-06, "loss": 0.01994619, "memory(GiB)": 15.03, "step": 17090, "train_speed(iter/s)": 1.474142 }, { "acc": 0.99979839, "epoch": 30.176522506619595, "grad_norm": 0.11691398173570633, "learning_rate": 3.708180134632918e-06, "loss": 0.00315788, "memory(GiB)": 15.03, "step": 17095, "train_speed(iter/s)": 1.474145 }, { "acc": 0.99814053, "epoch": 30.185348631950575, "grad_norm": 2.050363540649414, "learning_rate": 3.7053585970064203e-06, "loss": 0.01417287, "memory(GiB)": 15.03, "step": 17100, "train_speed(iter/s)": 1.47415 }, { "acc": 0.99943647, "epoch": 30.194174757281555, "grad_norm": 0.22508250176906586, "learning_rate": 3.7025375014965486e-06, "loss": 0.01214245, "memory(GiB)": 15.03, "step": 17105, "train_speed(iter/s)": 1.474161 }, { "acc": 0.99956884, "epoch": 30.203000882612532, "grad_norm": 0.41498100757598877, "learning_rate": 3.6997168490663277e-06, "loss": 0.00639237, "memory(GiB)": 15.03, "step": 17110, "train_speed(iter/s)": 1.474135 }, { "acc": 0.99978447, "epoch": 30.211827007943512, "grad_norm": 0.3153226375579834, "learning_rate": 3.696896640678632e-06, "loss": 0.00514999, "memory(GiB)": 15.03, "step": 17115, "train_speed(iter/s)": 1.474136 }, { "acc": 0.99912167, "epoch": 30.220653133274492, "grad_norm": 0.20765303075313568, "learning_rate": 3.6940768772961822e-06, "loss": 0.00797534, "memory(GiB)": 15.03, "step": 17120, "train_speed(iter/s)": 1.474129 }, { "acc": 0.99773102, "epoch": 30.229479258605473, "grad_norm": 0.01441156305372715, "learning_rate": 3.691257559881549e-06, "loss": 0.0245439, "memory(GiB)": 15.03, "step": 17125, "train_speed(iter/s)": 1.474139 }, { "acc": 0.99932804, "epoch": 30.238305383936453, "grad_norm": 0.20394036173820496, "learning_rate": 3.6884386893971485e-06, "loss": 0.00693307, "memory(GiB)": 15.03, "step": 17130, "train_speed(iter/s)": 1.474151 }, { "acc": 0.99888496, "epoch": 30.247131509267433, "grad_norm": 1.320525884628296, "learning_rate": 3.685620266805249e-06, "loss": 0.01822188, "memory(GiB)": 15.03, "step": 17135, "train_speed(iter/s)": 1.47415 }, { "acc": 0.9973938, "epoch": 30.25595763459841, "grad_norm": 0.5694645643234253, "learning_rate": 3.6828022930679603e-06, "loss": 0.01987962, "memory(GiB)": 15.03, "step": 17140, "train_speed(iter/s)": 1.47415 }, { "acc": 0.99866734, "epoch": 30.26478375992939, "grad_norm": 1.046366810798645, "learning_rate": 3.6799847691472427e-06, "loss": 0.01662565, "memory(GiB)": 15.03, "step": 17145, "train_speed(iter/s)": 1.474161 }, { "acc": 0.99837914, "epoch": 30.27360988526037, "grad_norm": 0.5700646638870239, "learning_rate": 3.6771676960049008e-06, "loss": 0.02344516, "memory(GiB)": 15.03, "step": 17150, "train_speed(iter/s)": 1.474163 }, { "acc": 0.99980774, "epoch": 30.28243601059135, "grad_norm": 0.45946428179740906, "learning_rate": 3.6743510746025874e-06, "loss": 0.0114418, "memory(GiB)": 15.03, "step": 17155, "train_speed(iter/s)": 1.474156 }, { "acc": 0.999582, "epoch": 30.29126213592233, "grad_norm": 0.29363593459129333, "learning_rate": 3.671534905901798e-06, "loss": 0.00738429, "memory(GiB)": 15.03, "step": 17160, "train_speed(iter/s)": 1.474153 }, { "acc": 0.99892349, "epoch": 30.30008826125331, "grad_norm": 0.7695024609565735, "learning_rate": 3.668719190863879e-06, "loss": 0.00873608, "memory(GiB)": 15.03, "step": 17165, "train_speed(iter/s)": 1.474167 }, { "acc": 0.99913759, "epoch": 30.308914386584288, "grad_norm": 1.5152629613876343, "learning_rate": 3.6659039304500138e-06, "loss": 0.01329105, "memory(GiB)": 15.03, "step": 17170, "train_speed(iter/s)": 1.474187 }, { "acc": 0.99871426, "epoch": 30.317740511915268, "grad_norm": 0.45822757482528687, "learning_rate": 3.6630891256212398e-06, "loss": 0.01298726, "memory(GiB)": 15.03, "step": 17175, "train_speed(iter/s)": 1.474195 }, { "acc": 0.99973955, "epoch": 30.32656663724625, "grad_norm": 0.15507614612579346, "learning_rate": 3.66027477733843e-06, "loss": 0.00234969, "memory(GiB)": 15.03, "step": 17180, "train_speed(iter/s)": 1.474193 }, { "acc": 0.9995472, "epoch": 30.33539276257723, "grad_norm": 1.2342498302459717, "learning_rate": 3.6574608865623095e-06, "loss": 0.01204998, "memory(GiB)": 15.03, "step": 17185, "train_speed(iter/s)": 1.474186 }, { "acc": 0.99722805, "epoch": 30.34421888790821, "grad_norm": 1.2736796140670776, "learning_rate": 3.654647454253442e-06, "loss": 0.02821025, "memory(GiB)": 15.03, "step": 17190, "train_speed(iter/s)": 1.474193 }, { "acc": 0.99853601, "epoch": 30.35304501323919, "grad_norm": 0.20673604309558868, "learning_rate": 3.6518344813722366e-06, "loss": 0.00983078, "memory(GiB)": 15.03, "step": 17195, "train_speed(iter/s)": 1.474197 }, { "acc": 0.99864168, "epoch": 30.36187113857017, "grad_norm": 0.49696049094200134, "learning_rate": 3.649021968878946e-06, "loss": 0.0132506, "memory(GiB)": 15.03, "step": 17200, "train_speed(iter/s)": 1.47417 }, { "acc": 0.99888668, "epoch": 30.370697263901146, "grad_norm": 0.7877345681190491, "learning_rate": 3.6462099177336633e-06, "loss": 0.01211423, "memory(GiB)": 15.03, "step": 17205, "train_speed(iter/s)": 1.474177 }, { "acc": 0.99917412, "epoch": 30.379523389232126, "grad_norm": 0.2953098714351654, "learning_rate": 3.643398328896327e-06, "loss": 0.01241673, "memory(GiB)": 15.03, "step": 17210, "train_speed(iter/s)": 1.47418 }, { "acc": 0.99919558, "epoch": 30.388349514563107, "grad_norm": 0.6007687449455261, "learning_rate": 3.6405872033267148e-06, "loss": 0.01005491, "memory(GiB)": 15.03, "step": 17215, "train_speed(iter/s)": 1.4742 }, { "acc": 0.99855032, "epoch": 30.397175639894087, "grad_norm": 1.960715413093567, "learning_rate": 3.637776541984452e-06, "loss": 0.01683781, "memory(GiB)": 15.03, "step": 17220, "train_speed(iter/s)": 1.474202 }, { "acc": 1.0, "epoch": 30.406001765225067, "grad_norm": 0.530565083026886, "learning_rate": 3.6349663458289976e-06, "loss": 0.00989882, "memory(GiB)": 15.03, "step": 17225, "train_speed(iter/s)": 1.474214 }, { "acc": 1.0, "epoch": 30.414827890556047, "grad_norm": 0.2534385323524475, "learning_rate": 3.632156615819658e-06, "loss": 0.00497842, "memory(GiB)": 15.03, "step": 17230, "train_speed(iter/s)": 1.474229 }, { "acc": 0.99913006, "epoch": 30.423654015887024, "grad_norm": 0.424477219581604, "learning_rate": 3.6293473529155765e-06, "loss": 0.00922771, "memory(GiB)": 15.03, "step": 17235, "train_speed(iter/s)": 1.474215 }, { "acc": 0.99843674, "epoch": 30.432480141218004, "grad_norm": 1.5936360359191895, "learning_rate": 3.6265385580757407e-06, "loss": 0.01177749, "memory(GiB)": 15.03, "step": 17240, "train_speed(iter/s)": 1.474197 }, { "acc": 0.99889526, "epoch": 30.441306266548985, "grad_norm": 0.1416434496641159, "learning_rate": 3.623730232258974e-06, "loss": 0.00680257, "memory(GiB)": 15.03, "step": 17245, "train_speed(iter/s)": 1.474199 }, { "acc": 0.99989033, "epoch": 30.450132391879965, "grad_norm": 0.21064814925193787, "learning_rate": 3.6209223764239436e-06, "loss": 0.00521761, "memory(GiB)": 15.03, "step": 17250, "train_speed(iter/s)": 1.474213 }, { "acc": 0.99822226, "epoch": 30.458958517210945, "grad_norm": 2.7239186763763428, "learning_rate": 3.6181149915291525e-06, "loss": 0.00971552, "memory(GiB)": 15.03, "step": 17255, "train_speed(iter/s)": 1.474225 }, { "acc": 0.99893456, "epoch": 30.467784642541925, "grad_norm": 0.4034622311592102, "learning_rate": 3.615308078532949e-06, "loss": 0.01170672, "memory(GiB)": 15.03, "step": 17260, "train_speed(iter/s)": 1.474221 }, { "acc": 0.99932213, "epoch": 30.476610767872902, "grad_norm": 0.20190270245075226, "learning_rate": 3.612501638393513e-06, "loss": 0.00913145, "memory(GiB)": 15.03, "step": 17265, "train_speed(iter/s)": 1.474233 }, { "acc": 0.99955769, "epoch": 30.485436893203882, "grad_norm": 0.33712831139564514, "learning_rate": 3.609695672068869e-06, "loss": 0.00682355, "memory(GiB)": 15.03, "step": 17270, "train_speed(iter/s)": 1.474227 }, { "acc": 0.99937258, "epoch": 30.494263018534863, "grad_norm": 0.47640588879585266, "learning_rate": 3.6068901805168736e-06, "loss": 0.00681776, "memory(GiB)": 15.03, "step": 17275, "train_speed(iter/s)": 1.474223 }, { "acc": 0.99881611, "epoch": 30.503089143865843, "grad_norm": 2.483633518218994, "learning_rate": 3.604085164695229e-06, "loss": 0.01515535, "memory(GiB)": 15.03, "step": 17280, "train_speed(iter/s)": 1.474192 }, { "acc": 0.99934521, "epoch": 30.511915269196823, "grad_norm": 0.6431764960289001, "learning_rate": 3.601280625561468e-06, "loss": 0.00865277, "memory(GiB)": 15.03, "step": 17285, "train_speed(iter/s)": 1.4742 }, { "acc": 0.99938793, "epoch": 30.520741394527803, "grad_norm": 1.0443387031555176, "learning_rate": 3.5984765640729656e-06, "loss": 0.00856991, "memory(GiB)": 15.03, "step": 17290, "train_speed(iter/s)": 1.474186 }, { "acc": 0.99890747, "epoch": 30.529567519858784, "grad_norm": 1.3873252868652344, "learning_rate": 3.5956729811869303e-06, "loss": 0.00787514, "memory(GiB)": 15.03, "step": 17295, "train_speed(iter/s)": 1.474192 }, { "acc": 0.99967947, "epoch": 30.53839364518976, "grad_norm": 0.6474394202232361, "learning_rate": 3.5928698778604087e-06, "loss": 0.01080838, "memory(GiB)": 15.03, "step": 17300, "train_speed(iter/s)": 1.474201 }, { "acc": 0.9985796, "epoch": 30.54721977052074, "grad_norm": 1.0150182247161865, "learning_rate": 3.5900672550502825e-06, "loss": 0.01379482, "memory(GiB)": 15.03, "step": 17305, "train_speed(iter/s)": 1.474225 }, { "acc": 0.99861336, "epoch": 30.55604589585172, "grad_norm": 1.9460017681121826, "learning_rate": 3.5872651137132745e-06, "loss": 0.02110481, "memory(GiB)": 15.03, "step": 17310, "train_speed(iter/s)": 1.47423 }, { "acc": 0.99836006, "epoch": 30.5648720211827, "grad_norm": 0.09822236746549606, "learning_rate": 3.5844634548059353e-06, "loss": 0.00771322, "memory(GiB)": 15.03, "step": 17315, "train_speed(iter/s)": 1.474235 }, { "acc": 0.999259, "epoch": 30.57369814651368, "grad_norm": 0.46634626388549805, "learning_rate": 3.581662279284656e-06, "loss": 0.00727947, "memory(GiB)": 15.03, "step": 17320, "train_speed(iter/s)": 1.474242 }, { "acc": 0.9994957, "epoch": 30.58252427184466, "grad_norm": 0.4895884394645691, "learning_rate": 3.5788615881056624e-06, "loss": 0.00709697, "memory(GiB)": 15.03, "step": 17325, "train_speed(iter/s)": 1.474241 }, { "acc": 0.99889183, "epoch": 30.59135039717564, "grad_norm": 0.40630531311035156, "learning_rate": 3.576061382225011e-06, "loss": 0.0136903, "memory(GiB)": 15.03, "step": 17330, "train_speed(iter/s)": 1.474231 }, { "acc": 0.99895763, "epoch": 30.60017652250662, "grad_norm": 0.3345783054828644, "learning_rate": 3.5732616625986e-06, "loss": 0.01261804, "memory(GiB)": 15.03, "step": 17335, "train_speed(iter/s)": 1.474236 }, { "acc": 0.99943733, "epoch": 30.6090026478376, "grad_norm": 0.1958661675453186, "learning_rate": 3.5704624301821517e-06, "loss": 0.00480332, "memory(GiB)": 15.03, "step": 17340, "train_speed(iter/s)": 1.474235 }, { "acc": 0.99907246, "epoch": 30.61782877316858, "grad_norm": 0.4325885772705078, "learning_rate": 3.567663685931233e-06, "loss": 0.0107888, "memory(GiB)": 15.03, "step": 17345, "train_speed(iter/s)": 1.474214 }, { "acc": 0.99945927, "epoch": 30.62665489849956, "grad_norm": 0.32369929552078247, "learning_rate": 3.564865430801235e-06, "loss": 0.00869426, "memory(GiB)": 15.03, "step": 17350, "train_speed(iter/s)": 1.474212 }, { "acc": 0.99942598, "epoch": 30.63548102383054, "grad_norm": 1.2278088331222534, "learning_rate": 3.562067665747387e-06, "loss": 0.01328557, "memory(GiB)": 15.03, "step": 17355, "train_speed(iter/s)": 1.474227 }, { "acc": 0.99919968, "epoch": 30.644307149161516, "grad_norm": 3.027247905731201, "learning_rate": 3.5592703917247483e-06, "loss": 0.0063359, "memory(GiB)": 15.03, "step": 17360, "train_speed(iter/s)": 1.47423 }, { "acc": 0.99929762, "epoch": 30.653133274492497, "grad_norm": 0.20024386048316956, "learning_rate": 3.556473609688214e-06, "loss": 0.00493113, "memory(GiB)": 15.03, "step": 17365, "train_speed(iter/s)": 1.474249 }, { "acc": 0.99824619, "epoch": 30.661959399823477, "grad_norm": 0.126887708902359, "learning_rate": 3.5536773205925074e-06, "loss": 0.01722056, "memory(GiB)": 15.03, "step": 17370, "train_speed(iter/s)": 1.474243 }, { "acc": 0.99774284, "epoch": 30.670785525154457, "grad_norm": 0.48421353101730347, "learning_rate": 3.550881525392186e-06, "loss": 0.02427603, "memory(GiB)": 15.03, "step": 17375, "train_speed(iter/s)": 1.474224 }, { "acc": 0.99957523, "epoch": 30.679611650485437, "grad_norm": 0.2176526039838791, "learning_rate": 3.5480862250416353e-06, "loss": 0.00755497, "memory(GiB)": 15.03, "step": 17380, "train_speed(iter/s)": 1.474217 }, { "acc": 0.99960079, "epoch": 30.688437775816418, "grad_norm": 0.2066309005022049, "learning_rate": 3.5452914204950793e-06, "loss": 0.00536587, "memory(GiB)": 15.03, "step": 17385, "train_speed(iter/s)": 1.474225 }, { "acc": 0.9986414, "epoch": 30.697263901147398, "grad_norm": 0.5198217630386353, "learning_rate": 3.5424971127065645e-06, "loss": 0.01110607, "memory(GiB)": 15.03, "step": 17390, "train_speed(iter/s)": 1.474238 }, { "acc": 0.99953346, "epoch": 30.706090026478375, "grad_norm": 0.24453400075435638, "learning_rate": 3.539703302629972e-06, "loss": 0.01062232, "memory(GiB)": 15.03, "step": 17395, "train_speed(iter/s)": 1.474264 }, { "acc": 0.99771671, "epoch": 30.714916151809355, "grad_norm": 1.5858347415924072, "learning_rate": 3.5369099912190117e-06, "loss": 0.02636093, "memory(GiB)": 15.03, "step": 17400, "train_speed(iter/s)": 1.474266 }, { "acc": 0.99773521, "epoch": 30.723742277140335, "grad_norm": 1.227353572845459, "learning_rate": 3.534117179427226e-06, "loss": 0.02292922, "memory(GiB)": 15.03, "step": 17405, "train_speed(iter/s)": 1.474249 }, { "acc": 0.99972219, "epoch": 30.732568402471315, "grad_norm": 0.2249934822320938, "learning_rate": 3.5313248682079813e-06, "loss": 0.00318493, "memory(GiB)": 15.03, "step": 17410, "train_speed(iter/s)": 1.474251 }, { "acc": 0.9998311, "epoch": 30.741394527802296, "grad_norm": 0.01625652238726616, "learning_rate": 3.5285330585144794e-06, "loss": 0.01045587, "memory(GiB)": 15.03, "step": 17415, "train_speed(iter/s)": 1.474264 }, { "acc": 0.99890633, "epoch": 30.750220653133276, "grad_norm": 1.2417569160461426, "learning_rate": 3.5257417512997462e-06, "loss": 0.00599176, "memory(GiB)": 15.03, "step": 17420, "train_speed(iter/s)": 1.474234 }, { "acc": 0.99956799, "epoch": 30.759046778464253, "grad_norm": 0.20844914019107819, "learning_rate": 3.522950947516638e-06, "loss": 0.00814292, "memory(GiB)": 15.03, "step": 17425, "train_speed(iter/s)": 1.474233 }, { "acc": 0.99917679, "epoch": 30.767872903795233, "grad_norm": 1.3166559934616089, "learning_rate": 3.5201606481178373e-06, "loss": 0.01215774, "memory(GiB)": 15.03, "step": 17430, "train_speed(iter/s)": 1.474245 }, { "acc": 0.99868803, "epoch": 30.776699029126213, "grad_norm": 0.040575698018074036, "learning_rate": 3.517370854055859e-06, "loss": 0.01421704, "memory(GiB)": 15.03, "step": 17435, "train_speed(iter/s)": 1.474275 }, { "acc": 0.99969873, "epoch": 30.785525154457194, "grad_norm": 0.03203370049595833, "learning_rate": 3.5145815662830424e-06, "loss": 0.01098429, "memory(GiB)": 15.03, "step": 17440, "train_speed(iter/s)": 1.474285 }, { "acc": 0.99916153, "epoch": 30.794351279788174, "grad_norm": 0.416576623916626, "learning_rate": 3.511792785751553e-06, "loss": 0.01290686, "memory(GiB)": 15.03, "step": 17445, "train_speed(iter/s)": 1.4743 }, { "acc": 0.99983978, "epoch": 30.803177405119154, "grad_norm": 0.3404448330402374, "learning_rate": 3.5090045134133855e-06, "loss": 0.00472764, "memory(GiB)": 15.03, "step": 17450, "train_speed(iter/s)": 1.47431 }, { "acc": 1.0, "epoch": 30.81200353045013, "grad_norm": 0.48619765043258667, "learning_rate": 3.506216750220358e-06, "loss": 0.00377908, "memory(GiB)": 15.03, "step": 17455, "train_speed(iter/s)": 1.474344 }, { "acc": 0.99888163, "epoch": 30.82082965578111, "grad_norm": 2.3558361530303955, "learning_rate": 3.503429497124119e-06, "loss": 0.01278456, "memory(GiB)": 15.03, "step": 17460, "train_speed(iter/s)": 1.474351 }, { "acc": 0.99969511, "epoch": 30.82965578111209, "grad_norm": 0.5667276382446289, "learning_rate": 3.500642755076137e-06, "loss": 0.00776756, "memory(GiB)": 15.03, "step": 17465, "train_speed(iter/s)": 1.474359 }, { "acc": 1.0, "epoch": 30.83848190644307, "grad_norm": 0.4994390904903412, "learning_rate": 3.4978565250277147e-06, "loss": 0.00660243, "memory(GiB)": 15.03, "step": 17470, "train_speed(iter/s)": 1.474383 }, { "acc": 0.99901276, "epoch": 30.847308031774052, "grad_norm": 0.12842704355716705, "learning_rate": 3.4950708079299705e-06, "loss": 0.00995709, "memory(GiB)": 15.03, "step": 17475, "train_speed(iter/s)": 1.474392 }, { "acc": 0.99941101, "epoch": 30.856134157105032, "grad_norm": 0.06352908164262772, "learning_rate": 3.4922856047338565e-06, "loss": 0.00816749, "memory(GiB)": 15.03, "step": 17480, "train_speed(iter/s)": 1.474364 }, { "acc": 1.0, "epoch": 30.864960282436012, "grad_norm": 0.33188188076019287, "learning_rate": 3.4895009163901406e-06, "loss": 0.01000675, "memory(GiB)": 15.03, "step": 17485, "train_speed(iter/s)": 1.47435 }, { "acc": 0.99908161, "epoch": 30.87378640776699, "grad_norm": 0.34641823172569275, "learning_rate": 3.486716743849423e-06, "loss": 0.00558228, "memory(GiB)": 15.03, "step": 17490, "train_speed(iter/s)": 1.474355 }, { "acc": 0.99900894, "epoch": 30.88261253309797, "grad_norm": 1.325012445449829, "learning_rate": 3.4839330880621214e-06, "loss": 0.00829087, "memory(GiB)": 15.03, "step": 17495, "train_speed(iter/s)": 1.474357 }, { "acc": 0.99756393, "epoch": 30.89143865842895, "grad_norm": 0.6918007731437683, "learning_rate": 3.481149949978482e-06, "loss": 0.01783471, "memory(GiB)": 15.03, "step": 17500, "train_speed(iter/s)": 1.474371 }, { "acc": 0.99875937, "epoch": 30.90026478375993, "grad_norm": 6.039026260375977, "learning_rate": 3.4783673305485706e-06, "loss": 0.01100167, "memory(GiB)": 15.03, "step": 17505, "train_speed(iter/s)": 1.47435 }, { "acc": 0.99912596, "epoch": 30.90909090909091, "grad_norm": 0.15536515414714813, "learning_rate": 3.4755852307222788e-06, "loss": 0.0096382, "memory(GiB)": 15.03, "step": 17510, "train_speed(iter/s)": 1.474328 }, { "acc": 0.99930315, "epoch": 30.91791703442189, "grad_norm": 1.668602705001831, "learning_rate": 3.472803651449319e-06, "loss": 0.0128047, "memory(GiB)": 15.03, "step": 17515, "train_speed(iter/s)": 1.474314 }, { "acc": 0.998316, "epoch": 30.926743159752867, "grad_norm": 0.9786022305488586, "learning_rate": 3.4700225936792276e-06, "loss": 0.0159916, "memory(GiB)": 15.03, "step": 17520, "train_speed(iter/s)": 1.47434 }, { "acc": 0.99963017, "epoch": 30.935569285083847, "grad_norm": 0.03900575637817383, "learning_rate": 3.4672420583613603e-06, "loss": 0.00640708, "memory(GiB)": 15.03, "step": 17525, "train_speed(iter/s)": 1.474358 }, { "acc": 0.9998311, "epoch": 30.944395410414828, "grad_norm": 0.6283926963806152, "learning_rate": 3.4644620464448962e-06, "loss": 0.00773678, "memory(GiB)": 15.03, "step": 17530, "train_speed(iter/s)": 1.474353 }, { "acc": 0.99854078, "epoch": 30.953221535745808, "grad_norm": 0.49220696091651917, "learning_rate": 3.461682558878836e-06, "loss": 0.01269403, "memory(GiB)": 15.03, "step": 17535, "train_speed(iter/s)": 1.47435 }, { "acc": 0.99929247, "epoch": 30.962047661076788, "grad_norm": 0.317823201417923, "learning_rate": 3.4589035966120003e-06, "loss": 0.00715988, "memory(GiB)": 15.03, "step": 17540, "train_speed(iter/s)": 1.474363 }, { "acc": 0.99969511, "epoch": 30.97087378640777, "grad_norm": 0.022727783769369125, "learning_rate": 3.456125160593033e-06, "loss": 0.0051218, "memory(GiB)": 15.03, "step": 17545, "train_speed(iter/s)": 1.474339 }, { "acc": 0.99913597, "epoch": 30.979699911738745, "grad_norm": 0.1277775764465332, "learning_rate": 3.4533472517703926e-06, "loss": 0.01266499, "memory(GiB)": 15.03, "step": 17550, "train_speed(iter/s)": 1.474366 }, { "acc": 0.99913177, "epoch": 30.988526037069725, "grad_norm": 1.5707734823226929, "learning_rate": 3.4505698710923657e-06, "loss": 0.01105264, "memory(GiB)": 15.03, "step": 17555, "train_speed(iter/s)": 1.474366 }, { "acc": 0.99913731, "epoch": 30.997352162400706, "grad_norm": 0.8645176291465759, "learning_rate": 3.4477930195070513e-06, "loss": 0.01469287, "memory(GiB)": 15.03, "step": 17560, "train_speed(iter/s)": 1.474378 }, { "acc": 0.99899292, "epoch": 31.006178287731686, "grad_norm": 0.6549779176712036, "learning_rate": 3.4450166979623734e-06, "loss": 0.02233045, "memory(GiB)": 15.03, "step": 17565, "train_speed(iter/s)": 1.474348 }, { "acc": 0.99920216, "epoch": 31.015004413062666, "grad_norm": 0.47934576869010925, "learning_rate": 3.4422409074060694e-06, "loss": 0.00616006, "memory(GiB)": 15.03, "step": 17570, "train_speed(iter/s)": 1.474359 }, { "acc": 0.99781284, "epoch": 31.023830538393646, "grad_norm": 0.7060786485671997, "learning_rate": 3.4394656487857007e-06, "loss": 0.01618712, "memory(GiB)": 15.03, "step": 17575, "train_speed(iter/s)": 1.474372 }, { "acc": 0.99939947, "epoch": 31.032656663724627, "grad_norm": 1.038880467414856, "learning_rate": 3.436690923048644e-06, "loss": 0.00747546, "memory(GiB)": 15.03, "step": 17580, "train_speed(iter/s)": 1.474395 }, { "acc": 0.99983559, "epoch": 31.041482789055603, "grad_norm": 0.19940373301506042, "learning_rate": 3.433916731142096e-06, "loss": 0.00771412, "memory(GiB)": 15.03, "step": 17585, "train_speed(iter/s)": 1.474396 }, { "acc": 0.99938908, "epoch": 31.050308914386584, "grad_norm": 2.0571627616882324, "learning_rate": 3.431143074013067e-06, "loss": 0.0162762, "memory(GiB)": 15.03, "step": 17590, "train_speed(iter/s)": 1.474382 }, { "acc": 0.99922075, "epoch": 31.059135039717564, "grad_norm": 0.36986562609672546, "learning_rate": 3.4283699526083923e-06, "loss": 0.00601839, "memory(GiB)": 15.03, "step": 17595, "train_speed(iter/s)": 1.474382 }, { "acc": 0.99957628, "epoch": 31.067961165048544, "grad_norm": 1.8502092361450195, "learning_rate": 3.4255973678747167e-06, "loss": 0.01046253, "memory(GiB)": 15.03, "step": 17600, "train_speed(iter/s)": 1.474374 }, { "acc": 0.99949045, "epoch": 31.076787290379524, "grad_norm": 0.4278129041194916, "learning_rate": 3.422825320758508e-06, "loss": 0.0074069, "memory(GiB)": 15.03, "step": 17605, "train_speed(iter/s)": 1.474371 }, { "acc": 0.99910717, "epoch": 31.085613415710505, "grad_norm": 0.5707171559333801, "learning_rate": 3.4200538122060444e-06, "loss": 0.0100201, "memory(GiB)": 15.03, "step": 17610, "train_speed(iter/s)": 1.474342 }, { "acc": 0.99951763, "epoch": 31.09443954104148, "grad_norm": 0.49128586053848267, "learning_rate": 3.4172828431634265e-06, "loss": 0.0057721, "memory(GiB)": 15.03, "step": 17615, "train_speed(iter/s)": 1.474357 }, { "acc": 0.99954443, "epoch": 31.10326566637246, "grad_norm": 0.22713841497898102, "learning_rate": 3.4145124145765648e-06, "loss": 0.0060826, "memory(GiB)": 15.03, "step": 17620, "train_speed(iter/s)": 1.47436 }, { "acc": 0.99950504, "epoch": 31.112091791703442, "grad_norm": 0.270831823348999, "learning_rate": 3.411742527391189e-06, "loss": 0.01042523, "memory(GiB)": 15.03, "step": 17625, "train_speed(iter/s)": 1.474375 }, { "acc": 0.99904079, "epoch": 31.120917917034422, "grad_norm": 2.979966402053833, "learning_rate": 3.408973182552844e-06, "loss": 0.00941696, "memory(GiB)": 15.03, "step": 17630, "train_speed(iter/s)": 1.474396 }, { "acc": 0.99968224, "epoch": 31.129744042365402, "grad_norm": 0.4663883447647095, "learning_rate": 3.406204381006888e-06, "loss": 0.00446607, "memory(GiB)": 15.03, "step": 17635, "train_speed(iter/s)": 1.474413 }, { "acc": 0.99911108, "epoch": 31.138570167696383, "grad_norm": 3.610180616378784, "learning_rate": 3.4034361236984923e-06, "loss": 0.01326562, "memory(GiB)": 15.03, "step": 17640, "train_speed(iter/s)": 1.474394 }, { "acc": 0.99908981, "epoch": 31.14739629302736, "grad_norm": 0.393912672996521, "learning_rate": 3.400668411572649e-06, "loss": 0.0103963, "memory(GiB)": 15.03, "step": 17645, "train_speed(iter/s)": 1.4744 }, { "acc": 0.99781551, "epoch": 31.15622241835834, "grad_norm": 0.0607539527118206, "learning_rate": 3.397901245574156e-06, "loss": 0.01640662, "memory(GiB)": 15.03, "step": 17650, "train_speed(iter/s)": 1.474378 }, { "acc": 0.9995142, "epoch": 31.16504854368932, "grad_norm": 0.9380244016647339, "learning_rate": 3.39513462664763e-06, "loss": 0.00829846, "memory(GiB)": 15.03, "step": 17655, "train_speed(iter/s)": 1.47438 }, { "acc": 0.99937725, "epoch": 31.1738746690203, "grad_norm": 0.5274990200996399, "learning_rate": 3.3923685557374996e-06, "loss": 0.00994879, "memory(GiB)": 15.03, "step": 17660, "train_speed(iter/s)": 1.474394 }, { "acc": 0.99921217, "epoch": 31.18270079435128, "grad_norm": 0.9208670854568481, "learning_rate": 3.389603033788004e-06, "loss": 0.01606364, "memory(GiB)": 15.03, "step": 17665, "train_speed(iter/s)": 1.474405 }, { "acc": 0.99875698, "epoch": 31.19152691968226, "grad_norm": 0.6521402597427368, "learning_rate": 3.3868380617431996e-06, "loss": 0.0125856, "memory(GiB)": 15.03, "step": 17670, "train_speed(iter/s)": 1.474417 }, { "acc": 1.0, "epoch": 31.20035304501324, "grad_norm": 0.5712131261825562, "learning_rate": 3.3840736405469495e-06, "loss": 0.00601209, "memory(GiB)": 15.03, "step": 17675, "train_speed(iter/s)": 1.474419 }, { "acc": 0.99935713, "epoch": 31.209179170344218, "grad_norm": 0.7366440296173096, "learning_rate": 3.3813097711429356e-06, "loss": 0.01147451, "memory(GiB)": 15.03, "step": 17680, "train_speed(iter/s)": 1.474415 }, { "acc": 0.99840775, "epoch": 31.218005295675198, "grad_norm": 1.3173965215682983, "learning_rate": 3.3785464544746443e-06, "loss": 0.01230726, "memory(GiB)": 15.03, "step": 17685, "train_speed(iter/s)": 1.474425 }, { "acc": 0.99932289, "epoch": 31.226831421006178, "grad_norm": 0.2528935968875885, "learning_rate": 3.3757836914853797e-06, "loss": 0.01064018, "memory(GiB)": 15.03, "step": 17690, "train_speed(iter/s)": 1.474411 }, { "acc": 0.99908009, "epoch": 31.23565754633716, "grad_norm": 0.33173859119415283, "learning_rate": 3.3730214831182507e-06, "loss": 0.00619737, "memory(GiB)": 15.03, "step": 17695, "train_speed(iter/s)": 1.474408 }, { "acc": 0.99913483, "epoch": 31.24448367166814, "grad_norm": 0.381234347820282, "learning_rate": 3.3702598303161833e-06, "loss": 0.00840758, "memory(GiB)": 15.03, "step": 17700, "train_speed(iter/s)": 1.47439 }, { "acc": 0.99905357, "epoch": 31.25330979699912, "grad_norm": 0.1401083916425705, "learning_rate": 3.3674987340219066e-06, "loss": 0.00876533, "memory(GiB)": 15.03, "step": 17705, "train_speed(iter/s)": 1.474411 }, { "acc": 0.99889679, "epoch": 31.262135922330096, "grad_norm": 0.007109501399099827, "learning_rate": 3.3647381951779675e-06, "loss": 0.0087396, "memory(GiB)": 15.03, "step": 17710, "train_speed(iter/s)": 1.474411 }, { "acc": 0.99960556, "epoch": 31.270962047661076, "grad_norm": 0.11604813486337662, "learning_rate": 3.3619782147267146e-06, "loss": 0.00445918, "memory(GiB)": 15.03, "step": 17715, "train_speed(iter/s)": 1.474396 }, { "acc": 0.99981613, "epoch": 31.279788172992056, "grad_norm": 0.18889188766479492, "learning_rate": 3.359218793610315e-06, "loss": 0.00342791, "memory(GiB)": 15.03, "step": 17720, "train_speed(iter/s)": 1.474391 }, { "acc": 0.99969511, "epoch": 31.288614298323036, "grad_norm": 0.3278385102748871, "learning_rate": 3.3564599327707363e-06, "loss": 0.00478234, "memory(GiB)": 15.03, "step": 17725, "train_speed(iter/s)": 1.474406 }, { "acc": 0.99951267, "epoch": 31.297440423654017, "grad_norm": 0.5210908055305481, "learning_rate": 3.3537016331497614e-06, "loss": 0.0092902, "memory(GiB)": 15.03, "step": 17730, "train_speed(iter/s)": 1.474426 }, { "acc": 0.99865646, "epoch": 31.306266548984997, "grad_norm": 0.34686189889907837, "learning_rate": 3.350943895688975e-06, "loss": 0.01366419, "memory(GiB)": 15.03, "step": 17735, "train_speed(iter/s)": 1.474417 }, { "acc": 0.99936066, "epoch": 31.315092674315974, "grad_norm": 0.29115554690361023, "learning_rate": 3.3481867213297766e-06, "loss": 0.01110061, "memory(GiB)": 15.03, "step": 17740, "train_speed(iter/s)": 1.474428 }, { "acc": 0.99934216, "epoch": 31.323918799646954, "grad_norm": 0.5594719648361206, "learning_rate": 3.34543011101337e-06, "loss": 0.00855054, "memory(GiB)": 15.03, "step": 17745, "train_speed(iter/s)": 1.474429 }, { "acc": 0.99948959, "epoch": 31.332744924977934, "grad_norm": 0.008340579457581043, "learning_rate": 3.342674065680766e-06, "loss": 0.00461514, "memory(GiB)": 15.03, "step": 17750, "train_speed(iter/s)": 1.474448 }, { "acc": 0.99854507, "epoch": 31.341571050308914, "grad_norm": 0.3552601635456085, "learning_rate": 3.339918586272783e-06, "loss": 0.01482536, "memory(GiB)": 15.03, "step": 17755, "train_speed(iter/s)": 1.474458 }, { "acc": 0.99961882, "epoch": 31.350397175639895, "grad_norm": 0.31322723627090454, "learning_rate": 3.3371636737300465e-06, "loss": 0.0059231, "memory(GiB)": 15.03, "step": 17760, "train_speed(iter/s)": 1.474444 }, { "acc": 0.99906025, "epoch": 31.359223300970875, "grad_norm": 0.38944685459136963, "learning_rate": 3.3344093289929917e-06, "loss": 0.00961075, "memory(GiB)": 15.03, "step": 17765, "train_speed(iter/s)": 1.474444 }, { "acc": 0.99973965, "epoch": 31.368049426301855, "grad_norm": 0.35497361421585083, "learning_rate": 3.331655553001855e-06, "loss": 0.00437502, "memory(GiB)": 15.03, "step": 17770, "train_speed(iter/s)": 1.47445 }, { "acc": 0.99942741, "epoch": 31.376875551632832, "grad_norm": 0.4477832317352295, "learning_rate": 3.32890234669668e-06, "loss": 0.00996907, "memory(GiB)": 15.03, "step": 17775, "train_speed(iter/s)": 1.474443 }, { "acc": 0.99938097, "epoch": 31.385701676963812, "grad_norm": 0.3248356878757477, "learning_rate": 3.3261497110173174e-06, "loss": 0.00951055, "memory(GiB)": 15.03, "step": 17780, "train_speed(iter/s)": 1.474435 }, { "acc": 0.99976854, "epoch": 31.394527802294792, "grad_norm": 0.7199221253395081, "learning_rate": 3.3233976469034224e-06, "loss": 0.00816404, "memory(GiB)": 15.03, "step": 17785, "train_speed(iter/s)": 1.474454 }, { "acc": 0.99891434, "epoch": 31.403353927625773, "grad_norm": 0.060719966888427734, "learning_rate": 3.3206461552944526e-06, "loss": 0.00569419, "memory(GiB)": 15.03, "step": 17790, "train_speed(iter/s)": 1.474461 }, { "acc": 0.99943466, "epoch": 31.412180052956753, "grad_norm": 0.07889757305383682, "learning_rate": 3.3178952371296752e-06, "loss": 0.00768059, "memory(GiB)": 15.03, "step": 17795, "train_speed(iter/s)": 1.474463 }, { "acc": 0.99975967, "epoch": 31.421006178287733, "grad_norm": 0.5529935359954834, "learning_rate": 3.315144893348155e-06, "loss": 0.00300614, "memory(GiB)": 15.03, "step": 17800, "train_speed(iter/s)": 1.474446 }, { "acc": 0.99836855, "epoch": 31.42983230361871, "grad_norm": 5.547778129577637, "learning_rate": 3.31239512488877e-06, "loss": 0.01641202, "memory(GiB)": 15.03, "step": 17805, "train_speed(iter/s)": 1.474437 }, { "acc": 0.99890976, "epoch": 31.43865842894969, "grad_norm": 1.1666467189788818, "learning_rate": 3.309645932690192e-06, "loss": 0.01353566, "memory(GiB)": 15.03, "step": 17810, "train_speed(iter/s)": 1.474449 }, { "acc": 0.99924107, "epoch": 31.44748455428067, "grad_norm": 1.3326669931411743, "learning_rate": 3.3068973176909027e-06, "loss": 0.01084407, "memory(GiB)": 15.03, "step": 17815, "train_speed(iter/s)": 1.474434 }, { "acc": 1.0, "epoch": 31.45631067961165, "grad_norm": 0.8646693825721741, "learning_rate": 3.304149280829183e-06, "loss": 0.00149051, "memory(GiB)": 15.03, "step": 17820, "train_speed(iter/s)": 1.474419 }, { "acc": 0.99865036, "epoch": 31.46513680494263, "grad_norm": 2.8243162631988525, "learning_rate": 3.3014018230431193e-06, "loss": 0.012027, "memory(GiB)": 15.03, "step": 17825, "train_speed(iter/s)": 1.474408 }, { "acc": 0.99825354, "epoch": 31.47396293027361, "grad_norm": 3.641777753829956, "learning_rate": 3.298654945270597e-06, "loss": 0.02362848, "memory(GiB)": 15.03, "step": 17830, "train_speed(iter/s)": 1.474423 }, { "acc": 0.99960785, "epoch": 31.482789055604588, "grad_norm": 0.8343186974525452, "learning_rate": 3.295908648449308e-06, "loss": 0.00907449, "memory(GiB)": 15.03, "step": 17835, "train_speed(iter/s)": 1.474441 }, { "acc": 1.0, "epoch": 31.491615180935568, "grad_norm": 0.06970841437578201, "learning_rate": 3.2931629335167407e-06, "loss": 0.00402673, "memory(GiB)": 15.03, "step": 17840, "train_speed(iter/s)": 1.474447 }, { "acc": 0.99949474, "epoch": 31.50044130626655, "grad_norm": 0.8928570747375488, "learning_rate": 3.2904178014101913e-06, "loss": 0.01170678, "memory(GiB)": 15.03, "step": 17845, "train_speed(iter/s)": 1.474467 }, { "acc": 0.99955959, "epoch": 31.50926743159753, "grad_norm": 0.7087474465370178, "learning_rate": 3.287673253066748e-06, "loss": 0.00721568, "memory(GiB)": 15.03, "step": 17850, "train_speed(iter/s)": 1.474455 }, { "acc": 0.99964285, "epoch": 31.51809355692851, "grad_norm": 0.8302301168441772, "learning_rate": 3.28492928942331e-06, "loss": 0.00819524, "memory(GiB)": 15.03, "step": 17855, "train_speed(iter/s)": 1.474468 }, { "acc": 0.99974995, "epoch": 31.52691968225949, "grad_norm": 0.16413627564907074, "learning_rate": 3.2821859114165695e-06, "loss": 0.00416669, "memory(GiB)": 15.03, "step": 17860, "train_speed(iter/s)": 1.474469 }, { "acc": 1.0, "epoch": 31.535745807590466, "grad_norm": 0.5691476464271545, "learning_rate": 3.279443119983021e-06, "loss": 0.01001098, "memory(GiB)": 15.03, "step": 17865, "train_speed(iter/s)": 1.474471 }, { "acc": 0.99937248, "epoch": 31.544571932921446, "grad_norm": 1.3242193460464478, "learning_rate": 3.27670091605896e-06, "loss": 0.02018854, "memory(GiB)": 15.03, "step": 17870, "train_speed(iter/s)": 1.474483 }, { "acc": 0.99977684, "epoch": 31.553398058252426, "grad_norm": 0.5317384600639343, "learning_rate": 3.27395930058048e-06, "loss": 0.01191862, "memory(GiB)": 15.03, "step": 17875, "train_speed(iter/s)": 1.474454 }, { "acc": 0.99964437, "epoch": 31.562224183583407, "grad_norm": 0.4056932330131531, "learning_rate": 3.271218274483473e-06, "loss": 0.00613865, "memory(GiB)": 15.03, "step": 17880, "train_speed(iter/s)": 1.474416 }, { "acc": 0.99967947, "epoch": 31.571050308914387, "grad_norm": 0.7241507768630981, "learning_rate": 3.2684778387036303e-06, "loss": 0.01307234, "memory(GiB)": 15.03, "step": 17885, "train_speed(iter/s)": 1.474413 }, { "acc": 0.99978809, "epoch": 31.579876434245367, "grad_norm": 0.38954100012779236, "learning_rate": 3.2657379941764466e-06, "loss": 0.00739274, "memory(GiB)": 15.03, "step": 17890, "train_speed(iter/s)": 1.474415 }, { "acc": 0.99973402, "epoch": 31.588702559576348, "grad_norm": 0.2766460180282593, "learning_rate": 3.262998741837205e-06, "loss": 0.0064548, "memory(GiB)": 15.03, "step": 17895, "train_speed(iter/s)": 1.474418 }, { "acc": 0.99928102, "epoch": 31.597528684907324, "grad_norm": 0.5072278380393982, "learning_rate": 3.260260082620995e-06, "loss": 0.0134619, "memory(GiB)": 15.03, "step": 17900, "train_speed(iter/s)": 1.474433 }, { "acc": 1.0, "epoch": 31.606354810238305, "grad_norm": 0.2405182421207428, "learning_rate": 3.2575220174626992e-06, "loss": 0.00441857, "memory(GiB)": 15.03, "step": 17905, "train_speed(iter/s)": 1.474421 }, { "acc": 0.99915962, "epoch": 31.615180935569285, "grad_norm": 0.4678500294685364, "learning_rate": 3.2547845472969995e-06, "loss": 0.00891373, "memory(GiB)": 15.03, "step": 17910, "train_speed(iter/s)": 1.474413 }, { "acc": 0.99963665, "epoch": 31.624007060900265, "grad_norm": 4.279112339019775, "learning_rate": 3.2520476730583733e-06, "loss": 0.01671486, "memory(GiB)": 15.03, "step": 17915, "train_speed(iter/s)": 1.474422 }, { "acc": 0.99900055, "epoch": 31.632833186231245, "grad_norm": 1.6098735332489014, "learning_rate": 3.2493113956810957e-06, "loss": 0.01204088, "memory(GiB)": 15.03, "step": 17920, "train_speed(iter/s)": 1.47441 }, { "acc": 0.99982147, "epoch": 31.641659311562226, "grad_norm": 0.3814529478549957, "learning_rate": 3.2465757160992345e-06, "loss": 0.00818552, "memory(GiB)": 15.03, "step": 17925, "train_speed(iter/s)": 1.474415 }, { "acc": 0.99930105, "epoch": 31.650485436893202, "grad_norm": 0.554873526096344, "learning_rate": 3.243840635246661e-06, "loss": 0.00625193, "memory(GiB)": 15.03, "step": 17930, "train_speed(iter/s)": 1.474416 }, { "acc": 0.99941187, "epoch": 31.659311562224183, "grad_norm": 0.126388818025589, "learning_rate": 3.241106154057035e-06, "loss": 0.01073126, "memory(GiB)": 15.03, "step": 17935, "train_speed(iter/s)": 1.474424 }, { "acc": 0.99900427, "epoch": 31.668137687555163, "grad_norm": 0.9917049407958984, "learning_rate": 3.2383722734638144e-06, "loss": 0.00655367, "memory(GiB)": 15.03, "step": 17940, "train_speed(iter/s)": 1.474407 }, { "acc": 0.9982378, "epoch": 31.676963812886143, "grad_norm": 2.306837320327759, "learning_rate": 3.2356389944002512e-06, "loss": 0.01251309, "memory(GiB)": 15.03, "step": 17945, "train_speed(iter/s)": 1.474408 }, { "acc": 0.99870033, "epoch": 31.685789938217123, "grad_norm": 0.3118349313735962, "learning_rate": 3.2329063177993945e-06, "loss": 0.01695338, "memory(GiB)": 15.03, "step": 17950, "train_speed(iter/s)": 1.47441 }, { "acc": 0.99871092, "epoch": 31.694616063548104, "grad_norm": 0.28511643409729004, "learning_rate": 3.230174244594084e-06, "loss": 0.01634724, "memory(GiB)": 15.03, "step": 17955, "train_speed(iter/s)": 1.474404 }, { "acc": 0.99928055, "epoch": 31.70344218887908, "grad_norm": 0.5083217024803162, "learning_rate": 3.2274427757169563e-06, "loss": 0.00749095, "memory(GiB)": 15.03, "step": 17960, "train_speed(iter/s)": 1.4744 }, { "acc": 1.0, "epoch": 31.71226831421006, "grad_norm": 1.0978190898895264, "learning_rate": 3.224711912100439e-06, "loss": 0.00301291, "memory(GiB)": 15.03, "step": 17965, "train_speed(iter/s)": 1.4744 }, { "acc": 0.99952812, "epoch": 31.72109443954104, "grad_norm": 0.6417052149772644, "learning_rate": 3.2219816546767573e-06, "loss": 0.00572926, "memory(GiB)": 15.03, "step": 17970, "train_speed(iter/s)": 1.474401 }, { "acc": 0.9989645, "epoch": 31.72992056487202, "grad_norm": 0.41997280716896057, "learning_rate": 3.2192520043779256e-06, "loss": 0.0101878, "memory(GiB)": 15.03, "step": 17975, "train_speed(iter/s)": 1.474411 }, { "acc": 0.99985123, "epoch": 31.738746690203, "grad_norm": 0.33124229311943054, "learning_rate": 3.2165229621357524e-06, "loss": 0.00986808, "memory(GiB)": 15.03, "step": 17980, "train_speed(iter/s)": 1.474425 }, { "acc": 0.99899158, "epoch": 31.74757281553398, "grad_norm": 0.4386896789073944, "learning_rate": 3.2137945288818393e-06, "loss": 0.00947646, "memory(GiB)": 15.03, "step": 17985, "train_speed(iter/s)": 1.474433 }, { "acc": 0.99949493, "epoch": 31.756398940864962, "grad_norm": 0.19448785483837128, "learning_rate": 3.2110667055475787e-06, "loss": 0.00702339, "memory(GiB)": 15.03, "step": 17990, "train_speed(iter/s)": 1.474451 }, { "acc": 0.99914255, "epoch": 31.76522506619594, "grad_norm": 0.5766091346740723, "learning_rate": 3.2083394930641567e-06, "loss": 0.00714418, "memory(GiB)": 15.03, "step": 17995, "train_speed(iter/s)": 1.474447 }, { "acc": 0.99967108, "epoch": 31.77405119152692, "grad_norm": 0.37996846437454224, "learning_rate": 3.2056128923625475e-06, "loss": 0.00731859, "memory(GiB)": 15.03, "step": 18000, "train_speed(iter/s)": 1.474452 }, { "acc": 0.99944134, "epoch": 31.7828773168579, "grad_norm": 0.4082198441028595, "learning_rate": 3.2028869043735206e-06, "loss": 0.00736569, "memory(GiB)": 15.03, "step": 18005, "train_speed(iter/s)": 1.47444 }, { "acc": 0.99895687, "epoch": 31.79170344218888, "grad_norm": 0.21823018789291382, "learning_rate": 3.2001615300276314e-06, "loss": 0.01074059, "memory(GiB)": 15.03, "step": 18010, "train_speed(iter/s)": 1.474447 }, { "acc": 0.99960823, "epoch": 31.80052956751986, "grad_norm": 0.40762418508529663, "learning_rate": 3.197436770255232e-06, "loss": 0.0105849, "memory(GiB)": 15.03, "step": 18015, "train_speed(iter/s)": 1.474451 }, { "acc": 0.99937115, "epoch": 31.80935569285084, "grad_norm": 0.31413859128952026, "learning_rate": 3.19471262598646e-06, "loss": 0.01317574, "memory(GiB)": 15.03, "step": 18020, "train_speed(iter/s)": 1.474447 }, { "acc": 0.99903927, "epoch": 31.818181818181817, "grad_norm": 1.3106911182403564, "learning_rate": 3.1919890981512464e-06, "loss": 0.0084449, "memory(GiB)": 15.03, "step": 18025, "train_speed(iter/s)": 1.47445 }, { "acc": 0.99979506, "epoch": 31.827007943512797, "grad_norm": 0.19639529287815094, "learning_rate": 3.1892661876793056e-06, "loss": 0.00498082, "memory(GiB)": 15.03, "step": 18030, "train_speed(iter/s)": 1.474468 }, { "acc": 0.99770222, "epoch": 31.835834068843777, "grad_norm": 0.5238794088363647, "learning_rate": 3.1865438955001492e-06, "loss": 0.01758661, "memory(GiB)": 15.03, "step": 18035, "train_speed(iter/s)": 1.474479 }, { "acc": 0.99822178, "epoch": 31.844660194174757, "grad_norm": 0.38014551997184753, "learning_rate": 3.1838222225430708e-06, "loss": 0.01326111, "memory(GiB)": 15.03, "step": 18040, "train_speed(iter/s)": 1.474477 }, { "acc": 0.99861364, "epoch": 31.853486319505738, "grad_norm": 0.6141208410263062, "learning_rate": 3.1811011697371586e-06, "loss": 0.01638894, "memory(GiB)": 15.03, "step": 18045, "train_speed(iter/s)": 1.47449 }, { "acc": 0.99806089, "epoch": 31.862312444836718, "grad_norm": 2.2800395488739014, "learning_rate": 3.178380738011283e-06, "loss": 0.01634993, "memory(GiB)": 15.03, "step": 18050, "train_speed(iter/s)": 1.474482 }, { "acc": 0.99915829, "epoch": 31.871138570167695, "grad_norm": 0.16380329430103302, "learning_rate": 3.17566092829411e-06, "loss": 0.01104881, "memory(GiB)": 15.03, "step": 18055, "train_speed(iter/s)": 1.474492 }, { "acc": 0.999471, "epoch": 31.879964695498675, "grad_norm": 0.4055873155593872, "learning_rate": 3.172941741514084e-06, "loss": 0.01187287, "memory(GiB)": 15.03, "step": 18060, "train_speed(iter/s)": 1.474484 }, { "acc": 0.99831896, "epoch": 31.888790820829655, "grad_norm": 0.042700063437223434, "learning_rate": 3.170223178599446e-06, "loss": 0.01157289, "memory(GiB)": 15.03, "step": 18065, "train_speed(iter/s)": 1.474488 }, { "acc": 0.99974995, "epoch": 31.897616946160635, "grad_norm": 0.04772741720080376, "learning_rate": 3.1675052404782163e-06, "loss": 0.00441035, "memory(GiB)": 15.03, "step": 18070, "train_speed(iter/s)": 1.474503 }, { "acc": 0.99975967, "epoch": 31.906443071491616, "grad_norm": 0.4871412217617035, "learning_rate": 3.1647879280782074e-06, "loss": 0.0068038, "memory(GiB)": 15.03, "step": 18075, "train_speed(iter/s)": 1.4745 }, { "acc": 0.99871912, "epoch": 31.915269196822596, "grad_norm": 0.30067506432533264, "learning_rate": 3.1620712423270143e-06, "loss": 0.00674992, "memory(GiB)": 15.03, "step": 18080, "train_speed(iter/s)": 1.474525 }, { "acc": 0.99838409, "epoch": 31.924095322153576, "grad_norm": 1.0067812204360962, "learning_rate": 3.1593551841520215e-06, "loss": 0.02068809, "memory(GiB)": 15.03, "step": 18085, "train_speed(iter/s)": 1.474542 }, { "acc": 0.99982872, "epoch": 31.932921447484553, "grad_norm": 0.248103529214859, "learning_rate": 3.1566397544803973e-06, "loss": 0.00918182, "memory(GiB)": 15.03, "step": 18090, "train_speed(iter/s)": 1.474538 }, { "acc": 0.99982643, "epoch": 31.941747572815533, "grad_norm": 0.45098957419395447, "learning_rate": 3.1539249542390938e-06, "loss": 0.00631234, "memory(GiB)": 15.03, "step": 18095, "train_speed(iter/s)": 1.474552 }, { "acc": 0.99902916, "epoch": 31.950573698146513, "grad_norm": 1.769364833831787, "learning_rate": 3.151210784354854e-06, "loss": 0.00763269, "memory(GiB)": 15.03, "step": 18100, "train_speed(iter/s)": 1.474548 }, { "acc": 0.99981613, "epoch": 31.959399823477494, "grad_norm": 0.3916313350200653, "learning_rate": 3.1484972457541986e-06, "loss": 0.01271055, "memory(GiB)": 15.03, "step": 18105, "train_speed(iter/s)": 1.474566 }, { "acc": 0.99969196, "epoch": 31.968225948808474, "grad_norm": 0.3507344126701355, "learning_rate": 3.1457843393634378e-06, "loss": 0.00513673, "memory(GiB)": 15.03, "step": 18110, "train_speed(iter/s)": 1.474579 }, { "acc": 0.99852476, "epoch": 31.977052074139454, "grad_norm": 0.019037960097193718, "learning_rate": 3.143072066108664e-06, "loss": 0.01144783, "memory(GiB)": 15.03, "step": 18115, "train_speed(iter/s)": 1.474594 }, { "acc": 0.99973402, "epoch": 31.98587819947043, "grad_norm": 0.3701723515987396, "learning_rate": 3.1403604269157534e-06, "loss": 0.0114295, "memory(GiB)": 15.03, "step": 18120, "train_speed(iter/s)": 1.47458 }, { "acc": 0.99960384, "epoch": 31.99470432480141, "grad_norm": 0.019288353621959686, "learning_rate": 3.137649422710366e-06, "loss": 0.00502953, "memory(GiB)": 15.03, "step": 18125, "train_speed(iter/s)": 1.474572 }, { "acc": 0.99953709, "epoch": 32.003530450132395, "grad_norm": 0.6623662114143372, "learning_rate": 3.134939054417947e-06, "loss": 0.01413929, "memory(GiB)": 15.03, "step": 18130, "train_speed(iter/s)": 1.474511 }, { "acc": 0.9988431, "epoch": 32.01235657546337, "grad_norm": 2.5953116416931152, "learning_rate": 3.1322293229637186e-06, "loss": 0.01069215, "memory(GiB)": 15.03, "step": 18135, "train_speed(iter/s)": 1.474512 }, { "acc": 0.99974995, "epoch": 32.02118270079435, "grad_norm": 0.33504727482795715, "learning_rate": 3.1295202292726945e-06, "loss": 0.01112119, "memory(GiB)": 15.03, "step": 18140, "train_speed(iter/s)": 1.47453 }, { "acc": 0.99984179, "epoch": 32.03000882612533, "grad_norm": 0.19499242305755615, "learning_rate": 3.1268117742696623e-06, "loss": 0.00845528, "memory(GiB)": 15.03, "step": 18145, "train_speed(iter/s)": 1.474533 }, { "acc": 0.99779091, "epoch": 32.03883495145631, "grad_norm": 1.9278756380081177, "learning_rate": 3.1241039588791982e-06, "loss": 0.01484069, "memory(GiB)": 15.03, "step": 18150, "train_speed(iter/s)": 1.474523 }, { "acc": 0.99959335, "epoch": 32.04766107678729, "grad_norm": 0.3929632008075714, "learning_rate": 3.121396784025654e-06, "loss": 0.01033637, "memory(GiB)": 15.03, "step": 18155, "train_speed(iter/s)": 1.474507 }, { "acc": 1.0, "epoch": 32.05648720211827, "grad_norm": 0.6588337421417236, "learning_rate": 3.1186902506331684e-06, "loss": 0.00729671, "memory(GiB)": 15.03, "step": 18160, "train_speed(iter/s)": 1.474517 }, { "acc": 1.0, "epoch": 32.06531332744925, "grad_norm": 0.5345266461372375, "learning_rate": 3.115984359625657e-06, "loss": 0.0042188, "memory(GiB)": 15.03, "step": 18165, "train_speed(iter/s)": 1.474531 }, { "acc": 0.99911537, "epoch": 32.07413945278023, "grad_norm": 0.5431683659553528, "learning_rate": 3.1132791119268193e-06, "loss": 0.00829813, "memory(GiB)": 15.03, "step": 18170, "train_speed(iter/s)": 1.474526 }, { "acc": 0.9997282, "epoch": 32.08296557811121, "grad_norm": 0.3204798698425293, "learning_rate": 3.1105745084601297e-06, "loss": 0.00570428, "memory(GiB)": 15.03, "step": 18175, "train_speed(iter/s)": 1.474509 }, { "acc": 0.99966211, "epoch": 32.09179170344219, "grad_norm": 0.09486634284257889, "learning_rate": 3.107870550148852e-06, "loss": 0.00664928, "memory(GiB)": 15.03, "step": 18180, "train_speed(iter/s)": 1.474523 }, { "acc": 1.0, "epoch": 32.10061782877317, "grad_norm": 0.22745360434055328, "learning_rate": 3.105167237916021e-06, "loss": 0.00453571, "memory(GiB)": 15.03, "step": 18185, "train_speed(iter/s)": 1.474536 }, { "acc": 0.99946375, "epoch": 32.10944395410415, "grad_norm": 0.5142778754234314, "learning_rate": 3.102464572684455e-06, "loss": 0.01421088, "memory(GiB)": 15.03, "step": 18190, "train_speed(iter/s)": 1.474543 }, { "acc": 0.9995533, "epoch": 32.11827007943513, "grad_norm": 0.5007224082946777, "learning_rate": 3.0997625553767507e-06, "loss": 0.00861605, "memory(GiB)": 15.03, "step": 18195, "train_speed(iter/s)": 1.47455 }, { "acc": 0.99975967, "epoch": 32.127096204766104, "grad_norm": 0.19153133034706116, "learning_rate": 3.0970611869152835e-06, "loss": 0.00697244, "memory(GiB)": 15.03, "step": 18200, "train_speed(iter/s)": 1.474548 }, { "acc": 0.99988213, "epoch": 32.13592233009709, "grad_norm": 0.1689414083957672, "learning_rate": 3.094360468222208e-06, "loss": 0.00616304, "memory(GiB)": 15.03, "step": 18205, "train_speed(iter/s)": 1.474541 }, { "acc": 0.99951639, "epoch": 32.144748455428065, "grad_norm": 0.028906237334012985, "learning_rate": 3.0916604002194555e-06, "loss": 0.00248439, "memory(GiB)": 15.03, "step": 18210, "train_speed(iter/s)": 1.474547 }, { "acc": 1.0, "epoch": 32.15357458075905, "grad_norm": 0.33599331974983215, "learning_rate": 3.0889609838287387e-06, "loss": 0.0058312, "memory(GiB)": 15.03, "step": 18215, "train_speed(iter/s)": 1.474546 }, { "acc": 0.99958363, "epoch": 32.162400706090025, "grad_norm": 0.6015697121620178, "learning_rate": 3.0862622199715403e-06, "loss": 0.00435567, "memory(GiB)": 15.03, "step": 18220, "train_speed(iter/s)": 1.474558 }, { "acc": 0.999543, "epoch": 32.17122683142101, "grad_norm": 0.03524408116936684, "learning_rate": 3.0835641095691305e-06, "loss": 0.00556789, "memory(GiB)": 15.03, "step": 18225, "train_speed(iter/s)": 1.474563 }, { "acc": 0.99978809, "epoch": 32.180052956751986, "grad_norm": 0.4078781306743622, "learning_rate": 3.0808666535425484e-06, "loss": 0.00358037, "memory(GiB)": 15.03, "step": 18230, "train_speed(iter/s)": 1.474549 }, { "acc": 0.99883556, "epoch": 32.18887908208296, "grad_norm": 0.777277410030365, "learning_rate": 3.078169852812614e-06, "loss": 0.00977422, "memory(GiB)": 15.03, "step": 18235, "train_speed(iter/s)": 1.474536 }, { "acc": 0.99987869, "epoch": 32.19770520741395, "grad_norm": 0.367713063955307, "learning_rate": 3.0754737082999203e-06, "loss": 0.00484847, "memory(GiB)": 15.03, "step": 18240, "train_speed(iter/s)": 1.474527 }, { "acc": 1.0, "epoch": 32.20653133274492, "grad_norm": 0.5351564288139343, "learning_rate": 3.0727782209248404e-06, "loss": 0.00703654, "memory(GiB)": 15.03, "step": 18245, "train_speed(iter/s)": 1.47454 }, { "acc": 0.99957762, "epoch": 32.21535745807591, "grad_norm": 0.584762454032898, "learning_rate": 3.0700833916075175e-06, "loss": 0.00989859, "memory(GiB)": 15.03, "step": 18250, "train_speed(iter/s)": 1.474542 }, { "acc": 0.99932194, "epoch": 32.224183583406884, "grad_norm": 0.010212801396846771, "learning_rate": 3.067389221267876e-06, "loss": 0.00672136, "memory(GiB)": 15.03, "step": 18255, "train_speed(iter/s)": 1.474536 }, { "acc": 0.99967613, "epoch": 32.23300970873787, "grad_norm": 0.31296294927597046, "learning_rate": 3.064695710825611e-06, "loss": 0.00579248, "memory(GiB)": 15.03, "step": 18260, "train_speed(iter/s)": 1.474549 }, { "acc": 0.99891796, "epoch": 32.241835834068844, "grad_norm": 0.2233838140964508, "learning_rate": 3.0620028612001973e-06, "loss": 0.00835734, "memory(GiB)": 15.03, "step": 18265, "train_speed(iter/s)": 1.474559 }, { "acc": 0.99981613, "epoch": 32.25066195939982, "grad_norm": 0.09541478008031845, "learning_rate": 3.0593106733108773e-06, "loss": 0.00518341, "memory(GiB)": 15.03, "step": 18270, "train_speed(iter/s)": 1.474566 }, { "acc": 0.99978447, "epoch": 32.259488084730805, "grad_norm": 0.3450442850589752, "learning_rate": 3.056619148076673e-06, "loss": 0.00543332, "memory(GiB)": 15.03, "step": 18275, "train_speed(iter/s)": 1.47457 }, { "acc": 0.9998106, "epoch": 32.26831421006178, "grad_norm": 0.048618633300065994, "learning_rate": 3.053928286416378e-06, "loss": 0.00731792, "memory(GiB)": 15.03, "step": 18280, "train_speed(iter/s)": 1.474591 }, { "acc": 1.0, "epoch": 32.277140335392765, "grad_norm": 0.036896854639053345, "learning_rate": 3.0512380892485607e-06, "loss": 0.00140526, "memory(GiB)": 15.03, "step": 18285, "train_speed(iter/s)": 1.474605 }, { "acc": 0.99920063, "epoch": 32.28596646072374, "grad_norm": 0.3200854957103729, "learning_rate": 3.048548557491559e-06, "loss": 0.01630294, "memory(GiB)": 15.03, "step": 18290, "train_speed(iter/s)": 1.474624 }, { "acc": 0.99921169, "epoch": 32.29479258605472, "grad_norm": 0.78890061378479, "learning_rate": 3.0458596920634896e-06, "loss": 0.01933061, "memory(GiB)": 15.03, "step": 18295, "train_speed(iter/s)": 1.47465 }, { "acc": 0.99919462, "epoch": 32.3036187113857, "grad_norm": 2.2510764598846436, "learning_rate": 3.043171493882234e-06, "loss": 0.00798339, "memory(GiB)": 15.03, "step": 18300, "train_speed(iter/s)": 1.474664 }, { "acc": 0.99866457, "epoch": 32.31244483671668, "grad_norm": 0.5607331395149231, "learning_rate": 3.040483963865455e-06, "loss": 0.01318967, "memory(GiB)": 15.03, "step": 18305, "train_speed(iter/s)": 1.474669 }, { "acc": 0.99880848, "epoch": 32.32127096204766, "grad_norm": 0.7104424834251404, "learning_rate": 3.0377971029305807e-06, "loss": 0.01565959, "memory(GiB)": 15.03, "step": 18310, "train_speed(iter/s)": 1.474677 }, { "acc": 0.99981346, "epoch": 32.33009708737864, "grad_norm": 0.2324729710817337, "learning_rate": 3.035110911994813e-06, "loss": 0.00419935, "memory(GiB)": 15.03, "step": 18315, "train_speed(iter/s)": 1.474691 }, { "acc": 0.99899426, "epoch": 32.338923212709624, "grad_norm": 0.35106468200683594, "learning_rate": 3.0324253919751258e-06, "loss": 0.01498562, "memory(GiB)": 15.03, "step": 18320, "train_speed(iter/s)": 1.474711 }, { "acc": 0.99935007, "epoch": 32.3477493380406, "grad_norm": 0.667900562286377, "learning_rate": 3.029740543788261e-06, "loss": 0.01111558, "memory(GiB)": 15.03, "step": 18325, "train_speed(iter/s)": 1.474715 }, { "acc": 0.99969511, "epoch": 32.35657546337158, "grad_norm": 0.4317042827606201, "learning_rate": 3.027056368350736e-06, "loss": 0.00544476, "memory(GiB)": 15.03, "step": 18330, "train_speed(iter/s)": 1.474744 }, { "acc": 0.99926863, "epoch": 32.36540158870256, "grad_norm": 0.7461350560188293, "learning_rate": 3.024372866578832e-06, "loss": 0.00892134, "memory(GiB)": 15.03, "step": 18335, "train_speed(iter/s)": 1.474741 }, { "acc": 1.0, "epoch": 32.37422771403354, "grad_norm": 0.3379417955875397, "learning_rate": 3.021690039388608e-06, "loss": 0.00653269, "memory(GiB)": 15.03, "step": 18340, "train_speed(iter/s)": 1.474747 }, { "acc": 0.99911613, "epoch": 32.38305383936452, "grad_norm": 0.4248613715171814, "learning_rate": 3.0190078876958843e-06, "loss": 0.00928908, "memory(GiB)": 15.03, "step": 18345, "train_speed(iter/s)": 1.474734 }, { "acc": 0.99901123, "epoch": 32.3918799646955, "grad_norm": 0.3672352433204651, "learning_rate": 3.0163264124162594e-06, "loss": 0.0080897, "memory(GiB)": 15.03, "step": 18350, "train_speed(iter/s)": 1.474741 }, { "acc": 0.99928818, "epoch": 32.40070609002648, "grad_norm": 0.9113860130310059, "learning_rate": 3.0136456144650943e-06, "loss": 0.0087422, "memory(GiB)": 15.03, "step": 18355, "train_speed(iter/s)": 1.474753 }, { "acc": 1.0, "epoch": 32.40953221535746, "grad_norm": 0.9455984234809875, "learning_rate": 3.010965494757522e-06, "loss": 0.00362885, "memory(GiB)": 15.03, "step": 18360, "train_speed(iter/s)": 1.474769 }, { "acc": 0.99912872, "epoch": 32.418358340688435, "grad_norm": 0.5263042449951172, "learning_rate": 3.008286054208441e-06, "loss": 0.00665107, "memory(GiB)": 15.03, "step": 18365, "train_speed(iter/s)": 1.474765 }, { "acc": 0.99985638, "epoch": 32.42718446601942, "grad_norm": 0.013002390041947365, "learning_rate": 3.005607293732523e-06, "loss": 0.00463443, "memory(GiB)": 15.03, "step": 18370, "train_speed(iter/s)": 1.474777 }, { "acc": 0.99877567, "epoch": 32.436010591350396, "grad_norm": 0.09757982939481735, "learning_rate": 3.0029292142442013e-06, "loss": 0.00630263, "memory(GiB)": 15.03, "step": 18375, "train_speed(iter/s)": 1.474788 }, { "acc": 0.99917402, "epoch": 32.44483671668138, "grad_norm": 0.3526676297187805, "learning_rate": 3.0002518166576817e-06, "loss": 0.00903375, "memory(GiB)": 15.03, "step": 18380, "train_speed(iter/s)": 1.474783 }, { "acc": 1.0, "epoch": 32.453662842012356, "grad_norm": 0.5064834356307983, "learning_rate": 2.9975751018869342e-06, "loss": 0.00767567, "memory(GiB)": 15.03, "step": 18385, "train_speed(iter/s)": 1.474792 }, { "acc": 1.0, "epoch": 32.46248896734333, "grad_norm": 0.15727101266384125, "learning_rate": 2.9948990708456986e-06, "loss": 0.00353978, "memory(GiB)": 15.03, "step": 18390, "train_speed(iter/s)": 1.474785 }, { "acc": 0.99916019, "epoch": 32.47131509267432, "grad_norm": 0.4229678809642792, "learning_rate": 2.99222372444748e-06, "loss": 0.00674726, "memory(GiB)": 15.03, "step": 18395, "train_speed(iter/s)": 1.47479 }, { "acc": 0.99930096, "epoch": 32.48014121800529, "grad_norm": 0.23650102317333221, "learning_rate": 2.9895490636055474e-06, "loss": 0.00598652, "memory(GiB)": 15.03, "step": 18400, "train_speed(iter/s)": 1.474786 }, { "acc": 0.99946423, "epoch": 32.48896734333628, "grad_norm": 0.23501421511173248, "learning_rate": 2.9868750892329386e-06, "loss": 0.00597072, "memory(GiB)": 15.03, "step": 18405, "train_speed(iter/s)": 1.474795 }, { "acc": 0.99947414, "epoch": 32.497793468667254, "grad_norm": 0.1991998702287674, "learning_rate": 2.9842018022424557e-06, "loss": 0.00881896, "memory(GiB)": 15.03, "step": 18410, "train_speed(iter/s)": 1.474792 }, { "acc": 1.0, "epoch": 32.50661959399824, "grad_norm": 0.4621860682964325, "learning_rate": 2.9815292035466686e-06, "loss": 0.00824858, "memory(GiB)": 15.03, "step": 18415, "train_speed(iter/s)": 1.474812 }, { "acc": 0.99952126, "epoch": 32.515445719329215, "grad_norm": 0.6335383057594299, "learning_rate": 2.9788572940579085e-06, "loss": 0.00440025, "memory(GiB)": 15.03, "step": 18420, "train_speed(iter/s)": 1.474808 }, { "acc": 1.0, "epoch": 32.52427184466019, "grad_norm": 0.013718298636376858, "learning_rate": 2.9761860746882736e-06, "loss": 0.00270109, "memory(GiB)": 15.03, "step": 18425, "train_speed(iter/s)": 1.47482 }, { "acc": 0.99966803, "epoch": 32.533097969991175, "grad_norm": 0.3988581597805023, "learning_rate": 2.9735155463496267e-06, "loss": 0.00716336, "memory(GiB)": 15.03, "step": 18430, "train_speed(iter/s)": 1.474811 }, { "acc": 0.99951859, "epoch": 32.54192409532215, "grad_norm": 0.08108774572610855, "learning_rate": 2.970845709953595e-06, "loss": 0.00650006, "memory(GiB)": 15.03, "step": 18435, "train_speed(iter/s)": 1.47483 }, { "acc": 0.99980774, "epoch": 32.550750220653136, "grad_norm": 0.30215662717819214, "learning_rate": 2.968176566411566e-06, "loss": 0.00599578, "memory(GiB)": 15.03, "step": 18440, "train_speed(iter/s)": 1.474831 }, { "acc": 0.99933758, "epoch": 32.55957634598411, "grad_norm": 0.9263208508491516, "learning_rate": 2.9655081166346954e-06, "loss": 0.00742442, "memory(GiB)": 15.03, "step": 18445, "train_speed(iter/s)": 1.474842 }, { "acc": 1.0, "epoch": 32.568402471315096, "grad_norm": 0.05062907189130783, "learning_rate": 2.9628403615338984e-06, "loss": 0.00307859, "memory(GiB)": 15.03, "step": 18450, "train_speed(iter/s)": 1.474835 }, { "acc": 0.99907532, "epoch": 32.57722859664607, "grad_norm": 0.7346553802490234, "learning_rate": 2.9601733020198565e-06, "loss": 0.01270945, "memory(GiB)": 15.03, "step": 18455, "train_speed(iter/s)": 1.474847 }, { "acc": 0.99929228, "epoch": 32.58605472197705, "grad_norm": 0.5565964579582214, "learning_rate": 2.9575069390030094e-06, "loss": 0.00697148, "memory(GiB)": 15.03, "step": 18460, "train_speed(iter/s)": 1.474835 }, { "acc": 0.99979172, "epoch": 32.59488084730803, "grad_norm": 0.19053833186626434, "learning_rate": 2.9548412733935643e-06, "loss": 0.00608734, "memory(GiB)": 15.03, "step": 18465, "train_speed(iter/s)": 1.474831 }, { "acc": 0.99973402, "epoch": 32.60370697263901, "grad_norm": 0.24875614047050476, "learning_rate": 2.952176306101485e-06, "loss": 0.00760932, "memory(GiB)": 15.03, "step": 18470, "train_speed(iter/s)": 1.474837 }, { "acc": 0.9986001, "epoch": 32.612533097969994, "grad_norm": 1.8401868343353271, "learning_rate": 2.949512038036501e-06, "loss": 0.01580933, "memory(GiB)": 15.03, "step": 18475, "train_speed(iter/s)": 1.474842 }, { "acc": 0.99890232, "epoch": 32.62135922330097, "grad_norm": 1.5940183401107788, "learning_rate": 2.9468484701081015e-06, "loss": 0.0068489, "memory(GiB)": 15.03, "step": 18480, "train_speed(iter/s)": 1.47487 }, { "acc": 0.99859352, "epoch": 32.63018534863195, "grad_norm": 0.22927364706993103, "learning_rate": 2.9441856032255366e-06, "loss": 0.00769424, "memory(GiB)": 15.03, "step": 18485, "train_speed(iter/s)": 1.47488 }, { "acc": 0.99952793, "epoch": 32.63901147396293, "grad_norm": 0.22242248058319092, "learning_rate": 2.9415234382978166e-06, "loss": 0.0088831, "memory(GiB)": 15.03, "step": 18490, "train_speed(iter/s)": 1.474895 }, { "acc": 0.99914627, "epoch": 32.64783759929391, "grad_norm": 0.03737509623169899, "learning_rate": 2.938861976233714e-06, "loss": 0.00743134, "memory(GiB)": 15.03, "step": 18495, "train_speed(iter/s)": 1.474889 }, { "acc": 0.99946184, "epoch": 32.65666372462489, "grad_norm": 0.021858932450413704, "learning_rate": 2.9362012179417588e-06, "loss": 0.00425646, "memory(GiB)": 15.03, "step": 18500, "train_speed(iter/s)": 1.474886 }, { "acc": 0.99948177, "epoch": 32.66548984995587, "grad_norm": 0.0667419284582138, "learning_rate": 2.9335411643302444e-06, "loss": 0.00996531, "memory(GiB)": 15.03, "step": 18505, "train_speed(iter/s)": 1.474896 }, { "acc": 1.0, "epoch": 32.67431597528685, "grad_norm": 0.023161567747592926, "learning_rate": 2.9308818163072174e-06, "loss": 0.00306709, "memory(GiB)": 15.03, "step": 18510, "train_speed(iter/s)": 1.474893 }, { "acc": 0.99977684, "epoch": 32.68314210061783, "grad_norm": 0.2786540389060974, "learning_rate": 2.928223174780492e-06, "loss": 0.00510986, "memory(GiB)": 15.03, "step": 18515, "train_speed(iter/s)": 1.474863 }, { "acc": 1.0, "epoch": 32.691968225948806, "grad_norm": 0.4681176543235779, "learning_rate": 2.925565240657634e-06, "loss": 0.00811928, "memory(GiB)": 15.03, "step": 18520, "train_speed(iter/s)": 1.474882 }, { "acc": 0.99972219, "epoch": 32.70079435127979, "grad_norm": 1.5783883333206177, "learning_rate": 2.922908014845971e-06, "loss": 0.00972234, "memory(GiB)": 15.03, "step": 18525, "train_speed(iter/s)": 1.474871 }, { "acc": 0.99917326, "epoch": 32.709620476610766, "grad_norm": 0.07853662967681885, "learning_rate": 2.9202514982525897e-06, "loss": 0.00705669, "memory(GiB)": 15.03, "step": 18530, "train_speed(iter/s)": 1.47487 }, { "acc": 0.99779778, "epoch": 32.71844660194175, "grad_norm": 0.930868923664093, "learning_rate": 2.91759569178433e-06, "loss": 0.02085824, "memory(GiB)": 15.03, "step": 18535, "train_speed(iter/s)": 1.474873 }, { "acc": 0.99921637, "epoch": 32.72727272727273, "grad_norm": 0.9240405559539795, "learning_rate": 2.9149405963477965e-06, "loss": 0.01294514, "memory(GiB)": 15.03, "step": 18540, "train_speed(iter/s)": 1.47488 }, { "acc": 0.99916687, "epoch": 32.73609885260371, "grad_norm": 0.3353053331375122, "learning_rate": 2.9122862128493466e-06, "loss": 0.00627209, "memory(GiB)": 15.03, "step": 18545, "train_speed(iter/s)": 1.474879 }, { "acc": 0.99900379, "epoch": 32.74492497793469, "grad_norm": 1.3787472248077393, "learning_rate": 2.909632542195094e-06, "loss": 0.01133658, "memory(GiB)": 15.03, "step": 18550, "train_speed(iter/s)": 1.474879 }, { "acc": 0.99978447, "epoch": 32.753751103265664, "grad_norm": 0.35878002643585205, "learning_rate": 2.906979585290908e-06, "loss": 0.00349008, "memory(GiB)": 15.03, "step": 18555, "train_speed(iter/s)": 1.474911 }, { "acc": 0.99955359, "epoch": 32.76257722859665, "grad_norm": 0.07501870393753052, "learning_rate": 2.904327343042422e-06, "loss": 0.00599486, "memory(GiB)": 15.03, "step": 18560, "train_speed(iter/s)": 1.474935 }, { "acc": 0.99973116, "epoch": 32.771403353927624, "grad_norm": 1.025280237197876, "learning_rate": 2.9016758163550174e-06, "loss": 0.00890039, "memory(GiB)": 15.03, "step": 18565, "train_speed(iter/s)": 1.474948 }, { "acc": 0.9991827, "epoch": 32.78022947925861, "grad_norm": 0.5566368103027344, "learning_rate": 2.8990250061338333e-06, "loss": 0.0093998, "memory(GiB)": 15.03, "step": 18570, "train_speed(iter/s)": 1.474966 }, { "acc": 0.99977226, "epoch": 32.789055604589585, "grad_norm": 0.36280015110969543, "learning_rate": 2.8963749132837644e-06, "loss": 0.00409286, "memory(GiB)": 15.03, "step": 18575, "train_speed(iter/s)": 1.474984 }, { "acc": 0.99936619, "epoch": 32.79788172992056, "grad_norm": 0.5970382690429688, "learning_rate": 2.893725538709464e-06, "loss": 0.00940866, "memory(GiB)": 15.03, "step": 18580, "train_speed(iter/s)": 1.47498 }, { "acc": 0.99911194, "epoch": 32.806707855251545, "grad_norm": 0.016790563240647316, "learning_rate": 2.891076883315333e-06, "loss": 0.01334991, "memory(GiB)": 15.03, "step": 18585, "train_speed(iter/s)": 1.475004 }, { "acc": 0.99943752, "epoch": 32.81553398058252, "grad_norm": 0.8525774478912354, "learning_rate": 2.8884289480055355e-06, "loss": 0.00904837, "memory(GiB)": 15.03, "step": 18590, "train_speed(iter/s)": 1.475006 }, { "acc": 0.99915085, "epoch": 32.824360105913506, "grad_norm": 0.291221559047699, "learning_rate": 2.885781733683984e-06, "loss": 0.01020974, "memory(GiB)": 15.03, "step": 18595, "train_speed(iter/s)": 1.47501 }, { "acc": 0.99965277, "epoch": 32.83318623124448, "grad_norm": 0.2914293110370636, "learning_rate": 2.883135241254345e-06, "loss": 0.00613951, "memory(GiB)": 15.03, "step": 18600, "train_speed(iter/s)": 1.475029 }, { "acc": 1.0, "epoch": 32.84201235657547, "grad_norm": 0.1932823210954666, "learning_rate": 2.880489471620039e-06, "loss": 0.00234072, "memory(GiB)": 15.03, "step": 18605, "train_speed(iter/s)": 1.475045 }, { "acc": 1.0, "epoch": 32.85083848190644, "grad_norm": 0.4004295766353607, "learning_rate": 2.8778444256842437e-06, "loss": 0.00237017, "memory(GiB)": 15.03, "step": 18610, "train_speed(iter/s)": 1.475027 }, { "acc": 0.99990673, "epoch": 32.85966460723742, "grad_norm": 0.6967677474021912, "learning_rate": 2.875200104349882e-06, "loss": 0.00260117, "memory(GiB)": 15.03, "step": 18615, "train_speed(iter/s)": 1.475032 }, { "acc": 0.99966068, "epoch": 32.868490732568404, "grad_norm": 0.4429885149002075, "learning_rate": 2.87255650851964e-06, "loss": 0.00553878, "memory(GiB)": 15.03, "step": 18620, "train_speed(iter/s)": 1.475017 }, { "acc": 0.99988537, "epoch": 32.87731685789938, "grad_norm": 0.10902397334575653, "learning_rate": 2.869913639095946e-06, "loss": 0.0060737, "memory(GiB)": 15.03, "step": 18625, "train_speed(iter/s)": 1.475018 }, { "acc": 1.0, "epoch": 32.886142983230364, "grad_norm": 0.3949187099933624, "learning_rate": 2.867271496980984e-06, "loss": 0.00475435, "memory(GiB)": 15.03, "step": 18630, "train_speed(iter/s)": 1.475038 }, { "acc": 0.99955235, "epoch": 32.89496910856134, "grad_norm": 0.5561807155609131, "learning_rate": 2.864630083076694e-06, "loss": 0.0067808, "memory(GiB)": 15.03, "step": 18635, "train_speed(iter/s)": 1.475062 }, { "acc": 0.99863644, "epoch": 32.903795233892325, "grad_norm": 0.3445765972137451, "learning_rate": 2.8619893982847608e-06, "loss": 0.01089824, "memory(GiB)": 15.03, "step": 18640, "train_speed(iter/s)": 1.475066 }, { "acc": 0.99924889, "epoch": 32.9126213592233, "grad_norm": 0.3691254258155823, "learning_rate": 2.859349443506624e-06, "loss": 0.00748463, "memory(GiB)": 15.03, "step": 18645, "train_speed(iter/s)": 1.475072 }, { "acc": 0.99853497, "epoch": 32.92144748455428, "grad_norm": 1.273218035697937, "learning_rate": 2.856710219643471e-06, "loss": 0.01117948, "memory(GiB)": 15.03, "step": 18650, "train_speed(iter/s)": 1.475078 }, { "acc": 0.99984179, "epoch": 32.93027360988526, "grad_norm": 0.303171843290329, "learning_rate": 2.854071727596246e-06, "loss": 0.00629357, "memory(GiB)": 15.03, "step": 18655, "train_speed(iter/s)": 1.475101 }, { "acc": 1.0, "epoch": 32.93909973521624, "grad_norm": 0.10513585060834885, "learning_rate": 2.8514339682656345e-06, "loss": 0.00615008, "memory(GiB)": 15.03, "step": 18660, "train_speed(iter/s)": 1.475107 }, { "acc": 0.99935436, "epoch": 32.94792586054722, "grad_norm": 0.7142825126647949, "learning_rate": 2.848796942552081e-06, "loss": 0.00946655, "memory(GiB)": 15.03, "step": 18665, "train_speed(iter/s)": 1.47511 }, { "acc": 1.0, "epoch": 32.9567519858782, "grad_norm": 0.5064623355865479, "learning_rate": 2.846160651355774e-06, "loss": 0.00666032, "memory(GiB)": 15.03, "step": 18670, "train_speed(iter/s)": 1.475115 }, { "acc": 0.99833574, "epoch": 32.965578111209176, "grad_norm": 0.8369842171669006, "learning_rate": 2.8435250955766508e-06, "loss": 0.01764584, "memory(GiB)": 15.03, "step": 18675, "train_speed(iter/s)": 1.475112 }, { "acc": 0.99947195, "epoch": 32.97440423654016, "grad_norm": 0.5446107387542725, "learning_rate": 2.8408902761143982e-06, "loss": 0.01016438, "memory(GiB)": 15.03, "step": 18680, "train_speed(iter/s)": 1.475118 }, { "acc": 0.99955702, "epoch": 32.983230361871136, "grad_norm": 0.04292583465576172, "learning_rate": 2.838256193868457e-06, "loss": 0.005251, "memory(GiB)": 15.03, "step": 18685, "train_speed(iter/s)": 1.475115 }, { "acc": 0.99951925, "epoch": 32.99205648720212, "grad_norm": 0.02216477505862713, "learning_rate": 2.8356228497380094e-06, "loss": 0.00658311, "memory(GiB)": 15.03, "step": 18690, "train_speed(iter/s)": 1.47513 }, { "acc": 0.99987745, "epoch": 33.0008826125331, "grad_norm": 0.2353648543357849, "learning_rate": 2.8329902446219894e-06, "loss": 0.00740957, "memory(GiB)": 15.03, "step": 18695, "train_speed(iter/s)": 1.47509 }, { "acc": 0.99894657, "epoch": 33.00970873786408, "grad_norm": 0.30848264694213867, "learning_rate": 2.830358379419074e-06, "loss": 0.0097137, "memory(GiB)": 15.03, "step": 18700, "train_speed(iter/s)": 1.475108 }, { "acc": 0.99900684, "epoch": 33.01853486319506, "grad_norm": 0.684176504611969, "learning_rate": 2.827727255027697e-06, "loss": 0.014068, "memory(GiB)": 15.03, "step": 18705, "train_speed(iter/s)": 1.475099 }, { "acc": 1.0, "epoch": 33.027360988526034, "grad_norm": 0.7083001136779785, "learning_rate": 2.825096872346028e-06, "loss": 0.0110539, "memory(GiB)": 15.03, "step": 18710, "train_speed(iter/s)": 1.4751 }, { "acc": 1.0, "epoch": 33.03618711385702, "grad_norm": 0.5218945741653442, "learning_rate": 2.8224672322719948e-06, "loss": 0.00643306, "memory(GiB)": 15.03, "step": 18715, "train_speed(iter/s)": 1.475113 }, { "acc": 0.99817104, "epoch": 33.045013239187995, "grad_norm": 0.35598164796829224, "learning_rate": 2.8198383357032617e-06, "loss": 0.01332065, "memory(GiB)": 15.03, "step": 18720, "train_speed(iter/s)": 1.475114 }, { "acc": 0.99833851, "epoch": 33.05383936451898, "grad_norm": 0.23216435313224792, "learning_rate": 2.8172101835372447e-06, "loss": 0.01497711, "memory(GiB)": 15.03, "step": 18725, "train_speed(iter/s)": 1.475117 }, { "acc": 0.99947186, "epoch": 33.062665489849955, "grad_norm": 0.3036978244781494, "learning_rate": 2.8145827766711016e-06, "loss": 0.00307484, "memory(GiB)": 15.03, "step": 18730, "train_speed(iter/s)": 1.47513 }, { "acc": 0.99956617, "epoch": 33.07149161518094, "grad_norm": 0.2369050532579422, "learning_rate": 2.811956116001743e-06, "loss": 0.00296967, "memory(GiB)": 15.03, "step": 18735, "train_speed(iter/s)": 1.475137 }, { "acc": 0.99944057, "epoch": 33.080317740511916, "grad_norm": 0.16569902002811432, "learning_rate": 2.8093302024258166e-06, "loss": 0.00853085, "memory(GiB)": 15.03, "step": 18740, "train_speed(iter/s)": 1.475141 }, { "acc": 0.9997159, "epoch": 33.08914386584289, "grad_norm": 0.37179675698280334, "learning_rate": 2.806705036839723e-06, "loss": 0.00656526, "memory(GiB)": 15.03, "step": 18745, "train_speed(iter/s)": 1.475132 }, { "acc": 0.99935932, "epoch": 33.097969991173876, "grad_norm": 0.5288481116294861, "learning_rate": 2.8040806201396003e-06, "loss": 0.00826252, "memory(GiB)": 15.03, "step": 18750, "train_speed(iter/s)": 1.475147 }, { "acc": 0.99910555, "epoch": 33.10679611650485, "grad_norm": 1.3665916919708252, "learning_rate": 2.8014569532213316e-06, "loss": 0.01321889, "memory(GiB)": 15.03, "step": 18755, "train_speed(iter/s)": 1.475123 }, { "acc": 0.99944754, "epoch": 33.11562224183584, "grad_norm": 0.09904374927282333, "learning_rate": 2.7988340369805516e-06, "loss": 0.00452971, "memory(GiB)": 15.03, "step": 18760, "train_speed(iter/s)": 1.475118 }, { "acc": 0.99962978, "epoch": 33.12444836716681, "grad_norm": 0.12551160156726837, "learning_rate": 2.7962118723126304e-06, "loss": 0.00542181, "memory(GiB)": 15.03, "step": 18765, "train_speed(iter/s)": 1.475125 }, { "acc": 0.99944096, "epoch": 33.13327449249779, "grad_norm": 0.517675518989563, "learning_rate": 2.793590460112686e-06, "loss": 0.00907784, "memory(GiB)": 15.03, "step": 18770, "train_speed(iter/s)": 1.475126 }, { "acc": 0.99860783, "epoch": 33.142100617828774, "grad_norm": 2.579441785812378, "learning_rate": 2.790969801275574e-06, "loss": 0.0113189, "memory(GiB)": 15.03, "step": 18775, "train_speed(iter/s)": 1.47512 }, { "acc": 0.99968748, "epoch": 33.15092674315975, "grad_norm": 0.4570350646972656, "learning_rate": 2.7883498966959034e-06, "loss": 0.01051791, "memory(GiB)": 15.03, "step": 18780, "train_speed(iter/s)": 1.475125 }, { "acc": 0.99899998, "epoch": 33.159752868490735, "grad_norm": 0.017921915277838707, "learning_rate": 2.785730747268013e-06, "loss": 0.01542263, "memory(GiB)": 15.03, "step": 18785, "train_speed(iter/s)": 1.475114 }, { "acc": 0.99980164, "epoch": 33.16857899382171, "grad_norm": 0.0262431763112545, "learning_rate": 2.783112353885996e-06, "loss": 0.00414122, "memory(GiB)": 15.03, "step": 18790, "train_speed(iter/s)": 1.475131 }, { "acc": 0.99931965, "epoch": 33.177405119152695, "grad_norm": 3.5050926208496094, "learning_rate": 2.7804947174436785e-06, "loss": 0.01235118, "memory(GiB)": 15.03, "step": 18795, "train_speed(iter/s)": 1.475115 }, { "acc": 1.0, "epoch": 33.18623124448367, "grad_norm": 0.6929489970207214, "learning_rate": 2.7778778388346324e-06, "loss": 0.00694146, "memory(GiB)": 15.03, "step": 18800, "train_speed(iter/s)": 1.475132 }, { "acc": 0.99913979, "epoch": 33.19505736981465, "grad_norm": 0.20260411500930786, "learning_rate": 2.775261718952167e-06, "loss": 0.01322337, "memory(GiB)": 15.03, "step": 18805, "train_speed(iter/s)": 1.47512 }, { "acc": 0.99950676, "epoch": 33.20388349514563, "grad_norm": 0.4398316740989685, "learning_rate": 2.772646358689339e-06, "loss": 0.00676005, "memory(GiB)": 15.03, "step": 18810, "train_speed(iter/s)": 1.47513 }, { "acc": 0.99837341, "epoch": 33.21270962047661, "grad_norm": 2.479499101638794, "learning_rate": 2.7700317589389406e-06, "loss": 0.02091443, "memory(GiB)": 15.03, "step": 18815, "train_speed(iter/s)": 1.475137 }, { "acc": 0.99979172, "epoch": 33.22153574580759, "grad_norm": 0.31252521276474, "learning_rate": 2.7674179205935077e-06, "loss": 0.00316846, "memory(GiB)": 15.03, "step": 18820, "train_speed(iter/s)": 1.47515 }, { "acc": 0.99965525, "epoch": 33.23036187113857, "grad_norm": 0.3161948621273041, "learning_rate": 2.76480484454531e-06, "loss": 0.00408223, "memory(GiB)": 15.03, "step": 18825, "train_speed(iter/s)": 1.475175 }, { "acc": 0.99920378, "epoch": 33.23918799646955, "grad_norm": 0.6463243961334229, "learning_rate": 2.7621925316863674e-06, "loss": 0.00787506, "memory(GiB)": 15.03, "step": 18830, "train_speed(iter/s)": 1.475177 }, { "acc": 0.99936438, "epoch": 33.24801412180053, "grad_norm": 0.054816555231809616, "learning_rate": 2.759580982908428e-06, "loss": 0.00657171, "memory(GiB)": 15.03, "step": 18835, "train_speed(iter/s)": 1.475197 }, { "acc": 0.99969511, "epoch": 33.25684024713151, "grad_norm": 0.6461918354034424, "learning_rate": 2.7569701991029895e-06, "loss": 0.00330861, "memory(GiB)": 15.03, "step": 18840, "train_speed(iter/s)": 1.475201 }, { "acc": 0.99911556, "epoch": 33.26566637246249, "grad_norm": 0.05190647020936012, "learning_rate": 2.754360181161282e-06, "loss": 0.00827834, "memory(GiB)": 15.03, "step": 18845, "train_speed(iter/s)": 1.47519 }, { "acc": 0.99979172, "epoch": 33.27449249779347, "grad_norm": 0.10378647595643997, "learning_rate": 2.751750929974273e-06, "loss": 0.00378598, "memory(GiB)": 15.03, "step": 18850, "train_speed(iter/s)": 1.475178 }, { "acc": 0.99973402, "epoch": 33.28331862312445, "grad_norm": 0.13099542260169983, "learning_rate": 2.749142446432674e-06, "loss": 0.01059505, "memory(GiB)": 15.03, "step": 18855, "train_speed(iter/s)": 1.475186 }, { "acc": 0.9993825, "epoch": 33.29214474845543, "grad_norm": 0.6742151379585266, "learning_rate": 2.7465347314269285e-06, "loss": 0.00825879, "memory(GiB)": 15.03, "step": 18860, "train_speed(iter/s)": 1.475178 }, { "acc": 1.0, "epoch": 33.300970873786405, "grad_norm": 0.2988441586494446, "learning_rate": 2.743927785847225e-06, "loss": 0.00116078, "memory(GiB)": 15.03, "step": 18865, "train_speed(iter/s)": 1.475199 }, { "acc": 0.99930735, "epoch": 33.30979699911739, "grad_norm": 0.41389355063438416, "learning_rate": 2.741321610583481e-06, "loss": 0.01253596, "memory(GiB)": 15.03, "step": 18870, "train_speed(iter/s)": 1.475211 }, { "acc": 0.9993927, "epoch": 33.318623124448365, "grad_norm": 1.5196171998977661, "learning_rate": 2.7387162065253564e-06, "loss": 0.00865254, "memory(GiB)": 15.03, "step": 18875, "train_speed(iter/s)": 1.47521 }, { "acc": 0.99926472, "epoch": 33.32744924977935, "grad_norm": 0.07944286614656448, "learning_rate": 2.7361115745622433e-06, "loss": 0.01439542, "memory(GiB)": 15.03, "step": 18880, "train_speed(iter/s)": 1.475222 }, { "acc": 0.99978065, "epoch": 33.336275375110326, "grad_norm": 0.24905584752559662, "learning_rate": 2.733507715583279e-06, "loss": 0.01211265, "memory(GiB)": 15.03, "step": 18885, "train_speed(iter/s)": 1.475224 }, { "acc": 0.99955931, "epoch": 33.34510150044131, "grad_norm": 0.37074482440948486, "learning_rate": 2.7309046304773272e-06, "loss": 0.00396227, "memory(GiB)": 15.03, "step": 18890, "train_speed(iter/s)": 1.475233 }, { "acc": 0.99931946, "epoch": 33.353927625772286, "grad_norm": 0.023587817326188087, "learning_rate": 2.728302320132993e-06, "loss": 0.00918183, "memory(GiB)": 15.03, "step": 18895, "train_speed(iter/s)": 1.475237 }, { "acc": 0.99953709, "epoch": 33.36275375110326, "grad_norm": 0.27451804280281067, "learning_rate": 2.725700785438612e-06, "loss": 0.00784803, "memory(GiB)": 15.03, "step": 18900, "train_speed(iter/s)": 1.47525 }, { "acc": 0.99909992, "epoch": 33.37157987643425, "grad_norm": 0.7406628727912903, "learning_rate": 2.7231000272822627e-06, "loss": 0.01562772, "memory(GiB)": 15.03, "step": 18905, "train_speed(iter/s)": 1.475256 }, { "acc": 0.99911251, "epoch": 33.38040600176522, "grad_norm": 0.4460514783859253, "learning_rate": 2.7205000465517518e-06, "loss": 0.0084352, "memory(GiB)": 15.03, "step": 18910, "train_speed(iter/s)": 1.475268 }, { "acc": 0.99985294, "epoch": 33.38923212709621, "grad_norm": 0.4826985001564026, "learning_rate": 2.717900844134625e-06, "loss": 0.00618441, "memory(GiB)": 15.03, "step": 18915, "train_speed(iter/s)": 1.475257 }, { "acc": 0.99919853, "epoch": 33.398058252427184, "grad_norm": 0.5174635052680969, "learning_rate": 2.71530242091816e-06, "loss": 0.00916165, "memory(GiB)": 15.03, "step": 18920, "train_speed(iter/s)": 1.475257 }, { "acc": 0.9998579, "epoch": 33.40688437775817, "grad_norm": 0.16717647016048431, "learning_rate": 2.712704777789368e-06, "loss": 0.00678531, "memory(GiB)": 15.03, "step": 18925, "train_speed(iter/s)": 1.475262 }, { "acc": 0.99864197, "epoch": 33.415710503089144, "grad_norm": 7.027989387512207, "learning_rate": 2.7101079156349936e-06, "loss": 0.01907008, "memory(GiB)": 15.03, "step": 18930, "train_speed(iter/s)": 1.475264 }, { "acc": 0.99920235, "epoch": 33.42453662842012, "grad_norm": 0.45898592472076416, "learning_rate": 2.707511835341519e-06, "loss": 0.00756073, "memory(GiB)": 15.03, "step": 18935, "train_speed(iter/s)": 1.475273 }, { "acc": 0.99948463, "epoch": 33.433362753751105, "grad_norm": 0.6473753452301025, "learning_rate": 2.704916537795156e-06, "loss": 0.00529045, "memory(GiB)": 15.03, "step": 18940, "train_speed(iter/s)": 1.475271 }, { "acc": 0.99941406, "epoch": 33.44218887908208, "grad_norm": 0.5202497243881226, "learning_rate": 2.7023220238818476e-06, "loss": 0.00573552, "memory(GiB)": 15.03, "step": 18945, "train_speed(iter/s)": 1.475268 }, { "acc": 0.99900646, "epoch": 33.451015004413065, "grad_norm": 0.48201918601989746, "learning_rate": 2.6997282944872715e-06, "loss": 0.00775977, "memory(GiB)": 15.03, "step": 18950, "train_speed(iter/s)": 1.475265 }, { "acc": 0.99943571, "epoch": 33.45984112974404, "grad_norm": 0.20521707832813263, "learning_rate": 2.697135350496838e-06, "loss": 0.01109671, "memory(GiB)": 15.03, "step": 18955, "train_speed(iter/s)": 1.475272 }, { "acc": 1.0, "epoch": 33.46866725507502, "grad_norm": 0.3855040669441223, "learning_rate": 2.6945431927956923e-06, "loss": 0.0012565, "memory(GiB)": 15.03, "step": 18960, "train_speed(iter/s)": 1.475278 }, { "acc": 0.99960938, "epoch": 33.477493380406, "grad_norm": 0.6814864277839661, "learning_rate": 2.6919518222687036e-06, "loss": 0.00641263, "memory(GiB)": 15.03, "step": 18965, "train_speed(iter/s)": 1.475271 }, { "acc": 1.0, "epoch": 33.48631950573698, "grad_norm": 0.19899816811084747, "learning_rate": 2.689361239800478e-06, "loss": 0.00179108, "memory(GiB)": 15.03, "step": 18970, "train_speed(iter/s)": 1.475264 }, { "acc": 0.99980469, "epoch": 33.49514563106796, "grad_norm": 1.3636479377746582, "learning_rate": 2.6867714462753486e-06, "loss": 0.00949557, "memory(GiB)": 15.03, "step": 18975, "train_speed(iter/s)": 1.475257 }, { "acc": 0.99933224, "epoch": 33.50397175639894, "grad_norm": 0.45769819617271423, "learning_rate": 2.684182442577386e-06, "loss": 0.00557433, "memory(GiB)": 15.03, "step": 18980, "train_speed(iter/s)": 1.475267 }, { "acc": 0.99872551, "epoch": 33.512797881729924, "grad_norm": 0.3878409266471863, "learning_rate": 2.681594229590382e-06, "loss": 0.00952666, "memory(GiB)": 15.03, "step": 18985, "train_speed(iter/s)": 1.475277 }, { "acc": 0.99937992, "epoch": 33.5216240070609, "grad_norm": 0.16463063657283783, "learning_rate": 2.679006808197868e-06, "loss": 0.00466601, "memory(GiB)": 15.03, "step": 18990, "train_speed(iter/s)": 1.475268 }, { "acc": 0.99989405, "epoch": 33.53045013239188, "grad_norm": 0.3601459562778473, "learning_rate": 2.676420179283099e-06, "loss": 0.00676351, "memory(GiB)": 15.03, "step": 18995, "train_speed(iter/s)": 1.475285 }, { "acc": 0.9993371, "epoch": 33.53927625772286, "grad_norm": 0.6067730188369751, "learning_rate": 2.6738343437290593e-06, "loss": 0.00931116, "memory(GiB)": 15.03, "step": 19000, "train_speed(iter/s)": 1.475298 }, { "acc": 0.99974489, "epoch": 33.54810238305384, "grad_norm": 0.7955930829048157, "learning_rate": 2.6712493024184625e-06, "loss": 0.0099228, "memory(GiB)": 15.03, "step": 19005, "train_speed(iter/s)": 1.475299 }, { "acc": 0.99934902, "epoch": 33.55692850838482, "grad_norm": 0.048212260007858276, "learning_rate": 2.668665056233757e-06, "loss": 0.00659049, "memory(GiB)": 15.03, "step": 19010, "train_speed(iter/s)": 1.475283 }, { "acc": 0.99955616, "epoch": 33.5657546337158, "grad_norm": 0.35906463861465454, "learning_rate": 2.6660816060571143e-06, "loss": 0.01042913, "memory(GiB)": 15.03, "step": 19015, "train_speed(iter/s)": 1.475284 }, { "acc": 0.99961882, "epoch": 33.57458075904678, "grad_norm": 0.3064565360546112, "learning_rate": 2.663498952770434e-06, "loss": 0.00665913, "memory(GiB)": 15.03, "step": 19020, "train_speed(iter/s)": 1.475286 }, { "acc": 0.99954309, "epoch": 33.58340688437776, "grad_norm": 2.002455472946167, "learning_rate": 2.6609170972553435e-06, "loss": 0.00508801, "memory(GiB)": 15.03, "step": 19025, "train_speed(iter/s)": 1.475283 }, { "acc": 0.99988422, "epoch": 33.592233009708735, "grad_norm": 0.033551234751939774, "learning_rate": 2.6583360403932023e-06, "loss": 0.00606429, "memory(GiB)": 15.03, "step": 19030, "train_speed(iter/s)": 1.475289 }, { "acc": 0.99934969, "epoch": 33.60105913503972, "grad_norm": 0.5377687215805054, "learning_rate": 2.6557557830650905e-06, "loss": 0.00735734, "memory(GiB)": 15.03, "step": 19035, "train_speed(iter/s)": 1.475279 }, { "acc": 0.99867582, "epoch": 33.609885260370696, "grad_norm": 0.13296456634998322, "learning_rate": 2.6531763261518236e-06, "loss": 0.01126805, "memory(GiB)": 15.03, "step": 19040, "train_speed(iter/s)": 1.475293 }, { "acc": 1.0, "epoch": 33.61871138570168, "grad_norm": 0.17258238792419434, "learning_rate": 2.6505976705339377e-06, "loss": 0.01668475, "memory(GiB)": 15.03, "step": 19045, "train_speed(iter/s)": 1.475276 }, { "acc": 0.9989687, "epoch": 33.627537511032656, "grad_norm": 0.24675510823726654, "learning_rate": 2.6480198170916952e-06, "loss": 0.00556964, "memory(GiB)": 15.03, "step": 19050, "train_speed(iter/s)": 1.475285 }, { "acc": 1.0, "epoch": 33.63636363636363, "grad_norm": 0.06344983726739883, "learning_rate": 2.6454427667050868e-06, "loss": 0.0077005, "memory(GiB)": 15.03, "step": 19055, "train_speed(iter/s)": 1.475277 }, { "acc": 1.0, "epoch": 33.64518976169462, "grad_norm": 0.02695801667869091, "learning_rate": 2.64286652025383e-06, "loss": 0.00491974, "memory(GiB)": 15.03, "step": 19060, "train_speed(iter/s)": 1.475299 }, { "acc": 0.99945927, "epoch": 33.654015887025594, "grad_norm": 0.4100799858570099, "learning_rate": 2.6402910786173663e-06, "loss": 0.00624471, "memory(GiB)": 15.03, "step": 19065, "train_speed(iter/s)": 1.475319 }, { "acc": 1.0, "epoch": 33.66284201235658, "grad_norm": 0.3295927941799164, "learning_rate": 2.6377164426748603e-06, "loss": 0.00642931, "memory(GiB)": 15.03, "step": 19070, "train_speed(iter/s)": 1.475333 }, { "acc": 0.99972219, "epoch": 33.671668137687554, "grad_norm": 0.4286789298057556, "learning_rate": 2.6351426133052087e-06, "loss": 0.00249806, "memory(GiB)": 15.03, "step": 19075, "train_speed(iter/s)": 1.475337 }, { "acc": 0.99979506, "epoch": 33.68049426301854, "grad_norm": 0.19238440692424774, "learning_rate": 2.6325695913870224e-06, "loss": 0.00350147, "memory(GiB)": 15.03, "step": 19080, "train_speed(iter/s)": 1.475352 }, { "acc": 0.9995266, "epoch": 33.689320388349515, "grad_norm": 0.6239384412765503, "learning_rate": 2.629997377798648e-06, "loss": 0.00607401, "memory(GiB)": 15.03, "step": 19085, "train_speed(iter/s)": 1.475352 }, { "acc": 1.0, "epoch": 33.69814651368049, "grad_norm": 0.04920322448015213, "learning_rate": 2.627425973418147e-06, "loss": 0.00105288, "memory(GiB)": 15.03, "step": 19090, "train_speed(iter/s)": 1.475347 }, { "acc": 0.99963379, "epoch": 33.706972639011475, "grad_norm": 1.2605068683624268, "learning_rate": 2.6248553791233094e-06, "loss": 0.01330207, "memory(GiB)": 15.03, "step": 19095, "train_speed(iter/s)": 1.475363 }, { "acc": 0.99916172, "epoch": 33.71579876434245, "grad_norm": 0.35705068707466125, "learning_rate": 2.622285595791645e-06, "loss": 0.01803802, "memory(GiB)": 15.03, "step": 19100, "train_speed(iter/s)": 1.475357 }, { "acc": 0.99923973, "epoch": 33.724624889673436, "grad_norm": 1.0195802450180054, "learning_rate": 2.619716624300393e-06, "loss": 0.00530656, "memory(GiB)": 15.03, "step": 19105, "train_speed(iter/s)": 1.475374 }, { "acc": 0.9998476, "epoch": 33.73345101500441, "grad_norm": 0.35015541315078735, "learning_rate": 2.61714846552651e-06, "loss": 0.00549616, "memory(GiB)": 15.03, "step": 19110, "train_speed(iter/s)": 1.475371 }, { "acc": 0.99935036, "epoch": 33.74227714033539, "grad_norm": 0.4837400019168854, "learning_rate": 2.6145811203466775e-06, "loss": 0.01356559, "memory(GiB)": 15.03, "step": 19115, "train_speed(iter/s)": 1.475377 }, { "acc": 0.99954224, "epoch": 33.75110326566637, "grad_norm": 0.5704022645950317, "learning_rate": 2.612014589637296e-06, "loss": 0.00515211, "memory(GiB)": 15.03, "step": 19120, "train_speed(iter/s)": 1.475354 }, { "acc": 0.99984179, "epoch": 33.75992939099735, "grad_norm": 0.3986596167087555, "learning_rate": 2.6094488742744946e-06, "loss": 0.00612408, "memory(GiB)": 15.03, "step": 19125, "train_speed(iter/s)": 1.475338 }, { "acc": 0.99855022, "epoch": 33.76875551632833, "grad_norm": 2.082425355911255, "learning_rate": 2.6068839751341158e-06, "loss": 0.01947937, "memory(GiB)": 15.03, "step": 19130, "train_speed(iter/s)": 1.475335 }, { "acc": 0.99949007, "epoch": 33.77758164165931, "grad_norm": 1.8466401100158691, "learning_rate": 2.6043198930917322e-06, "loss": 0.00673563, "memory(GiB)": 15.03, "step": 19135, "train_speed(iter/s)": 1.475342 }, { "acc": 1.0, "epoch": 33.786407766990294, "grad_norm": 0.5277537703514099, "learning_rate": 2.601756629022631e-06, "loss": 0.00768714, "memory(GiB)": 15.03, "step": 19140, "train_speed(iter/s)": 1.475344 }, { "acc": 0.99980469, "epoch": 33.79523389232127, "grad_norm": 1.9600661993026733, "learning_rate": 2.599194183801822e-06, "loss": 0.0081865, "memory(GiB)": 15.03, "step": 19145, "train_speed(iter/s)": 1.475355 }, { "acc": 0.99964275, "epoch": 33.80406001765225, "grad_norm": 0.21644894778728485, "learning_rate": 2.596632558304034e-06, "loss": 0.00609814, "memory(GiB)": 15.03, "step": 19150, "train_speed(iter/s)": 1.475351 }, { "acc": 0.99956894, "epoch": 33.81288614298323, "grad_norm": 0.3437494933605194, "learning_rate": 2.594071753403721e-06, "loss": 0.00880522, "memory(GiB)": 15.03, "step": 19155, "train_speed(iter/s)": 1.475351 }, { "acc": 0.99907112, "epoch": 33.82171226831421, "grad_norm": 0.21501494944095612, "learning_rate": 2.5915117699750504e-06, "loss": 0.00555588, "memory(GiB)": 15.03, "step": 19160, "train_speed(iter/s)": 1.475343 }, { "acc": 0.99969511, "epoch": 33.83053839364519, "grad_norm": 3.300936222076416, "learning_rate": 2.588952608891916e-06, "loss": 0.00754613, "memory(GiB)": 15.03, "step": 19165, "train_speed(iter/s)": 1.475356 }, { "acc": 1.0, "epoch": 33.83936451897617, "grad_norm": 0.23539476096630096, "learning_rate": 2.5863942710279237e-06, "loss": 0.00670053, "memory(GiB)": 15.03, "step": 19170, "train_speed(iter/s)": 1.475368 }, { "acc": 0.99978065, "epoch": 33.84819064430715, "grad_norm": 1.960307240486145, "learning_rate": 2.5838367572564023e-06, "loss": 0.00523424, "memory(GiB)": 15.03, "step": 19175, "train_speed(iter/s)": 1.475384 }, { "acc": 0.99961224, "epoch": 33.85701676963813, "grad_norm": 0.21228750050067902, "learning_rate": 2.5812800684504007e-06, "loss": 0.00737125, "memory(GiB)": 15.03, "step": 19180, "train_speed(iter/s)": 1.475397 }, { "acc": 0.99985466, "epoch": 33.865842894969106, "grad_norm": 1.0342230796813965, "learning_rate": 2.578724205482684e-06, "loss": 0.00736153, "memory(GiB)": 15.03, "step": 19185, "train_speed(iter/s)": 1.475384 }, { "acc": 0.99925289, "epoch": 33.87466902030009, "grad_norm": 2.203308582305908, "learning_rate": 2.5761691692257346e-06, "loss": 0.00398376, "memory(GiB)": 15.03, "step": 19190, "train_speed(iter/s)": 1.475389 }, { "acc": 1.0, "epoch": 33.883495145631066, "grad_norm": 0.24461309611797333, "learning_rate": 2.5736149605517525e-06, "loss": 0.0033328, "memory(GiB)": 15.03, "step": 19195, "train_speed(iter/s)": 1.475391 }, { "acc": 0.99935608, "epoch": 33.89232127096205, "grad_norm": 0.4249266982078552, "learning_rate": 2.571061580332659e-06, "loss": 0.00577959, "memory(GiB)": 15.03, "step": 19200, "train_speed(iter/s)": 1.475393 }, { "acc": 0.99972582, "epoch": 33.90114739629303, "grad_norm": 0.19218586385250092, "learning_rate": 2.5685090294400883e-06, "loss": 0.00457609, "memory(GiB)": 15.03, "step": 19205, "train_speed(iter/s)": 1.475386 }, { "acc": 0.99858532, "epoch": 33.90997352162401, "grad_norm": 0.21774178743362427, "learning_rate": 2.565957308745396e-06, "loss": 0.01148361, "memory(GiB)": 15.03, "step": 19210, "train_speed(iter/s)": 1.475385 }, { "acc": 0.99949999, "epoch": 33.91879964695499, "grad_norm": 0.013962136581540108, "learning_rate": 2.5634064191196493e-06, "loss": 0.00427859, "memory(GiB)": 15.03, "step": 19215, "train_speed(iter/s)": 1.475384 }, { "acc": 0.99913235, "epoch": 33.927625772285964, "grad_norm": 0.05153771489858627, "learning_rate": 2.560856361433635e-06, "loss": 0.00669796, "memory(GiB)": 15.03, "step": 19220, "train_speed(iter/s)": 1.475366 }, { "acc": 0.99975491, "epoch": 33.93645189761695, "grad_norm": 0.1857522875070572, "learning_rate": 2.5583071365578527e-06, "loss": 0.00342697, "memory(GiB)": 15.03, "step": 19225, "train_speed(iter/s)": 1.475362 }, { "acc": 1.0, "epoch": 33.945278022947925, "grad_norm": 0.21126241981983185, "learning_rate": 2.555758745362523e-06, "loss": 0.00473944, "memory(GiB)": 15.03, "step": 19230, "train_speed(iter/s)": 1.475374 }, { "acc": 0.9997159, "epoch": 33.95410414827891, "grad_norm": 0.2154933512210846, "learning_rate": 2.5532111887175766e-06, "loss": 0.00367481, "memory(GiB)": 15.03, "step": 19235, "train_speed(iter/s)": 1.475385 }, { "acc": 0.99972219, "epoch": 33.962930273609885, "grad_norm": 0.024925032630562782, "learning_rate": 2.5506644674926635e-06, "loss": 0.00516517, "memory(GiB)": 15.03, "step": 19240, "train_speed(iter/s)": 1.475391 }, { "acc": 0.99900961, "epoch": 33.97175639894086, "grad_norm": 0.1049136370420456, "learning_rate": 2.5481185825571423e-06, "loss": 0.00960525, "memory(GiB)": 15.03, "step": 19245, "train_speed(iter/s)": 1.47541 }, { "acc": 0.9997282, "epoch": 33.980582524271846, "grad_norm": 0.030980706214904785, "learning_rate": 2.545573534780096e-06, "loss": 0.00892535, "memory(GiB)": 15.03, "step": 19250, "train_speed(iter/s)": 1.475423 }, { "acc": 0.99939365, "epoch": 33.98940864960282, "grad_norm": 0.4026886224746704, "learning_rate": 2.543029325030311e-06, "loss": 0.00737418, "memory(GiB)": 15.03, "step": 19255, "train_speed(iter/s)": 1.475437 }, { "acc": 0.99984179, "epoch": 33.998234774933806, "grad_norm": 0.10515116900205612, "learning_rate": 2.5404859541762978e-06, "loss": 0.00285431, "memory(GiB)": 15.03, "step": 19260, "train_speed(iter/s)": 1.475436 }, { "acc": 0.99953346, "epoch": 34.00706090026478, "grad_norm": 0.1740943044424057, "learning_rate": 2.537943423086273e-06, "loss": 0.01089431, "memory(GiB)": 15.03, "step": 19265, "train_speed(iter/s)": 1.475367 }, { "acc": 0.99874077, "epoch": 34.01588702559577, "grad_norm": 0.07221147418022156, "learning_rate": 2.5354017326281684e-06, "loss": 0.01392811, "memory(GiB)": 15.03, "step": 19270, "train_speed(iter/s)": 1.475378 }, { "acc": 0.9997282, "epoch": 34.02471315092674, "grad_norm": 0.9150261878967285, "learning_rate": 2.532860883669629e-06, "loss": 0.00870524, "memory(GiB)": 15.03, "step": 19275, "train_speed(iter/s)": 1.475378 }, { "acc": 0.99979506, "epoch": 34.03353927625772, "grad_norm": 0.20292316377162933, "learning_rate": 2.530320877078016e-06, "loss": 0.00550447, "memory(GiB)": 15.03, "step": 19280, "train_speed(iter/s)": 1.475388 }, { "acc": 1.0, "epoch": 34.042365401588704, "grad_norm": 0.114869125187397, "learning_rate": 2.527781713720396e-06, "loss": 0.00459478, "memory(GiB)": 15.03, "step": 19285, "train_speed(iter/s)": 1.475379 }, { "acc": 0.99935455, "epoch": 34.05119152691968, "grad_norm": 0.42411142587661743, "learning_rate": 2.525243394463556e-06, "loss": 0.00867316, "memory(GiB)": 15.03, "step": 19290, "train_speed(iter/s)": 1.475394 }, { "acc": 0.99935884, "epoch": 34.060017652250664, "grad_norm": 0.48628219962120056, "learning_rate": 2.522705920173989e-06, "loss": 0.0099642, "memory(GiB)": 15.03, "step": 19295, "train_speed(iter/s)": 1.475374 }, { "acc": 0.99965792, "epoch": 34.06884377758164, "grad_norm": 0.1970711052417755, "learning_rate": 2.5201692917178995e-06, "loss": 0.00983912, "memory(GiB)": 15.03, "step": 19300, "train_speed(iter/s)": 1.475375 }, { "acc": 0.99923916, "epoch": 34.077669902912625, "grad_norm": 2.307204246520996, "learning_rate": 2.517633509961208e-06, "loss": 0.00611274, "memory(GiB)": 15.03, "step": 19305, "train_speed(iter/s)": 1.475386 }, { "acc": 1.0, "epoch": 34.0864960282436, "grad_norm": 0.4904381334781647, "learning_rate": 2.5150985757695424e-06, "loss": 0.00302659, "memory(GiB)": 15.03, "step": 19310, "train_speed(iter/s)": 1.47538 }, { "acc": 0.99956894, "epoch": 34.09532215357458, "grad_norm": 2.431091070175171, "learning_rate": 2.5125644900082404e-06, "loss": 0.00332735, "memory(GiB)": 15.03, "step": 19315, "train_speed(iter/s)": 1.475388 }, { "acc": 0.99987984, "epoch": 34.10414827890556, "grad_norm": 0.46348387002944946, "learning_rate": 2.51003125354235e-06, "loss": 0.00494338, "memory(GiB)": 15.03, "step": 19320, "train_speed(iter/s)": 1.475382 }, { "acc": 0.99910927, "epoch": 34.11297440423654, "grad_norm": 2.0729753971099854, "learning_rate": 2.5074988672366347e-06, "loss": 0.00429652, "memory(GiB)": 15.03, "step": 19325, "train_speed(iter/s)": 1.475386 }, { "acc": 1.0, "epoch": 34.12180052956752, "grad_norm": 0.8207273483276367, "learning_rate": 2.504967331955561e-06, "loss": 0.01542971, "memory(GiB)": 15.03, "step": 19330, "train_speed(iter/s)": 1.475377 }, { "acc": 0.9998311, "epoch": 34.1306266548985, "grad_norm": 0.40273740887641907, "learning_rate": 2.50243664856331e-06, "loss": 0.00708214, "memory(GiB)": 15.03, "step": 19335, "train_speed(iter/s)": 1.475381 }, { "acc": 1.0, "epoch": 34.139452780229476, "grad_norm": 0.3349788188934326, "learning_rate": 2.499906817923769e-06, "loss": 0.00517513, "memory(GiB)": 15.03, "step": 19340, "train_speed(iter/s)": 1.475365 }, { "acc": 0.99690075, "epoch": 34.14827890556046, "grad_norm": 0.6160609126091003, "learning_rate": 2.4973778409005346e-06, "loss": 0.02128076, "memory(GiB)": 15.03, "step": 19345, "train_speed(iter/s)": 1.475372 }, { "acc": 0.99962244, "epoch": 34.15710503089144, "grad_norm": 0.06940780580043793, "learning_rate": 2.4948497183569104e-06, "loss": 0.00593252, "memory(GiB)": 15.03, "step": 19350, "train_speed(iter/s)": 1.475378 }, { "acc": 0.99942398, "epoch": 34.16593115622242, "grad_norm": 0.5562392473220825, "learning_rate": 2.4923224511559143e-06, "loss": 0.0122248, "memory(GiB)": 15.03, "step": 19355, "train_speed(iter/s)": 1.475391 }, { "acc": 1.0, "epoch": 34.1747572815534, "grad_norm": 0.2523365616798401, "learning_rate": 2.489796040160266e-06, "loss": 0.00296476, "memory(GiB)": 15.03, "step": 19360, "train_speed(iter/s)": 1.475386 }, { "acc": 0.99877129, "epoch": 34.18358340688438, "grad_norm": 0.2293577641248703, "learning_rate": 2.4872704862323955e-06, "loss": 0.00970539, "memory(GiB)": 15.03, "step": 19365, "train_speed(iter/s)": 1.475391 }, { "acc": 0.99967947, "epoch": 34.19240953221536, "grad_norm": 1.2281728982925415, "learning_rate": 2.484745790234437e-06, "loss": 0.00393146, "memory(GiB)": 15.03, "step": 19370, "train_speed(iter/s)": 1.475401 }, { "acc": 0.99881525, "epoch": 34.201235657546334, "grad_norm": 0.03580470755696297, "learning_rate": 2.48222195302824e-06, "loss": 0.01159677, "memory(GiB)": 15.03, "step": 19375, "train_speed(iter/s)": 1.475407 }, { "acc": 0.99904575, "epoch": 34.21006178287732, "grad_norm": 0.19174782931804657, "learning_rate": 2.4796989754753506e-06, "loss": 0.01163825, "memory(GiB)": 15.03, "step": 19380, "train_speed(iter/s)": 1.475407 }, { "acc": 0.99924774, "epoch": 34.218887908208295, "grad_norm": 0.8779622912406921, "learning_rate": 2.4771768584370308e-06, "loss": 0.00842741, "memory(GiB)": 15.03, "step": 19385, "train_speed(iter/s)": 1.475405 }, { "acc": 0.99984179, "epoch": 34.22771403353928, "grad_norm": 0.15444570779800415, "learning_rate": 2.4746556027742418e-06, "loss": 0.00290919, "memory(GiB)": 15.03, "step": 19390, "train_speed(iter/s)": 1.475402 }, { "acc": 0.99982872, "epoch": 34.236540158870255, "grad_norm": 0.4621584415435791, "learning_rate": 2.4721352093476517e-06, "loss": 0.0038515, "memory(GiB)": 15.03, "step": 19395, "train_speed(iter/s)": 1.475416 }, { "acc": 1.0, "epoch": 34.24536628420124, "grad_norm": 0.027316443622112274, "learning_rate": 2.46961567901764e-06, "loss": 0.003668, "memory(GiB)": 15.03, "step": 19400, "train_speed(iter/s)": 1.47541 }, { "acc": 0.99890852, "epoch": 34.254192409532216, "grad_norm": 2.0067341327667236, "learning_rate": 2.4670970126442836e-06, "loss": 0.00598677, "memory(GiB)": 15.03, "step": 19405, "train_speed(iter/s)": 1.475441 }, { "acc": 0.99910603, "epoch": 34.26301853486319, "grad_norm": 1.7178030014038086, "learning_rate": 2.4645792110873725e-06, "loss": 0.01258151, "memory(GiB)": 15.03, "step": 19410, "train_speed(iter/s)": 1.475436 }, { "acc": 0.99966135, "epoch": 34.271844660194176, "grad_norm": 0.8157581686973572, "learning_rate": 2.4620622752063946e-06, "loss": 0.01214305, "memory(GiB)": 15.03, "step": 19415, "train_speed(iter/s)": 1.475429 }, { "acc": 0.99974489, "epoch": 34.28067078552515, "grad_norm": 0.5958316922187805, "learning_rate": 2.459546205860546e-06, "loss": 0.00627618, "memory(GiB)": 15.03, "step": 19420, "train_speed(iter/s)": 1.475444 }, { "acc": 1.0, "epoch": 34.28949691085614, "grad_norm": 0.3542749583721161, "learning_rate": 2.4570310039087235e-06, "loss": 0.00812078, "memory(GiB)": 15.03, "step": 19425, "train_speed(iter/s)": 1.475447 }, { "acc": 1.0, "epoch": 34.298323036187114, "grad_norm": 0.2654886841773987, "learning_rate": 2.454516670209534e-06, "loss": 0.00280451, "memory(GiB)": 15.03, "step": 19430, "train_speed(iter/s)": 1.475445 }, { "acc": 0.99974995, "epoch": 34.30714916151809, "grad_norm": 0.1427929848432541, "learning_rate": 2.4520032056212826e-06, "loss": 0.00513675, "memory(GiB)": 15.03, "step": 19435, "train_speed(iter/s)": 1.475475 }, { "acc": 0.99976854, "epoch": 34.315975286849074, "grad_norm": 0.40517935156822205, "learning_rate": 2.4494906110019807e-06, "loss": 0.00743167, "memory(GiB)": 15.03, "step": 19440, "train_speed(iter/s)": 1.475477 }, { "acc": 0.99975491, "epoch": 34.32480141218005, "grad_norm": 0.6734102368354797, "learning_rate": 2.4469788872093376e-06, "loss": 0.00830709, "memory(GiB)": 15.03, "step": 19445, "train_speed(iter/s)": 1.475464 }, { "acc": 1.0, "epoch": 34.333627537511035, "grad_norm": 0.4586543142795563, "learning_rate": 2.444468035100774e-06, "loss": 0.00571838, "memory(GiB)": 15.03, "step": 19450, "train_speed(iter/s)": 1.475483 }, { "acc": 1.0, "epoch": 34.34245366284201, "grad_norm": 0.5583115220069885, "learning_rate": 2.4419580555334043e-06, "loss": 0.00512527, "memory(GiB)": 15.03, "step": 19455, "train_speed(iter/s)": 1.475487 }, { "acc": 0.99944515, "epoch": 34.351279788172995, "grad_norm": 0.13897104561328888, "learning_rate": 2.4394489493640525e-06, "loss": 0.00423331, "memory(GiB)": 15.03, "step": 19460, "train_speed(iter/s)": 1.475486 }, { "acc": 1.0, "epoch": 34.36010591350397, "grad_norm": 0.19939042627811432, "learning_rate": 2.43694071744924e-06, "loss": 0.00672199, "memory(GiB)": 15.03, "step": 19465, "train_speed(iter/s)": 1.475481 }, { "acc": 0.9994689, "epoch": 34.36893203883495, "grad_norm": 0.10511776059865952, "learning_rate": 2.4344333606451907e-06, "loss": 0.00759193, "memory(GiB)": 15.03, "step": 19470, "train_speed(iter/s)": 1.475479 }, { "acc": 1.0, "epoch": 34.37775816416593, "grad_norm": 0.010140586644411087, "learning_rate": 2.431926879807826e-06, "loss": 0.00395065, "memory(GiB)": 15.03, "step": 19475, "train_speed(iter/s)": 1.475474 }, { "acc": 0.99987116, "epoch": 34.38658428949691, "grad_norm": 0.2764464318752289, "learning_rate": 2.429421275792778e-06, "loss": 0.00333612, "memory(GiB)": 15.03, "step": 19480, "train_speed(iter/s)": 1.475467 }, { "acc": 0.99970236, "epoch": 34.39541041482789, "grad_norm": 0.9468881487846375, "learning_rate": 2.42691654945537e-06, "loss": 0.00241642, "memory(GiB)": 15.03, "step": 19485, "train_speed(iter/s)": 1.475466 }, { "acc": 1.0, "epoch": 34.40423654015887, "grad_norm": 0.02413361705839634, "learning_rate": 2.424412701650631e-06, "loss": 0.00410973, "memory(GiB)": 15.03, "step": 19490, "train_speed(iter/s)": 1.47546 }, { "acc": 1.0, "epoch": 34.413062665489846, "grad_norm": 0.17058038711547852, "learning_rate": 2.421909733233285e-06, "loss": 0.00154829, "memory(GiB)": 15.03, "step": 19495, "train_speed(iter/s)": 1.475449 }, { "acc": 0.99962673, "epoch": 34.42188879082083, "grad_norm": 0.04139883816242218, "learning_rate": 2.419407645057761e-06, "loss": 0.00604359, "memory(GiB)": 15.03, "step": 19500, "train_speed(iter/s)": 1.475457 }, { "acc": 0.99913845, "epoch": 34.43071491615181, "grad_norm": 1.037328839302063, "learning_rate": 2.4169064379781884e-06, "loss": 0.01021543, "memory(GiB)": 15.03, "step": 19505, "train_speed(iter/s)": 1.475445 }, { "acc": 0.99867048, "epoch": 34.43954104148279, "grad_norm": 0.767073392868042, "learning_rate": 2.41440611284839e-06, "loss": 0.01232257, "memory(GiB)": 15.03, "step": 19510, "train_speed(iter/s)": 1.475448 }, { "acc": 0.99988937, "epoch": 34.44836716681377, "grad_norm": 0.27869585156440735, "learning_rate": 2.4119066705218926e-06, "loss": 0.00402484, "memory(GiB)": 15.03, "step": 19515, "train_speed(iter/s)": 1.475441 }, { "acc": 0.9997282, "epoch": 34.45719329214475, "grad_norm": 0.18666328489780426, "learning_rate": 2.4094081118519157e-06, "loss": 0.00635155, "memory(GiB)": 15.03, "step": 19520, "train_speed(iter/s)": 1.475462 }, { "acc": 0.99953861, "epoch": 34.46601941747573, "grad_norm": 0.30208680033683777, "learning_rate": 2.406910437691386e-06, "loss": 0.00946329, "memory(GiB)": 15.03, "step": 19525, "train_speed(iter/s)": 1.475469 }, { "acc": 0.99978065, "epoch": 34.474845542806705, "grad_norm": 0.019637085497379303, "learning_rate": 2.4044136488929194e-06, "loss": 0.00347948, "memory(GiB)": 15.03, "step": 19530, "train_speed(iter/s)": 1.475471 }, { "acc": 0.99759274, "epoch": 34.48367166813769, "grad_norm": 0.595937192440033, "learning_rate": 2.4019177463088363e-06, "loss": 0.0143254, "memory(GiB)": 15.03, "step": 19535, "train_speed(iter/s)": 1.475475 }, { "acc": 0.99927616, "epoch": 34.492497793468665, "grad_norm": 2.2802531719207764, "learning_rate": 2.399422730791152e-06, "loss": 0.00619181, "memory(GiB)": 15.03, "step": 19540, "train_speed(iter/s)": 1.475468 }, { "acc": 0.99920378, "epoch": 34.50132391879965, "grad_norm": 0.35300213098526, "learning_rate": 2.396928603191576e-06, "loss": 0.00722931, "memory(GiB)": 15.03, "step": 19545, "train_speed(iter/s)": 1.475484 }, { "acc": 0.99970236, "epoch": 34.510150044130626, "grad_norm": 0.21861957013607025, "learning_rate": 2.3944353643615186e-06, "loss": 0.00669703, "memory(GiB)": 15.03, "step": 19550, "train_speed(iter/s)": 1.475492 }, { "acc": 0.99872627, "epoch": 34.51897616946161, "grad_norm": 0.3949319124221802, "learning_rate": 2.391943015152087e-06, "loss": 0.00802148, "memory(GiB)": 15.03, "step": 19555, "train_speed(iter/s)": 1.475493 }, { "acc": 0.99953747, "epoch": 34.527802294792586, "grad_norm": 0.468953400850296, "learning_rate": 2.3894515564140833e-06, "loss": 0.00859143, "memory(GiB)": 15.03, "step": 19560, "train_speed(iter/s)": 1.475492 }, { "acc": 0.99949999, "epoch": 34.53662842012356, "grad_norm": 0.7867465019226074, "learning_rate": 2.3869609889980036e-06, "loss": 0.01159563, "memory(GiB)": 15.03, "step": 19565, "train_speed(iter/s)": 1.475504 }, { "acc": 0.99942551, "epoch": 34.54545454545455, "grad_norm": 0.202135369181633, "learning_rate": 2.3844713137540417e-06, "loss": 0.00982931, "memory(GiB)": 15.03, "step": 19570, "train_speed(iter/s)": 1.475502 }, { "acc": 0.99972219, "epoch": 34.55428067078552, "grad_norm": 0.3496208190917969, "learning_rate": 2.3819825315320892e-06, "loss": 0.00680438, "memory(GiB)": 15.03, "step": 19575, "train_speed(iter/s)": 1.47552 }, { "acc": 0.99953594, "epoch": 34.56310679611651, "grad_norm": 0.26764240860939026, "learning_rate": 2.379494643181728e-06, "loss": 0.00551887, "memory(GiB)": 15.03, "step": 19580, "train_speed(iter/s)": 1.475513 }, { "acc": 1.0, "epoch": 34.571932921447484, "grad_norm": 0.017849070951342583, "learning_rate": 2.37700764955224e-06, "loss": 0.00292424, "memory(GiB)": 15.03, "step": 19585, "train_speed(iter/s)": 1.475544 }, { "acc": 0.99945736, "epoch": 34.58075904677847, "grad_norm": 0.021364282816648483, "learning_rate": 2.3745215514925986e-06, "loss": 0.00309783, "memory(GiB)": 15.03, "step": 19590, "train_speed(iter/s)": 1.475531 }, { "acc": 0.99817448, "epoch": 34.589585172109444, "grad_norm": 0.982012927532196, "learning_rate": 2.3720363498514705e-06, "loss": 0.01541276, "memory(GiB)": 15.03, "step": 19595, "train_speed(iter/s)": 1.475541 }, { "acc": 1.0, "epoch": 34.59841129744042, "grad_norm": 0.3675920367240906, "learning_rate": 2.3695520454772172e-06, "loss": 0.00526494, "memory(GiB)": 15.03, "step": 19600, "train_speed(iter/s)": 1.475544 }, { "acc": 0.99981613, "epoch": 34.607237422771405, "grad_norm": 0.5142641067504883, "learning_rate": 2.367068639217898e-06, "loss": 0.00515425, "memory(GiB)": 15.03, "step": 19605, "train_speed(iter/s)": 1.475573 }, { "acc": 0.99948721, "epoch": 34.61606354810238, "grad_norm": 0.015884747728705406, "learning_rate": 2.3645861319212593e-06, "loss": 0.00464729, "memory(GiB)": 15.03, "step": 19610, "train_speed(iter/s)": 1.475563 }, { "acc": 0.99976416, "epoch": 34.624889673433366, "grad_norm": 0.0171107966452837, "learning_rate": 2.362104524434743e-06, "loss": 0.0036089, "memory(GiB)": 15.03, "step": 19615, "train_speed(iter/s)": 1.475569 }, { "acc": 0.99970236, "epoch": 34.63371579876434, "grad_norm": 0.25081339478492737, "learning_rate": 2.3596238176054877e-06, "loss": 0.00519942, "memory(GiB)": 15.03, "step": 19620, "train_speed(iter/s)": 1.475573 }, { "acc": 0.99982395, "epoch": 34.64254192409532, "grad_norm": 0.34696733951568604, "learning_rate": 2.3571440122803173e-06, "loss": 0.00323085, "memory(GiB)": 15.03, "step": 19625, "train_speed(iter/s)": 1.47556 }, { "acc": 0.99960938, "epoch": 34.6513680494263, "grad_norm": 0.438517689704895, "learning_rate": 2.3546651093057556e-06, "loss": 0.00602136, "memory(GiB)": 15.03, "step": 19630, "train_speed(iter/s)": 1.475554 }, { "acc": 0.99955359, "epoch": 34.66019417475728, "grad_norm": 2.5391550064086914, "learning_rate": 2.3521871095280126e-06, "loss": 0.00699155, "memory(GiB)": 15.03, "step": 19635, "train_speed(iter/s)": 1.475556 }, { "acc": 1.0, "epoch": 34.66902030008826, "grad_norm": 0.02682121843099594, "learning_rate": 2.349710013792992e-06, "loss": 0.0050259, "memory(GiB)": 15.03, "step": 19640, "train_speed(iter/s)": 1.475559 }, { "acc": 0.99963379, "epoch": 34.67784642541924, "grad_norm": 0.41866832971572876, "learning_rate": 2.3472338229462872e-06, "loss": 0.00674193, "memory(GiB)": 15.03, "step": 19645, "train_speed(iter/s)": 1.475566 }, { "acc": 0.99876518, "epoch": 34.686672550750224, "grad_norm": 1.98751962184906, "learning_rate": 2.3447585378331884e-06, "loss": 0.01310091, "memory(GiB)": 15.03, "step": 19650, "train_speed(iter/s)": 1.47555 }, { "acc": 0.99956055, "epoch": 34.6954986760812, "grad_norm": 0.40799635648727417, "learning_rate": 2.3422841592986683e-06, "loss": 0.00446787, "memory(GiB)": 15.03, "step": 19655, "train_speed(iter/s)": 1.475549 }, { "acc": 1.0, "epoch": 34.70432480141218, "grad_norm": 0.05987805873155594, "learning_rate": 2.3398106881873983e-06, "loss": 0.00131607, "memory(GiB)": 15.03, "step": 19660, "train_speed(iter/s)": 1.475564 }, { "acc": 0.99949923, "epoch": 34.71315092674316, "grad_norm": 0.10845351219177246, "learning_rate": 2.337338125343735e-06, "loss": 0.00387635, "memory(GiB)": 15.03, "step": 19665, "train_speed(iter/s)": 1.475569 }, { "acc": 0.99969511, "epoch": 34.72197705207414, "grad_norm": 0.30013754963874817, "learning_rate": 2.334866471611725e-06, "loss": 0.00648239, "memory(GiB)": 15.03, "step": 19670, "train_speed(iter/s)": 1.475579 }, { "acc": 1.0, "epoch": 34.73080317740512, "grad_norm": 0.26072707772254944, "learning_rate": 2.332395727835105e-06, "loss": 0.00294593, "memory(GiB)": 15.03, "step": 19675, "train_speed(iter/s)": 1.475596 }, { "acc": 0.99939537, "epoch": 34.7396293027361, "grad_norm": 0.6594151258468628, "learning_rate": 2.329925894857304e-06, "loss": 0.00929389, "memory(GiB)": 15.03, "step": 19680, "train_speed(iter/s)": 1.47559 }, { "acc": 0.99899349, "epoch": 34.748455428067075, "grad_norm": 0.3898424208164215, "learning_rate": 2.3274569735214375e-06, "loss": 0.01172294, "memory(GiB)": 15.03, "step": 19685, "train_speed(iter/s)": 1.475574 }, { "acc": 0.99949999, "epoch": 34.75728155339806, "grad_norm": 0.9318251609802246, "learning_rate": 2.324988964670309e-06, "loss": 0.01248376, "memory(GiB)": 15.03, "step": 19690, "train_speed(iter/s)": 1.475586 }, { "acc": 0.99953079, "epoch": 34.766107678729036, "grad_norm": 0.08090227097272873, "learning_rate": 2.32252186914641e-06, "loss": 0.0041632, "memory(GiB)": 15.03, "step": 19695, "train_speed(iter/s)": 1.475596 }, { "acc": 1.0, "epoch": 34.77493380406002, "grad_norm": 0.019143279641866684, "learning_rate": 2.320055687791926e-06, "loss": 0.00303316, "memory(GiB)": 15.03, "step": 19700, "train_speed(iter/s)": 1.47557 }, { "acc": 0.99982643, "epoch": 34.783759929390996, "grad_norm": 0.7646206617355347, "learning_rate": 2.317590421448722e-06, "loss": 0.00250412, "memory(GiB)": 15.03, "step": 19705, "train_speed(iter/s)": 1.475554 }, { "acc": 0.99906445, "epoch": 34.79258605472198, "grad_norm": 1.3008216619491577, "learning_rate": 2.3151260709583597e-06, "loss": 0.00585786, "memory(GiB)": 15.03, "step": 19710, "train_speed(iter/s)": 1.475538 }, { "acc": 0.99963236, "epoch": 34.80141218005296, "grad_norm": 2.5118706226348877, "learning_rate": 2.312662637162081e-06, "loss": 0.00988334, "memory(GiB)": 15.03, "step": 19715, "train_speed(iter/s)": 1.475545 }, { "acc": 0.9994935, "epoch": 34.81023830538393, "grad_norm": 0.33555173873901367, "learning_rate": 2.3102001209008144e-06, "loss": 0.00943346, "memory(GiB)": 15.03, "step": 19720, "train_speed(iter/s)": 1.475545 }, { "acc": 0.9994442, "epoch": 34.81906443071492, "grad_norm": 0.5532030463218689, "learning_rate": 2.3077385230151835e-06, "loss": 0.0080715, "memory(GiB)": 15.03, "step": 19725, "train_speed(iter/s)": 1.47555 }, { "acc": 1.0, "epoch": 34.827890556045894, "grad_norm": 0.4158782958984375, "learning_rate": 2.3052778443454898e-06, "loss": 0.00531619, "memory(GiB)": 15.03, "step": 19730, "train_speed(iter/s)": 1.475554 }, { "acc": 0.99982147, "epoch": 34.83671668137688, "grad_norm": 0.29526853561401367, "learning_rate": 2.3028180857317247e-06, "loss": 0.00410263, "memory(GiB)": 15.03, "step": 19735, "train_speed(iter/s)": 1.475551 }, { "acc": 0.99976416, "epoch": 34.845542806707854, "grad_norm": 0.00703785615041852, "learning_rate": 2.3003592480135636e-06, "loss": 0.00359963, "memory(GiB)": 15.03, "step": 19740, "train_speed(iter/s)": 1.475577 }, { "acc": 0.99946785, "epoch": 34.85436893203884, "grad_norm": 0.4767099618911743, "learning_rate": 2.2979013320303714e-06, "loss": 0.01263396, "memory(GiB)": 15.03, "step": 19745, "train_speed(iter/s)": 1.475592 }, { "acc": 0.9993124, "epoch": 34.863195057369815, "grad_norm": 0.08283688127994537, "learning_rate": 2.2954443386211926e-06, "loss": 0.00951895, "memory(GiB)": 15.03, "step": 19750, "train_speed(iter/s)": 1.475587 }, { "acc": 1.0, "epoch": 34.87202118270079, "grad_norm": 0.7132420539855957, "learning_rate": 2.2929882686247644e-06, "loss": 0.00548558, "memory(GiB)": 15.03, "step": 19755, "train_speed(iter/s)": 1.475552 }, { "acc": 0.99973402, "epoch": 34.880847308031775, "grad_norm": 0.01201736181974411, "learning_rate": 2.2905331228795023e-06, "loss": 0.00384921, "memory(GiB)": 15.03, "step": 19760, "train_speed(iter/s)": 1.475561 }, { "acc": 1.0, "epoch": 34.88967343336275, "grad_norm": 0.17519040405750275, "learning_rate": 2.2880789022235088e-06, "loss": 0.00214731, "memory(GiB)": 15.03, "step": 19765, "train_speed(iter/s)": 1.475565 }, { "acc": 0.99945726, "epoch": 34.898499558693736, "grad_norm": 0.3283655643463135, "learning_rate": 2.285625607494567e-06, "loss": 0.0105681, "memory(GiB)": 15.03, "step": 19770, "train_speed(iter/s)": 1.47557 }, { "acc": 0.9997282, "epoch": 34.90732568402471, "grad_norm": 0.7356042265892029, "learning_rate": 2.2831732395301524e-06, "loss": 0.00664208, "memory(GiB)": 15.03, "step": 19775, "train_speed(iter/s)": 1.475581 }, { "acc": 0.99987116, "epoch": 34.916151809355696, "grad_norm": 0.6037574410438538, "learning_rate": 2.2807217991674144e-06, "loss": 0.00429642, "memory(GiB)": 15.03, "step": 19780, "train_speed(iter/s)": 1.475601 }, { "acc": 0.99984179, "epoch": 34.92497793468667, "grad_norm": 0.4545719027519226, "learning_rate": 2.2782712872431937e-06, "loss": 0.0081955, "memory(GiB)": 15.03, "step": 19785, "train_speed(iter/s)": 1.475603 }, { "acc": 0.99953709, "epoch": 34.93380406001765, "grad_norm": 0.01514421496540308, "learning_rate": 2.2758217045940096e-06, "loss": 0.0065515, "memory(GiB)": 15.03, "step": 19790, "train_speed(iter/s)": 1.475615 }, { "acc": 1.0, "epoch": 34.942630185348634, "grad_norm": 0.30762457847595215, "learning_rate": 2.2733730520560645e-06, "loss": 0.00390651, "memory(GiB)": 15.03, "step": 19795, "train_speed(iter/s)": 1.475633 }, { "acc": 1.0, "epoch": 34.95145631067961, "grad_norm": 0.21935789287090302, "learning_rate": 2.270925330465242e-06, "loss": 0.00480866, "memory(GiB)": 15.03, "step": 19800, "train_speed(iter/s)": 1.475639 }, { "acc": 1.0, "epoch": 34.960282436010594, "grad_norm": 0.29408982396125793, "learning_rate": 2.2684785406571134e-06, "loss": 0.00332055, "memory(GiB)": 15.03, "step": 19805, "train_speed(iter/s)": 1.475629 }, { "acc": 1.0, "epoch": 34.96910856134157, "grad_norm": 0.273134708404541, "learning_rate": 2.266032683466928e-06, "loss": 0.00456632, "memory(GiB)": 15.03, "step": 19810, "train_speed(iter/s)": 1.475633 }, { "acc": 0.99951477, "epoch": 34.97793468667255, "grad_norm": 0.3789949119091034, "learning_rate": 2.2635877597296155e-06, "loss": 0.00580677, "memory(GiB)": 15.03, "step": 19815, "train_speed(iter/s)": 1.475651 }, { "acc": 0.99917412, "epoch": 34.98676081200353, "grad_norm": 0.012653440237045288, "learning_rate": 2.2611437702797873e-06, "loss": 0.00974101, "memory(GiB)": 15.03, "step": 19820, "train_speed(iter/s)": 1.475662 }, { "acc": 0.99937086, "epoch": 34.99558693733451, "grad_norm": 0.4912550151348114, "learning_rate": 2.258700715951741e-06, "loss": 0.00639766, "memory(GiB)": 15.03, "step": 19825, "train_speed(iter/s)": 1.475664 }, { "acc": 0.99951925, "epoch": 35.00441306266549, "grad_norm": 0.28129658102989197, "learning_rate": 2.2562585975794476e-06, "loss": 0.00516978, "memory(GiB)": 15.03, "step": 19830, "train_speed(iter/s)": 1.475618 }, { "acc": 0.99945555, "epoch": 35.01323918799647, "grad_norm": 0.4967805743217468, "learning_rate": 2.2538174159965653e-06, "loss": 0.00700525, "memory(GiB)": 15.03, "step": 19835, "train_speed(iter/s)": 1.475617 }, { "acc": 0.99924622, "epoch": 35.02206531332745, "grad_norm": 0.1287708729505539, "learning_rate": 2.251377172036428e-06, "loss": 0.00804682, "memory(GiB)": 15.03, "step": 19840, "train_speed(iter/s)": 1.475619 }, { "acc": 0.99982872, "epoch": 35.03089143865843, "grad_norm": 0.4197837710380554, "learning_rate": 2.248937866532048e-06, "loss": 0.00545827, "memory(GiB)": 15.03, "step": 19845, "train_speed(iter/s)": 1.475594 }, { "acc": 0.99917717, "epoch": 35.039717563989406, "grad_norm": 0.24958296120166779, "learning_rate": 2.2464995003161253e-06, "loss": 0.01113764, "memory(GiB)": 15.03, "step": 19850, "train_speed(iter/s)": 1.475592 }, { "acc": 0.99969511, "epoch": 35.04854368932039, "grad_norm": 0.6561144590377808, "learning_rate": 2.244062074221031e-06, "loss": 0.00677436, "memory(GiB)": 15.03, "step": 19855, "train_speed(iter/s)": 1.475607 }, { "acc": 1.0, "epoch": 35.057369814651366, "grad_norm": 0.23689542710781097, "learning_rate": 2.2416255890788184e-06, "loss": 0.00344308, "memory(GiB)": 15.03, "step": 19860, "train_speed(iter/s)": 1.475594 }, { "acc": 0.99978065, "epoch": 35.06619593998235, "grad_norm": 0.2568121552467346, "learning_rate": 2.2391900457212188e-06, "loss": 0.00302183, "memory(GiB)": 15.03, "step": 19865, "train_speed(iter/s)": 1.475596 }, { "acc": 0.99984179, "epoch": 35.07502206531333, "grad_norm": 0.5008816123008728, "learning_rate": 2.2367554449796453e-06, "loss": 0.00318525, "memory(GiB)": 15.03, "step": 19870, "train_speed(iter/s)": 1.475612 }, { "acc": 1.0, "epoch": 35.083848190644304, "grad_norm": 0.009978344663977623, "learning_rate": 2.2343217876851835e-06, "loss": 0.00426431, "memory(GiB)": 15.03, "step": 19875, "train_speed(iter/s)": 1.475623 }, { "acc": 0.99921875, "epoch": 35.09267431597529, "grad_norm": 0.3756400942802429, "learning_rate": 2.2318890746686035e-06, "loss": 0.00952265, "memory(GiB)": 15.03, "step": 19880, "train_speed(iter/s)": 1.475628 }, { "acc": 0.99974489, "epoch": 35.101500441306264, "grad_norm": 0.7501010298728943, "learning_rate": 2.2294573067603484e-06, "loss": 0.0098237, "memory(GiB)": 15.03, "step": 19885, "train_speed(iter/s)": 1.475635 }, { "acc": 0.9998457, "epoch": 35.11032656663725, "grad_norm": 0.3148929476737976, "learning_rate": 2.227026484790539e-06, "loss": 0.00485686, "memory(GiB)": 15.03, "step": 19890, "train_speed(iter/s)": 1.475646 }, { "acc": 0.99916649, "epoch": 35.119152691968225, "grad_norm": 0.2989409863948822, "learning_rate": 2.224596609588973e-06, "loss": 0.01029205, "memory(GiB)": 15.03, "step": 19895, "train_speed(iter/s)": 1.475656 }, { "acc": 0.99885635, "epoch": 35.12797881729921, "grad_norm": 0.04958198592066765, "learning_rate": 2.2221676819851294e-06, "loss": 0.01024513, "memory(GiB)": 15.03, "step": 19900, "train_speed(iter/s)": 1.475658 }, { "acc": 0.99981613, "epoch": 35.136804942630185, "grad_norm": 0.49431219696998596, "learning_rate": 2.219739702808157e-06, "loss": 0.00839211, "memory(GiB)": 15.03, "step": 19905, "train_speed(iter/s)": 1.475656 }, { "acc": 0.99956894, "epoch": 35.14563106796116, "grad_norm": 0.2903295159339905, "learning_rate": 2.217312672886888e-06, "loss": 0.00565063, "memory(GiB)": 15.03, "step": 19910, "train_speed(iter/s)": 1.47567 }, { "acc": 1.0, "epoch": 35.154457193292146, "grad_norm": 0.1527746021747589, "learning_rate": 2.214886593049824e-06, "loss": 0.00417357, "memory(GiB)": 15.03, "step": 19915, "train_speed(iter/s)": 1.475663 }, { "acc": 1.0, "epoch": 35.16328331862312, "grad_norm": 0.5021132230758667, "learning_rate": 2.2124614641251453e-06, "loss": 0.00379223, "memory(GiB)": 15.03, "step": 19920, "train_speed(iter/s)": 1.475656 }, { "acc": 0.99978065, "epoch": 35.172109443954106, "grad_norm": 0.5480843782424927, "learning_rate": 2.210037286940706e-06, "loss": 0.0041277, "memory(GiB)": 15.03, "step": 19925, "train_speed(iter/s)": 1.47566 }, { "acc": 1.0, "epoch": 35.18093556928508, "grad_norm": 0.5658074617385864, "learning_rate": 2.20761406232404e-06, "loss": 0.00416461, "memory(GiB)": 15.03, "step": 19930, "train_speed(iter/s)": 1.475676 }, { "acc": 0.99981613, "epoch": 35.18976169461607, "grad_norm": 0.5590735673904419, "learning_rate": 2.205191791102351e-06, "loss": 0.00507181, "memory(GiB)": 15.03, "step": 19935, "train_speed(iter/s)": 1.475661 }, { "acc": 0.99978065, "epoch": 35.19858781994704, "grad_norm": 0.9225596189498901, "learning_rate": 2.2027704741025166e-06, "loss": 0.00744058, "memory(GiB)": 15.03, "step": 19940, "train_speed(iter/s)": 1.475654 }, { "acc": 0.99981613, "epoch": 35.20741394527802, "grad_norm": 0.5676866173744202, "learning_rate": 2.2003501121510954e-06, "loss": 0.00885589, "memory(GiB)": 15.03, "step": 19945, "train_speed(iter/s)": 1.475647 }, { "acc": 0.99910545, "epoch": 35.216240070609004, "grad_norm": 0.6004440188407898, "learning_rate": 2.1979307060743106e-06, "loss": 0.00986461, "memory(GiB)": 15.03, "step": 19950, "train_speed(iter/s)": 1.475659 }, { "acc": 1.0, "epoch": 35.22506619593998, "grad_norm": 0.10619144886732101, "learning_rate": 2.195512256698069e-06, "loss": 0.00455567, "memory(GiB)": 15.03, "step": 19955, "train_speed(iter/s)": 1.47568 }, { "acc": 0.99896393, "epoch": 35.233892321270964, "grad_norm": 0.2527467906475067, "learning_rate": 2.1930947648479434e-06, "loss": 0.01242752, "memory(GiB)": 15.03, "step": 19960, "train_speed(iter/s)": 1.475679 }, { "acc": 0.99926586, "epoch": 35.24271844660194, "grad_norm": 0.48876410722732544, "learning_rate": 2.190678231349182e-06, "loss": 0.00893586, "memory(GiB)": 15.03, "step": 19965, "train_speed(iter/s)": 1.475694 }, { "acc": 0.99988422, "epoch": 35.251544571932925, "grad_norm": 0.29525047540664673, "learning_rate": 2.188262657026705e-06, "loss": 0.00867642, "memory(GiB)": 15.03, "step": 19970, "train_speed(iter/s)": 1.475665 }, { "acc": 0.9997159, "epoch": 35.2603706972639, "grad_norm": 0.3752363920211792, "learning_rate": 2.1858480427051095e-06, "loss": 0.00612829, "memory(GiB)": 15.03, "step": 19975, "train_speed(iter/s)": 1.475677 }, { "acc": 0.99957628, "epoch": 35.26919682259488, "grad_norm": 0.20273804664611816, "learning_rate": 2.1834343892086588e-06, "loss": 0.0084473, "memory(GiB)": 15.03, "step": 19980, "train_speed(iter/s)": 1.475679 }, { "acc": 1.0, "epoch": 35.27802294792586, "grad_norm": 0.34618571400642395, "learning_rate": 2.181021697361292e-06, "loss": 0.00870955, "memory(GiB)": 15.03, "step": 19985, "train_speed(iter/s)": 1.475675 }, { "acc": 0.99963236, "epoch": 35.28684907325684, "grad_norm": 0.3774063289165497, "learning_rate": 2.1786099679866164e-06, "loss": 0.00765994, "memory(GiB)": 15.03, "step": 19990, "train_speed(iter/s)": 1.47568 }, { "acc": 0.99912071, "epoch": 35.29567519858782, "grad_norm": 0.5836058855056763, "learning_rate": 2.176199201907918e-06, "loss": 0.00696737, "memory(GiB)": 15.03, "step": 19995, "train_speed(iter/s)": 1.47568 }, { "acc": 0.9995862, "epoch": 35.3045013239188, "grad_norm": 0.44613897800445557, "learning_rate": 2.1737893999481446e-06, "loss": 0.0072567, "memory(GiB)": 15.03, "step": 20000, "train_speed(iter/s)": 1.475696 }, { "epoch": 35.3045013239188, "eval_acc": 0.7889144278232035, "eval_loss": 1.664538860321045, "eval_runtime": 30.2115, "eval_samples_per_second": 44.189, "eval_steps_per_second": 5.528, "step": 20000 }, { "acc": 0.99960709, "epoch": 35.313327449249776, "grad_norm": 0.5698481202125549, "learning_rate": 2.171380562929924e-06, "loss": 0.00780919, "memory(GiB)": 15.03, "step": 20005, "train_speed(iter/s)": 1.468965 }, { "acc": 1.0, "epoch": 35.32215357458076, "grad_norm": 0.17221826314926147, "learning_rate": 2.1689726916755478e-06, "loss": 0.00735344, "memory(GiB)": 15.03, "step": 20010, "train_speed(iter/s)": 1.468965 }, { "acc": 0.99970932, "epoch": 35.33097969991174, "grad_norm": 0.6122751832008362, "learning_rate": 2.1665657870069794e-06, "loss": 0.01185014, "memory(GiB)": 15.03, "step": 20015, "train_speed(iter/s)": 1.468963 }, { "acc": 0.99973402, "epoch": 35.33980582524272, "grad_norm": 0.3008663058280945, "learning_rate": 2.1641598497458526e-06, "loss": 0.00819156, "memory(GiB)": 15.03, "step": 20020, "train_speed(iter/s)": 1.468959 }, { "acc": 0.99987621, "epoch": 35.3486319505737, "grad_norm": 0.076087586581707, "learning_rate": 2.161754880713475e-06, "loss": 0.00451463, "memory(GiB)": 15.03, "step": 20025, "train_speed(iter/s)": 1.468965 }, { "acc": 0.99954662, "epoch": 35.35745807590468, "grad_norm": 0.34042608737945557, "learning_rate": 2.159350880730817e-06, "loss": 0.00748178, "memory(GiB)": 15.03, "step": 20030, "train_speed(iter/s)": 1.46897 }, { "acc": 0.99980774, "epoch": 35.36628420123566, "grad_norm": 0.3133780062198639, "learning_rate": 2.156947850618523e-06, "loss": 0.00492019, "memory(GiB)": 15.03, "step": 20035, "train_speed(iter/s)": 1.468996 }, { "acc": 0.99976416, "epoch": 35.375110326566634, "grad_norm": 0.2894154489040375, "learning_rate": 2.1545457911969018e-06, "loss": 0.00604806, "memory(GiB)": 15.03, "step": 20040, "train_speed(iter/s)": 1.468999 }, { "acc": 0.99980774, "epoch": 35.38393645189762, "grad_norm": 0.1261654496192932, "learning_rate": 2.1521447032859356e-06, "loss": 0.00410876, "memory(GiB)": 15.03, "step": 20045, "train_speed(iter/s)": 1.469004 }, { "acc": 1.0, "epoch": 35.392762577228595, "grad_norm": 0.16086852550506592, "learning_rate": 2.149744587705275e-06, "loss": 0.0040939, "memory(GiB)": 15.03, "step": 20050, "train_speed(iter/s)": 1.469004 }, { "acc": 0.99908285, "epoch": 35.40158870255958, "grad_norm": 0.6113322973251343, "learning_rate": 2.1473454452742347e-06, "loss": 0.00530462, "memory(GiB)": 15.03, "step": 20055, "train_speed(iter/s)": 1.469002 }, { "acc": 0.99888592, "epoch": 35.410414827890556, "grad_norm": 0.6359821557998657, "learning_rate": 2.1449472768117997e-06, "loss": 0.0121297, "memory(GiB)": 15.03, "step": 20060, "train_speed(iter/s)": 1.468995 }, { "acc": 1.0, "epoch": 35.41924095322153, "grad_norm": 0.48961761593818665, "learning_rate": 2.1425500831366195e-06, "loss": 0.00170032, "memory(GiB)": 15.03, "step": 20065, "train_speed(iter/s)": 1.469002 }, { "acc": 1.0, "epoch": 35.428067078552516, "grad_norm": 0.42266055941581726, "learning_rate": 2.1401538650670174e-06, "loss": 0.00489246, "memory(GiB)": 15.03, "step": 20070, "train_speed(iter/s)": 1.469017 }, { "acc": 0.9994525, "epoch": 35.43689320388349, "grad_norm": 0.02478148601949215, "learning_rate": 2.137758623420977e-06, "loss": 0.00719012, "memory(GiB)": 15.03, "step": 20075, "train_speed(iter/s)": 1.469034 }, { "acc": 0.99984179, "epoch": 35.44571932921448, "grad_norm": 0.9184127449989319, "learning_rate": 2.1353643590161533e-06, "loss": 0.00219189, "memory(GiB)": 15.03, "step": 20080, "train_speed(iter/s)": 1.469027 }, { "acc": 0.99955177, "epoch": 35.45454545454545, "grad_norm": 1.3381718397140503, "learning_rate": 2.132971072669864e-06, "loss": 0.01211423, "memory(GiB)": 15.03, "step": 20085, "train_speed(iter/s)": 1.469047 }, { "acc": 1.0, "epoch": 35.46337157987644, "grad_norm": 0.30102217197418213, "learning_rate": 2.1305787651990954e-06, "loss": 0.0035553, "memory(GiB)": 15.03, "step": 20090, "train_speed(iter/s)": 1.46905 }, { "acc": 0.99936237, "epoch": 35.472197705207414, "grad_norm": 1.0245232582092285, "learning_rate": 2.1281874374204962e-06, "loss": 0.00720856, "memory(GiB)": 15.03, "step": 20095, "train_speed(iter/s)": 1.469028 }, { "acc": 1.0, "epoch": 35.48102383053839, "grad_norm": 0.1099279373884201, "learning_rate": 2.125797090150387e-06, "loss": 0.0078798, "memory(GiB)": 15.03, "step": 20100, "train_speed(iter/s)": 1.469026 }, { "acc": 0.9998579, "epoch": 35.489849955869374, "grad_norm": 0.18647438287734985, "learning_rate": 2.1234077242047494e-06, "loss": 0.00303982, "memory(GiB)": 15.03, "step": 20105, "train_speed(iter/s)": 1.46902 }, { "acc": 0.99942455, "epoch": 35.49867608120035, "grad_norm": 0.04131558910012245, "learning_rate": 2.121019340399228e-06, "loss": 0.00598904, "memory(GiB)": 15.03, "step": 20110, "train_speed(iter/s)": 1.469019 }, { "acc": 0.99981613, "epoch": 35.507502206531335, "grad_norm": 0.29049205780029297, "learning_rate": 2.118631939549136e-06, "loss": 0.00497948, "memory(GiB)": 15.03, "step": 20115, "train_speed(iter/s)": 1.469038 }, { "acc": 0.99949999, "epoch": 35.51632833186231, "grad_norm": 3.0333051681518555, "learning_rate": 2.116245522469451e-06, "loss": 0.01265059, "memory(GiB)": 15.03, "step": 20120, "train_speed(iter/s)": 1.469049 }, { "acc": 0.99939003, "epoch": 35.525154457193295, "grad_norm": 0.28272518515586853, "learning_rate": 2.1138600899748106e-06, "loss": 0.00653026, "memory(GiB)": 15.03, "step": 20125, "train_speed(iter/s)": 1.469057 }, { "acc": 1.0, "epoch": 35.53398058252427, "grad_norm": 0.1578226536512375, "learning_rate": 2.1114756428795226e-06, "loss": 0.00545085, "memory(GiB)": 15.03, "step": 20130, "train_speed(iter/s)": 1.469065 }, { "acc": 0.99975491, "epoch": 35.54280670785525, "grad_norm": 0.9372988939285278, "learning_rate": 2.1090921819975547e-06, "loss": 0.00544861, "memory(GiB)": 15.03, "step": 20135, "train_speed(iter/s)": 1.469067 }, { "acc": 0.99956284, "epoch": 35.55163283318623, "grad_norm": 0.4355403780937195, "learning_rate": 2.1067097081425363e-06, "loss": 0.0041558, "memory(GiB)": 15.03, "step": 20140, "train_speed(iter/s)": 1.469091 }, { "acc": 0.99923611, "epoch": 35.56045895851721, "grad_norm": 0.8289739489555359, "learning_rate": 2.104328222127761e-06, "loss": 0.01017649, "memory(GiB)": 15.03, "step": 20145, "train_speed(iter/s)": 1.469077 }, { "acc": 0.99979506, "epoch": 35.56928508384819, "grad_norm": 0.36439085006713867, "learning_rate": 2.1019477247661892e-06, "loss": 0.01014499, "memory(GiB)": 15.03, "step": 20150, "train_speed(iter/s)": 1.469097 }, { "acc": 0.99977684, "epoch": 35.57811120917917, "grad_norm": 0.012240749783813953, "learning_rate": 2.099568216870439e-06, "loss": 0.00202937, "memory(GiB)": 15.03, "step": 20155, "train_speed(iter/s)": 1.469103 }, { "acc": 0.9997282, "epoch": 35.586937334510154, "grad_norm": 0.24056795239448547, "learning_rate": 2.097189699252792e-06, "loss": 0.00860753, "memory(GiB)": 15.03, "step": 20160, "train_speed(iter/s)": 1.469111 }, { "acc": 0.99901924, "epoch": 35.59576345984113, "grad_norm": 0.044676803052425385, "learning_rate": 2.094812172725193e-06, "loss": 0.01122778, "memory(GiB)": 15.03, "step": 20165, "train_speed(iter/s)": 1.469124 }, { "acc": 0.99930305, "epoch": 35.60458958517211, "grad_norm": 0.25244736671447754, "learning_rate": 2.092435638099246e-06, "loss": 0.00964131, "memory(GiB)": 15.03, "step": 20170, "train_speed(iter/s)": 1.46913 }, { "acc": 0.99974995, "epoch": 35.61341571050309, "grad_norm": 0.27802175283432007, "learning_rate": 2.0900600961862204e-06, "loss": 0.00222107, "memory(GiB)": 15.03, "step": 20175, "train_speed(iter/s)": 1.469138 }, { "acc": 0.99975967, "epoch": 35.62224183583407, "grad_norm": 0.692671537399292, "learning_rate": 2.0876855477970433e-06, "loss": 0.00535498, "memory(GiB)": 15.03, "step": 20180, "train_speed(iter/s)": 1.469154 }, { "acc": 0.99925871, "epoch": 35.63106796116505, "grad_norm": 0.013550112023949623, "learning_rate": 2.085311993742304e-06, "loss": 0.00540251, "memory(GiB)": 15.03, "step": 20185, "train_speed(iter/s)": 1.469156 }, { "acc": 0.99966221, "epoch": 35.63989408649603, "grad_norm": 0.04050751402974129, "learning_rate": 2.0829394348322498e-06, "loss": 0.00791835, "memory(GiB)": 15.03, "step": 20190, "train_speed(iter/s)": 1.469155 }, { "acc": 1.0, "epoch": 35.648720211827005, "grad_norm": 0.4878508150577545, "learning_rate": 2.080567871876793e-06, "loss": 0.00635286, "memory(GiB)": 15.03, "step": 20195, "train_speed(iter/s)": 1.469155 }, { "acc": 0.99962959, "epoch": 35.65754633715799, "grad_norm": 0.44653353095054626, "learning_rate": 2.0781973056855018e-06, "loss": 0.00444468, "memory(GiB)": 15.03, "step": 20200, "train_speed(iter/s)": 1.469168 }, { "acc": 0.99872808, "epoch": 35.666372462488965, "grad_norm": 0.36158424615859985, "learning_rate": 2.075827737067608e-06, "loss": 0.01362877, "memory(GiB)": 15.03, "step": 20205, "train_speed(iter/s)": 1.46915 }, { "acc": 0.9991991, "epoch": 35.67519858781995, "grad_norm": 1.401997685432434, "learning_rate": 2.073459166831999e-06, "loss": 0.00873863, "memory(GiB)": 15.03, "step": 20210, "train_speed(iter/s)": 1.46914 }, { "acc": 1.0, "epoch": 35.684024713150926, "grad_norm": 0.006066068075597286, "learning_rate": 2.0710915957872236e-06, "loss": 0.00201258, "memory(GiB)": 15.03, "step": 20215, "train_speed(iter/s)": 1.469142 }, { "acc": 0.99958467, "epoch": 35.69285083848191, "grad_norm": 0.47550302743911743, "learning_rate": 2.068725024741487e-06, "loss": 0.00908477, "memory(GiB)": 15.03, "step": 20220, "train_speed(iter/s)": 1.469153 }, { "acc": 0.9991004, "epoch": 35.701676963812886, "grad_norm": 0.25354713201522827, "learning_rate": 2.066359454502658e-06, "loss": 0.00732087, "memory(GiB)": 15.03, "step": 20225, "train_speed(iter/s)": 1.469168 }, { "acc": 0.99942551, "epoch": 35.71050308914386, "grad_norm": 0.7475471496582031, "learning_rate": 2.06399488587826e-06, "loss": 0.01189823, "memory(GiB)": 15.03, "step": 20230, "train_speed(iter/s)": 1.469172 }, { "acc": 1.0, "epoch": 35.71932921447485, "grad_norm": 0.34157589077949524, "learning_rate": 2.0616313196754737e-06, "loss": 0.00380307, "memory(GiB)": 15.03, "step": 20235, "train_speed(iter/s)": 1.469202 }, { "acc": 0.99939041, "epoch": 35.728155339805824, "grad_norm": 0.2855295240879059, "learning_rate": 2.0592687567011384e-06, "loss": 0.00804471, "memory(GiB)": 15.03, "step": 20240, "train_speed(iter/s)": 1.46918 }, { "acc": 1.0, "epoch": 35.73698146513681, "grad_norm": 0.5197140574455261, "learning_rate": 2.0569071977617553e-06, "loss": 0.01027207, "memory(GiB)": 15.03, "step": 20245, "train_speed(iter/s)": 1.469176 }, { "acc": 0.99960938, "epoch": 35.745807590467784, "grad_norm": 0.09132958948612213, "learning_rate": 2.054546643663475e-06, "loss": 0.00427942, "memory(GiB)": 15.03, "step": 20250, "train_speed(iter/s)": 1.4692 }, { "acc": 0.99948254, "epoch": 35.75463371579876, "grad_norm": 1.9075040817260742, "learning_rate": 2.0521870952121135e-06, "loss": 0.00650443, "memory(GiB)": 15.03, "step": 20255, "train_speed(iter/s)": 1.469182 }, { "acc": 1.0, "epoch": 35.763459841129745, "grad_norm": 0.2254795879125595, "learning_rate": 2.0498285532131368e-06, "loss": 0.004716, "memory(GiB)": 15.03, "step": 20260, "train_speed(iter/s)": 1.469188 }, { "acc": 0.99950857, "epoch": 35.77228596646072, "grad_norm": 0.09643606841564178, "learning_rate": 2.0474710184716686e-06, "loss": 0.00664346, "memory(GiB)": 15.03, "step": 20265, "train_speed(iter/s)": 1.46917 }, { "acc": 0.99959669, "epoch": 35.781112091791705, "grad_norm": 0.22967182099819183, "learning_rate": 2.0451144917924928e-06, "loss": 0.0043822, "memory(GiB)": 15.03, "step": 20270, "train_speed(iter/s)": 1.469148 }, { "acc": 1.0, "epoch": 35.78993821712268, "grad_norm": 0.09250381588935852, "learning_rate": 2.0427589739800462e-06, "loss": 0.00495645, "memory(GiB)": 15.03, "step": 20275, "train_speed(iter/s)": 1.46914 }, { "acc": 0.99923277, "epoch": 35.798764342453666, "grad_norm": 0.03947417065501213, "learning_rate": 2.0404044658384194e-06, "loss": 0.00862362, "memory(GiB)": 15.03, "step": 20280, "train_speed(iter/s)": 1.469152 }, { "acc": 0.99970236, "epoch": 35.80759046778464, "grad_norm": 2.7969136238098145, "learning_rate": 2.0380509681713603e-06, "loss": 0.00502271, "memory(GiB)": 15.03, "step": 20285, "train_speed(iter/s)": 1.469166 }, { "acc": 0.99980774, "epoch": 35.81641659311562, "grad_norm": 0.010107004083693027, "learning_rate": 2.035698481782275e-06, "loss": 0.00447589, "memory(GiB)": 15.03, "step": 20290, "train_speed(iter/s)": 1.469158 }, { "acc": 0.9996542, "epoch": 35.8252427184466, "grad_norm": 0.1666458696126938, "learning_rate": 2.033347007474216e-06, "loss": 0.00497562, "memory(GiB)": 15.03, "step": 20295, "train_speed(iter/s)": 1.469182 }, { "acc": 0.99945927, "epoch": 35.83406884377758, "grad_norm": 1.5832619667053223, "learning_rate": 2.0309965460499013e-06, "loss": 0.01370795, "memory(GiB)": 15.03, "step": 20300, "train_speed(iter/s)": 1.469194 }, { "acc": 0.99934072, "epoch": 35.84289496910856, "grad_norm": 0.32102468609809875, "learning_rate": 2.0286470983116945e-06, "loss": 0.00759763, "memory(GiB)": 15.03, "step": 20305, "train_speed(iter/s)": 1.469212 }, { "acc": 0.9994009, "epoch": 35.85172109443954, "grad_norm": 0.38422587513923645, "learning_rate": 2.0262986650616167e-06, "loss": 0.01207238, "memory(GiB)": 15.03, "step": 20310, "train_speed(iter/s)": 1.469212 }, { "acc": 0.99985294, "epoch": 35.860547219770524, "grad_norm": 0.4869580864906311, "learning_rate": 2.023951247101339e-06, "loss": 0.00313685, "memory(GiB)": 15.03, "step": 20315, "train_speed(iter/s)": 1.469203 }, { "acc": 1.0, "epoch": 35.8693733451015, "grad_norm": 0.0075362129136919975, "learning_rate": 2.0216048452321937e-06, "loss": 0.00292867, "memory(GiB)": 15.03, "step": 20320, "train_speed(iter/s)": 1.469207 }, { "acc": 1.0, "epoch": 35.87819947043248, "grad_norm": 0.10938145220279694, "learning_rate": 2.019259460255158e-06, "loss": 0.00338419, "memory(GiB)": 15.03, "step": 20325, "train_speed(iter/s)": 1.469222 }, { "acc": 0.99974995, "epoch": 35.88702559576346, "grad_norm": 0.5261418223381042, "learning_rate": 2.016915092970869e-06, "loss": 0.00281926, "memory(GiB)": 15.03, "step": 20330, "train_speed(iter/s)": 1.469221 }, { "acc": 1.0, "epoch": 35.89585172109444, "grad_norm": 0.3533914089202881, "learning_rate": 2.01457174417961e-06, "loss": 0.00333218, "memory(GiB)": 15.03, "step": 20335, "train_speed(iter/s)": 1.469229 }, { "acc": 0.99972219, "epoch": 35.90467784642542, "grad_norm": 0.4828283488750458, "learning_rate": 2.012229414681321e-06, "loss": 0.0076639, "memory(GiB)": 15.03, "step": 20340, "train_speed(iter/s)": 1.469235 }, { "acc": 1.0, "epoch": 35.9135039717564, "grad_norm": 0.4529750347137451, "learning_rate": 2.0098881052755898e-06, "loss": 0.0016108, "memory(GiB)": 15.03, "step": 20345, "train_speed(iter/s)": 1.469237 }, { "acc": 0.99929905, "epoch": 35.92233009708738, "grad_norm": 2.4999709129333496, "learning_rate": 2.007547816761662e-06, "loss": 0.00827138, "memory(GiB)": 15.03, "step": 20350, "train_speed(iter/s)": 1.469255 }, { "acc": 0.99927549, "epoch": 35.93115622241836, "grad_norm": 0.4833478331565857, "learning_rate": 2.0052085499384306e-06, "loss": 0.00739737, "memory(GiB)": 15.03, "step": 20355, "train_speed(iter/s)": 1.469268 }, { "acc": 1.0, "epoch": 35.939982347749336, "grad_norm": 0.43926897644996643, "learning_rate": 2.0028703056044403e-06, "loss": 0.01345146, "memory(GiB)": 15.03, "step": 20360, "train_speed(iter/s)": 1.469268 }, { "acc": 1.0, "epoch": 35.94880847308032, "grad_norm": 0.5510545372962952, "learning_rate": 2.000533084557885e-06, "loss": 0.00843226, "memory(GiB)": 15.03, "step": 20365, "train_speed(iter/s)": 1.469264 }, { "acc": 0.99958839, "epoch": 35.957634598411296, "grad_norm": 0.28565725684165955, "learning_rate": 1.9981968875966136e-06, "loss": 0.00722842, "memory(GiB)": 15.03, "step": 20370, "train_speed(iter/s)": 1.469266 }, { "acc": 1.0, "epoch": 35.96646072374228, "grad_norm": 0.18570508062839508, "learning_rate": 1.995861715518125e-06, "loss": 0.00157713, "memory(GiB)": 15.03, "step": 20375, "train_speed(iter/s)": 1.469273 }, { "acc": 1.0, "epoch": 35.97528684907326, "grad_norm": 0.12703901529312134, "learning_rate": 1.993527569119564e-06, "loss": 0.00448735, "memory(GiB)": 15.03, "step": 20380, "train_speed(iter/s)": 1.469292 }, { "acc": 0.99979839, "epoch": 35.98411297440423, "grad_norm": 0.3483317196369171, "learning_rate": 1.991194449197728e-06, "loss": 0.00653981, "memory(GiB)": 15.03, "step": 20385, "train_speed(iter/s)": 1.469286 }, { "acc": 1.0, "epoch": 35.99293909973522, "grad_norm": 0.30678072571754456, "learning_rate": 1.9888623565490632e-06, "loss": 0.00311185, "memory(GiB)": 15.03, "step": 20390, "train_speed(iter/s)": 1.469288 }, { "acc": 0.99942341, "epoch": 36.001765225066194, "grad_norm": 1.6987841129302979, "learning_rate": 1.9865312919696673e-06, "loss": 0.00659747, "memory(GiB)": 15.03, "step": 20395, "train_speed(iter/s)": 1.46921 }, { "acc": 0.99963236, "epoch": 36.01059135039718, "grad_norm": 0.5092635750770569, "learning_rate": 1.984201256255284e-06, "loss": 0.00573987, "memory(GiB)": 15.03, "step": 20400, "train_speed(iter/s)": 1.469195 }, { "acc": 0.99947586, "epoch": 36.019417475728154, "grad_norm": 0.1971241980791092, "learning_rate": 1.9818722502013083e-06, "loss": 0.00438973, "memory(GiB)": 15.03, "step": 20405, "train_speed(iter/s)": 1.469207 }, { "acc": 0.999547, "epoch": 36.02824360105914, "grad_norm": 0.3392362892627716, "learning_rate": 1.9795442746027793e-06, "loss": 0.00561229, "memory(GiB)": 15.03, "step": 20410, "train_speed(iter/s)": 1.469212 }, { "acc": 1.0, "epoch": 36.037069726390115, "grad_norm": 0.008953385055065155, "learning_rate": 1.9772173302543927e-06, "loss": 0.0042597, "memory(GiB)": 15.03, "step": 20415, "train_speed(iter/s)": 1.469221 }, { "acc": 1.0, "epoch": 36.04589585172109, "grad_norm": 0.13713321089744568, "learning_rate": 1.9748914179504824e-06, "loss": 0.00203065, "memory(GiB)": 15.03, "step": 20420, "train_speed(iter/s)": 1.469217 }, { "acc": 0.99969511, "epoch": 36.054721977052075, "grad_norm": 0.23562723398208618, "learning_rate": 1.9725665384850385e-06, "loss": 0.00292648, "memory(GiB)": 15.03, "step": 20425, "train_speed(iter/s)": 1.469224 }, { "acc": 1.0, "epoch": 36.06354810238305, "grad_norm": 0.33403971791267395, "learning_rate": 1.9702426926516937e-06, "loss": 0.00272038, "memory(GiB)": 15.03, "step": 20430, "train_speed(iter/s)": 1.469228 }, { "acc": 1.0, "epoch": 36.072374227714036, "grad_norm": 0.06563941389322281, "learning_rate": 1.967919881243727e-06, "loss": 0.00371468, "memory(GiB)": 15.03, "step": 20435, "train_speed(iter/s)": 1.469238 }, { "acc": 0.99977684, "epoch": 36.08120035304501, "grad_norm": 1.1073317527770996, "learning_rate": 1.965598105054066e-06, "loss": 0.00734552, "memory(GiB)": 15.03, "step": 20440, "train_speed(iter/s)": 1.469225 }, { "acc": 0.99945917, "epoch": 36.09002647837599, "grad_norm": 0.44773411750793457, "learning_rate": 1.9632773648752887e-06, "loss": 0.00696445, "memory(GiB)": 15.03, "step": 20445, "train_speed(iter/s)": 1.469235 }, { "acc": 0.99984179, "epoch": 36.09885260370697, "grad_norm": 0.28272321820259094, "learning_rate": 1.9609576614996114e-06, "loss": 0.00327198, "memory(GiB)": 15.03, "step": 20450, "train_speed(iter/s)": 1.469241 }, { "acc": 1.0, "epoch": 36.10767872903795, "grad_norm": 0.3230976164340973, "learning_rate": 1.9586389957189043e-06, "loss": 0.00659513, "memory(GiB)": 15.03, "step": 20455, "train_speed(iter/s)": 1.46924 }, { "acc": 0.99976416, "epoch": 36.116504854368934, "grad_norm": 0.04287204146385193, "learning_rate": 1.956321368324679e-06, "loss": 0.00517884, "memory(GiB)": 15.03, "step": 20460, "train_speed(iter/s)": 1.469246 }, { "acc": 1.0, "epoch": 36.12533097969991, "grad_norm": 0.01822204887866974, "learning_rate": 1.9540047801080927e-06, "loss": 0.00243056, "memory(GiB)": 15.03, "step": 20465, "train_speed(iter/s)": 1.469248 }, { "acc": 0.99978065, "epoch": 36.134157105030894, "grad_norm": 0.317647784948349, "learning_rate": 1.9516892318599475e-06, "loss": 0.00480913, "memory(GiB)": 15.03, "step": 20470, "train_speed(iter/s)": 1.469243 }, { "acc": 0.99974232, "epoch": 36.14298323036187, "grad_norm": 0.31123897433280945, "learning_rate": 1.9493747243706956e-06, "loss": 0.00924099, "memory(GiB)": 15.03, "step": 20475, "train_speed(iter/s)": 1.469239 }, { "acc": 1.0, "epoch": 36.15180935569285, "grad_norm": 0.2670152187347412, "learning_rate": 1.947061258430428e-06, "loss": 0.00507105, "memory(GiB)": 15.03, "step": 20480, "train_speed(iter/s)": 1.469236 }, { "acc": 0.99954033, "epoch": 36.16063548102383, "grad_norm": 0.435815691947937, "learning_rate": 1.94474883482888e-06, "loss": 0.00596658, "memory(GiB)": 15.03, "step": 20485, "train_speed(iter/s)": 1.469226 }, { "acc": 0.99908991, "epoch": 36.16946160635481, "grad_norm": 0.7569391131401062, "learning_rate": 1.9424374543554374e-06, "loss": 0.01220391, "memory(GiB)": 15.03, "step": 20490, "train_speed(iter/s)": 1.469234 }, { "acc": 0.99956894, "epoch": 36.17828773168579, "grad_norm": 0.7247863411903381, "learning_rate": 1.940127117799122e-06, "loss": 0.00835518, "memory(GiB)": 15.03, "step": 20495, "train_speed(iter/s)": 1.469231 }, { "acc": 1.0, "epoch": 36.18711385701677, "grad_norm": 0.44277670979499817, "learning_rate": 1.9378178259486067e-06, "loss": 0.00358846, "memory(GiB)": 15.03, "step": 20500, "train_speed(iter/s)": 1.469248 }, { "acc": 0.9997159, "epoch": 36.19593998234775, "grad_norm": 0.4871448576450348, "learning_rate": 1.9355095795922027e-06, "loss": 0.00775202, "memory(GiB)": 15.03, "step": 20505, "train_speed(iter/s)": 1.469244 }, { "acc": 0.99946346, "epoch": 36.20476610767873, "grad_norm": 0.22687700390815735, "learning_rate": 1.9332023795178643e-06, "loss": 0.00427258, "memory(GiB)": 15.03, "step": 20510, "train_speed(iter/s)": 1.469242 }, { "acc": 0.99960623, "epoch": 36.213592233009706, "grad_norm": 0.16057737171649933, "learning_rate": 1.9308962265131907e-06, "loss": 0.00569406, "memory(GiB)": 15.03, "step": 20515, "train_speed(iter/s)": 1.469238 }, { "acc": 0.99921875, "epoch": 36.22241835834069, "grad_norm": 0.7785775065422058, "learning_rate": 1.9285911213654248e-06, "loss": 0.00633194, "memory(GiB)": 15.03, "step": 20520, "train_speed(iter/s)": 1.469245 }, { "acc": 0.99904366, "epoch": 36.23124448367167, "grad_norm": 2.7865660190582275, "learning_rate": 1.9262870648614488e-06, "loss": 0.00700186, "memory(GiB)": 15.03, "step": 20525, "train_speed(iter/s)": 1.469258 }, { "acc": 0.99969511, "epoch": 36.24007060900265, "grad_norm": 0.4057309329509735, "learning_rate": 1.9239840577877874e-06, "loss": 0.00320527, "memory(GiB)": 15.03, "step": 20530, "train_speed(iter/s)": 1.469266 }, { "acc": 1.0, "epoch": 36.24889673433363, "grad_norm": 0.18062669038772583, "learning_rate": 1.9216821009306076e-06, "loss": 0.00312062, "memory(GiB)": 15.03, "step": 20535, "train_speed(iter/s)": 1.46927 }, { "acc": 0.99979506, "epoch": 36.257722859664604, "grad_norm": 0.21525992453098297, "learning_rate": 1.91938119507572e-06, "loss": 0.00483601, "memory(GiB)": 15.03, "step": 20540, "train_speed(iter/s)": 1.469251 }, { "acc": 0.99964218, "epoch": 36.26654898499559, "grad_norm": 0.5487952828407288, "learning_rate": 1.917081341008572e-06, "loss": 0.00959297, "memory(GiB)": 15.03, "step": 20545, "train_speed(iter/s)": 1.469256 }, { "acc": 0.99960594, "epoch": 36.275375110326564, "grad_norm": 0.2875920534133911, "learning_rate": 1.9147825395142584e-06, "loss": 0.00608635, "memory(GiB)": 15.03, "step": 20550, "train_speed(iter/s)": 1.469262 }, { "acc": 1.0, "epoch": 36.28420123565755, "grad_norm": 0.00985183659940958, "learning_rate": 1.9124847913775077e-06, "loss": 0.0039561, "memory(GiB)": 15.03, "step": 20555, "train_speed(iter/s)": 1.469274 }, { "acc": 0.99967947, "epoch": 36.293027360988525, "grad_norm": 0.3532153367996216, "learning_rate": 1.9101880973826923e-06, "loss": 0.00250053, "memory(GiB)": 15.03, "step": 20560, "train_speed(iter/s)": 1.469274 }, { "acc": 0.99982872, "epoch": 36.30185348631951, "grad_norm": 0.24477225542068481, "learning_rate": 1.9078924583138235e-06, "loss": 0.00491307, "memory(GiB)": 15.03, "step": 20565, "train_speed(iter/s)": 1.469274 }, { "acc": 0.99959869, "epoch": 36.310679611650485, "grad_norm": 0.20878736674785614, "learning_rate": 1.905597874954556e-06, "loss": 0.00530667, "memory(GiB)": 15.03, "step": 20570, "train_speed(iter/s)": 1.469262 }, { "acc": 0.99872227, "epoch": 36.31950573698146, "grad_norm": 0.3699761629104614, "learning_rate": 1.9033043480881788e-06, "loss": 0.01818931, "memory(GiB)": 15.03, "step": 20575, "train_speed(iter/s)": 1.469271 }, { "acc": 0.99970932, "epoch": 36.328331862312446, "grad_norm": 1.8516249656677246, "learning_rate": 1.9010118784976253e-06, "loss": 0.00367217, "memory(GiB)": 15.03, "step": 20580, "train_speed(iter/s)": 1.469267 }, { "acc": 1.0, "epoch": 36.33715798764342, "grad_norm": 0.25991547107696533, "learning_rate": 1.8987204669654647e-06, "loss": 0.00204309, "memory(GiB)": 15.03, "step": 20585, "train_speed(iter/s)": 1.469269 }, { "acc": 0.99965363, "epoch": 36.345984112974406, "grad_norm": 0.24816834926605225, "learning_rate": 1.896430114273903e-06, "loss": 0.00709907, "memory(GiB)": 15.03, "step": 20590, "train_speed(iter/s)": 1.469282 }, { "acc": 1.0, "epoch": 36.35481023830538, "grad_norm": 0.010112470015883446, "learning_rate": 1.8941408212047926e-06, "loss": 0.00042624, "memory(GiB)": 15.03, "step": 20595, "train_speed(iter/s)": 1.4693 }, { "acc": 1.0, "epoch": 36.36363636363637, "grad_norm": 0.26939842104911804, "learning_rate": 1.8918525885396172e-06, "loss": 0.00456234, "memory(GiB)": 15.03, "step": 20600, "train_speed(iter/s)": 1.469306 }, { "acc": 1.0, "epoch": 36.372462488967344, "grad_norm": 0.30945831537246704, "learning_rate": 1.8895654170594995e-06, "loss": 0.00522972, "memory(GiB)": 15.03, "step": 20605, "train_speed(iter/s)": 1.46932 }, { "acc": 0.99949999, "epoch": 36.38128861429832, "grad_norm": 0.35855117440223694, "learning_rate": 1.8872793075452003e-06, "loss": 0.00825767, "memory(GiB)": 15.03, "step": 20610, "train_speed(iter/s)": 1.469312 }, { "acc": 1.0, "epoch": 36.390114739629304, "grad_norm": 0.27565836906433105, "learning_rate": 1.884994260777121e-06, "loss": 0.00511496, "memory(GiB)": 15.03, "step": 20615, "train_speed(iter/s)": 1.469317 }, { "acc": 1.0, "epoch": 36.39894086496028, "grad_norm": 0.29059794545173645, "learning_rate": 1.8827102775352957e-06, "loss": 0.0030125, "memory(GiB)": 15.03, "step": 20620, "train_speed(iter/s)": 1.469328 }, { "acc": 1.0, "epoch": 36.407766990291265, "grad_norm": 0.3742542266845703, "learning_rate": 1.8804273585994007e-06, "loss": 0.00517251, "memory(GiB)": 15.03, "step": 20625, "train_speed(iter/s)": 1.46932 }, { "acc": 1.0, "epoch": 36.41659311562224, "grad_norm": 0.3893791437149048, "learning_rate": 1.8781455047487426e-06, "loss": 0.00455099, "memory(GiB)": 15.03, "step": 20630, "train_speed(iter/s)": 1.469326 }, { "acc": 0.99970932, "epoch": 36.42541924095322, "grad_norm": 0.2904628813266754, "learning_rate": 1.8758647167622695e-06, "loss": 0.00527669, "memory(GiB)": 15.03, "step": 20635, "train_speed(iter/s)": 1.469336 }, { "acc": 1.0, "epoch": 36.4342453662842, "grad_norm": 0.0599629208445549, "learning_rate": 1.8735849954185612e-06, "loss": 0.00337934, "memory(GiB)": 15.03, "step": 20640, "train_speed(iter/s)": 1.469338 }, { "acc": 0.99931335, "epoch": 36.44307149161518, "grad_norm": 0.23557241261005402, "learning_rate": 1.8713063414958395e-06, "loss": 0.01097516, "memory(GiB)": 15.03, "step": 20645, "train_speed(iter/s)": 1.46935 }, { "acc": 1.0, "epoch": 36.45189761694616, "grad_norm": 0.24202889204025269, "learning_rate": 1.8690287557719564e-06, "loss": 0.0045836, "memory(GiB)": 15.03, "step": 20650, "train_speed(iter/s)": 1.469337 }, { "acc": 0.99892807, "epoch": 36.46072374227714, "grad_norm": 0.00856119953095913, "learning_rate": 1.8667522390244015e-06, "loss": 0.01448052, "memory(GiB)": 15.03, "step": 20655, "train_speed(iter/s)": 1.46934 }, { "acc": 1.0, "epoch": 36.46954986760812, "grad_norm": 0.019465064629912376, "learning_rate": 1.8644767920302975e-06, "loss": 0.00509711, "memory(GiB)": 15.03, "step": 20660, "train_speed(iter/s)": 1.46936 }, { "acc": 1.0, "epoch": 36.4783759929391, "grad_norm": 0.40245917439460754, "learning_rate": 1.8622024155664062e-06, "loss": 0.0040669, "memory(GiB)": 15.03, "step": 20665, "train_speed(iter/s)": 1.469374 }, { "acc": 1.0, "epoch": 36.487202118270076, "grad_norm": 0.6591199040412903, "learning_rate": 1.8599291104091191e-06, "loss": 0.00313993, "memory(GiB)": 15.03, "step": 20670, "train_speed(iter/s)": 1.469368 }, { "acc": 0.99985294, "epoch": 36.49602824360106, "grad_norm": 0.04650735855102539, "learning_rate": 1.857656877334467e-06, "loss": 0.00288739, "memory(GiB)": 15.03, "step": 20675, "train_speed(iter/s)": 1.469382 }, { "acc": 0.99978447, "epoch": 36.50485436893204, "grad_norm": 0.13912340998649597, "learning_rate": 1.8553857171181097e-06, "loss": 0.00654191, "memory(GiB)": 15.03, "step": 20680, "train_speed(iter/s)": 1.469393 }, { "acc": 0.99912157, "epoch": 36.51368049426302, "grad_norm": 0.22498691082000732, "learning_rate": 1.8531156305353437e-06, "loss": 0.01075061, "memory(GiB)": 15.03, "step": 20685, "train_speed(iter/s)": 1.469402 }, { "acc": 0.99978809, "epoch": 36.522506619594, "grad_norm": 0.017953813076019287, "learning_rate": 1.8508466183610965e-06, "loss": 0.00659389, "memory(GiB)": 15.03, "step": 20690, "train_speed(iter/s)": 1.469404 }, { "acc": 0.99980469, "epoch": 36.53133274492498, "grad_norm": 0.4426243305206299, "learning_rate": 1.8485786813699338e-06, "loss": 0.0097039, "memory(GiB)": 15.03, "step": 20695, "train_speed(iter/s)": 1.469399 }, { "acc": 0.99932528, "epoch": 36.54015887025596, "grad_norm": 0.06676210463047028, "learning_rate": 1.8463118203360482e-06, "loss": 0.00389721, "memory(GiB)": 15.03, "step": 20700, "train_speed(iter/s)": 1.469392 }, { "acc": 0.99949274, "epoch": 36.548984995586935, "grad_norm": 0.2688200771808624, "learning_rate": 1.8440460360332704e-06, "loss": 0.00707726, "memory(GiB)": 15.03, "step": 20705, "train_speed(iter/s)": 1.469398 }, { "acc": 1.0, "epoch": 36.55781112091792, "grad_norm": 0.09346215426921844, "learning_rate": 1.8417813292350603e-06, "loss": 0.00363492, "memory(GiB)": 15.03, "step": 20710, "train_speed(iter/s)": 1.469413 }, { "acc": 0.99979839, "epoch": 36.566637246248895, "grad_norm": 0.6310887336730957, "learning_rate": 1.8395177007145087e-06, "loss": 0.00628387, "memory(GiB)": 15.03, "step": 20715, "train_speed(iter/s)": 1.469418 }, { "acc": 1.0, "epoch": 36.57546337157988, "grad_norm": 0.3692699074745178, "learning_rate": 1.8372551512443434e-06, "loss": 0.00208522, "memory(GiB)": 15.03, "step": 20720, "train_speed(iter/s)": 1.46943 }, { "acc": 1.0, "epoch": 36.584289496910856, "grad_norm": 1.2801525592803955, "learning_rate": 1.8349936815969198e-06, "loss": 0.00488369, "memory(GiB)": 15.03, "step": 20725, "train_speed(iter/s)": 1.469453 }, { "acc": 1.0, "epoch": 36.59311562224183, "grad_norm": 0.3280809223651886, "learning_rate": 1.8327332925442251e-06, "loss": 0.00146935, "memory(GiB)": 15.03, "step": 20730, "train_speed(iter/s)": 1.469461 }, { "acc": 0.99990673, "epoch": 36.601941747572816, "grad_norm": 0.2115648239850998, "learning_rate": 1.8304739848578768e-06, "loss": 0.00195176, "memory(GiB)": 15.03, "step": 20735, "train_speed(iter/s)": 1.469459 }, { "acc": 1.0, "epoch": 36.61076787290379, "grad_norm": 0.33144232630729675, "learning_rate": 1.8282157593091277e-06, "loss": 0.00331586, "memory(GiB)": 15.03, "step": 20740, "train_speed(iter/s)": 1.469475 }, { "acc": 0.99921312, "epoch": 36.61959399823478, "grad_norm": 0.29480236768722534, "learning_rate": 1.825958616668855e-06, "loss": 0.00752632, "memory(GiB)": 15.03, "step": 20745, "train_speed(iter/s)": 1.469477 }, { "acc": 1.0, "epoch": 36.62842012356575, "grad_norm": 0.30328670144081116, "learning_rate": 1.8237025577075722e-06, "loss": 0.00342527, "memory(GiB)": 15.03, "step": 20750, "train_speed(iter/s)": 1.469492 }, { "acc": 1.0, "epoch": 36.63724624889674, "grad_norm": 0.5336109399795532, "learning_rate": 1.8214475831954198e-06, "loss": 0.00642919, "memory(GiB)": 15.03, "step": 20755, "train_speed(iter/s)": 1.469509 }, { "acc": 0.99953709, "epoch": 36.646072374227714, "grad_norm": 0.302643746137619, "learning_rate": 1.8191936939021671e-06, "loss": 0.00651422, "memory(GiB)": 15.03, "step": 20760, "train_speed(iter/s)": 1.469494 }, { "acc": 1.0, "epoch": 36.65489849955869, "grad_norm": 0.19679082930088043, "learning_rate": 1.8169408905972127e-06, "loss": 0.002238, "memory(GiB)": 15.03, "step": 20765, "train_speed(iter/s)": 1.469482 }, { "acc": 1.0, "epoch": 36.663724624889674, "grad_norm": 0.3198138177394867, "learning_rate": 1.8146891740495894e-06, "loss": 0.00518233, "memory(GiB)": 15.03, "step": 20770, "train_speed(iter/s)": 1.469483 }, { "acc": 1.0, "epoch": 36.67255075022065, "grad_norm": 0.006469623651355505, "learning_rate": 1.8124385450279546e-06, "loss": 0.00317187, "memory(GiB)": 15.03, "step": 20775, "train_speed(iter/s)": 1.469488 }, { "acc": 0.99936724, "epoch": 36.681376875551635, "grad_norm": 0.4812902808189392, "learning_rate": 1.8101890043005946e-06, "loss": 0.00678779, "memory(GiB)": 15.03, "step": 20780, "train_speed(iter/s)": 1.46948 }, { "acc": 1.0, "epoch": 36.69020300088261, "grad_norm": 0.0028452437836676836, "learning_rate": 1.8079405526354232e-06, "loss": 0.001282, "memory(GiB)": 15.03, "step": 20785, "train_speed(iter/s)": 1.469477 }, { "acc": 0.99973965, "epoch": 36.699029126213595, "grad_norm": 0.010689840652048588, "learning_rate": 1.8056931907999885e-06, "loss": 0.00455339, "memory(GiB)": 15.03, "step": 20790, "train_speed(iter/s)": 1.469486 }, { "acc": 0.99950752, "epoch": 36.70785525154457, "grad_norm": 0.44659489393234253, "learning_rate": 1.803446919561459e-06, "loss": 0.00667927, "memory(GiB)": 15.03, "step": 20795, "train_speed(iter/s)": 1.469487 }, { "acc": 1.0, "epoch": 36.71668137687555, "grad_norm": 0.008511505089700222, "learning_rate": 1.8012017396866374e-06, "loss": 0.00469666, "memory(GiB)": 15.03, "step": 20800, "train_speed(iter/s)": 1.469504 }, { "acc": 1.0, "epoch": 36.72550750220653, "grad_norm": 0.005030881613492966, "learning_rate": 1.798957651941948e-06, "loss": 0.00791161, "memory(GiB)": 15.03, "step": 20805, "train_speed(iter/s)": 1.469502 }, { "acc": 0.99917107, "epoch": 36.73433362753751, "grad_norm": 0.0479322224855423, "learning_rate": 1.7967146570934449e-06, "loss": 0.00829047, "memory(GiB)": 15.03, "step": 20810, "train_speed(iter/s)": 1.469489 }, { "acc": 0.99979506, "epoch": 36.74315975286849, "grad_norm": 0.006221085321158171, "learning_rate": 1.7944727559068115e-06, "loss": 0.00302012, "memory(GiB)": 15.03, "step": 20815, "train_speed(iter/s)": 1.469488 }, { "acc": 0.99978065, "epoch": 36.75198587819947, "grad_norm": 0.12247627973556519, "learning_rate": 1.7922319491473544e-06, "loss": 0.00661957, "memory(GiB)": 15.03, "step": 20820, "train_speed(iter/s)": 1.469477 }, { "acc": 0.99946232, "epoch": 36.76081200353045, "grad_norm": 0.034129783511161804, "learning_rate": 1.7899922375800074e-06, "loss": 0.00784751, "memory(GiB)": 15.03, "step": 20825, "train_speed(iter/s)": 1.469469 }, { "acc": 0.99953194, "epoch": 36.76963812886143, "grad_norm": 0.4007125794887543, "learning_rate": 1.7877536219693297e-06, "loss": 0.00881804, "memory(GiB)": 15.03, "step": 20830, "train_speed(iter/s)": 1.469455 }, { "acc": 0.99952059, "epoch": 36.77846425419241, "grad_norm": 0.8708144426345825, "learning_rate": 1.7855161030795103e-06, "loss": 0.00979242, "memory(GiB)": 15.03, "step": 20835, "train_speed(iter/s)": 1.469462 }, { "acc": 0.9998457, "epoch": 36.78729037952339, "grad_norm": 0.11236131191253662, "learning_rate": 1.7832796816743577e-06, "loss": 0.00352135, "memory(GiB)": 15.03, "step": 20840, "train_speed(iter/s)": 1.469476 }, { "acc": 0.99979506, "epoch": 36.79611650485437, "grad_norm": 1.4322459697723389, "learning_rate": 1.7810443585173132e-06, "loss": 0.00925851, "memory(GiB)": 15.03, "step": 20845, "train_speed(iter/s)": 1.469478 }, { "acc": 1.0, "epoch": 36.80494263018535, "grad_norm": 0.34220588207244873, "learning_rate": 1.7788101343714364e-06, "loss": 0.00810039, "memory(GiB)": 15.03, "step": 20850, "train_speed(iter/s)": 1.469483 }, { "acc": 1.0, "epoch": 36.81376875551633, "grad_norm": 0.06893142312765121, "learning_rate": 1.7765770099994146e-06, "loss": 0.00542728, "memory(GiB)": 15.03, "step": 20855, "train_speed(iter/s)": 1.469485 }, { "acc": 0.99945116, "epoch": 36.822594880847305, "grad_norm": 0.3712671101093292, "learning_rate": 1.7743449861635583e-06, "loss": 0.00547673, "memory(GiB)": 15.03, "step": 20860, "train_speed(iter/s)": 1.469503 }, { "acc": 1.0, "epoch": 36.83142100617829, "grad_norm": 0.0175261739641428, "learning_rate": 1.7721140636258064e-06, "loss": 0.00258049, "memory(GiB)": 15.03, "step": 20865, "train_speed(iter/s)": 1.46951 }, { "acc": 0.99961777, "epoch": 36.840247131509265, "grad_norm": 0.16654013097286224, "learning_rate": 1.769884243147716e-06, "loss": 0.00404106, "memory(GiB)": 15.03, "step": 20870, "train_speed(iter/s)": 1.469493 }, { "acc": 1.0, "epoch": 36.84907325684025, "grad_norm": 0.25142771005630493, "learning_rate": 1.7676555254904737e-06, "loss": 0.00146806, "memory(GiB)": 15.03, "step": 20875, "train_speed(iter/s)": 1.469482 }, { "acc": 0.99946709, "epoch": 36.857899382171226, "grad_norm": 0.14548027515411377, "learning_rate": 1.7654279114148858e-06, "loss": 0.00383506, "memory(GiB)": 15.03, "step": 20880, "train_speed(iter/s)": 1.46949 }, { "acc": 0.99929848, "epoch": 36.86672550750221, "grad_norm": 0.03899195045232773, "learning_rate": 1.7632014016813827e-06, "loss": 0.01026102, "memory(GiB)": 15.03, "step": 20885, "train_speed(iter/s)": 1.469491 }, { "acc": 0.99918976, "epoch": 36.87555163283319, "grad_norm": 0.23072968423366547, "learning_rate": 1.7609759970500158e-06, "loss": 0.0049846, "memory(GiB)": 15.03, "step": 20890, "train_speed(iter/s)": 1.469503 }, { "acc": 0.99894695, "epoch": 36.88437775816416, "grad_norm": 0.842979371547699, "learning_rate": 1.7587516982804659e-06, "loss": 0.01087043, "memory(GiB)": 15.03, "step": 20895, "train_speed(iter/s)": 1.469501 }, { "acc": 0.99974995, "epoch": 36.89320388349515, "grad_norm": 0.0057941023260355, "learning_rate": 1.7565285061320297e-06, "loss": 0.00345864, "memory(GiB)": 15.03, "step": 20900, "train_speed(iter/s)": 1.469513 }, { "acc": 0.99990158, "epoch": 36.902030008826124, "grad_norm": 0.24995744228363037, "learning_rate": 1.7543064213636285e-06, "loss": 0.0062291, "memory(GiB)": 15.03, "step": 20905, "train_speed(iter/s)": 1.469528 }, { "acc": 0.99956894, "epoch": 36.91085613415711, "grad_norm": 0.10007430613040924, "learning_rate": 1.752085444733804e-06, "loss": 0.00219804, "memory(GiB)": 15.03, "step": 20910, "train_speed(iter/s)": 1.469534 }, { "acc": 1.0, "epoch": 36.919682259488084, "grad_norm": 0.5618264675140381, "learning_rate": 1.7498655770007214e-06, "loss": 0.00666182, "memory(GiB)": 15.03, "step": 20915, "train_speed(iter/s)": 1.469559 }, { "acc": 0.99894447, "epoch": 36.92850838481906, "grad_norm": 0.2864035665988922, "learning_rate": 1.747646818922171e-06, "loss": 0.00692981, "memory(GiB)": 15.03, "step": 20920, "train_speed(iter/s)": 1.469567 }, { "acc": 1.0, "epoch": 36.937334510150045, "grad_norm": 0.2148773968219757, "learning_rate": 1.745429171255557e-06, "loss": 0.00325189, "memory(GiB)": 15.03, "step": 20925, "train_speed(iter/s)": 1.469565 }, { "acc": 0.99959011, "epoch": 36.94616063548102, "grad_norm": 0.18068380653858185, "learning_rate": 1.7432126347579073e-06, "loss": 0.0071142, "memory(GiB)": 15.03, "step": 20930, "train_speed(iter/s)": 1.469569 }, { "acc": 0.99951248, "epoch": 36.954986760812005, "grad_norm": 0.23701053857803345, "learning_rate": 1.740997210185871e-06, "loss": 0.00665616, "memory(GiB)": 15.03, "step": 20935, "train_speed(iter/s)": 1.46957 }, { "acc": 0.99959011, "epoch": 36.96381288614298, "grad_norm": 0.429525226354599, "learning_rate": 1.7387828982957206e-06, "loss": 0.00660139, "memory(GiB)": 15.03, "step": 20940, "train_speed(iter/s)": 1.469574 }, { "acc": 0.99985466, "epoch": 36.972639011473966, "grad_norm": 0.0027764555998146534, "learning_rate": 1.7365696998433432e-06, "loss": 0.00263122, "memory(GiB)": 15.03, "step": 20945, "train_speed(iter/s)": 1.469595 }, { "acc": 0.99975491, "epoch": 36.98146513680494, "grad_norm": 0.34408217668533325, "learning_rate": 1.7343576155842498e-06, "loss": 0.00392311, "memory(GiB)": 15.03, "step": 20950, "train_speed(iter/s)": 1.469583 }, { "acc": 0.99980164, "epoch": 36.99029126213592, "grad_norm": 0.13866007328033447, "learning_rate": 1.7321466462735672e-06, "loss": 0.0025316, "memory(GiB)": 15.03, "step": 20955, "train_speed(iter/s)": 1.469604 }, { "acc": 1.0, "epoch": 36.9991173874669, "grad_norm": 0.10125969350337982, "learning_rate": 1.729936792666047e-06, "loss": 0.00731516, "memory(GiB)": 15.03, "step": 20960, "train_speed(iter/s)": 1.469615 }, { "acc": 0.99949999, "epoch": 37.00794351279788, "grad_norm": 0.2857555150985718, "learning_rate": 1.7277280555160553e-06, "loss": 0.00596685, "memory(GiB)": 15.03, "step": 20965, "train_speed(iter/s)": 1.469584 }, { "acc": 0.9998457, "epoch": 37.016769638128864, "grad_norm": 0.01605396345257759, "learning_rate": 1.7255204355775803e-06, "loss": 0.00459937, "memory(GiB)": 15.03, "step": 20970, "train_speed(iter/s)": 1.469586 }, { "acc": 0.99979506, "epoch": 37.02559576345984, "grad_norm": 0.3734655976295471, "learning_rate": 1.7233139336042268e-06, "loss": 0.0053152, "memory(GiB)": 15.03, "step": 20975, "train_speed(iter/s)": 1.469564 }, { "acc": 0.9994998, "epoch": 37.034421888790824, "grad_norm": 0.1590978503227234, "learning_rate": 1.721108550349219e-06, "loss": 0.00807005, "memory(GiB)": 15.03, "step": 20980, "train_speed(iter/s)": 1.469572 }, { "acc": 0.99979506, "epoch": 37.0432480141218, "grad_norm": 0.00983121246099472, "learning_rate": 1.7189042865653954e-06, "loss": 0.00284177, "memory(GiB)": 15.03, "step": 20985, "train_speed(iter/s)": 1.469586 }, { "acc": 1.0, "epoch": 37.05207413945278, "grad_norm": 0.20065106451511383, "learning_rate": 1.7167011430052199e-06, "loss": 0.00344139, "memory(GiB)": 15.03, "step": 20990, "train_speed(iter/s)": 1.469585 }, { "acc": 0.99937115, "epoch": 37.06090026478376, "grad_norm": 0.37811797857284546, "learning_rate": 1.7144991204207673e-06, "loss": 0.00414764, "memory(GiB)": 15.03, "step": 20995, "train_speed(iter/s)": 1.469584 }, { "acc": 1.0, "epoch": 37.06972639011474, "grad_norm": 0.3025737702846527, "learning_rate": 1.7122982195637338e-06, "loss": 0.00384917, "memory(GiB)": 15.03, "step": 21000, "train_speed(iter/s)": 1.4696 }, { "acc": 0.99959669, "epoch": 37.07855251544572, "grad_norm": 0.19838108122348785, "learning_rate": 1.7100984411854304e-06, "loss": 0.00457619, "memory(GiB)": 15.03, "step": 21005, "train_speed(iter/s)": 1.469606 }, { "acc": 0.99986973, "epoch": 37.0873786407767, "grad_norm": 0.4599727392196655, "learning_rate": 1.7078997860367854e-06, "loss": 0.00299805, "memory(GiB)": 15.03, "step": 21010, "train_speed(iter/s)": 1.46961 }, { "acc": 0.99956894, "epoch": 37.096204766107675, "grad_norm": 0.04801715537905693, "learning_rate": 1.7057022548683428e-06, "loss": 0.00453378, "memory(GiB)": 15.03, "step": 21015, "train_speed(iter/s)": 1.469601 }, { "acc": 1.0, "epoch": 37.10503089143866, "grad_norm": 0.1841200739145279, "learning_rate": 1.7035058484302666e-06, "loss": 0.00547748, "memory(GiB)": 15.03, "step": 21020, "train_speed(iter/s)": 1.469615 }, { "acc": 0.99984379, "epoch": 37.113857016769636, "grad_norm": 0.1940535455942154, "learning_rate": 1.7013105674723327e-06, "loss": 0.00311792, "memory(GiB)": 15.03, "step": 21025, "train_speed(iter/s)": 1.469627 }, { "acc": 0.9997159, "epoch": 37.12268314210062, "grad_norm": 0.3611815869808197, "learning_rate": 1.6991164127439327e-06, "loss": 0.0074184, "memory(GiB)": 15.03, "step": 21030, "train_speed(iter/s)": 1.469631 }, { "acc": 0.99976416, "epoch": 37.131509267431596, "grad_norm": 0.25646716356277466, "learning_rate": 1.6969233849940788e-06, "loss": 0.00444793, "memory(GiB)": 15.03, "step": 21035, "train_speed(iter/s)": 1.46965 }, { "acc": 1.0, "epoch": 37.14033539276258, "grad_norm": 0.07709137350320816, "learning_rate": 1.6947314849713917e-06, "loss": 0.002689, "memory(GiB)": 15.03, "step": 21040, "train_speed(iter/s)": 1.469652 }, { "acc": 0.99942436, "epoch": 37.14916151809356, "grad_norm": 0.4985784888267517, "learning_rate": 1.6925407134241137e-06, "loss": 0.00572182, "memory(GiB)": 15.03, "step": 21045, "train_speed(iter/s)": 1.469655 }, { "acc": 0.99970932, "epoch": 37.15798764342453, "grad_norm": 0.3898686468601227, "learning_rate": 1.6903510711000975e-06, "loss": 0.0042881, "memory(GiB)": 15.03, "step": 21050, "train_speed(iter/s)": 1.469662 }, { "acc": 1.0, "epoch": 37.16681376875552, "grad_norm": 0.1578664928674698, "learning_rate": 1.6881625587468106e-06, "loss": 0.00333517, "memory(GiB)": 15.03, "step": 21055, "train_speed(iter/s)": 1.469661 }, { "acc": 1.0, "epoch": 37.175639894086494, "grad_norm": 0.257155179977417, "learning_rate": 1.685975177111335e-06, "loss": 0.00337901, "memory(GiB)": 15.03, "step": 21060, "train_speed(iter/s)": 1.46969 }, { "acc": 0.99979506, "epoch": 37.18446601941748, "grad_norm": 0.39772775769233704, "learning_rate": 1.6837889269403698e-06, "loss": 0.00778629, "memory(GiB)": 15.03, "step": 21065, "train_speed(iter/s)": 1.46968 }, { "acc": 0.99858274, "epoch": 37.193292144748455, "grad_norm": 0.530670702457428, "learning_rate": 1.6816038089802244e-06, "loss": 0.01033485, "memory(GiB)": 15.03, "step": 21070, "train_speed(iter/s)": 1.469699 }, { "acc": 1.0, "epoch": 37.20211827007944, "grad_norm": 0.09428568929433823, "learning_rate": 1.6794198239768231e-06, "loss": 0.00382903, "memory(GiB)": 15.03, "step": 21075, "train_speed(iter/s)": 1.469705 }, { "acc": 1.0, "epoch": 37.210944395410415, "grad_norm": 0.2624572515487671, "learning_rate": 1.6772369726757005e-06, "loss": 0.00310834, "memory(GiB)": 15.03, "step": 21080, "train_speed(iter/s)": 1.469715 }, { "acc": 0.99974995, "epoch": 37.21977052074139, "grad_norm": 0.6416389346122742, "learning_rate": 1.6750552558220106e-06, "loss": 0.00301776, "memory(GiB)": 15.03, "step": 21085, "train_speed(iter/s)": 1.469713 }, { "acc": 0.99990158, "epoch": 37.228596646072376, "grad_norm": 0.6565731167793274, "learning_rate": 1.6728746741605129e-06, "loss": 0.00837296, "memory(GiB)": 15.03, "step": 21090, "train_speed(iter/s)": 1.469718 }, { "acc": 0.9997282, "epoch": 37.23742277140335, "grad_norm": 0.055505357682704926, "learning_rate": 1.6706952284355856e-06, "loss": 0.00594216, "memory(GiB)": 15.03, "step": 21095, "train_speed(iter/s)": 1.46972 }, { "acc": 0.99959869, "epoch": 37.246248896734336, "grad_norm": 0.2682381868362427, "learning_rate": 1.6685169193912162e-06, "loss": 0.00398802, "memory(GiB)": 15.03, "step": 21100, "train_speed(iter/s)": 1.469737 }, { "acc": 1.0, "epoch": 37.25507502206531, "grad_norm": 0.017892736941576004, "learning_rate": 1.6663397477710028e-06, "loss": 0.00235841, "memory(GiB)": 15.03, "step": 21105, "train_speed(iter/s)": 1.469742 }, { "acc": 1.0, "epoch": 37.26390114739629, "grad_norm": 0.017667455598711967, "learning_rate": 1.6641637143181557e-06, "loss": 0.00542745, "memory(GiB)": 15.03, "step": 21110, "train_speed(iter/s)": 1.469752 }, { "acc": 0.99957581, "epoch": 37.27272727272727, "grad_norm": 0.28101956844329834, "learning_rate": 1.6619888197755005e-06, "loss": 0.00467757, "memory(GiB)": 15.03, "step": 21115, "train_speed(iter/s)": 1.469744 }, { "acc": 0.99946232, "epoch": 37.28155339805825, "grad_norm": 0.24809806048870087, "learning_rate": 1.6598150648854687e-06, "loss": 0.00707751, "memory(GiB)": 15.03, "step": 21120, "train_speed(iter/s)": 1.469764 }, { "acc": 0.99982872, "epoch": 37.290379523389234, "grad_norm": 0.01668470911681652, "learning_rate": 1.657642450390108e-06, "loss": 0.00508605, "memory(GiB)": 15.03, "step": 21125, "train_speed(iter/s)": 1.469772 }, { "acc": 0.99987621, "epoch": 37.29920564872021, "grad_norm": 0.0463331863284111, "learning_rate": 1.6554709770310731e-06, "loss": 0.00321595, "memory(GiB)": 15.03, "step": 21130, "train_speed(iter/s)": 1.469784 }, { "acc": 0.99970236, "epoch": 37.308031774051194, "grad_norm": 0.18990951776504517, "learning_rate": 1.653300645549628e-06, "loss": 0.01460354, "memory(GiB)": 15.03, "step": 21135, "train_speed(iter/s)": 1.469798 }, { "acc": 0.99969511, "epoch": 37.31685789938217, "grad_norm": 0.28083741664886475, "learning_rate": 1.6511314566866518e-06, "loss": 0.00493207, "memory(GiB)": 15.03, "step": 21140, "train_speed(iter/s)": 1.469802 }, { "acc": 0.99956894, "epoch": 37.32568402471315, "grad_norm": 0.5829600691795349, "learning_rate": 1.6489634111826306e-06, "loss": 0.00685947, "memory(GiB)": 15.03, "step": 21145, "train_speed(iter/s)": 1.469813 }, { "acc": 0.99947224, "epoch": 37.33451015004413, "grad_norm": 0.23293770849704742, "learning_rate": 1.6467965097776588e-06, "loss": 0.00408467, "memory(GiB)": 15.03, "step": 21150, "train_speed(iter/s)": 1.469817 }, { "acc": 1.0, "epoch": 37.34333627537511, "grad_norm": 0.1983867883682251, "learning_rate": 1.6446307532114412e-06, "loss": 0.00160189, "memory(GiB)": 15.03, "step": 21155, "train_speed(iter/s)": 1.469808 }, { "acc": 1.0, "epoch": 37.35216240070609, "grad_norm": 0.3219849765300751, "learning_rate": 1.6424661422232956e-06, "loss": 0.00265182, "memory(GiB)": 15.03, "step": 21160, "train_speed(iter/s)": 1.469807 }, { "acc": 0.99951639, "epoch": 37.36098852603707, "grad_norm": 0.04172181338071823, "learning_rate": 1.6403026775521419e-06, "loss": 0.00622178, "memory(GiB)": 15.03, "step": 21165, "train_speed(iter/s)": 1.46982 }, { "acc": 1.0, "epoch": 37.36981465136805, "grad_norm": 0.05348602309823036, "learning_rate": 1.6381403599365158e-06, "loss": 0.00146007, "memory(GiB)": 15.03, "step": 21170, "train_speed(iter/s)": 1.469828 }, { "acc": 1.0, "epoch": 37.37864077669903, "grad_norm": 0.08967379480600357, "learning_rate": 1.635979190114557e-06, "loss": 0.00269412, "memory(GiB)": 15.03, "step": 21175, "train_speed(iter/s)": 1.469849 }, { "acc": 1.0, "epoch": 37.387466902030006, "grad_norm": 0.7085072994232178, "learning_rate": 1.6338191688240136e-06, "loss": 0.00451379, "memory(GiB)": 15.03, "step": 21180, "train_speed(iter/s)": 1.469857 }, { "acc": 0.99953346, "epoch": 37.39629302736099, "grad_norm": 0.022090477868914604, "learning_rate": 1.6316602968022405e-06, "loss": 0.00780691, "memory(GiB)": 15.03, "step": 21185, "train_speed(iter/s)": 1.46986 }, { "acc": 0.99981346, "epoch": 37.40511915269197, "grad_norm": 0.4826734960079193, "learning_rate": 1.629502574786206e-06, "loss": 0.00905975, "memory(GiB)": 15.03, "step": 21190, "train_speed(iter/s)": 1.469864 }, { "acc": 1.0, "epoch": 37.41394527802295, "grad_norm": 0.28355714678764343, "learning_rate": 1.6273460035124805e-06, "loss": 0.00386405, "memory(GiB)": 15.03, "step": 21195, "train_speed(iter/s)": 1.469859 }, { "acc": 0.99921379, "epoch": 37.42277140335393, "grad_norm": 2.5929019451141357, "learning_rate": 1.6251905837172427e-06, "loss": 0.00858085, "memory(GiB)": 15.03, "step": 21200, "train_speed(iter/s)": 1.469868 }, { "acc": 1.0, "epoch": 37.431597528684904, "grad_norm": 0.17966189980506897, "learning_rate": 1.6230363161362778e-06, "loss": 0.00590208, "memory(GiB)": 15.03, "step": 21205, "train_speed(iter/s)": 1.469889 }, { "acc": 1.0, "epoch": 37.44042365401589, "grad_norm": 0.20473222434520721, "learning_rate": 1.6208832015049802e-06, "loss": 0.00333119, "memory(GiB)": 15.03, "step": 21210, "train_speed(iter/s)": 1.469892 }, { "acc": 1.0, "epoch": 37.449249779346864, "grad_norm": 0.5573592782020569, "learning_rate": 1.6187312405583468e-06, "loss": 0.00596147, "memory(GiB)": 15.03, "step": 21215, "train_speed(iter/s)": 1.469899 }, { "acc": 0.99917164, "epoch": 37.45807590467785, "grad_norm": 0.27108675241470337, "learning_rate": 1.6165804340309865e-06, "loss": 0.0076138, "memory(GiB)": 15.03, "step": 21220, "train_speed(iter/s)": 1.469903 }, { "acc": 0.99962959, "epoch": 37.466902030008825, "grad_norm": 0.45093756914138794, "learning_rate": 1.6144307826571085e-06, "loss": 0.00638301, "memory(GiB)": 15.03, "step": 21225, "train_speed(iter/s)": 1.469908 }, { "acc": 1.0, "epoch": 37.47572815533981, "grad_norm": 0.24007175862789154, "learning_rate": 1.6122822871705294e-06, "loss": 0.00230502, "memory(GiB)": 15.03, "step": 21230, "train_speed(iter/s)": 1.469902 }, { "acc": 1.0, "epoch": 37.484554280670785, "grad_norm": 0.1772460639476776, "learning_rate": 1.6101349483046697e-06, "loss": 0.00359103, "memory(GiB)": 15.03, "step": 21235, "train_speed(iter/s)": 1.469898 }, { "acc": 0.99965277, "epoch": 37.49338040600176, "grad_norm": 0.051468461751937866, "learning_rate": 1.6079887667925598e-06, "loss": 0.0032302, "memory(GiB)": 15.03, "step": 21240, "train_speed(iter/s)": 1.469883 }, { "acc": 1.0, "epoch": 37.502206531332746, "grad_norm": 0.15133561193943024, "learning_rate": 1.6058437433668304e-06, "loss": 0.00492334, "memory(GiB)": 15.03, "step": 21245, "train_speed(iter/s)": 1.469875 }, { "acc": 0.99973965, "epoch": 37.51103265666372, "grad_norm": 0.5643420815467834, "learning_rate": 1.6036998787597206e-06, "loss": 0.00309159, "memory(GiB)": 15.03, "step": 21250, "train_speed(iter/s)": 1.469895 }, { "acc": 0.99970236, "epoch": 37.51985878199471, "grad_norm": 0.19466853141784668, "learning_rate": 1.6015571737030702e-06, "loss": 0.00848654, "memory(GiB)": 15.03, "step": 21255, "train_speed(iter/s)": 1.469885 }, { "acc": 1.0, "epoch": 37.52868490732568, "grad_norm": 0.8553580045700073, "learning_rate": 1.5994156289283236e-06, "loss": 0.00547138, "memory(GiB)": 15.03, "step": 21260, "train_speed(iter/s)": 1.469891 }, { "acc": 1.0, "epoch": 37.53751103265667, "grad_norm": 0.5480849742889404, "learning_rate": 1.5972752451665341e-06, "loss": 0.0032391, "memory(GiB)": 15.03, "step": 21265, "train_speed(iter/s)": 1.4699 }, { "acc": 0.99974318, "epoch": 37.546337157987644, "grad_norm": 0.17345891892910004, "learning_rate": 1.5951360231483522e-06, "loss": 0.00915635, "memory(GiB)": 15.03, "step": 21270, "train_speed(iter/s)": 1.469906 }, { "acc": 0.99940929, "epoch": 37.55516328331862, "grad_norm": 0.5201378464698792, "learning_rate": 1.5929979636040352e-06, "loss": 0.01114029, "memory(GiB)": 15.03, "step": 21275, "train_speed(iter/s)": 1.469923 }, { "acc": 1.0, "epoch": 37.563989408649604, "grad_norm": 0.0746016800403595, "learning_rate": 1.5908610672634412e-06, "loss": 0.00125902, "memory(GiB)": 15.03, "step": 21280, "train_speed(iter/s)": 1.469909 }, { "acc": 0.99953985, "epoch": 37.57281553398058, "grad_norm": 0.24759143590927124, "learning_rate": 1.5887253348560358e-06, "loss": 0.01024701, "memory(GiB)": 15.03, "step": 21285, "train_speed(iter/s)": 1.469911 }, { "acc": 0.99948158, "epoch": 37.581641659311565, "grad_norm": 0.19151835143566132, "learning_rate": 1.5865907671108817e-06, "loss": 0.00578727, "memory(GiB)": 15.03, "step": 21290, "train_speed(iter/s)": 1.469925 }, { "acc": 0.99892807, "epoch": 37.59046778464254, "grad_norm": 0.7739430069923401, "learning_rate": 1.5844573647566496e-06, "loss": 0.01075526, "memory(GiB)": 15.03, "step": 21295, "train_speed(iter/s)": 1.469928 }, { "acc": 0.9997159, "epoch": 37.59929390997352, "grad_norm": 0.5424031019210815, "learning_rate": 1.5823251285216088e-06, "loss": 0.00440936, "memory(GiB)": 15.03, "step": 21300, "train_speed(iter/s)": 1.469935 }, { "acc": 1.0, "epoch": 37.6081200353045, "grad_norm": 0.38455942273139954, "learning_rate": 1.58019405913363e-06, "loss": 0.0044932, "memory(GiB)": 15.03, "step": 21305, "train_speed(iter/s)": 1.469938 }, { "acc": 0.9998106, "epoch": 37.61694616063548, "grad_norm": 0.6413201689720154, "learning_rate": 1.5780641573201857e-06, "loss": 0.00674258, "memory(GiB)": 15.03, "step": 21310, "train_speed(iter/s)": 1.469947 }, { "acc": 1.0, "epoch": 37.62577228596646, "grad_norm": 0.03756264969706535, "learning_rate": 1.5759354238083543e-06, "loss": 0.00409473, "memory(GiB)": 15.03, "step": 21315, "train_speed(iter/s)": 1.469955 }, { "acc": 0.99960098, "epoch": 37.63459841129744, "grad_norm": 0.31711676716804504, "learning_rate": 1.5738078593248103e-06, "loss": 0.00578496, "memory(GiB)": 15.03, "step": 21320, "train_speed(iter/s)": 1.469972 }, { "acc": 0.99919291, "epoch": 37.64342453662842, "grad_norm": 1.9969561100006104, "learning_rate": 1.5716814645958314e-06, "loss": 0.0035133, "memory(GiB)": 15.03, "step": 21325, "train_speed(iter/s)": 1.469983 }, { "acc": 1.0, "epoch": 37.6522506619594, "grad_norm": 0.2560315728187561, "learning_rate": 1.5695562403472933e-06, "loss": 0.0033356, "memory(GiB)": 15.03, "step": 21330, "train_speed(iter/s)": 1.469985 }, { "acc": 1.0, "epoch": 37.661076787290376, "grad_norm": 0.013178855180740356, "learning_rate": 1.5674321873046775e-06, "loss": 0.00355258, "memory(GiB)": 15.03, "step": 21335, "train_speed(iter/s)": 1.470002 }, { "acc": 0.99967947, "epoch": 37.66990291262136, "grad_norm": 0.40159615874290466, "learning_rate": 1.5653093061930585e-06, "loss": 0.00772702, "memory(GiB)": 15.03, "step": 21340, "train_speed(iter/s)": 1.470015 }, { "acc": 1.0, "epoch": 37.67872903795234, "grad_norm": 0.26544710993766785, "learning_rate": 1.56318759773712e-06, "loss": 0.00606239, "memory(GiB)": 15.03, "step": 21345, "train_speed(iter/s)": 1.470011 }, { "acc": 0.9998311, "epoch": 37.68755516328332, "grad_norm": 0.013726743869483471, "learning_rate": 1.5610670626611365e-06, "loss": 0.00343155, "memory(GiB)": 15.03, "step": 21350, "train_speed(iter/s)": 1.470019 }, { "acc": 1.0, "epoch": 37.6963812886143, "grad_norm": 1.2183741331100464, "learning_rate": 1.5589477016889854e-06, "loss": 0.00498698, "memory(GiB)": 15.03, "step": 21355, "train_speed(iter/s)": 1.470026 }, { "acc": 0.99979506, "epoch": 37.70520741394528, "grad_norm": 0.33402538299560547, "learning_rate": 1.5568295155441455e-06, "loss": 0.00381194, "memory(GiB)": 15.03, "step": 21360, "train_speed(iter/s)": 1.470046 }, { "acc": 0.99985123, "epoch": 37.71403353927626, "grad_norm": 0.04630003869533539, "learning_rate": 1.55471250494969e-06, "loss": 0.00506088, "memory(GiB)": 15.03, "step": 21365, "train_speed(iter/s)": 1.470059 }, { "acc": 0.99954996, "epoch": 37.722859664607235, "grad_norm": 0.2564777135848999, "learning_rate": 1.5525966706282964e-06, "loss": 0.0031023, "memory(GiB)": 15.03, "step": 21370, "train_speed(iter/s)": 1.470063 }, { "acc": 1.0, "epoch": 37.73168578993822, "grad_norm": 0.021683113649487495, "learning_rate": 1.5504820133022359e-06, "loss": 0.00087479, "memory(GiB)": 15.03, "step": 21375, "train_speed(iter/s)": 1.470075 }, { "acc": 0.99947052, "epoch": 37.740511915269195, "grad_norm": 0.30444738268852234, "learning_rate": 1.54836853369338e-06, "loss": 0.00599322, "memory(GiB)": 15.03, "step": 21380, "train_speed(iter/s)": 1.470073 }, { "acc": 1.0, "epoch": 37.74933804060018, "grad_norm": 0.6228015422821045, "learning_rate": 1.5462562325231955e-06, "loss": 0.00921501, "memory(GiB)": 15.03, "step": 21385, "train_speed(iter/s)": 1.470063 }, { "acc": 0.99961557, "epoch": 37.758164165931156, "grad_norm": 0.1032637357711792, "learning_rate": 1.5441451105127534e-06, "loss": 0.00539937, "memory(GiB)": 15.03, "step": 21390, "train_speed(iter/s)": 1.470076 }, { "acc": 0.99982643, "epoch": 37.76699029126213, "grad_norm": 0.35694509744644165, "learning_rate": 1.5420351683827154e-06, "loss": 0.00593139, "memory(GiB)": 15.03, "step": 21395, "train_speed(iter/s)": 1.470077 }, { "acc": 1.0, "epoch": 37.775816416593116, "grad_norm": 0.4147913455963135, "learning_rate": 1.5399264068533435e-06, "loss": 0.005218, "memory(GiB)": 15.03, "step": 21400, "train_speed(iter/s)": 1.470104 }, { "acc": 0.99972219, "epoch": 37.78464254192409, "grad_norm": 0.46650880575180054, "learning_rate": 1.5378188266444943e-06, "loss": 0.00725167, "memory(GiB)": 15.03, "step": 21405, "train_speed(iter/s)": 1.470087 }, { "acc": 0.9997282, "epoch": 37.79346866725508, "grad_norm": 0.1596396416425705, "learning_rate": 1.535712428475626e-06, "loss": 0.00835602, "memory(GiB)": 15.03, "step": 21410, "train_speed(iter/s)": 1.470109 }, { "acc": 1.0, "epoch": 37.80229479258605, "grad_norm": 0.08130895346403122, "learning_rate": 1.5336072130657887e-06, "loss": 0.00215094, "memory(GiB)": 15.03, "step": 21415, "train_speed(iter/s)": 1.470105 }, { "acc": 1.0, "epoch": 37.81112091791704, "grad_norm": 0.008451271802186966, "learning_rate": 1.5315031811336314e-06, "loss": 0.00194535, "memory(GiB)": 15.03, "step": 21420, "train_speed(iter/s)": 1.470108 }, { "acc": 1.0, "epoch": 37.819947043248014, "grad_norm": 0.2020994871854782, "learning_rate": 1.5294003333973982e-06, "loss": 0.0023363, "memory(GiB)": 15.03, "step": 21425, "train_speed(iter/s)": 1.47009 }, { "acc": 1.0, "epoch": 37.82877316857899, "grad_norm": 0.08496468514204025, "learning_rate": 1.5272986705749268e-06, "loss": 0.00408497, "memory(GiB)": 15.03, "step": 21430, "train_speed(iter/s)": 1.470095 }, { "acc": 0.99927292, "epoch": 37.837599293909975, "grad_norm": 0.09305769205093384, "learning_rate": 1.525198193383653e-06, "loss": 0.00886542, "memory(GiB)": 15.03, "step": 21435, "train_speed(iter/s)": 1.470092 }, { "acc": 1.0, "epoch": 37.84642541924095, "grad_norm": 0.01578410714864731, "learning_rate": 1.5230989025406098e-06, "loss": 0.00290596, "memory(GiB)": 15.03, "step": 21440, "train_speed(iter/s)": 1.470104 }, { "acc": 0.99969511, "epoch": 37.855251544571935, "grad_norm": 0.42202645540237427, "learning_rate": 1.5210007987624203e-06, "loss": 0.00785025, "memory(GiB)": 15.03, "step": 21445, "train_speed(iter/s)": 1.470104 }, { "acc": 0.99973965, "epoch": 37.86407766990291, "grad_norm": 0.2738766670227051, "learning_rate": 1.5189038827653056e-06, "loss": 0.00729777, "memory(GiB)": 15.03, "step": 21450, "train_speed(iter/s)": 1.470098 }, { "acc": 0.99987497, "epoch": 37.872903795233896, "grad_norm": 0.02979225479066372, "learning_rate": 1.516808155265079e-06, "loss": 0.00396814, "memory(GiB)": 15.03, "step": 21455, "train_speed(iter/s)": 1.470123 }, { "acc": 1.0, "epoch": 37.88172992056487, "grad_norm": 0.094859778881073, "learning_rate": 1.514713616977151e-06, "loss": 0.00138847, "memory(GiB)": 15.03, "step": 21460, "train_speed(iter/s)": 1.470138 }, { "acc": 1.0, "epoch": 37.89055604589585, "grad_norm": 0.26759687066078186, "learning_rate": 1.5126202686165258e-06, "loss": 0.00554419, "memory(GiB)": 15.03, "step": 21465, "train_speed(iter/s)": 1.47015 }, { "acc": 0.99976854, "epoch": 37.89938217122683, "grad_norm": 0.08523917943239212, "learning_rate": 1.5105281108977995e-06, "loss": 0.00306497, "memory(GiB)": 15.03, "step": 21470, "train_speed(iter/s)": 1.470163 }, { "acc": 0.99988317, "epoch": 37.90820829655781, "grad_norm": 0.10467228293418884, "learning_rate": 1.5084371445351625e-06, "loss": 0.00447417, "memory(GiB)": 15.03, "step": 21475, "train_speed(iter/s)": 1.47017 }, { "acc": 1.0, "epoch": 37.91703442188879, "grad_norm": 0.14105574786663055, "learning_rate": 1.5063473702423975e-06, "loss": 0.00249239, "memory(GiB)": 15.03, "step": 21480, "train_speed(iter/s)": 1.470167 }, { "acc": 0.99979706, "epoch": 37.92586054721977, "grad_norm": 0.009704582393169403, "learning_rate": 1.5042587887328833e-06, "loss": 0.00554161, "memory(GiB)": 15.03, "step": 21485, "train_speed(iter/s)": 1.470186 }, { "acc": 1.0, "epoch": 37.93468667255075, "grad_norm": 0.225467711687088, "learning_rate": 1.5021714007195877e-06, "loss": 0.00431273, "memory(GiB)": 15.03, "step": 21490, "train_speed(iter/s)": 1.470209 }, { "acc": 0.99987116, "epoch": 37.94351279788173, "grad_norm": 0.05867612734436989, "learning_rate": 1.5000852069150753e-06, "loss": 0.0044903, "memory(GiB)": 15.03, "step": 21495, "train_speed(iter/s)": 1.470219 }, { "acc": 1.0, "epoch": 37.95233892321271, "grad_norm": 0.368386834859848, "learning_rate": 1.4980002080314999e-06, "loss": 0.00394041, "memory(GiB)": 15.03, "step": 21500, "train_speed(iter/s)": 1.470231 }, { "acc": 0.99942455, "epoch": 37.96116504854369, "grad_norm": 0.24788491427898407, "learning_rate": 1.4959164047806073e-06, "loss": 0.00350792, "memory(GiB)": 15.03, "step": 21505, "train_speed(iter/s)": 1.47024 }, { "acc": 0.99954739, "epoch": 37.96999117387467, "grad_norm": 0.3374648690223694, "learning_rate": 1.4938337978737358e-06, "loss": 0.00669252, "memory(GiB)": 15.03, "step": 21510, "train_speed(iter/s)": 1.470237 }, { "acc": 0.99981613, "epoch": 37.97881729920565, "grad_norm": 0.2720601558685303, "learning_rate": 1.4917523880218176e-06, "loss": 0.00207561, "memory(GiB)": 15.03, "step": 21515, "train_speed(iter/s)": 1.470235 }, { "acc": 0.99982872, "epoch": 37.98764342453663, "grad_norm": 0.4211428463459015, "learning_rate": 1.4896721759353735e-06, "loss": 0.00849007, "memory(GiB)": 15.03, "step": 21520, "train_speed(iter/s)": 1.470229 }, { "acc": 0.99980469, "epoch": 37.996469549867605, "grad_norm": 0.3201862573623657, "learning_rate": 1.4875931623245152e-06, "loss": 0.00513783, "memory(GiB)": 15.03, "step": 21525, "train_speed(iter/s)": 1.470221 }, { "acc": 1.0, "epoch": 38.00529567519859, "grad_norm": 0.03203576058149338, "learning_rate": 1.4855153478989456e-06, "loss": 0.00396245, "memory(GiB)": 15.03, "step": 21530, "train_speed(iter/s)": 1.47018 }, { "acc": 1.0, "epoch": 38.014121800529566, "grad_norm": 0.457798033952713, "learning_rate": 1.483438733367962e-06, "loss": 0.00380628, "memory(GiB)": 15.03, "step": 21535, "train_speed(iter/s)": 1.470169 }, { "acc": 0.99978809, "epoch": 38.02294792586055, "grad_norm": 0.286793053150177, "learning_rate": 1.4813633194404452e-06, "loss": 0.00209302, "memory(GiB)": 15.03, "step": 21540, "train_speed(iter/s)": 1.470167 }, { "acc": 0.99987116, "epoch": 38.031774051191526, "grad_norm": 0.1273355484008789, "learning_rate": 1.4792891068248744e-06, "loss": 0.00350333, "memory(GiB)": 15.03, "step": 21545, "train_speed(iter/s)": 1.470169 }, { "acc": 1.0, "epoch": 38.04060017652251, "grad_norm": 0.16043013334274292, "learning_rate": 1.477216096229311e-06, "loss": 0.00314832, "memory(GiB)": 15.03, "step": 21550, "train_speed(iter/s)": 1.47017 }, { "acc": 1.0, "epoch": 38.04942630185349, "grad_norm": 0.017185691744089127, "learning_rate": 1.4751442883614112e-06, "loss": 0.00078608, "memory(GiB)": 15.03, "step": 21555, "train_speed(iter/s)": 1.470175 }, { "acc": 0.99972372, "epoch": 38.05825242718446, "grad_norm": 0.009447721764445305, "learning_rate": 1.4730736839284155e-06, "loss": 0.00418009, "memory(GiB)": 15.03, "step": 21560, "train_speed(iter/s)": 1.470164 }, { "acc": 0.99966221, "epoch": 38.06707855251545, "grad_norm": 0.21639050543308258, "learning_rate": 1.4710042836371608e-06, "loss": 0.00424793, "memory(GiB)": 15.03, "step": 21565, "train_speed(iter/s)": 1.470153 }, { "acc": 1.0, "epoch": 38.075904677846424, "grad_norm": 0.4339740574359894, "learning_rate": 1.4689360881940674e-06, "loss": 0.00509116, "memory(GiB)": 15.03, "step": 21570, "train_speed(iter/s)": 1.470171 }, { "acc": 1.0, "epoch": 38.08473080317741, "grad_norm": 0.2818065285682678, "learning_rate": 1.4668690983051443e-06, "loss": 0.00355617, "memory(GiB)": 15.03, "step": 21575, "train_speed(iter/s)": 1.470184 }, { "acc": 0.99949999, "epoch": 38.093556928508384, "grad_norm": 0.39865535497665405, "learning_rate": 1.4648033146759938e-06, "loss": 0.00714839, "memory(GiB)": 15.03, "step": 21580, "train_speed(iter/s)": 1.470168 }, { "acc": 0.99954453, "epoch": 38.10238305383936, "grad_norm": 1.701865792274475, "learning_rate": 1.4627387380118006e-06, "loss": 0.00158596, "memory(GiB)": 15.03, "step": 21585, "train_speed(iter/s)": 1.470147 }, { "acc": 1.0, "epoch": 38.111209179170345, "grad_norm": 0.3459719717502594, "learning_rate": 1.460675369017342e-06, "loss": 0.00409778, "memory(GiB)": 15.03, "step": 21590, "train_speed(iter/s)": 1.470148 }, { "acc": 1.0, "epoch": 38.12003530450132, "grad_norm": 0.018117016181349754, "learning_rate": 1.4586132083969804e-06, "loss": 0.00502571, "memory(GiB)": 15.03, "step": 21595, "train_speed(iter/s)": 1.470155 }, { "acc": 1.0, "epoch": 38.128861429832305, "grad_norm": 0.08250527828931808, "learning_rate": 1.4565522568546655e-06, "loss": 0.0057594, "memory(GiB)": 15.03, "step": 21600, "train_speed(iter/s)": 1.470163 }, { "acc": 0.99976854, "epoch": 38.13768755516328, "grad_norm": 0.5063766837120056, "learning_rate": 1.4544925150939345e-06, "loss": 0.00555634, "memory(GiB)": 15.03, "step": 21605, "train_speed(iter/s)": 1.470178 }, { "acc": 1.0, "epoch": 38.146513680494266, "grad_norm": 0.07645954936742783, "learning_rate": 1.4524339838179147e-06, "loss": 0.00160556, "memory(GiB)": 15.03, "step": 21610, "train_speed(iter/s)": 1.470176 }, { "acc": 1.0, "epoch": 38.15533980582524, "grad_norm": 0.11610513925552368, "learning_rate": 1.4503766637293151e-06, "loss": 0.00109347, "memory(GiB)": 15.03, "step": 21615, "train_speed(iter/s)": 1.470163 }, { "acc": 0.99938154, "epoch": 38.16416593115622, "grad_norm": 0.45366692543029785, "learning_rate": 1.4483205555304367e-06, "loss": 0.00764191, "memory(GiB)": 15.03, "step": 21620, "train_speed(iter/s)": 1.470158 }, { "acc": 1.0, "epoch": 38.1729920564872, "grad_norm": 0.008042136207222939, "learning_rate": 1.446265659923162e-06, "loss": 0.00512455, "memory(GiB)": 15.03, "step": 21625, "train_speed(iter/s)": 1.470161 }, { "acc": 1.0, "epoch": 38.18181818181818, "grad_norm": 0.3072381615638733, "learning_rate": 1.4442119776089619e-06, "loss": 0.00218784, "memory(GiB)": 15.03, "step": 21630, "train_speed(iter/s)": 1.47016 }, { "acc": 1.0, "epoch": 38.190644307149164, "grad_norm": 0.06949611753225327, "learning_rate": 1.4421595092888915e-06, "loss": 0.00446151, "memory(GiB)": 15.03, "step": 21635, "train_speed(iter/s)": 1.470144 }, { "acc": 1.0, "epoch": 38.19947043248014, "grad_norm": 0.15091781318187714, "learning_rate": 1.440108255663595e-06, "loss": 0.00415683, "memory(GiB)": 15.03, "step": 21640, "train_speed(iter/s)": 1.470159 }, { "acc": 0.9996212, "epoch": 38.208296557811124, "grad_norm": 0.6279335618019104, "learning_rate": 1.438058217433299e-06, "loss": 0.00498977, "memory(GiB)": 15.03, "step": 21645, "train_speed(iter/s)": 1.470151 }, { "acc": 0.99888783, "epoch": 38.2171226831421, "grad_norm": 1.3370834589004517, "learning_rate": 1.436009395297816e-06, "loss": 0.00969076, "memory(GiB)": 15.03, "step": 21650, "train_speed(iter/s)": 1.470163 }, { "acc": 0.99921875, "epoch": 38.22594880847308, "grad_norm": 0.47076234221458435, "learning_rate": 1.433961789956541e-06, "loss": 0.00962408, "memory(GiB)": 15.03, "step": 21655, "train_speed(iter/s)": 1.470156 }, { "acc": 1.0, "epoch": 38.23477493380406, "grad_norm": 0.17053508758544922, "learning_rate": 1.4319154021084583e-06, "loss": 0.00312216, "memory(GiB)": 15.03, "step": 21660, "train_speed(iter/s)": 1.470175 }, { "acc": 0.9998106, "epoch": 38.24360105913504, "grad_norm": 0.40973854064941406, "learning_rate": 1.4298702324521327e-06, "loss": 0.00614536, "memory(GiB)": 15.03, "step": 21665, "train_speed(iter/s)": 1.470175 }, { "acc": 1.0, "epoch": 38.25242718446602, "grad_norm": 0.15842610597610474, "learning_rate": 1.4278262816857173e-06, "loss": 0.00159503, "memory(GiB)": 15.03, "step": 21670, "train_speed(iter/s)": 1.470195 }, { "acc": 0.99972219, "epoch": 38.261253309797, "grad_norm": 0.0204179547727108, "learning_rate": 1.4257835505069437e-06, "loss": 0.00423747, "memory(GiB)": 15.03, "step": 21675, "train_speed(iter/s)": 1.470197 }, { "acc": 0.99980774, "epoch": 38.270079435127975, "grad_norm": 0.27523648738861084, "learning_rate": 1.4237420396131296e-06, "loss": 0.00654871, "memory(GiB)": 15.03, "step": 21680, "train_speed(iter/s)": 1.470201 }, { "acc": 1.0, "epoch": 38.27890556045896, "grad_norm": 0.04681440815329552, "learning_rate": 1.421701749701179e-06, "loss": 0.00325152, "memory(GiB)": 15.03, "step": 21685, "train_speed(iter/s)": 1.470206 }, { "acc": 0.99987116, "epoch": 38.287731685789936, "grad_norm": 0.31555742025375366, "learning_rate": 1.4196626814675755e-06, "loss": 0.00627667, "memory(GiB)": 15.03, "step": 21690, "train_speed(iter/s)": 1.470202 }, { "acc": 1.0, "epoch": 38.29655781112092, "grad_norm": 0.23916836082935333, "learning_rate": 1.4176248356083858e-06, "loss": 0.00371477, "memory(GiB)": 15.03, "step": 21695, "train_speed(iter/s)": 1.470199 }, { "acc": 1.0, "epoch": 38.305383936451896, "grad_norm": 0.2023865282535553, "learning_rate": 1.4155882128192596e-06, "loss": 0.00345602, "memory(GiB)": 15.03, "step": 21700, "train_speed(iter/s)": 1.470198 }, { "acc": 1.0, "epoch": 38.31421006178288, "grad_norm": 0.052336256951093674, "learning_rate": 1.4135528137954318e-06, "loss": 0.00369059, "memory(GiB)": 15.03, "step": 21705, "train_speed(iter/s)": 1.470207 }, { "acc": 1.0, "epoch": 38.32303618711386, "grad_norm": 0.17720776796340942, "learning_rate": 1.4115186392317153e-06, "loss": 0.00405217, "memory(GiB)": 15.03, "step": 21710, "train_speed(iter/s)": 1.470195 }, { "acc": 1.0, "epoch": 38.331862312444834, "grad_norm": 0.004603615961968899, "learning_rate": 1.4094856898225104e-06, "loss": 0.00040182, "memory(GiB)": 15.03, "step": 21715, "train_speed(iter/s)": 1.470211 }, { "acc": 0.9998457, "epoch": 38.34068843777582, "grad_norm": 0.01516451220959425, "learning_rate": 1.4074539662617937e-06, "loss": 0.00403849, "memory(GiB)": 15.03, "step": 21720, "train_speed(iter/s)": 1.470232 }, { "acc": 0.9994669, "epoch": 38.349514563106794, "grad_norm": 0.006979384459555149, "learning_rate": 1.4054234692431254e-06, "loss": 0.00468994, "memory(GiB)": 15.03, "step": 21725, "train_speed(iter/s)": 1.470251 }, { "acc": 0.99978809, "epoch": 38.35834068843778, "grad_norm": 0.02076650969684124, "learning_rate": 1.4033941994596462e-06, "loss": 0.00491108, "memory(GiB)": 15.03, "step": 21730, "train_speed(iter/s)": 1.470242 }, { "acc": 1.0, "epoch": 38.367166813768755, "grad_norm": 0.33347785472869873, "learning_rate": 1.4013661576040825e-06, "loss": 0.00387146, "memory(GiB)": 15.03, "step": 21735, "train_speed(iter/s)": 1.470249 }, { "acc": 0.99969511, "epoch": 38.37599293909974, "grad_norm": 0.4661696255207062, "learning_rate": 1.3993393443687347e-06, "loss": 0.00589817, "memory(GiB)": 15.03, "step": 21740, "train_speed(iter/s)": 1.470262 }, { "acc": 0.99824352, "epoch": 38.384819064430715, "grad_norm": 1.3839129209518433, "learning_rate": 1.3973137604454882e-06, "loss": 0.01639534, "memory(GiB)": 15.03, "step": 21745, "train_speed(iter/s)": 1.470255 }, { "acc": 1.0, "epoch": 38.39364518976169, "grad_norm": 0.01182895340025425, "learning_rate": 1.3952894065258056e-06, "loss": 0.00246692, "memory(GiB)": 15.03, "step": 21750, "train_speed(iter/s)": 1.470278 }, { "acc": 1.0, "epoch": 38.402471315092676, "grad_norm": 0.008970530703663826, "learning_rate": 1.3932662833007349e-06, "loss": 0.00590722, "memory(GiB)": 15.03, "step": 21755, "train_speed(iter/s)": 1.470276 }, { "acc": 0.99984379, "epoch": 38.41129744042365, "grad_norm": 0.27074113488197327, "learning_rate": 1.3912443914608972e-06, "loss": 0.00411691, "memory(GiB)": 15.03, "step": 21760, "train_speed(iter/s)": 1.470291 }, { "acc": 0.99989033, "epoch": 38.420123565754636, "grad_norm": 0.2917521297931671, "learning_rate": 1.3892237316965e-06, "loss": 0.00886943, "memory(GiB)": 15.03, "step": 21765, "train_speed(iter/s)": 1.470293 }, { "acc": 0.99932384, "epoch": 38.42894969108561, "grad_norm": 0.5893796682357788, "learning_rate": 1.387204304697325e-06, "loss": 0.00789565, "memory(GiB)": 15.03, "step": 21770, "train_speed(iter/s)": 1.470301 }, { "acc": 0.99957619, "epoch": 38.43777581641659, "grad_norm": 0.01196334883570671, "learning_rate": 1.3851861111527349e-06, "loss": 0.00645159, "memory(GiB)": 15.03, "step": 21775, "train_speed(iter/s)": 1.470285 }, { "acc": 1.0, "epoch": 38.44660194174757, "grad_norm": 0.2250034362077713, "learning_rate": 1.3831691517516708e-06, "loss": 0.00234301, "memory(GiB)": 15.03, "step": 21780, "train_speed(iter/s)": 1.470288 }, { "acc": 1.0, "epoch": 38.45542806707855, "grad_norm": 0.28490331768989563, "learning_rate": 1.3811534271826545e-06, "loss": 0.00404866, "memory(GiB)": 15.03, "step": 21785, "train_speed(iter/s)": 1.470282 }, { "acc": 0.99984941, "epoch": 38.464254192409534, "grad_norm": 0.4291517734527588, "learning_rate": 1.379138938133784e-06, "loss": 0.0028638, "memory(GiB)": 15.03, "step": 21790, "train_speed(iter/s)": 1.470268 }, { "acc": 1.0, "epoch": 38.47308031774051, "grad_norm": 0.014813866466283798, "learning_rate": 1.3771256852927382e-06, "loss": 0.00210854, "memory(GiB)": 15.03, "step": 21795, "train_speed(iter/s)": 1.470289 }, { "acc": 0.99974489, "epoch": 38.481906443071495, "grad_norm": 0.5010682940483093, "learning_rate": 1.3751136693467703e-06, "loss": 0.00693816, "memory(GiB)": 15.03, "step": 21800, "train_speed(iter/s)": 1.470304 }, { "acc": 1.0, "epoch": 38.49073256840247, "grad_norm": 0.3096100986003876, "learning_rate": 1.373102890982712e-06, "loss": 0.00408118, "memory(GiB)": 15.03, "step": 21805, "train_speed(iter/s)": 1.470299 }, { "acc": 1.0, "epoch": 38.49955869373345, "grad_norm": 0.03658943623304367, "learning_rate": 1.3710933508869774e-06, "loss": 0.00205471, "memory(GiB)": 15.03, "step": 21810, "train_speed(iter/s)": 1.470312 }, { "acc": 0.99989033, "epoch": 38.50838481906443, "grad_norm": 0.3581581115722656, "learning_rate": 1.3690850497455516e-06, "loss": 0.00513321, "memory(GiB)": 15.03, "step": 21815, "train_speed(iter/s)": 1.470324 }, { "acc": 1.0, "epoch": 38.51721094439541, "grad_norm": 0.3385297954082489, "learning_rate": 1.3670779882440005e-06, "loss": 0.00501942, "memory(GiB)": 15.03, "step": 21820, "train_speed(iter/s)": 1.470348 }, { "acc": 0.99926128, "epoch": 38.52603706972639, "grad_norm": 0.05312838405370712, "learning_rate": 1.3650721670674636e-06, "loss": 0.0028401, "memory(GiB)": 15.03, "step": 21825, "train_speed(iter/s)": 1.470359 }, { "acc": 0.99916668, "epoch": 38.53486319505737, "grad_norm": 0.010730471462011337, "learning_rate": 1.363067586900662e-06, "loss": 0.00782957, "memory(GiB)": 15.03, "step": 21830, "train_speed(iter/s)": 1.470363 }, { "acc": 0.99938898, "epoch": 38.54368932038835, "grad_norm": 0.8263199329376221, "learning_rate": 1.3610642484278877e-06, "loss": 0.0086622, "memory(GiB)": 15.03, "step": 21835, "train_speed(iter/s)": 1.470372 }, { "acc": 0.99974995, "epoch": 38.55251544571933, "grad_norm": 0.2989823818206787, "learning_rate": 1.359062152333014e-06, "loss": 0.00584255, "memory(GiB)": 15.03, "step": 21840, "train_speed(iter/s)": 1.470377 }, { "acc": 0.99976854, "epoch": 38.561341571050306, "grad_norm": 0.16342255473136902, "learning_rate": 1.357061299299486e-06, "loss": 0.0057306, "memory(GiB)": 15.03, "step": 21845, "train_speed(iter/s)": 1.470387 }, { "acc": 0.99970236, "epoch": 38.57016769638129, "grad_norm": 0.4006889760494232, "learning_rate": 1.3550616900103263e-06, "loss": 0.0071524, "memory(GiB)": 15.03, "step": 21850, "train_speed(iter/s)": 1.470398 }, { "acc": 0.99916859, "epoch": 38.57899382171227, "grad_norm": 0.13603466749191284, "learning_rate": 1.3530633251481307e-06, "loss": 0.00764621, "memory(GiB)": 15.03, "step": 21855, "train_speed(iter/s)": 1.470405 }, { "acc": 1.0, "epoch": 38.58781994704325, "grad_norm": 0.049395885318517685, "learning_rate": 1.3510662053950754e-06, "loss": 0.00401341, "memory(GiB)": 15.03, "step": 21860, "train_speed(iter/s)": 1.470415 }, { "acc": 0.99973402, "epoch": 38.59664607237423, "grad_norm": 0.008068983443081379, "learning_rate": 1.349070331432906e-06, "loss": 0.00302558, "memory(GiB)": 15.03, "step": 21865, "train_speed(iter/s)": 1.47042 }, { "acc": 1.0, "epoch": 38.605472197705204, "grad_norm": 0.10527512431144714, "learning_rate": 1.3470757039429461e-06, "loss": 0.00327389, "memory(GiB)": 15.03, "step": 21870, "train_speed(iter/s)": 1.470404 }, { "acc": 1.0, "epoch": 38.61429832303619, "grad_norm": 0.011018842458724976, "learning_rate": 1.3450823236060906e-06, "loss": 0.0014318, "memory(GiB)": 15.03, "step": 21875, "train_speed(iter/s)": 1.470393 }, { "acc": 1.0, "epoch": 38.623124448367165, "grad_norm": 0.37172406911849976, "learning_rate": 1.3430901911028141e-06, "loss": 0.00221075, "memory(GiB)": 15.03, "step": 21880, "train_speed(iter/s)": 1.470396 }, { "acc": 0.99937353, "epoch": 38.63195057369815, "grad_norm": 0.2793061435222626, "learning_rate": 1.3410993071131586e-06, "loss": 0.01141675, "memory(GiB)": 15.03, "step": 21885, "train_speed(iter/s)": 1.470397 }, { "acc": 0.99986706, "epoch": 38.640776699029125, "grad_norm": 0.010318984277546406, "learning_rate": 1.339109672316747e-06, "loss": 0.00379874, "memory(GiB)": 15.03, "step": 21890, "train_speed(iter/s)": 1.470408 }, { "acc": 0.99956894, "epoch": 38.64960282436011, "grad_norm": 2.062422752380371, "learning_rate": 1.33712128739277e-06, "loss": 0.00632004, "memory(GiB)": 15.03, "step": 21895, "train_speed(iter/s)": 1.470415 }, { "acc": 0.99963236, "epoch": 38.658428949691086, "grad_norm": 0.39456290006637573, "learning_rate": 1.335134153019992e-06, "loss": 0.00803705, "memory(GiB)": 15.03, "step": 21900, "train_speed(iter/s)": 1.470402 }, { "acc": 0.99956894, "epoch": 38.66725507502206, "grad_norm": 0.06841760873794556, "learning_rate": 1.3331482698767554e-06, "loss": 0.00619902, "memory(GiB)": 15.03, "step": 21905, "train_speed(iter/s)": 1.470408 }, { "acc": 0.99963236, "epoch": 38.676081200353046, "grad_norm": 0.004592767916619778, "learning_rate": 1.331163638640969e-06, "loss": 0.00693836, "memory(GiB)": 15.03, "step": 21910, "train_speed(iter/s)": 1.470408 }, { "acc": 0.9997159, "epoch": 38.68490732568402, "grad_norm": 0.003426638897508383, "learning_rate": 1.3291802599901216e-06, "loss": 0.00745695, "memory(GiB)": 15.03, "step": 21915, "train_speed(iter/s)": 1.470422 }, { "acc": 0.99978447, "epoch": 38.69373345101501, "grad_norm": 0.44065535068511963, "learning_rate": 1.3271981346012672e-06, "loss": 0.01127586, "memory(GiB)": 15.03, "step": 21920, "train_speed(iter/s)": 1.47043 }, { "acc": 0.99981613, "epoch": 38.70255957634598, "grad_norm": 0.4140743017196655, "learning_rate": 1.325217263151036e-06, "loss": 0.00411149, "memory(GiB)": 15.03, "step": 21925, "train_speed(iter/s)": 1.470437 }, { "acc": 0.99869318, "epoch": 38.71138570167697, "grad_norm": 0.012640483677387238, "learning_rate": 1.3232376463156274e-06, "loss": 0.0049621, "memory(GiB)": 15.03, "step": 21930, "train_speed(iter/s)": 1.470438 }, { "acc": 1.0, "epoch": 38.720211827007944, "grad_norm": 0.26981720328330994, "learning_rate": 1.321259284770817e-06, "loss": 0.00344228, "memory(GiB)": 15.03, "step": 21935, "train_speed(iter/s)": 1.470416 }, { "acc": 1.0, "epoch": 38.72903795233892, "grad_norm": 0.0791802778840065, "learning_rate": 1.3192821791919487e-06, "loss": 0.00658432, "memory(GiB)": 15.03, "step": 21940, "train_speed(iter/s)": 1.470419 }, { "acc": 0.9998188, "epoch": 38.737864077669904, "grad_norm": 0.004274338949471712, "learning_rate": 1.3173063302539377e-06, "loss": 0.00354121, "memory(GiB)": 15.03, "step": 21945, "train_speed(iter/s)": 1.47044 }, { "acc": 0.9997282, "epoch": 38.74669020300088, "grad_norm": 0.1693202555179596, "learning_rate": 1.315331738631269e-06, "loss": 0.00399407, "memory(GiB)": 15.03, "step": 21950, "train_speed(iter/s)": 1.470457 }, { "acc": 0.99958668, "epoch": 38.755516328331865, "grad_norm": 0.3755573332309723, "learning_rate": 1.3133584049980033e-06, "loss": 0.00862174, "memory(GiB)": 15.03, "step": 21955, "train_speed(iter/s)": 1.470477 }, { "acc": 0.99933167, "epoch": 38.76434245366284, "grad_norm": 0.6954278945922852, "learning_rate": 1.311386330027766e-06, "loss": 0.01612415, "memory(GiB)": 15.03, "step": 21960, "train_speed(iter/s)": 1.470479 }, { "acc": 0.99980774, "epoch": 38.77316857899382, "grad_norm": 0.22612278163433075, "learning_rate": 1.309415514393758e-06, "loss": 0.00248081, "memory(GiB)": 15.03, "step": 21965, "train_speed(iter/s)": 1.470495 }, { "acc": 1.0, "epoch": 38.7819947043248, "grad_norm": 0.05627543479204178, "learning_rate": 1.3074459587687464e-06, "loss": 0.00238596, "memory(GiB)": 15.03, "step": 21970, "train_speed(iter/s)": 1.4705 }, { "acc": 0.99985466, "epoch": 38.79082082965578, "grad_norm": 0.3341808617115021, "learning_rate": 1.3054776638250692e-06, "loss": 0.00230406, "memory(GiB)": 15.03, "step": 21975, "train_speed(iter/s)": 1.470501 }, { "acc": 0.99986839, "epoch": 38.79964695498676, "grad_norm": 0.5275895595550537, "learning_rate": 1.303510630234634e-06, "loss": 0.01247396, "memory(GiB)": 15.03, "step": 21980, "train_speed(iter/s)": 1.470503 }, { "acc": 0.99970093, "epoch": 38.80847308031774, "grad_norm": 1.1572753190994263, "learning_rate": 1.3015448586689208e-06, "loss": 0.00762318, "memory(GiB)": 15.03, "step": 21985, "train_speed(iter/s)": 1.470514 }, { "acc": 0.99972219, "epoch": 38.81729920564872, "grad_norm": 0.3044637143611908, "learning_rate": 1.2995803497989746e-06, "loss": 0.0026926, "memory(GiB)": 15.03, "step": 21990, "train_speed(iter/s)": 1.470526 }, { "acc": 1.0, "epoch": 38.8261253309797, "grad_norm": 0.2516695261001587, "learning_rate": 1.2976171042954104e-06, "loss": 0.00268557, "memory(GiB)": 15.03, "step": 21995, "train_speed(iter/s)": 1.470527 }, { "acc": 1.0, "epoch": 38.83495145631068, "grad_norm": 0.32163694500923157, "learning_rate": 1.2956551228284122e-06, "loss": 0.00431298, "memory(GiB)": 15.03, "step": 22000, "train_speed(iter/s)": 1.470531 }, { "acc": 0.99990158, "epoch": 38.84377758164166, "grad_norm": 0.13618160784244537, "learning_rate": 1.2936944060677337e-06, "loss": 0.00253155, "memory(GiB)": 15.03, "step": 22005, "train_speed(iter/s)": 1.47052 }, { "acc": 0.99926243, "epoch": 38.85260370697264, "grad_norm": 0.3920629024505615, "learning_rate": 1.2917349546826973e-06, "loss": 0.01057029, "memory(GiB)": 15.03, "step": 22010, "train_speed(iter/s)": 1.470517 }, { "acc": 1.0, "epoch": 38.86142983230362, "grad_norm": 0.007108214311301708, "learning_rate": 1.2897767693421914e-06, "loss": 0.00211404, "memory(GiB)": 15.03, "step": 22015, "train_speed(iter/s)": 1.470504 }, { "acc": 1.0, "epoch": 38.8702559576346, "grad_norm": 0.06904494762420654, "learning_rate": 1.2878198507146722e-06, "loss": 0.00102902, "memory(GiB)": 15.03, "step": 22020, "train_speed(iter/s)": 1.470514 }, { "acc": 0.99956884, "epoch": 38.87908208296558, "grad_norm": 1.0214966535568237, "learning_rate": 1.2858641994681635e-06, "loss": 0.01131739, "memory(GiB)": 15.03, "step": 22025, "train_speed(iter/s)": 1.470514 }, { "acc": 0.99954472, "epoch": 38.88790820829656, "grad_norm": 0.20908960700035095, "learning_rate": 1.2839098162702596e-06, "loss": 0.00817568, "memory(GiB)": 15.03, "step": 22030, "train_speed(iter/s)": 1.470515 }, { "acc": 0.99966011, "epoch": 38.896734333627535, "grad_norm": 0.05100598186254501, "learning_rate": 1.2819567017881166e-06, "loss": 0.00253714, "memory(GiB)": 15.03, "step": 22035, "train_speed(iter/s)": 1.470513 }, { "acc": 0.99955311, "epoch": 38.90556045895852, "grad_norm": 0.3320994973182678, "learning_rate": 1.2800048566884632e-06, "loss": 0.00288793, "memory(GiB)": 15.03, "step": 22040, "train_speed(iter/s)": 1.470515 }, { "acc": 0.9993845, "epoch": 38.914386584289495, "grad_norm": 0.4190920293331146, "learning_rate": 1.2780542816375917e-06, "loss": 0.0044812, "memory(GiB)": 15.03, "step": 22045, "train_speed(iter/s)": 1.470522 }, { "acc": 0.99979172, "epoch": 38.92321270962048, "grad_norm": 0.5128288865089417, "learning_rate": 1.2761049773013605e-06, "loss": 0.00555502, "memory(GiB)": 15.03, "step": 22050, "train_speed(iter/s)": 1.470514 }, { "acc": 0.99974995, "epoch": 38.932038834951456, "grad_norm": 0.17729581892490387, "learning_rate": 1.2741569443451923e-06, "loss": 0.00605004, "memory(GiB)": 15.03, "step": 22055, "train_speed(iter/s)": 1.470514 }, { "acc": 0.99936218, "epoch": 38.94086496028243, "grad_norm": 0.2258409857749939, "learning_rate": 1.2722101834340828e-06, "loss": 0.00407968, "memory(GiB)": 15.03, "step": 22060, "train_speed(iter/s)": 1.470529 }, { "acc": 0.99952221, "epoch": 38.949691085613416, "grad_norm": 0.6702372431755066, "learning_rate": 1.2702646952325875e-06, "loss": 0.00687094, "memory(GiB)": 15.03, "step": 22065, "train_speed(iter/s)": 1.470559 }, { "acc": 1.0, "epoch": 38.95851721094439, "grad_norm": 0.003607160644605756, "learning_rate": 1.2683204804048283e-06, "loss": 0.00595412, "memory(GiB)": 15.03, "step": 22070, "train_speed(iter/s)": 1.470574 }, { "acc": 0.99881248, "epoch": 38.96734333627538, "grad_norm": 0.1649170070886612, "learning_rate": 1.2663775396144926e-06, "loss": 0.01132291, "memory(GiB)": 15.03, "step": 22075, "train_speed(iter/s)": 1.470582 }, { "acc": 1.0, "epoch": 38.976169461606354, "grad_norm": 0.24964971840381622, "learning_rate": 1.2644358735248353e-06, "loss": 0.00087559, "memory(GiB)": 15.03, "step": 22080, "train_speed(iter/s)": 1.470578 }, { "acc": 0.99980164, "epoch": 38.98499558693734, "grad_norm": 0.34617725014686584, "learning_rate": 1.2624954827986715e-06, "loss": 0.00610419, "memory(GiB)": 15.03, "step": 22085, "train_speed(iter/s)": 1.470564 }, { "acc": 1.0, "epoch": 38.993821712268314, "grad_norm": 0.31189778447151184, "learning_rate": 1.260556368098387e-06, "loss": 0.00584409, "memory(GiB)": 15.03, "step": 22090, "train_speed(iter/s)": 1.470569 }, { "acc": 1.0, "epoch": 39.00264783759929, "grad_norm": 0.008656321093440056, "learning_rate": 1.2586185300859277e-06, "loss": 0.00092372, "memory(GiB)": 15.03, "step": 22095, "train_speed(iter/s)": 1.47053 }, { "acc": 1.0, "epoch": 39.011473962930275, "grad_norm": 0.19088022410869598, "learning_rate": 1.2566819694228038e-06, "loss": 0.00446184, "memory(GiB)": 15.03, "step": 22100, "train_speed(iter/s)": 1.470548 }, { "acc": 1.0, "epoch": 39.02030008826125, "grad_norm": 0.6306004524230957, "learning_rate": 1.2547466867700889e-06, "loss": 0.00436954, "memory(GiB)": 15.03, "step": 22105, "train_speed(iter/s)": 1.470532 }, { "acc": 0.9991148, "epoch": 39.029126213592235, "grad_norm": 0.043872423470020294, "learning_rate": 1.2528126827884254e-06, "loss": 0.00547804, "memory(GiB)": 15.03, "step": 22110, "train_speed(iter/s)": 1.470547 }, { "acc": 1.0, "epoch": 39.03795233892321, "grad_norm": 0.2804846167564392, "learning_rate": 1.250879958138014e-06, "loss": 0.00287633, "memory(GiB)": 15.03, "step": 22115, "train_speed(iter/s)": 1.470561 }, { "acc": 1.0, "epoch": 39.046778464254196, "grad_norm": 0.7893935441970825, "learning_rate": 1.2489485134786184e-06, "loss": 0.00941351, "memory(GiB)": 15.03, "step": 22120, "train_speed(iter/s)": 1.470553 }, { "acc": 1.0, "epoch": 39.05560458958517, "grad_norm": 0.012236339971423149, "learning_rate": 1.2470183494695701e-06, "loss": 0.00248575, "memory(GiB)": 15.03, "step": 22125, "train_speed(iter/s)": 1.470555 }, { "acc": 0.99951925, "epoch": 39.06443071491615, "grad_norm": 0.08905375003814697, "learning_rate": 1.2450894667697582e-06, "loss": 0.00896057, "memory(GiB)": 15.03, "step": 22130, "train_speed(iter/s)": 1.470557 }, { "acc": 1.0, "epoch": 39.07325684024713, "grad_norm": 0.12773719429969788, "learning_rate": 1.2431618660376391e-06, "loss": 0.00400365, "memory(GiB)": 15.03, "step": 22135, "train_speed(iter/s)": 1.470562 }, { "acc": 0.99975491, "epoch": 39.08208296557811, "grad_norm": 0.30891552567481995, "learning_rate": 1.2412355479312282e-06, "loss": 0.00581133, "memory(GiB)": 15.03, "step": 22140, "train_speed(iter/s)": 1.470562 }, { "acc": 0.99987116, "epoch": 39.09090909090909, "grad_norm": 0.27971726655960083, "learning_rate": 1.2393105131081037e-06, "loss": 0.00553441, "memory(GiB)": 15.03, "step": 22145, "train_speed(iter/s)": 1.47056 }, { "acc": 1.0, "epoch": 39.09973521624007, "grad_norm": 0.03276746720075607, "learning_rate": 1.2373867622254044e-06, "loss": 0.00170475, "memory(GiB)": 15.03, "step": 22150, "train_speed(iter/s)": 1.470582 }, { "acc": 0.9997159, "epoch": 39.10856134157105, "grad_norm": 0.0034497149754315615, "learning_rate": 1.2354642959398365e-06, "loss": 0.00295091, "memory(GiB)": 15.03, "step": 22155, "train_speed(iter/s)": 1.470584 }, { "acc": 0.99975491, "epoch": 39.11738746690203, "grad_norm": 0.3982962369918823, "learning_rate": 1.23354311490766e-06, "loss": 0.00737194, "memory(GiB)": 15.03, "step": 22160, "train_speed(iter/s)": 1.470564 }, { "acc": 1.0, "epoch": 39.12621359223301, "grad_norm": 0.004842342343181372, "learning_rate": 1.2316232197847032e-06, "loss": 0.00460757, "memory(GiB)": 15.03, "step": 22165, "train_speed(iter/s)": 1.470593 }, { "acc": 1.0, "epoch": 39.13503971756399, "grad_norm": 0.3552801012992859, "learning_rate": 1.2297046112263502e-06, "loss": 0.0056311, "memory(GiB)": 15.03, "step": 22170, "train_speed(iter/s)": 1.470595 }, { "acc": 0.99980164, "epoch": 39.14386584289497, "grad_norm": 0.038852982223033905, "learning_rate": 1.227787289887548e-06, "loss": 0.00361749, "memory(GiB)": 15.03, "step": 22175, "train_speed(iter/s)": 1.470593 }, { "acc": 0.99987745, "epoch": 39.15269196822595, "grad_norm": 0.5422189831733704, "learning_rate": 1.2258712564228034e-06, "loss": 0.0050593, "memory(GiB)": 15.03, "step": 22180, "train_speed(iter/s)": 1.470595 }, { "acc": 0.99957628, "epoch": 39.16151809355693, "grad_norm": 0.41902807354927063, "learning_rate": 1.2239565114861852e-06, "loss": 0.01045853, "memory(GiB)": 15.03, "step": 22185, "train_speed(iter/s)": 1.470587 }, { "acc": 0.9998188, "epoch": 39.170344218887905, "grad_norm": 0.2993398904800415, "learning_rate": 1.2220430557313214e-06, "loss": 0.00206616, "memory(GiB)": 15.03, "step": 22190, "train_speed(iter/s)": 1.470608 }, { "acc": 0.99940071, "epoch": 39.17917034421889, "grad_norm": 0.49265187978744507, "learning_rate": 1.2201308898113997e-06, "loss": 0.00727085, "memory(GiB)": 15.03, "step": 22195, "train_speed(iter/s)": 1.470617 }, { "acc": 1.0, "epoch": 39.187996469549866, "grad_norm": 0.3765396177768707, "learning_rate": 1.2182200143791658e-06, "loss": 0.0018478, "memory(GiB)": 15.03, "step": 22200, "train_speed(iter/s)": 1.470602 }, { "acc": 1.0, "epoch": 39.19682259488085, "grad_norm": 0.040747348219156265, "learning_rate": 1.2163104300869296e-06, "loss": 0.00266198, "memory(GiB)": 15.03, "step": 22205, "train_speed(iter/s)": 1.470607 }, { "acc": 0.99978809, "epoch": 39.205648720211826, "grad_norm": 0.20248544216156006, "learning_rate": 1.214402137586554e-06, "loss": 0.00477725, "memory(GiB)": 15.03, "step": 22210, "train_speed(iter/s)": 1.470609 }, { "acc": 1.0, "epoch": 39.21447484554281, "grad_norm": 0.005526500754058361, "learning_rate": 1.2124951375294679e-06, "loss": 0.00231221, "memory(GiB)": 15.03, "step": 22215, "train_speed(iter/s)": 1.470621 }, { "acc": 0.99920378, "epoch": 39.22330097087379, "grad_norm": 0.5856613516807556, "learning_rate": 1.210589430566654e-06, "loss": 0.00804488, "memory(GiB)": 15.03, "step": 22220, "train_speed(iter/s)": 1.470618 }, { "acc": 1.0, "epoch": 39.23212709620476, "grad_norm": 0.3566894233226776, "learning_rate": 1.2086850173486529e-06, "loss": 0.00380212, "memory(GiB)": 15.03, "step": 22225, "train_speed(iter/s)": 1.470626 }, { "acc": 1.0, "epoch": 39.24095322153575, "grad_norm": 0.38531628251075745, "learning_rate": 1.206781898525569e-06, "loss": 0.00400043, "memory(GiB)": 15.03, "step": 22230, "train_speed(iter/s)": 1.470637 }, { "acc": 0.9998311, "epoch": 39.249779346866724, "grad_norm": 0.44849875569343567, "learning_rate": 1.20488007474706e-06, "loss": 0.00496629, "memory(GiB)": 15.03, "step": 22235, "train_speed(iter/s)": 1.47065 }, { "acc": 1.0, "epoch": 39.25860547219771, "grad_norm": 0.16356948018074036, "learning_rate": 1.2029795466623428e-06, "loss": 0.00163329, "memory(GiB)": 15.03, "step": 22240, "train_speed(iter/s)": 1.470658 }, { "acc": 0.9998106, "epoch": 39.267431597528685, "grad_norm": 1.4180561304092407, "learning_rate": 1.2010803149201903e-06, "loss": 0.00432293, "memory(GiB)": 15.03, "step": 22245, "train_speed(iter/s)": 1.470654 }, { "acc": 1.0, "epoch": 39.27625772285966, "grad_norm": 0.2617226243019104, "learning_rate": 1.1991823801689383e-06, "loss": 0.00167718, "memory(GiB)": 15.03, "step": 22250, "train_speed(iter/s)": 1.470652 }, { "acc": 1.0, "epoch": 39.285083848190645, "grad_norm": 0.007452644407749176, "learning_rate": 1.1972857430564736e-06, "loss": 0.01174861, "memory(GiB)": 15.03, "step": 22255, "train_speed(iter/s)": 1.470659 }, { "acc": 1.0, "epoch": 39.29390997352162, "grad_norm": 0.10999694466590881, "learning_rate": 1.1953904042302449e-06, "loss": 0.0086017, "memory(GiB)": 15.03, "step": 22260, "train_speed(iter/s)": 1.470648 }, { "acc": 1.0, "epoch": 39.302736098852606, "grad_norm": 0.023900581523776054, "learning_rate": 1.193496364337254e-06, "loss": 0.0028032, "memory(GiB)": 15.03, "step": 22265, "train_speed(iter/s)": 1.470651 }, { "acc": 1.0, "epoch": 39.31156222418358, "grad_norm": 0.10513874143362045, "learning_rate": 1.1916036240240614e-06, "loss": 0.00354737, "memory(GiB)": 15.03, "step": 22270, "train_speed(iter/s)": 1.47065 }, { "acc": 1.0, "epoch": 39.320388349514566, "grad_norm": 0.006772433873265982, "learning_rate": 1.1897121839367823e-06, "loss": 0.0031463, "memory(GiB)": 15.03, "step": 22275, "train_speed(iter/s)": 1.470662 }, { "acc": 1.0, "epoch": 39.32921447484554, "grad_norm": 0.013297559693455696, "learning_rate": 1.1878220447210893e-06, "loss": 0.00041797, "memory(GiB)": 15.03, "step": 22280, "train_speed(iter/s)": 1.470687 }, { "acc": 1.0, "epoch": 39.33804060017652, "grad_norm": 0.5366057753562927, "learning_rate": 1.1859332070222147e-06, "loss": 0.00717786, "memory(GiB)": 15.03, "step": 22285, "train_speed(iter/s)": 1.470682 }, { "acc": 1.0, "epoch": 39.3468667255075, "grad_norm": 0.023785408586263657, "learning_rate": 1.1840456714849365e-06, "loss": 0.00308552, "memory(GiB)": 15.03, "step": 22290, "train_speed(iter/s)": 1.470688 }, { "acc": 1.0, "epoch": 39.35569285083848, "grad_norm": 0.03397442400455475, "learning_rate": 1.1821594387535983e-06, "loss": 0.00550936, "memory(GiB)": 15.03, "step": 22295, "train_speed(iter/s)": 1.470701 }, { "acc": 1.0, "epoch": 39.364518976169464, "grad_norm": 0.3332420289516449, "learning_rate": 1.1802745094720929e-06, "loss": 0.00269229, "memory(GiB)": 15.03, "step": 22300, "train_speed(iter/s)": 1.470702 }, { "acc": 1.0, "epoch": 39.37334510150044, "grad_norm": 0.3875822126865387, "learning_rate": 1.1783908842838738e-06, "loss": 0.00545057, "memory(GiB)": 15.03, "step": 22305, "train_speed(iter/s)": 1.470712 }, { "acc": 1.0, "epoch": 39.382171226831424, "grad_norm": 0.06561661511659622, "learning_rate": 1.1765085638319396e-06, "loss": 0.00184646, "memory(GiB)": 15.03, "step": 22310, "train_speed(iter/s)": 1.4707 }, { "acc": 1.0, "epoch": 39.3909973521624, "grad_norm": 0.2094532996416092, "learning_rate": 1.1746275487588548e-06, "loss": 0.00372709, "memory(GiB)": 15.03, "step": 22315, "train_speed(iter/s)": 1.470699 }, { "acc": 0.99978065, "epoch": 39.39982347749338, "grad_norm": 0.3748449683189392, "learning_rate": 1.1727478397067298e-06, "loss": 0.00914959, "memory(GiB)": 15.03, "step": 22320, "train_speed(iter/s)": 1.470686 }, { "acc": 0.99983559, "epoch": 39.40864960282436, "grad_norm": 0.09329292178153992, "learning_rate": 1.1708694373172366e-06, "loss": 0.0097131, "memory(GiB)": 15.03, "step": 22325, "train_speed(iter/s)": 1.470675 }, { "acc": 0.9998579, "epoch": 39.41747572815534, "grad_norm": 0.5556334257125854, "learning_rate": 1.1689923422315916e-06, "loss": 0.00648165, "memory(GiB)": 15.03, "step": 22330, "train_speed(iter/s)": 1.470692 }, { "acc": 0.99978065, "epoch": 39.42630185348632, "grad_norm": 0.2683987319469452, "learning_rate": 1.1671165550905733e-06, "loss": 0.00530173, "memory(GiB)": 15.03, "step": 22335, "train_speed(iter/s)": 1.470713 }, { "acc": 0.9997159, "epoch": 39.4351279788173, "grad_norm": 0.24683202803134918, "learning_rate": 1.165242076534512e-06, "loss": 0.00437563, "memory(GiB)": 15.03, "step": 22340, "train_speed(iter/s)": 1.470724 }, { "acc": 1.0, "epoch": 39.443954104148276, "grad_norm": 0.5176321268081665, "learning_rate": 1.163368907203287e-06, "loss": 0.004374, "memory(GiB)": 15.03, "step": 22345, "train_speed(iter/s)": 1.470728 }, { "acc": 0.9998106, "epoch": 39.45278022947926, "grad_norm": 0.29870977997779846, "learning_rate": 1.161497047736338e-06, "loss": 0.0089687, "memory(GiB)": 15.03, "step": 22350, "train_speed(iter/s)": 1.470729 }, { "acc": 0.99973402, "epoch": 39.461606354810236, "grad_norm": 0.34212079644203186, "learning_rate": 1.159626498772648e-06, "loss": 0.00995528, "memory(GiB)": 15.03, "step": 22355, "train_speed(iter/s)": 1.470735 }, { "acc": 1.0, "epoch": 39.47043248014122, "grad_norm": 0.00870918482542038, "learning_rate": 1.157757260950762e-06, "loss": 0.00324394, "memory(GiB)": 15.03, "step": 22360, "train_speed(iter/s)": 1.47075 }, { "acc": 1.0, "epoch": 39.4792586054722, "grad_norm": 0.09648479521274567, "learning_rate": 1.1558893349087703e-06, "loss": 0.00621674, "memory(GiB)": 15.03, "step": 22365, "train_speed(iter/s)": 1.470744 }, { "acc": 0.99979172, "epoch": 39.48808473080318, "grad_norm": 0.44091862440109253, "learning_rate": 1.1540227212843232e-06, "loss": 0.00720744, "memory(GiB)": 15.03, "step": 22370, "train_speed(iter/s)": 1.470749 }, { "acc": 0.99970236, "epoch": 39.49691085613416, "grad_norm": 0.4470570981502533, "learning_rate": 1.1521574207146123e-06, "loss": 0.0078141, "memory(GiB)": 15.03, "step": 22375, "train_speed(iter/s)": 1.470761 }, { "acc": 1.0, "epoch": 39.505736981465134, "grad_norm": 0.133452907204628, "learning_rate": 1.1502934338363894e-06, "loss": 0.00241906, "memory(GiB)": 15.03, "step": 22380, "train_speed(iter/s)": 1.470775 }, { "acc": 0.99972219, "epoch": 39.51456310679612, "grad_norm": 0.3254829943180084, "learning_rate": 1.1484307612859572e-06, "loss": 0.00410798, "memory(GiB)": 15.03, "step": 22385, "train_speed(iter/s)": 1.47077 }, { "acc": 1.0, "epoch": 39.523389232127094, "grad_norm": 0.011654489673674107, "learning_rate": 1.1465694036991667e-06, "loss": 0.00361909, "memory(GiB)": 15.03, "step": 22390, "train_speed(iter/s)": 1.470776 }, { "acc": 0.99954958, "epoch": 39.53221535745808, "grad_norm": 0.33905574679374695, "learning_rate": 1.1447093617114204e-06, "loss": 0.00735642, "memory(GiB)": 15.03, "step": 22395, "train_speed(iter/s)": 1.470732 }, { "acc": 1.0, "epoch": 39.541041482789055, "grad_norm": 0.01257950346916914, "learning_rate": 1.1428506359576725e-06, "loss": 0.00468948, "memory(GiB)": 15.03, "step": 22400, "train_speed(iter/s)": 1.470717 }, { "acc": 1.0, "epoch": 39.54986760812004, "grad_norm": 0.17369811236858368, "learning_rate": 1.140993227072428e-06, "loss": 0.00609867, "memory(GiB)": 15.03, "step": 22405, "train_speed(iter/s)": 1.470724 }, { "acc": 1.0, "epoch": 39.558693733451015, "grad_norm": 0.5164003372192383, "learning_rate": 1.1391371356897448e-06, "loss": 0.00671156, "memory(GiB)": 15.03, "step": 22410, "train_speed(iter/s)": 1.470744 }, { "acc": 1.0, "epoch": 39.56751985878199, "grad_norm": 0.4766514301300049, "learning_rate": 1.137282362443224e-06, "loss": 0.00667671, "memory(GiB)": 15.03, "step": 22415, "train_speed(iter/s)": 1.470742 }, { "acc": 0.99976416, "epoch": 39.576345984112976, "grad_norm": 0.012484446167945862, "learning_rate": 1.1354289079660253e-06, "loss": 0.00515961, "memory(GiB)": 15.03, "step": 22420, "train_speed(iter/s)": 1.470734 }, { "acc": 1.0, "epoch": 39.58517210944395, "grad_norm": 0.0036773092579096556, "learning_rate": 1.1335767728908509e-06, "loss": 0.00280683, "memory(GiB)": 15.03, "step": 22425, "train_speed(iter/s)": 1.470748 }, { "acc": 0.99969511, "epoch": 39.593998234774936, "grad_norm": 0.3482222259044647, "learning_rate": 1.1317259578499606e-06, "loss": 0.00645354, "memory(GiB)": 15.03, "step": 22430, "train_speed(iter/s)": 1.470744 }, { "acc": 0.99921875, "epoch": 39.60282436010591, "grad_norm": 1.1186565160751343, "learning_rate": 1.1298764634751523e-06, "loss": 0.00489457, "memory(GiB)": 15.03, "step": 22435, "train_speed(iter/s)": 1.470771 }, { "acc": 1.0, "epoch": 39.61165048543689, "grad_norm": 0.04693704843521118, "learning_rate": 1.1280282903977848e-06, "loss": 0.00274283, "memory(GiB)": 15.03, "step": 22440, "train_speed(iter/s)": 1.470778 }, { "acc": 1.0, "epoch": 39.620476610767874, "grad_norm": 0.23852185904979706, "learning_rate": 1.1261814392487575e-06, "loss": 0.00315633, "memory(GiB)": 15.03, "step": 22445, "train_speed(iter/s)": 1.470795 }, { "acc": 0.99986982, "epoch": 39.62930273609885, "grad_norm": 0.17262142896652222, "learning_rate": 1.124335910658523e-06, "loss": 0.00677653, "memory(GiB)": 15.03, "step": 22450, "train_speed(iter/s)": 1.470794 }, { "acc": 0.99978065, "epoch": 39.638128861429834, "grad_norm": 0.002955957083031535, "learning_rate": 1.1224917052570842e-06, "loss": 0.00288974, "memory(GiB)": 15.03, "step": 22455, "train_speed(iter/s)": 1.470795 }, { "acc": 0.99970236, "epoch": 39.64695498676081, "grad_norm": 0.33389905095100403, "learning_rate": 1.1206488236739831e-06, "loss": 0.00438107, "memory(GiB)": 15.03, "step": 22460, "train_speed(iter/s)": 1.470805 }, { "acc": 0.99957628, "epoch": 39.655781112091795, "grad_norm": 0.036649029701948166, "learning_rate": 1.1188072665383209e-06, "loss": 0.00454641, "memory(GiB)": 15.03, "step": 22465, "train_speed(iter/s)": 1.470806 }, { "acc": 0.9997797, "epoch": 39.66460723742277, "grad_norm": 0.11740145832300186, "learning_rate": 1.1169670344787385e-06, "loss": 0.00469745, "memory(GiB)": 15.03, "step": 22470, "train_speed(iter/s)": 1.470794 }, { "acc": 1.0, "epoch": 39.67343336275375, "grad_norm": 0.050009533762931824, "learning_rate": 1.1151281281234314e-06, "loss": 0.00291277, "memory(GiB)": 15.03, "step": 22475, "train_speed(iter/s)": 1.470792 }, { "acc": 0.99967747, "epoch": 39.68225948808473, "grad_norm": 0.05969711020588875, "learning_rate": 1.1132905481001343e-06, "loss": 0.00464137, "memory(GiB)": 15.03, "step": 22480, "train_speed(iter/s)": 1.470781 }, { "acc": 1.0, "epoch": 39.69108561341571, "grad_norm": 0.4278900623321533, "learning_rate": 1.1114542950361374e-06, "loss": 0.00339375, "memory(GiB)": 15.03, "step": 22485, "train_speed(iter/s)": 1.47078 }, { "acc": 0.99967108, "epoch": 39.69991173874669, "grad_norm": 0.398512601852417, "learning_rate": 1.1096193695582714e-06, "loss": 0.00575619, "memory(GiB)": 15.03, "step": 22490, "train_speed(iter/s)": 1.47077 }, { "acc": 1.0, "epoch": 39.70873786407767, "grad_norm": 0.5288915038108826, "learning_rate": 1.1077857722929204e-06, "loss": 0.0071907, "memory(GiB)": 15.03, "step": 22495, "train_speed(iter/s)": 1.470771 }, { "acc": 1.0, "epoch": 39.71756398940865, "grad_norm": 0.14269310235977173, "learning_rate": 1.1059535038660064e-06, "loss": 0.00852607, "memory(GiB)": 15.03, "step": 22500, "train_speed(iter/s)": 1.470776 }, { "acc": 1.0, "epoch": 39.72639011473963, "grad_norm": 0.3262213170528412, "learning_rate": 1.1041225649030045e-06, "loss": 0.00323902, "memory(GiB)": 15.03, "step": 22505, "train_speed(iter/s)": 1.470778 }, { "acc": 0.99955359, "epoch": 39.735216240070606, "grad_norm": 0.0065835025161504745, "learning_rate": 1.1022929560289364e-06, "loss": 0.00738255, "memory(GiB)": 15.03, "step": 22510, "train_speed(iter/s)": 1.470811 }, { "acc": 0.99957628, "epoch": 39.74404236540159, "grad_norm": 0.46335524320602417, "learning_rate": 1.100464677868366e-06, "loss": 0.0085946, "memory(GiB)": 15.03, "step": 22515, "train_speed(iter/s)": 1.470822 }, { "acc": 1.0, "epoch": 39.75286849073257, "grad_norm": 0.16182903945446014, "learning_rate": 1.0986377310454045e-06, "loss": 0.00433397, "memory(GiB)": 15.03, "step": 22520, "train_speed(iter/s)": 1.470832 }, { "acc": 0.99985123, "epoch": 39.76169461606355, "grad_norm": 0.019545987248420715, "learning_rate": 1.0968121161837066e-06, "loss": 0.00154474, "memory(GiB)": 15.03, "step": 22525, "train_speed(iter/s)": 1.470842 }, { "acc": 0.99970236, "epoch": 39.77052074139453, "grad_norm": 0.22777946293354034, "learning_rate": 1.0949878339064774e-06, "loss": 0.00452493, "memory(GiB)": 15.03, "step": 22530, "train_speed(iter/s)": 1.47085 }, { "acc": 0.99927959, "epoch": 39.779346866725504, "grad_norm": 0.4379991888999939, "learning_rate": 1.0931648848364629e-06, "loss": 0.00950837, "memory(GiB)": 15.03, "step": 22535, "train_speed(iter/s)": 1.470851 }, { "acc": 1.0, "epoch": 39.78817299205649, "grad_norm": 0.0539107583463192, "learning_rate": 1.0913432695959533e-06, "loss": 0.00325183, "memory(GiB)": 15.03, "step": 22540, "train_speed(iter/s)": 1.470838 }, { "acc": 1.0, "epoch": 39.796999117387465, "grad_norm": 0.7417189478874207, "learning_rate": 1.0895229888067877e-06, "loss": 0.00638292, "memory(GiB)": 15.03, "step": 22545, "train_speed(iter/s)": 1.470839 }, { "acc": 0.99955463, "epoch": 39.80582524271845, "grad_norm": 0.38722094893455505, "learning_rate": 1.0877040430903455e-06, "loss": 0.00567941, "memory(GiB)": 15.03, "step": 22550, "train_speed(iter/s)": 1.470851 }, { "acc": 1.0, "epoch": 39.814651368049425, "grad_norm": 0.2805366516113281, "learning_rate": 1.085886433067554e-06, "loss": 0.00356668, "memory(GiB)": 15.03, "step": 22555, "train_speed(iter/s)": 1.470866 }, { "acc": 1.0, "epoch": 39.82347749338041, "grad_norm": 0.2059888243675232, "learning_rate": 1.084070159358881e-06, "loss": 0.00327633, "memory(GiB)": 15.03, "step": 22560, "train_speed(iter/s)": 1.470866 }, { "acc": 0.99971733, "epoch": 39.832303618711386, "grad_norm": 0.38263630867004395, "learning_rate": 1.08225522258434e-06, "loss": 0.00571564, "memory(GiB)": 15.03, "step": 22565, "train_speed(iter/s)": 1.470869 }, { "acc": 0.99918785, "epoch": 39.84112974404236, "grad_norm": 0.4760167598724365, "learning_rate": 1.0804416233634874e-06, "loss": 0.0088979, "memory(GiB)": 15.03, "step": 22570, "train_speed(iter/s)": 1.470888 }, { "acc": 1.0, "epoch": 39.849955869373346, "grad_norm": 0.2355320006608963, "learning_rate": 1.0786293623154233e-06, "loss": 0.00267473, "memory(GiB)": 15.03, "step": 22575, "train_speed(iter/s)": 1.470873 }, { "acc": 1.0, "epoch": 39.85878199470432, "grad_norm": 0.5514572262763977, "learning_rate": 1.0768184400587947e-06, "loss": 0.00245552, "memory(GiB)": 15.03, "step": 22580, "train_speed(iter/s)": 1.470882 }, { "acc": 0.99980469, "epoch": 39.86760812003531, "grad_norm": 0.01280655711889267, "learning_rate": 1.0750088572117825e-06, "loss": 0.00179932, "memory(GiB)": 15.03, "step": 22585, "train_speed(iter/s)": 1.470879 }, { "acc": 1.0, "epoch": 39.87643424536628, "grad_norm": 0.3268848955631256, "learning_rate": 1.0732006143921202e-06, "loss": 0.00707864, "memory(GiB)": 15.03, "step": 22590, "train_speed(iter/s)": 1.47088 }, { "acc": 0.99986553, "epoch": 39.88526037069727, "grad_norm": 0.16525931656360626, "learning_rate": 1.0713937122170762e-06, "loss": 0.00606312, "memory(GiB)": 15.03, "step": 22595, "train_speed(iter/s)": 1.470878 }, { "acc": 1.0, "epoch": 39.894086496028244, "grad_norm": 0.1473475694656372, "learning_rate": 1.069588151303469e-06, "loss": 0.00420564, "memory(GiB)": 15.03, "step": 22600, "train_speed(iter/s)": 1.470889 }, { "acc": 1.0, "epoch": 39.90291262135922, "grad_norm": 0.007536440622061491, "learning_rate": 1.0677839322676503e-06, "loss": 0.00174961, "memory(GiB)": 15.03, "step": 22605, "train_speed(iter/s)": 1.470878 }, { "acc": 0.99885206, "epoch": 39.911738746690204, "grad_norm": 0.03519275411963463, "learning_rate": 1.065981055725521e-06, "loss": 0.00426188, "memory(GiB)": 15.03, "step": 22610, "train_speed(iter/s)": 1.470879 }, { "acc": 1.0, "epoch": 39.92056487202118, "grad_norm": 0.0038317665457725525, "learning_rate": 1.0641795222925185e-06, "loss": 0.00219945, "memory(GiB)": 15.03, "step": 22615, "train_speed(iter/s)": 1.470905 }, { "acc": 0.99956894, "epoch": 39.929390997352165, "grad_norm": 0.012241306714713573, "learning_rate": 1.062379332583629e-06, "loss": 0.00482519, "memory(GiB)": 15.03, "step": 22620, "train_speed(iter/s)": 1.470907 }, { "acc": 0.99885406, "epoch": 39.93821712268314, "grad_norm": 0.016169685870409012, "learning_rate": 1.0605804872133694e-06, "loss": 0.00825649, "memory(GiB)": 15.03, "step": 22625, "train_speed(iter/s)": 1.470914 }, { "acc": 0.998738, "epoch": 39.94704324801412, "grad_norm": 0.7011902332305908, "learning_rate": 1.0587829867958063e-06, "loss": 0.00764363, "memory(GiB)": 15.03, "step": 22630, "train_speed(iter/s)": 1.47091 }, { "acc": 0.99981613, "epoch": 39.9558693733451, "grad_norm": 0.3181643486022949, "learning_rate": 1.0569868319445453e-06, "loss": 0.00297656, "memory(GiB)": 15.03, "step": 22635, "train_speed(iter/s)": 1.47091 }, { "acc": 1.0, "epoch": 39.96469549867608, "grad_norm": 0.25398150086402893, "learning_rate": 1.0551920232727309e-06, "loss": 0.00191602, "memory(GiB)": 15.03, "step": 22640, "train_speed(iter/s)": 1.470907 }, { "acc": 1.0, "epoch": 39.97352162400706, "grad_norm": 0.01315175648778677, "learning_rate": 1.0533985613930484e-06, "loss": 0.00604938, "memory(GiB)": 15.03, "step": 22645, "train_speed(iter/s)": 1.470927 }, { "acc": 1.0, "epoch": 39.98234774933804, "grad_norm": 0.01572241075336933, "learning_rate": 1.0516064469177235e-06, "loss": 0.00619277, "memory(GiB)": 15.03, "step": 22650, "train_speed(iter/s)": 1.470943 }, { "acc": 0.99973402, "epoch": 39.99117387466902, "grad_norm": 0.9744386672973633, "learning_rate": 1.0498156804585237e-06, "loss": 0.00482696, "memory(GiB)": 15.03, "step": 22655, "train_speed(iter/s)": 1.47096 }, { "acc": 1.0, "epoch": 40.0, "grad_norm": 0.01945849321782589, "learning_rate": 1.0480262626267535e-06, "loss": 0.00115618, "memory(GiB)": 15.03, "step": 22660, "train_speed(iter/s)": 1.470954 }, { "acc": 1.0, "epoch": 40.00882612533098, "grad_norm": 0.05358978733420372, "learning_rate": 1.0462381940332612e-06, "loss": 0.0019773, "memory(GiB)": 15.03, "step": 22665, "train_speed(iter/s)": 1.470942 }, { "acc": 0.99970551, "epoch": 40.01765225066196, "grad_norm": 0.4015968441963196, "learning_rate": 1.0444514752884274e-06, "loss": 0.00598913, "memory(GiB)": 15.03, "step": 22670, "train_speed(iter/s)": 1.470956 }, { "acc": 1.0, "epoch": 40.02647837599294, "grad_norm": 0.4030076861381531, "learning_rate": 1.0426661070021785e-06, "loss": 0.00378869, "memory(GiB)": 15.03, "step": 22675, "train_speed(iter/s)": 1.470961 }, { "acc": 1.0, "epoch": 40.03530450132392, "grad_norm": 0.005037008319050074, "learning_rate": 1.0408820897839794e-06, "loss": 0.00141146, "memory(GiB)": 15.03, "step": 22680, "train_speed(iter/s)": 1.470964 }, { "acc": 1.0, "epoch": 40.0441306266549, "grad_norm": 0.28292879462242126, "learning_rate": 1.0390994242428307e-06, "loss": 0.00368673, "memory(GiB)": 15.03, "step": 22685, "train_speed(iter/s)": 1.470972 }, { "acc": 0.99958324, "epoch": 40.05295675198588, "grad_norm": 0.4211520552635193, "learning_rate": 1.0373181109872724e-06, "loss": 0.0080336, "memory(GiB)": 15.03, "step": 22690, "train_speed(iter/s)": 1.470973 }, { "acc": 0.99963236, "epoch": 40.06178287731686, "grad_norm": 0.07151676714420319, "learning_rate": 1.0355381506253824e-06, "loss": 0.00448467, "memory(GiB)": 15.03, "step": 22695, "train_speed(iter/s)": 1.470966 }, { "acc": 0.99973402, "epoch": 40.070609002647835, "grad_norm": 0.3940576910972595, "learning_rate": 1.0337595437647791e-06, "loss": 0.00442565, "memory(GiB)": 15.03, "step": 22700, "train_speed(iter/s)": 1.470965 }, { "acc": 1.0, "epoch": 40.07943512797882, "grad_norm": 0.1576610952615738, "learning_rate": 1.0319822910126202e-06, "loss": 0.00331695, "memory(GiB)": 15.03, "step": 22705, "train_speed(iter/s)": 1.470981 }, { "acc": 0.99950371, "epoch": 40.088261253309796, "grad_norm": 0.37024807929992676, "learning_rate": 1.0302063929755941e-06, "loss": 0.01293155, "memory(GiB)": 15.03, "step": 22710, "train_speed(iter/s)": 1.47099 }, { "acc": 0.99969511, "epoch": 40.09708737864078, "grad_norm": 0.09882461279630661, "learning_rate": 1.0284318502599335e-06, "loss": 0.00775649, "memory(GiB)": 15.03, "step": 22715, "train_speed(iter/s)": 1.471002 }, { "acc": 0.99977684, "epoch": 40.105913503971756, "grad_norm": 0.20593422651290894, "learning_rate": 1.0266586634714047e-06, "loss": 0.00268572, "memory(GiB)": 15.03, "step": 22720, "train_speed(iter/s)": 1.471016 }, { "acc": 0.99980164, "epoch": 40.11473962930273, "grad_norm": 0.22379878163337708, "learning_rate": 1.0248868332153165e-06, "loss": 0.00393872, "memory(GiB)": 15.03, "step": 22725, "train_speed(iter/s)": 1.471023 }, { "acc": 0.99982872, "epoch": 40.12356575463372, "grad_norm": 0.18323664367198944, "learning_rate": 1.0231163600965044e-06, "loss": 0.00649939, "memory(GiB)": 15.03, "step": 22730, "train_speed(iter/s)": 1.471028 }, { "acc": 1.0, "epoch": 40.13239187996469, "grad_norm": 0.3388269543647766, "learning_rate": 1.0213472447193522e-06, "loss": 0.00453871, "memory(GiB)": 15.03, "step": 22735, "train_speed(iter/s)": 1.471028 }, { "acc": 1.0, "epoch": 40.14121800529568, "grad_norm": 0.12817516922950745, "learning_rate": 1.019579487687771e-06, "loss": 0.00296681, "memory(GiB)": 15.03, "step": 22740, "train_speed(iter/s)": 1.471048 }, { "acc": 0.99980164, "epoch": 40.150044130626654, "grad_norm": 0.21779289841651917, "learning_rate": 1.0178130896052162e-06, "loss": 0.00706286, "memory(GiB)": 15.03, "step": 22745, "train_speed(iter/s)": 1.471065 }, { "acc": 0.99899998, "epoch": 40.15887025595764, "grad_norm": 0.31134849786758423, "learning_rate": 1.0160480510746713e-06, "loss": 0.01394545, "memory(GiB)": 15.03, "step": 22750, "train_speed(iter/s)": 1.471072 }, { "acc": 1.0, "epoch": 40.167696381288614, "grad_norm": 0.00819137692451477, "learning_rate": 1.0142843726986605e-06, "loss": 0.00296094, "memory(GiB)": 15.03, "step": 22755, "train_speed(iter/s)": 1.47109 }, { "acc": 0.9994565, "epoch": 40.17652250661959, "grad_norm": 0.004241089802235365, "learning_rate": 1.0125220550792453e-06, "loss": 0.00435035, "memory(GiB)": 15.03, "step": 22760, "train_speed(iter/s)": 1.471079 }, { "acc": 0.99959564, "epoch": 40.185348631950575, "grad_norm": 0.02925514057278633, "learning_rate": 1.0107610988180185e-06, "loss": 0.00381645, "memory(GiB)": 15.03, "step": 22765, "train_speed(iter/s)": 1.471079 }, { "acc": 1.0, "epoch": 40.19417475728155, "grad_norm": 0.012872534804046154, "learning_rate": 1.0090015045161096e-06, "loss": 0.0039453, "memory(GiB)": 15.03, "step": 22770, "train_speed(iter/s)": 1.471091 }, { "acc": 1.0, "epoch": 40.203000882612535, "grad_norm": 0.024881774559617043, "learning_rate": 1.0072432727741825e-06, "loss": 0.0014386, "memory(GiB)": 15.03, "step": 22775, "train_speed(iter/s)": 1.471092 }, { "acc": 0.99978065, "epoch": 40.21182700794351, "grad_norm": 0.36514681577682495, "learning_rate": 1.0054864041924393e-06, "loss": 0.0026508, "memory(GiB)": 15.03, "step": 22780, "train_speed(iter/s)": 1.471073 }, { "acc": 1.0, "epoch": 40.220653133274496, "grad_norm": 0.15036307275295258, "learning_rate": 1.0037308993706116e-06, "loss": 0.00199083, "memory(GiB)": 15.03, "step": 22785, "train_speed(iter/s)": 1.471073 }, { "acc": 1.0, "epoch": 40.22947925860547, "grad_norm": 0.46239250898361206, "learning_rate": 1.001976758907973e-06, "loss": 0.00477753, "memory(GiB)": 15.03, "step": 22790, "train_speed(iter/s)": 1.471081 }, { "acc": 1.0, "epoch": 40.23830538393645, "grad_norm": 0.25105756521224976, "learning_rate": 1.0002239834033199e-06, "loss": 0.0023414, "memory(GiB)": 15.03, "step": 22795, "train_speed(iter/s)": 1.471086 }, { "acc": 0.99979506, "epoch": 40.24713150926743, "grad_norm": 0.007145863492041826, "learning_rate": 9.98472573454993e-07, "loss": 0.00371165, "memory(GiB)": 15.03, "step": 22800, "train_speed(iter/s)": 1.471093 }, { "acc": 0.99985294, "epoch": 40.25595763459841, "grad_norm": 0.1401464343070984, "learning_rate": 9.96722529660865e-07, "loss": 0.00215785, "memory(GiB)": 15.03, "step": 22805, "train_speed(iter/s)": 1.471076 }, { "acc": 1.0, "epoch": 40.264783759929394, "grad_norm": 0.1404687613248825, "learning_rate": 9.949738526183383e-07, "loss": 0.00374272, "memory(GiB)": 15.03, "step": 22810, "train_speed(iter/s)": 1.471087 }, { "acc": 0.99974489, "epoch": 40.27360988526037, "grad_norm": 0.2904708981513977, "learning_rate": 9.932265429243514e-07, "loss": 0.00531659, "memory(GiB)": 15.03, "step": 22815, "train_speed(iter/s)": 1.471103 }, { "acc": 1.0, "epoch": 40.28243601059135, "grad_norm": 0.008103875443339348, "learning_rate": 9.91480601175374e-07, "loss": 0.00331455, "memory(GiB)": 15.03, "step": 22820, "train_speed(iter/s)": 1.471128 }, { "acc": 1.0, "epoch": 40.29126213592233, "grad_norm": 0.16456559300422668, "learning_rate": 9.897360279674124e-07, "loss": 0.00612335, "memory(GiB)": 15.03, "step": 22825, "train_speed(iter/s)": 1.471131 }, { "acc": 0.99993553, "epoch": 40.30008826125331, "grad_norm": 0.5251823663711548, "learning_rate": 9.879928238960054e-07, "loss": 0.00641357, "memory(GiB)": 15.03, "step": 22830, "train_speed(iter/s)": 1.471129 }, { "acc": 0.9998106, "epoch": 40.30891438658429, "grad_norm": 0.17435920238494873, "learning_rate": 9.862509895562183e-07, "loss": 0.0065881, "memory(GiB)": 15.03, "step": 22835, "train_speed(iter/s)": 1.471125 }, { "acc": 1.0, "epoch": 40.31774051191527, "grad_norm": 0.0446833111345768, "learning_rate": 9.845105255426568e-07, "loss": 0.00343782, "memory(GiB)": 15.03, "step": 22840, "train_speed(iter/s)": 1.471129 }, { "acc": 0.99986553, "epoch": 40.32656663724625, "grad_norm": 0.2917977571487427, "learning_rate": 9.827714324494531e-07, "loss": 0.00445301, "memory(GiB)": 15.03, "step": 22845, "train_speed(iter/s)": 1.471137 }, { "acc": 0.99956074, "epoch": 40.33539276257723, "grad_norm": 0.52313232421875, "learning_rate": 9.810337108702773e-07, "loss": 0.00483306, "memory(GiB)": 15.03, "step": 22850, "train_speed(iter/s)": 1.471131 }, { "acc": 1.0, "epoch": 40.344218887908205, "grad_norm": 0.003120261477306485, "learning_rate": 9.79297361398322e-07, "loss": 0.00131643, "memory(GiB)": 15.03, "step": 22855, "train_speed(iter/s)": 1.47115 }, { "acc": 0.99989033, "epoch": 40.35304501323919, "grad_norm": 0.2837827503681183, "learning_rate": 9.775623846263219e-07, "loss": 0.0043128, "memory(GiB)": 15.03, "step": 22860, "train_speed(iter/s)": 1.471141 }, { "acc": 1.0, "epoch": 40.361871138570166, "grad_norm": 0.302255243062973, "learning_rate": 9.75828781146535e-07, "loss": 0.00387364, "memory(GiB)": 15.03, "step": 22865, "train_speed(iter/s)": 1.471142 }, { "acc": 0.99983768, "epoch": 40.37069726390115, "grad_norm": 0.0063803307712078094, "learning_rate": 9.740965515507576e-07, "loss": 0.00650602, "memory(GiB)": 15.03, "step": 22870, "train_speed(iter/s)": 1.471131 }, { "acc": 1.0, "epoch": 40.379523389232126, "grad_norm": 0.011224834248423576, "learning_rate": 9.72365696430308e-07, "loss": 0.00426492, "memory(GiB)": 15.03, "step": 22875, "train_speed(iter/s)": 1.47112 }, { "acc": 1.0, "epoch": 40.38834951456311, "grad_norm": 0.06848262250423431, "learning_rate": 9.70636216376044e-07, "loss": 0.00163661, "memory(GiB)": 15.03, "step": 22880, "train_speed(iter/s)": 1.471116 }, { "acc": 1.0, "epoch": 40.39717563989409, "grad_norm": 0.7160292863845825, "learning_rate": 9.689081119783504e-07, "loss": 0.00891198, "memory(GiB)": 15.03, "step": 22885, "train_speed(iter/s)": 1.471119 }, { "acc": 0.99971485, "epoch": 40.406001765225064, "grad_norm": 0.028021328151226044, "learning_rate": 9.671813838271405e-07, "loss": 0.00346042, "memory(GiB)": 15.03, "step": 22890, "train_speed(iter/s)": 1.471109 }, { "acc": 1.0, "epoch": 40.41482789055605, "grad_norm": 0.27985963225364685, "learning_rate": 9.65456032511864e-07, "loss": 0.00521391, "memory(GiB)": 15.03, "step": 22895, "train_speed(iter/s)": 1.471122 }, { "acc": 1.0, "epoch": 40.423654015887024, "grad_norm": 0.2308196723461151, "learning_rate": 9.637320586214905e-07, "loss": 0.00287791, "memory(GiB)": 15.03, "step": 22900, "train_speed(iter/s)": 1.471142 }, { "acc": 0.99968748, "epoch": 40.43248014121801, "grad_norm": 0.5944979190826416, "learning_rate": 9.620094627445295e-07, "loss": 0.00469148, "memory(GiB)": 15.03, "step": 22905, "train_speed(iter/s)": 1.471138 }, { "acc": 0.99982872, "epoch": 40.441306266548985, "grad_norm": 0.008672822266817093, "learning_rate": 9.602882454690131e-07, "loss": 0.00335726, "memory(GiB)": 15.03, "step": 22910, "train_speed(iter/s)": 1.471132 }, { "acc": 1.0, "epoch": 40.45013239187996, "grad_norm": 0.3081051707267761, "learning_rate": 9.5856840738251e-07, "loss": 0.00284725, "memory(GiB)": 15.03, "step": 22915, "train_speed(iter/s)": 1.471136 }, { "acc": 0.99969473, "epoch": 40.458958517210945, "grad_norm": 0.0020394783932715654, "learning_rate": 9.568499490721076e-07, "loss": 0.00600558, "memory(GiB)": 15.03, "step": 22920, "train_speed(iter/s)": 1.47114 }, { "acc": 0.99988937, "epoch": 40.46778464254192, "grad_norm": 0.09389062970876694, "learning_rate": 9.551328711244314e-07, "loss": 0.00438921, "memory(GiB)": 15.03, "step": 22925, "train_speed(iter/s)": 1.471133 }, { "acc": 1.0, "epoch": 40.476610767872906, "grad_norm": 0.007321173790842295, "learning_rate": 9.534171741256337e-07, "loss": 0.00132558, "memory(GiB)": 15.03, "step": 22930, "train_speed(iter/s)": 1.471138 }, { "acc": 1.0, "epoch": 40.48543689320388, "grad_norm": 0.009223075583577156, "learning_rate": 9.51702858661393e-07, "loss": 0.00263409, "memory(GiB)": 15.03, "step": 22935, "train_speed(iter/s)": 1.471119 }, { "acc": 0.99989033, "epoch": 40.494263018534866, "grad_norm": 0.21506153047084808, "learning_rate": 9.499899253169179e-07, "loss": 0.00636986, "memory(GiB)": 15.03, "step": 22940, "train_speed(iter/s)": 1.471123 }, { "acc": 1.0, "epoch": 40.50308914386584, "grad_norm": 0.1970566213130951, "learning_rate": 9.48278374676943e-07, "loss": 0.00185862, "memory(GiB)": 15.03, "step": 22945, "train_speed(iter/s)": 1.471138 }, { "acc": 1.0, "epoch": 40.51191526919682, "grad_norm": 0.09002802520990372, "learning_rate": 9.465682073257346e-07, "loss": 0.00266228, "memory(GiB)": 15.03, "step": 22950, "train_speed(iter/s)": 1.471156 }, { "acc": 1.0, "epoch": 40.5207413945278, "grad_norm": 0.5229912400245667, "learning_rate": 9.448594238470871e-07, "loss": 0.00297999, "memory(GiB)": 15.03, "step": 22955, "train_speed(iter/s)": 1.471154 }, { "acc": 0.99921875, "epoch": 40.52956751985878, "grad_norm": 0.3936924934387207, "learning_rate": 9.431520248243146e-07, "loss": 0.0066, "memory(GiB)": 15.03, "step": 22960, "train_speed(iter/s)": 1.471128 }, { "acc": 0.99967108, "epoch": 40.538393645189764, "grad_norm": 0.5996826887130737, "learning_rate": 9.414460108402687e-07, "loss": 0.00467018, "memory(GiB)": 15.03, "step": 22965, "train_speed(iter/s)": 1.471132 }, { "acc": 1.0, "epoch": 40.54721977052074, "grad_norm": 0.3716825246810913, "learning_rate": 9.397413824773214e-07, "loss": 0.00494132, "memory(GiB)": 15.03, "step": 22970, "train_speed(iter/s)": 1.471143 }, { "acc": 1.0, "epoch": 40.556045895851724, "grad_norm": 0.15140171349048615, "learning_rate": 9.380381403173771e-07, "loss": 0.00474935, "memory(GiB)": 15.03, "step": 22975, "train_speed(iter/s)": 1.471153 }, { "acc": 1.0, "epoch": 40.5648720211827, "grad_norm": 0.26357021927833557, "learning_rate": 9.363362849418596e-07, "loss": 0.00323078, "memory(GiB)": 15.03, "step": 22980, "train_speed(iter/s)": 1.471153 }, { "acc": 1.0, "epoch": 40.57369814651368, "grad_norm": 0.002360536018386483, "learning_rate": 9.346358169317273e-07, "loss": 0.00028942, "memory(GiB)": 15.03, "step": 22985, "train_speed(iter/s)": 1.471164 }, { "acc": 1.0, "epoch": 40.58252427184466, "grad_norm": 0.06471845507621765, "learning_rate": 9.329367368674586e-07, "loss": 0.00261016, "memory(GiB)": 15.03, "step": 22990, "train_speed(iter/s)": 1.471161 }, { "acc": 1.0, "epoch": 40.59135039717564, "grad_norm": 0.2680796980857849, "learning_rate": 9.312390453290626e-07, "loss": 0.00287977, "memory(GiB)": 15.03, "step": 22995, "train_speed(iter/s)": 1.471162 }, { "acc": 1.0, "epoch": 40.60017652250662, "grad_norm": 0.332591712474823, "learning_rate": 9.295427428960741e-07, "loss": 0.00249012, "memory(GiB)": 15.03, "step": 23000, "train_speed(iter/s)": 1.471169 }, { "acc": 0.99979172, "epoch": 40.6090026478376, "grad_norm": 0.42821139097213745, "learning_rate": 9.278478301475485e-07, "loss": 0.0061795, "memory(GiB)": 15.03, "step": 23005, "train_speed(iter/s)": 1.471163 }, { "acc": 1.0, "epoch": 40.617828773168576, "grad_norm": 0.06425680220127106, "learning_rate": 9.26154307662074e-07, "loss": 0.00245188, "memory(GiB)": 15.03, "step": 23010, "train_speed(iter/s)": 1.471178 }, { "acc": 0.99963703, "epoch": 40.62665489849956, "grad_norm": 0.15096019208431244, "learning_rate": 9.244621760177578e-07, "loss": 0.00717763, "memory(GiB)": 15.03, "step": 23015, "train_speed(iter/s)": 1.471174 }, { "acc": 0.99975491, "epoch": 40.635481023830536, "grad_norm": 0.5212996602058411, "learning_rate": 9.227714357922399e-07, "loss": 0.00355382, "memory(GiB)": 15.03, "step": 23020, "train_speed(iter/s)": 1.471177 }, { "acc": 1.0, "epoch": 40.64430714916152, "grad_norm": 0.1683797538280487, "learning_rate": 9.210820875626742e-07, "loss": 0.00165894, "memory(GiB)": 15.03, "step": 23025, "train_speed(iter/s)": 1.471172 }, { "acc": 1.0, "epoch": 40.6531332744925, "grad_norm": 0.224503755569458, "learning_rate": 9.193941319057517e-07, "loss": 0.00375122, "memory(GiB)": 15.03, "step": 23030, "train_speed(iter/s)": 1.471176 }, { "acc": 0.99976416, "epoch": 40.66195939982348, "grad_norm": 0.22131715714931488, "learning_rate": 9.177075693976779e-07, "loss": 0.00314681, "memory(GiB)": 15.03, "step": 23035, "train_speed(iter/s)": 1.471163 }, { "acc": 0.99933376, "epoch": 40.67078552515446, "grad_norm": 0.2876952290534973, "learning_rate": 9.160224006141926e-07, "loss": 0.00839013, "memory(GiB)": 15.03, "step": 23040, "train_speed(iter/s)": 1.471167 }, { "acc": 0.99921875, "epoch": 40.679611650485434, "grad_norm": 1.1153813600540161, "learning_rate": 9.143386261305479e-07, "loss": 0.00636329, "memory(GiB)": 15.03, "step": 23045, "train_speed(iter/s)": 1.471162 }, { "acc": 1.0, "epoch": 40.68843777581642, "grad_norm": 0.29772815108299255, "learning_rate": 9.126562465215294e-07, "loss": 0.00451161, "memory(GiB)": 15.03, "step": 23050, "train_speed(iter/s)": 1.471167 }, { "acc": 0.99978447, "epoch": 40.697263901147394, "grad_norm": 0.014904489740729332, "learning_rate": 9.109752623614449e-07, "loss": 0.00404183, "memory(GiB)": 15.03, "step": 23055, "train_speed(iter/s)": 1.471173 }, { "acc": 1.0, "epoch": 40.70609002647838, "grad_norm": 0.02856946364045143, "learning_rate": 9.092956742241235e-07, "loss": 0.00244019, "memory(GiB)": 15.03, "step": 23060, "train_speed(iter/s)": 1.471183 }, { "acc": 0.99957628, "epoch": 40.714916151809355, "grad_norm": 0.07468409091234207, "learning_rate": 9.076174826829176e-07, "loss": 0.00451501, "memory(GiB)": 15.03, "step": 23065, "train_speed(iter/s)": 1.471184 }, { "acc": 1.0, "epoch": 40.72374227714034, "grad_norm": 0.25628113746643066, "learning_rate": 9.059406883107036e-07, "loss": 0.00271869, "memory(GiB)": 15.03, "step": 23070, "train_speed(iter/s)": 1.471192 }, { "acc": 0.99989033, "epoch": 40.732568402471315, "grad_norm": 0.5351098775863647, "learning_rate": 9.042652916798822e-07, "loss": 0.00557834, "memory(GiB)": 15.03, "step": 23075, "train_speed(iter/s)": 1.471178 }, { "acc": 1.0, "epoch": 40.74139452780229, "grad_norm": 0.07209683209657669, "learning_rate": 9.025912933623787e-07, "loss": 0.00495595, "memory(GiB)": 15.03, "step": 23080, "train_speed(iter/s)": 1.471176 }, { "acc": 0.99925003, "epoch": 40.750220653133276, "grad_norm": 0.13226626813411713, "learning_rate": 9.009186939296334e-07, "loss": 0.01376365, "memory(GiB)": 15.03, "step": 23085, "train_speed(iter/s)": 1.471161 }, { "acc": 1.0, "epoch": 40.75904677846425, "grad_norm": 0.24235139787197113, "learning_rate": 8.992474939526172e-07, "loss": 0.00317235, "memory(GiB)": 15.03, "step": 23090, "train_speed(iter/s)": 1.471176 }, { "acc": 0.99980164, "epoch": 40.76787290379524, "grad_norm": 0.012069562450051308, "learning_rate": 8.975776940018191e-07, "loss": 0.0026146, "memory(GiB)": 15.03, "step": 23095, "train_speed(iter/s)": 1.471176 }, { "acc": 1.0, "epoch": 40.77669902912621, "grad_norm": 0.006741008255630732, "learning_rate": 8.959092946472521e-07, "loss": 0.00017752, "memory(GiB)": 15.03, "step": 23100, "train_speed(iter/s)": 1.471177 }, { "acc": 0.99991436, "epoch": 40.78552515445719, "grad_norm": 0.004685512278228998, "learning_rate": 8.942422964584502e-07, "loss": 0.00435976, "memory(GiB)": 15.03, "step": 23105, "train_speed(iter/s)": 1.471181 }, { "acc": 1.0, "epoch": 40.794351279788174, "grad_norm": 0.21697168052196503, "learning_rate": 8.925767000044685e-07, "loss": 0.00444048, "memory(GiB)": 15.03, "step": 23110, "train_speed(iter/s)": 1.471183 }, { "acc": 0.99959078, "epoch": 40.80317740511915, "grad_norm": 0.2844371795654297, "learning_rate": 8.909125058538833e-07, "loss": 0.00278666, "memory(GiB)": 15.03, "step": 23115, "train_speed(iter/s)": 1.471176 }, { "acc": 0.99981613, "epoch": 40.812003530450134, "grad_norm": 0.0596623532474041, "learning_rate": 8.89249714574794e-07, "loss": 0.00290959, "memory(GiB)": 15.03, "step": 23120, "train_speed(iter/s)": 1.471173 }, { "acc": 1.0, "epoch": 40.82082965578111, "grad_norm": 0.2434006780385971, "learning_rate": 8.875883267348229e-07, "loss": 0.00244864, "memory(GiB)": 15.03, "step": 23125, "train_speed(iter/s)": 1.471172 }, { "acc": 1.0, "epoch": 40.829655781112095, "grad_norm": 0.03048471361398697, "learning_rate": 8.859283429011054e-07, "loss": 0.0037706, "memory(GiB)": 15.03, "step": 23130, "train_speed(iter/s)": 1.471183 }, { "acc": 1.0, "epoch": 40.83848190644307, "grad_norm": 0.10248848795890808, "learning_rate": 8.842697636403072e-07, "loss": 0.00332084, "memory(GiB)": 15.03, "step": 23135, "train_speed(iter/s)": 1.471185 }, { "acc": 1.0, "epoch": 40.84730803177405, "grad_norm": 0.3020898103713989, "learning_rate": 8.826125895186065e-07, "loss": 0.00287627, "memory(GiB)": 15.03, "step": 23140, "train_speed(iter/s)": 1.47119 }, { "acc": 1.0, "epoch": 40.85613415710503, "grad_norm": 0.4388090968132019, "learning_rate": 8.809568211017097e-07, "loss": 0.00332495, "memory(GiB)": 15.03, "step": 23145, "train_speed(iter/s)": 1.471194 }, { "acc": 1.0, "epoch": 40.86496028243601, "grad_norm": 0.014238474890589714, "learning_rate": 8.793024589548342e-07, "loss": 0.00222517, "memory(GiB)": 15.03, "step": 23150, "train_speed(iter/s)": 1.47121 }, { "acc": 0.99990158, "epoch": 40.87378640776699, "grad_norm": 0.016723381355404854, "learning_rate": 8.776495036427256e-07, "loss": 0.00372299, "memory(GiB)": 15.03, "step": 23155, "train_speed(iter/s)": 1.471201 }, { "acc": 1.0, "epoch": 40.88261253309797, "grad_norm": 0.2239764779806137, "learning_rate": 8.759979557296439e-07, "loss": 0.00403912, "memory(GiB)": 15.03, "step": 23160, "train_speed(iter/s)": 1.471204 }, { "acc": 0.99969511, "epoch": 40.89143865842895, "grad_norm": 0.07422420382499695, "learning_rate": 8.743478157793739e-07, "loss": 0.00651209, "memory(GiB)": 15.03, "step": 23165, "train_speed(iter/s)": 1.471207 }, { "acc": 1.0, "epoch": 40.90026478375993, "grad_norm": 0.004689340945333242, "learning_rate": 8.726990843552117e-07, "loss": 0.00515894, "memory(GiB)": 15.03, "step": 23170, "train_speed(iter/s)": 1.4712 }, { "acc": 1.0, "epoch": 40.90909090909091, "grad_norm": 0.394588828086853, "learning_rate": 8.710517620199796e-07, "loss": 0.00407185, "memory(GiB)": 15.03, "step": 23175, "train_speed(iter/s)": 1.471218 }, { "acc": 0.99921875, "epoch": 40.91791703442189, "grad_norm": 0.06808420270681381, "learning_rate": 8.694058493360179e-07, "loss": 0.00309132, "memory(GiB)": 15.03, "step": 23180, "train_speed(iter/s)": 1.471233 }, { "acc": 0.9998311, "epoch": 40.92674315975287, "grad_norm": 0.008534879423677921, "learning_rate": 8.677613468651835e-07, "loss": 0.00357867, "memory(GiB)": 15.03, "step": 23185, "train_speed(iter/s)": 1.471252 }, { "acc": 0.99970856, "epoch": 40.93556928508385, "grad_norm": 0.48032817244529724, "learning_rate": 8.66118255168853e-07, "loss": 0.00578483, "memory(GiB)": 15.03, "step": 23190, "train_speed(iter/s)": 1.471262 }, { "acc": 1.0, "epoch": 40.94439541041483, "grad_norm": 0.06408099830150604, "learning_rate": 8.644765748079195e-07, "loss": 0.00167373, "memory(GiB)": 15.03, "step": 23195, "train_speed(iter/s)": 1.471259 }, { "acc": 0.99980774, "epoch": 40.953221535745804, "grad_norm": 0.41166672110557556, "learning_rate": 8.628363063427981e-07, "loss": 0.00808945, "memory(GiB)": 15.03, "step": 23200, "train_speed(iter/s)": 1.471274 }, { "acc": 0.99939747, "epoch": 40.96204766107679, "grad_norm": 0.5339998006820679, "learning_rate": 8.6119745033342e-07, "loss": 0.00689065, "memory(GiB)": 15.03, "step": 23205, "train_speed(iter/s)": 1.471292 }, { "acc": 1.0, "epoch": 40.970873786407765, "grad_norm": 0.25688043236732483, "learning_rate": 8.595600073392336e-07, "loss": 0.00203428, "memory(GiB)": 15.03, "step": 23210, "train_speed(iter/s)": 1.471305 }, { "acc": 0.99958467, "epoch": 40.97969991173875, "grad_norm": 0.004618323408067226, "learning_rate": 8.57923977919206e-07, "loss": 0.01176367, "memory(GiB)": 15.03, "step": 23215, "train_speed(iter/s)": 1.471301 }, { "acc": 1.0, "epoch": 40.988526037069725, "grad_norm": 0.016799770295619965, "learning_rate": 8.562893626318187e-07, "loss": 0.00427396, "memory(GiB)": 15.03, "step": 23220, "train_speed(iter/s)": 1.471304 }, { "acc": 1.0, "epoch": 40.99735216240071, "grad_norm": 0.002302925568073988, "learning_rate": 8.54656162035077e-07, "loss": 0.00275408, "memory(GiB)": 15.03, "step": 23225, "train_speed(iter/s)": 1.471293 }, { "acc": 1.0, "epoch": 41.006178287731686, "grad_norm": 0.5590141415596008, "learning_rate": 8.530243766864966e-07, "loss": 0.00467947, "memory(GiB)": 15.03, "step": 23230, "train_speed(iter/s)": 1.47125 }, { "acc": 0.9994401, "epoch": 41.01500441306266, "grad_norm": 0.5662695169448853, "learning_rate": 8.513940071431138e-07, "loss": 0.01223036, "memory(GiB)": 15.03, "step": 23235, "train_speed(iter/s)": 1.471255 }, { "acc": 1.0, "epoch": 41.023830538393646, "grad_norm": 0.024362662807106972, "learning_rate": 8.497650539614784e-07, "loss": 0.00091087, "memory(GiB)": 15.03, "step": 23240, "train_speed(iter/s)": 1.47125 }, { "acc": 0.99952221, "epoch": 41.03265666372462, "grad_norm": 0.24616631865501404, "learning_rate": 8.481375176976612e-07, "loss": 0.00391077, "memory(GiB)": 15.03, "step": 23245, "train_speed(iter/s)": 1.471254 }, { "acc": 1.0, "epoch": 41.04148278905561, "grad_norm": 0.003720010630786419, "learning_rate": 8.465113989072484e-07, "loss": 0.00484893, "memory(GiB)": 15.03, "step": 23250, "train_speed(iter/s)": 1.471262 }, { "acc": 1.0, "epoch": 41.050308914386584, "grad_norm": 0.1292356252670288, "learning_rate": 8.44886698145337e-07, "loss": 0.00192664, "memory(GiB)": 15.03, "step": 23255, "train_speed(iter/s)": 1.471263 }, { "acc": 1.0, "epoch": 41.05913503971757, "grad_norm": 0.3466140329837799, "learning_rate": 8.432634159665466e-07, "loss": 0.00484016, "memory(GiB)": 15.03, "step": 23260, "train_speed(iter/s)": 1.471252 }, { "acc": 1.0, "epoch": 41.067961165048544, "grad_norm": 0.4249177873134613, "learning_rate": 8.416415529250074e-07, "loss": 0.00229066, "memory(GiB)": 15.03, "step": 23265, "train_speed(iter/s)": 1.471255 }, { "acc": 1.0, "epoch": 41.07678729037952, "grad_norm": 0.015525617636740208, "learning_rate": 8.40021109574372e-07, "loss": 0.00336149, "memory(GiB)": 15.03, "step": 23270, "train_speed(iter/s)": 1.471265 }, { "acc": 0.99973402, "epoch": 41.085613415710505, "grad_norm": 0.15645131468772888, "learning_rate": 8.384020864677981e-07, "loss": 0.00504438, "memory(GiB)": 15.03, "step": 23275, "train_speed(iter/s)": 1.471258 }, { "acc": 1.0, "epoch": 41.09443954104148, "grad_norm": 0.03865910321474075, "learning_rate": 8.367844841579686e-07, "loss": 0.00116087, "memory(GiB)": 15.03, "step": 23280, "train_speed(iter/s)": 1.471255 }, { "acc": 1.0, "epoch": 41.103265666372465, "grad_norm": 0.400810569524765, "learning_rate": 8.35168303197075e-07, "loss": 0.00513112, "memory(GiB)": 15.03, "step": 23285, "train_speed(iter/s)": 1.47125 }, { "acc": 1.0, "epoch": 41.11209179170344, "grad_norm": 0.5516843199729919, "learning_rate": 8.335535441368282e-07, "loss": 0.00773003, "memory(GiB)": 15.03, "step": 23290, "train_speed(iter/s)": 1.471253 }, { "acc": 0.99969511, "epoch": 41.12091791703442, "grad_norm": 0.24110665917396545, "learning_rate": 8.319402075284474e-07, "loss": 0.00680232, "memory(GiB)": 15.03, "step": 23295, "train_speed(iter/s)": 1.471256 }, { "acc": 1.0, "epoch": 41.1297440423654, "grad_norm": 0.006096712313592434, "learning_rate": 8.303282939226729e-07, "loss": 0.00189313, "memory(GiB)": 15.03, "step": 23300, "train_speed(iter/s)": 1.47126 }, { "acc": 1.0, "epoch": 41.13857016769638, "grad_norm": 0.06887287646532059, "learning_rate": 8.287178038697567e-07, "loss": 0.00671918, "memory(GiB)": 15.03, "step": 23305, "train_speed(iter/s)": 1.471272 }, { "acc": 1.0, "epoch": 41.14739629302736, "grad_norm": 0.24914850294589996, "learning_rate": 8.271087379194634e-07, "loss": 0.00254153, "memory(GiB)": 15.03, "step": 23310, "train_speed(iter/s)": 1.471304 }, { "acc": 1.0, "epoch": 41.15622241835834, "grad_norm": 0.06740475445985794, "learning_rate": 8.255010966210732e-07, "loss": 0.00302083, "memory(GiB)": 15.03, "step": 23315, "train_speed(iter/s)": 1.471304 }, { "acc": 0.99979506, "epoch": 41.16504854368932, "grad_norm": 0.005010309163480997, "learning_rate": 8.238948805233781e-07, "loss": 0.00130796, "memory(GiB)": 15.03, "step": 23320, "train_speed(iter/s)": 1.471295 }, { "acc": 1.0, "epoch": 41.1738746690203, "grad_norm": 0.3361108601093292, "learning_rate": 8.222900901746859e-07, "loss": 0.00611987, "memory(GiB)": 15.03, "step": 23325, "train_speed(iter/s)": 1.471306 }, { "acc": 0.99980469, "epoch": 41.18270079435128, "grad_norm": 0.7493156790733337, "learning_rate": 8.206867261228179e-07, "loss": 0.00902247, "memory(GiB)": 15.03, "step": 23330, "train_speed(iter/s)": 1.471296 }, { "acc": 1.0, "epoch": 41.19152691968226, "grad_norm": 0.32777395844459534, "learning_rate": 8.190847889151061e-07, "loss": 0.00504223, "memory(GiB)": 15.03, "step": 23335, "train_speed(iter/s)": 1.471297 }, { "acc": 1.0, "epoch": 41.20035304501324, "grad_norm": 0.01686910167336464, "learning_rate": 8.174842790983972e-07, "loss": 0.0015819, "memory(GiB)": 15.03, "step": 23340, "train_speed(iter/s)": 1.47132 }, { "acc": 1.0, "epoch": 41.20917917034422, "grad_norm": 0.01002488937228918, "learning_rate": 8.158851972190478e-07, "loss": 0.00337695, "memory(GiB)": 15.03, "step": 23345, "train_speed(iter/s)": 1.471322 }, { "acc": 1.0, "epoch": 41.2180052956752, "grad_norm": 0.15879970788955688, "learning_rate": 8.14287543822933e-07, "loss": 0.00301958, "memory(GiB)": 15.03, "step": 23350, "train_speed(iter/s)": 1.471332 }, { "acc": 1.0, "epoch": 41.22683142100618, "grad_norm": 0.1845499575138092, "learning_rate": 8.126913194554345e-07, "loss": 0.00305028, "memory(GiB)": 15.03, "step": 23355, "train_speed(iter/s)": 1.471339 }, { "acc": 1.0, "epoch": 41.23565754633716, "grad_norm": 0.0054604290053248405, "learning_rate": 8.110965246614484e-07, "loss": 0.00255483, "memory(GiB)": 15.03, "step": 23360, "train_speed(iter/s)": 1.471335 }, { "acc": 1.0, "epoch": 41.244483671668135, "grad_norm": 0.1741165667772293, "learning_rate": 8.095031599853814e-07, "loss": 0.00509855, "memory(GiB)": 15.03, "step": 23365, "train_speed(iter/s)": 1.471334 }, { "acc": 0.99980164, "epoch": 41.25330979699912, "grad_norm": 0.020652009174227715, "learning_rate": 8.079112259711547e-07, "loss": 0.00692733, "memory(GiB)": 15.03, "step": 23370, "train_speed(iter/s)": 1.471332 }, { "acc": 0.99987116, "epoch": 41.262135922330096, "grad_norm": 0.05841609090566635, "learning_rate": 8.063207231622023e-07, "loss": 0.00493858, "memory(GiB)": 15.03, "step": 23375, "train_speed(iter/s)": 1.471326 }, { "acc": 1.0, "epoch": 41.27096204766108, "grad_norm": 0.004602496046572924, "learning_rate": 8.047316521014617e-07, "loss": 0.00331248, "memory(GiB)": 15.03, "step": 23380, "train_speed(iter/s)": 1.471324 }, { "acc": 0.99949999, "epoch": 41.279788172992056, "grad_norm": 0.34368187189102173, "learning_rate": 8.03144013331391e-07, "loss": 0.01310305, "memory(GiB)": 15.03, "step": 23385, "train_speed(iter/s)": 1.471332 }, { "acc": 1.0, "epoch": 41.28861429832303, "grad_norm": 0.3794126510620117, "learning_rate": 8.015578073939527e-07, "loss": 0.00297563, "memory(GiB)": 15.03, "step": 23390, "train_speed(iter/s)": 1.47134 }, { "acc": 1.0, "epoch": 41.29744042365402, "grad_norm": 0.17052899301052094, "learning_rate": 7.99973034830627e-07, "loss": 0.00229856, "memory(GiB)": 15.03, "step": 23395, "train_speed(iter/s)": 1.47134 }, { "acc": 1.0, "epoch": 41.30626654898499, "grad_norm": 0.28022727370262146, "learning_rate": 7.98389696182395e-07, "loss": 0.00351454, "memory(GiB)": 15.03, "step": 23400, "train_speed(iter/s)": 1.471351 }, { "acc": 1.0, "epoch": 41.31509267431598, "grad_norm": 0.11856618523597717, "learning_rate": 7.968077919897578e-07, "loss": 0.00427395, "memory(GiB)": 15.03, "step": 23405, "train_speed(iter/s)": 1.471372 }, { "acc": 0.99981613, "epoch": 41.323918799646954, "grad_norm": 0.41403400897979736, "learning_rate": 7.952273227927203e-07, "loss": 0.00909861, "memory(GiB)": 15.03, "step": 23410, "train_speed(iter/s)": 1.471372 }, { "acc": 1.0, "epoch": 41.33274492497794, "grad_norm": 0.2107604742050171, "learning_rate": 7.936482891308048e-07, "loss": 0.00196992, "memory(GiB)": 15.03, "step": 23415, "train_speed(iter/s)": 1.471371 }, { "acc": 1.0, "epoch": 41.341571050308914, "grad_norm": 0.022970497608184814, "learning_rate": 7.920706915430326e-07, "loss": 0.00337293, "memory(GiB)": 15.03, "step": 23420, "train_speed(iter/s)": 1.471387 }, { "acc": 1.0, "epoch": 41.35039717563989, "grad_norm": 0.050354160368442535, "learning_rate": 7.90494530567945e-07, "loss": 0.0023334, "memory(GiB)": 15.03, "step": 23425, "train_speed(iter/s)": 1.471411 }, { "acc": 1.0, "epoch": 41.359223300970875, "grad_norm": 0.17931373417377472, "learning_rate": 7.889198067435896e-07, "loss": 0.0017529, "memory(GiB)": 15.03, "step": 23430, "train_speed(iter/s)": 1.47141 }, { "acc": 1.0, "epoch": 41.36804942630185, "grad_norm": 0.2206866443157196, "learning_rate": 7.873465206075206e-07, "loss": 0.001793, "memory(GiB)": 15.03, "step": 23435, "train_speed(iter/s)": 1.471411 }, { "acc": 0.99973955, "epoch": 41.376875551632835, "grad_norm": 0.20924092829227448, "learning_rate": 7.857746726968066e-07, "loss": 0.00505227, "memory(GiB)": 15.03, "step": 23440, "train_speed(iter/s)": 1.471424 }, { "acc": 1.0, "epoch": 41.38570167696381, "grad_norm": 0.38303449749946594, "learning_rate": 7.842042635480183e-07, "loss": 0.00354341, "memory(GiB)": 15.03, "step": 23445, "train_speed(iter/s)": 1.471431 }, { "acc": 0.99978809, "epoch": 41.394527802294796, "grad_norm": 0.17432458698749542, "learning_rate": 7.826352936972425e-07, "loss": 0.00365316, "memory(GiB)": 15.03, "step": 23450, "train_speed(iter/s)": 1.471422 }, { "acc": 0.99970417, "epoch": 41.40335392762577, "grad_norm": 0.648347020149231, "learning_rate": 7.81067763680069e-07, "loss": 0.00398634, "memory(GiB)": 15.03, "step": 23455, "train_speed(iter/s)": 1.471423 }, { "acc": 1.0, "epoch": 41.41218005295675, "grad_norm": 0.18981975317001343, "learning_rate": 7.795016740316021e-07, "loss": 0.00382851, "memory(GiB)": 15.03, "step": 23460, "train_speed(iter/s)": 1.471417 }, { "acc": 0.99984179, "epoch": 41.42100617828773, "grad_norm": 0.023099420592188835, "learning_rate": 7.779370252864457e-07, "loss": 0.00385736, "memory(GiB)": 15.03, "step": 23465, "train_speed(iter/s)": 1.471428 }, { "acc": 1.0, "epoch": 41.42983230361871, "grad_norm": 0.2524597942829132, "learning_rate": 7.763738179787203e-07, "loss": 0.00128417, "memory(GiB)": 15.03, "step": 23470, "train_speed(iter/s)": 1.471418 }, { "acc": 1.0, "epoch": 41.438658428949694, "grad_norm": 0.006127600092440844, "learning_rate": 7.748120526420516e-07, "loss": 0.00340062, "memory(GiB)": 15.03, "step": 23475, "train_speed(iter/s)": 1.471405 }, { "acc": 1.0, "epoch": 41.44748455428067, "grad_norm": 0.349833607673645, "learning_rate": 7.732517298095711e-07, "loss": 0.00331091, "memory(GiB)": 15.03, "step": 23480, "train_speed(iter/s)": 1.471415 }, { "acc": 1.0, "epoch": 41.45631067961165, "grad_norm": 0.04757366701960564, "learning_rate": 7.716928500139193e-07, "loss": 0.00270359, "memory(GiB)": 15.03, "step": 23485, "train_speed(iter/s)": 1.471427 }, { "acc": 1.0, "epoch": 41.46513680494263, "grad_norm": 0.0062155709601938725, "learning_rate": 7.701354137872427e-07, "loss": 0.00209329, "memory(GiB)": 15.03, "step": 23490, "train_speed(iter/s)": 1.471422 }, { "acc": 1.0, "epoch": 41.47396293027361, "grad_norm": 0.009990843012928963, "learning_rate": 7.685794216611981e-07, "loss": 0.00275192, "memory(GiB)": 15.03, "step": 23495, "train_speed(iter/s)": 1.471429 }, { "acc": 0.99951925, "epoch": 41.48278905560459, "grad_norm": 0.00443090358749032, "learning_rate": 7.670248741669499e-07, "loss": 0.00327605, "memory(GiB)": 15.03, "step": 23500, "train_speed(iter/s)": 1.47144 }, { "acc": 1.0, "epoch": 41.49161518093557, "grad_norm": 0.4557376503944397, "learning_rate": 7.654717718351611e-07, "loss": 0.00697248, "memory(GiB)": 15.03, "step": 23505, "train_speed(iter/s)": 1.471454 }, { "acc": 0.99989033, "epoch": 41.50044130626655, "grad_norm": 0.10992980003356934, "learning_rate": 7.639201151960121e-07, "loss": 0.00260106, "memory(GiB)": 15.03, "step": 23510, "train_speed(iter/s)": 1.471473 }, { "acc": 1.0, "epoch": 41.50926743159753, "grad_norm": 0.020183760672807693, "learning_rate": 7.623699047791824e-07, "loss": 0.00438965, "memory(GiB)": 15.03, "step": 23515, "train_speed(iter/s)": 1.471492 }, { "acc": 1.0, "epoch": 41.518093556928505, "grad_norm": 0.2739025056362152, "learning_rate": 7.608211411138633e-07, "loss": 0.00327671, "memory(GiB)": 15.03, "step": 23520, "train_speed(iter/s)": 1.471504 }, { "acc": 0.99899998, "epoch": 41.52691968225949, "grad_norm": 0.2105398178100586, "learning_rate": 7.592738247287454e-07, "loss": 0.01082763, "memory(GiB)": 15.03, "step": 23525, "train_speed(iter/s)": 1.471513 }, { "acc": 1.0, "epoch": 41.535745807590466, "grad_norm": 0.3190975785255432, "learning_rate": 7.577279561520323e-07, "loss": 0.00440538, "memory(GiB)": 15.03, "step": 23530, "train_speed(iter/s)": 1.471519 }, { "acc": 1.0, "epoch": 41.54457193292145, "grad_norm": 0.26541975140571594, "learning_rate": 7.56183535911428e-07, "loss": 0.00517194, "memory(GiB)": 15.03, "step": 23535, "train_speed(iter/s)": 1.471538 }, { "acc": 1.0, "epoch": 41.55339805825243, "grad_norm": 0.3041074573993683, "learning_rate": 7.546405645341452e-07, "loss": 0.00338853, "memory(GiB)": 15.03, "step": 23540, "train_speed(iter/s)": 1.471541 }, { "acc": 0.99958534, "epoch": 41.56222418358341, "grad_norm": 0.30411121249198914, "learning_rate": 7.530990425469042e-07, "loss": 0.00511123, "memory(GiB)": 15.03, "step": 23545, "train_speed(iter/s)": 1.471545 }, { "acc": 1.0, "epoch": 41.57105030891439, "grad_norm": 0.39682653546333313, "learning_rate": 7.515589704759221e-07, "loss": 0.00269219, "memory(GiB)": 15.03, "step": 23550, "train_speed(iter/s)": 1.471554 }, { "acc": 0.99990158, "epoch": 41.579876434245364, "grad_norm": 0.12884953618049622, "learning_rate": 7.5002034884693e-07, "loss": 0.00418131, "memory(GiB)": 15.03, "step": 23555, "train_speed(iter/s)": 1.471566 }, { "acc": 1.0, "epoch": 41.58870255957635, "grad_norm": 0.00968797318637371, "learning_rate": 7.484831781851581e-07, "loss": 0.00086055, "memory(GiB)": 15.03, "step": 23560, "train_speed(iter/s)": 1.471565 }, { "acc": 0.99975491, "epoch": 41.597528684907324, "grad_norm": 0.025553135201334953, "learning_rate": 7.469474590153471e-07, "loss": 0.00459515, "memory(GiB)": 15.03, "step": 23565, "train_speed(iter/s)": 1.47157 }, { "acc": 0.99978809, "epoch": 41.60635481023831, "grad_norm": 0.18790391087532043, "learning_rate": 7.454131918617335e-07, "loss": 0.00405204, "memory(GiB)": 15.03, "step": 23570, "train_speed(iter/s)": 1.471567 }, { "acc": 0.99897823, "epoch": 41.615180935569285, "grad_norm": 0.0033775237388908863, "learning_rate": 7.438803772480671e-07, "loss": 0.00585417, "memory(GiB)": 15.03, "step": 23575, "train_speed(iter/s)": 1.47157 }, { "acc": 1.0, "epoch": 41.62400706090026, "grad_norm": 0.1795273721218109, "learning_rate": 7.423490156975961e-07, "loss": 0.00091751, "memory(GiB)": 15.03, "step": 23580, "train_speed(iter/s)": 1.471575 }, { "acc": 1.0, "epoch": 41.632833186231245, "grad_norm": 0.005459378473460674, "learning_rate": 7.408191077330767e-07, "loss": 0.00238809, "memory(GiB)": 15.03, "step": 23585, "train_speed(iter/s)": 1.471571 }, { "acc": 1.0, "epoch": 41.64165931156222, "grad_norm": 0.2583361566066742, "learning_rate": 7.392906538767633e-07, "loss": 0.00455483, "memory(GiB)": 15.03, "step": 23590, "train_speed(iter/s)": 1.471574 }, { "acc": 0.99940338, "epoch": 41.650485436893206, "grad_norm": 0.486500084400177, "learning_rate": 7.377636546504185e-07, "loss": 0.01403852, "memory(GiB)": 15.03, "step": 23595, "train_speed(iter/s)": 1.471565 }, { "acc": 1.0, "epoch": 41.65931156222418, "grad_norm": 0.22925715148448944, "learning_rate": 7.362381105753101e-07, "loss": 0.00236303, "memory(GiB)": 15.03, "step": 23600, "train_speed(iter/s)": 1.471555 }, { "acc": 1.0, "epoch": 41.668137687555166, "grad_norm": 0.02645830251276493, "learning_rate": 7.347140221722029e-07, "loss": 0.00321397, "memory(GiB)": 15.03, "step": 23605, "train_speed(iter/s)": 1.471566 }, { "acc": 1.0, "epoch": 41.67696381288614, "grad_norm": 0.1783907115459442, "learning_rate": 7.331913899613701e-07, "loss": 0.00221253, "memory(GiB)": 15.03, "step": 23610, "train_speed(iter/s)": 1.471566 }, { "acc": 1.0, "epoch": 41.68578993821712, "grad_norm": 0.4787992238998413, "learning_rate": 7.316702144625837e-07, "loss": 0.00534058, "memory(GiB)": 15.03, "step": 23615, "train_speed(iter/s)": 1.471569 }, { "acc": 1.0, "epoch": 41.694616063548104, "grad_norm": 0.45373913645744324, "learning_rate": 7.30150496195122e-07, "loss": 0.00270491, "memory(GiB)": 15.03, "step": 23620, "train_speed(iter/s)": 1.471591 }, { "acc": 1.0, "epoch": 41.70344218887908, "grad_norm": 0.23355762660503387, "learning_rate": 7.286322356777668e-07, "loss": 0.00319904, "memory(GiB)": 15.03, "step": 23625, "train_speed(iter/s)": 1.471578 }, { "acc": 1.0, "epoch": 41.712268314210064, "grad_norm": 0.09707964956760406, "learning_rate": 7.271154334287947e-07, "loss": 0.00571588, "memory(GiB)": 15.03, "step": 23630, "train_speed(iter/s)": 1.471583 }, { "acc": 0.99974318, "epoch": 41.72109443954104, "grad_norm": 0.25582441687583923, "learning_rate": 7.256000899659941e-07, "loss": 0.00356159, "memory(GiB)": 15.03, "step": 23635, "train_speed(iter/s)": 1.471579 }, { "acc": 1.0, "epoch": 41.729920564872025, "grad_norm": 0.12326301634311676, "learning_rate": 7.240862058066479e-07, "loss": 0.00514306, "memory(GiB)": 15.03, "step": 23640, "train_speed(iter/s)": 1.471599 }, { "acc": 1.0, "epoch": 41.738746690203, "grad_norm": 0.009313741698861122, "learning_rate": 7.225737814675466e-07, "loss": 0.00486409, "memory(GiB)": 15.03, "step": 23645, "train_speed(iter/s)": 1.471586 }, { "acc": 1.0, "epoch": 41.74757281553398, "grad_norm": 0.09649302065372467, "learning_rate": 7.210628174649785e-07, "loss": 0.00311451, "memory(GiB)": 15.03, "step": 23650, "train_speed(iter/s)": 1.471585 }, { "acc": 1.0, "epoch": 41.75639894086496, "grad_norm": 0.06239388510584831, "learning_rate": 7.19553314314735e-07, "loss": 0.00221443, "memory(GiB)": 15.03, "step": 23655, "train_speed(iter/s)": 1.471594 }, { "acc": 0.9997159, "epoch": 41.76522506619594, "grad_norm": 0.06244469806551933, "learning_rate": 7.180452725321062e-07, "loss": 0.00458892, "memory(GiB)": 15.03, "step": 23660, "train_speed(iter/s)": 1.471597 }, { "acc": 1.0, "epoch": 41.77405119152692, "grad_norm": 0.012689444236457348, "learning_rate": 7.165386926318883e-07, "loss": 0.00102092, "memory(GiB)": 15.03, "step": 23665, "train_speed(iter/s)": 1.4716 }, { "acc": 1.0, "epoch": 41.7828773168579, "grad_norm": 0.02884645387530327, "learning_rate": 7.150335751283767e-07, "loss": 0.00186917, "memory(GiB)": 15.03, "step": 23670, "train_speed(iter/s)": 1.471597 }, { "acc": 1.0, "epoch": 41.791703442188876, "grad_norm": 0.0019530907738953829, "learning_rate": 7.13529920535363e-07, "loss": 0.00204163, "memory(GiB)": 15.03, "step": 23675, "train_speed(iter/s)": 1.471606 }, { "acc": 1.0, "epoch": 41.80052956751986, "grad_norm": 0.002377680968493223, "learning_rate": 7.120277293661457e-07, "loss": 0.00262044, "memory(GiB)": 15.03, "step": 23680, "train_speed(iter/s)": 1.471602 }, { "acc": 1.0, "epoch": 41.809355692850836, "grad_norm": 0.027350381016731262, "learning_rate": 7.105270021335192e-07, "loss": 0.00271371, "memory(GiB)": 15.03, "step": 23685, "train_speed(iter/s)": 1.471611 }, { "acc": 1.0, "epoch": 41.81818181818182, "grad_norm": 0.004101388156414032, "learning_rate": 7.090277393497834e-07, "loss": 0.00152088, "memory(GiB)": 15.03, "step": 23690, "train_speed(iter/s)": 1.471618 }, { "acc": 1.0, "epoch": 41.8270079435128, "grad_norm": 0.13565710186958313, "learning_rate": 7.075299415267306e-07, "loss": 0.00093136, "memory(GiB)": 15.03, "step": 23695, "train_speed(iter/s)": 1.471609 }, { "acc": 1.0, "epoch": 41.83583406884378, "grad_norm": 0.01897389069199562, "learning_rate": 7.060336091756603e-07, "loss": 0.0057242, "memory(GiB)": 15.03, "step": 23700, "train_speed(iter/s)": 1.471612 }, { "acc": 0.99975491, "epoch": 41.84466019417476, "grad_norm": 0.08325564861297607, "learning_rate": 7.045387428073666e-07, "loss": 0.00403437, "memory(GiB)": 15.03, "step": 23705, "train_speed(iter/s)": 1.471611 }, { "acc": 0.99910717, "epoch": 41.853486319505734, "grad_norm": 0.5049504041671753, "learning_rate": 7.030453429321492e-07, "loss": 0.00831214, "memory(GiB)": 15.03, "step": 23710, "train_speed(iter/s)": 1.471628 }, { "acc": 1.0, "epoch": 41.86231244483672, "grad_norm": 0.2511325776576996, "learning_rate": 7.015534100597987e-07, "loss": 0.00460547, "memory(GiB)": 15.03, "step": 23715, "train_speed(iter/s)": 1.471632 }, { "acc": 1.0, "epoch": 41.871138570167695, "grad_norm": 0.5835056304931641, "learning_rate": 7.00062944699611e-07, "loss": 0.00304268, "memory(GiB)": 15.03, "step": 23720, "train_speed(iter/s)": 1.471633 }, { "acc": 1.0, "epoch": 41.87996469549868, "grad_norm": 0.15616166591644287, "learning_rate": 6.985739473603817e-07, "loss": 0.00835739, "memory(GiB)": 15.03, "step": 23725, "train_speed(iter/s)": 1.471626 }, { "acc": 1.0, "epoch": 41.888790820829655, "grad_norm": 0.09350085258483887, "learning_rate": 6.970864185504012e-07, "loss": 0.00083187, "memory(GiB)": 15.03, "step": 23730, "train_speed(iter/s)": 1.471628 }, { "acc": 1.0, "epoch": 41.89761694616064, "grad_norm": 0.015569414012134075, "learning_rate": 6.956003587774617e-07, "loss": 0.00290987, "memory(GiB)": 15.03, "step": 23735, "train_speed(iter/s)": 1.471632 }, { "acc": 0.99991436, "epoch": 41.906443071491616, "grad_norm": 0.005674362648278475, "learning_rate": 6.941157685488506e-07, "loss": 0.00223654, "memory(GiB)": 15.03, "step": 23740, "train_speed(iter/s)": 1.47164 }, { "acc": 1.0, "epoch": 41.91526919682259, "grad_norm": 0.23422442376613617, "learning_rate": 6.926326483713568e-07, "loss": 0.00210184, "memory(GiB)": 15.03, "step": 23745, "train_speed(iter/s)": 1.471644 }, { "acc": 1.0, "epoch": 41.924095322153576, "grad_norm": 0.16852502524852753, "learning_rate": 6.911509987512685e-07, "loss": 0.00506312, "memory(GiB)": 15.03, "step": 23750, "train_speed(iter/s)": 1.471637 }, { "acc": 0.99951382, "epoch": 41.93292144748455, "grad_norm": 0.463561087846756, "learning_rate": 6.896708201943683e-07, "loss": 0.01062514, "memory(GiB)": 15.03, "step": 23755, "train_speed(iter/s)": 1.471639 }, { "acc": 1.0, "epoch": 41.94174757281554, "grad_norm": 0.008234679698944092, "learning_rate": 6.881921132059383e-07, "loss": 0.00094006, "memory(GiB)": 15.03, "step": 23760, "train_speed(iter/s)": 1.471657 }, { "acc": 1.0, "epoch": 41.95057369814651, "grad_norm": 0.007516379933804274, "learning_rate": 6.867148782907561e-07, "loss": 0.00376324, "memory(GiB)": 15.03, "step": 23765, "train_speed(iter/s)": 1.471646 }, { "acc": 1.0, "epoch": 41.95939982347749, "grad_norm": 0.28626424074172974, "learning_rate": 6.852391159531017e-07, "loss": 0.00214473, "memory(GiB)": 15.03, "step": 23770, "train_speed(iter/s)": 1.471659 }, { "acc": 1.0, "epoch": 41.968225948808474, "grad_norm": 0.23607602715492249, "learning_rate": 6.837648266967482e-07, "loss": 0.00213545, "memory(GiB)": 15.03, "step": 23775, "train_speed(iter/s)": 1.471659 }, { "acc": 1.0, "epoch": 41.97705207413945, "grad_norm": 0.03941928595304489, "learning_rate": 6.822920110249674e-07, "loss": 0.00264063, "memory(GiB)": 15.03, "step": 23780, "train_speed(iter/s)": 1.471673 }, { "acc": 1.0, "epoch": 41.985878199470434, "grad_norm": 0.036726515740156174, "learning_rate": 6.808206694405268e-07, "loss": 0.002325, "memory(GiB)": 15.03, "step": 23785, "train_speed(iter/s)": 1.47169 }, { "acc": 1.0, "epoch": 41.99470432480141, "grad_norm": 0.2322227954864502, "learning_rate": 6.793508024456922e-07, "loss": 0.00218844, "memory(GiB)": 15.03, "step": 23790, "train_speed(iter/s)": 1.471705 }, { "acc": 1.0, "epoch": 42.003530450132395, "grad_norm": 0.015744715929031372, "learning_rate": 6.778824105422281e-07, "loss": 0.00186546, "memory(GiB)": 15.03, "step": 23795, "train_speed(iter/s)": 1.471661 }, { "acc": 0.99976416, "epoch": 42.01235657546337, "grad_norm": 0.3135804533958435, "learning_rate": 6.764154942313883e-07, "loss": 0.00274195, "memory(GiB)": 15.03, "step": 23800, "train_speed(iter/s)": 1.471651 }, { "acc": 1.0, "epoch": 42.02118270079435, "grad_norm": 0.13022711873054504, "learning_rate": 6.74950054013931e-07, "loss": 0.00530087, "memory(GiB)": 15.03, "step": 23805, "train_speed(iter/s)": 1.471639 }, { "acc": 0.99963379, "epoch": 42.03000882612533, "grad_norm": 0.1699727177619934, "learning_rate": 6.734860903901044e-07, "loss": 0.00486158, "memory(GiB)": 15.03, "step": 23810, "train_speed(iter/s)": 1.471646 }, { "acc": 1.0, "epoch": 42.03883495145631, "grad_norm": 0.187603160738945, "learning_rate": 6.720236038596588e-07, "loss": 0.00111162, "memory(GiB)": 15.03, "step": 23815, "train_speed(iter/s)": 1.471651 }, { "acc": 0.99974995, "epoch": 42.04766107678729, "grad_norm": 0.30725526809692383, "learning_rate": 6.705625949218326e-07, "loss": 0.00602349, "memory(GiB)": 15.03, "step": 23820, "train_speed(iter/s)": 1.471652 }, { "acc": 0.99970093, "epoch": 42.05648720211827, "grad_norm": 0.23547124862670898, "learning_rate": 6.691030640753661e-07, "loss": 0.00326789, "memory(GiB)": 15.03, "step": 23825, "train_speed(iter/s)": 1.471649 }, { "acc": 1.0, "epoch": 42.06531332744925, "grad_norm": 0.33213239908218384, "learning_rate": 6.676450118184917e-07, "loss": 0.00630966, "memory(GiB)": 15.03, "step": 23830, "train_speed(iter/s)": 1.471665 }, { "acc": 1.0, "epoch": 42.07413945278023, "grad_norm": 0.18297189474105835, "learning_rate": 6.661884386489409e-07, "loss": 0.0011953, "memory(GiB)": 15.03, "step": 23835, "train_speed(iter/s)": 1.471662 }, { "acc": 1.0, "epoch": 42.08296557811121, "grad_norm": 0.29775941371917725, "learning_rate": 6.647333450639338e-07, "loss": 0.00243474, "memory(GiB)": 15.03, "step": 23840, "train_speed(iter/s)": 1.471665 }, { "acc": 1.0, "epoch": 42.09179170344219, "grad_norm": 0.2721579372882843, "learning_rate": 6.632797315601903e-07, "loss": 0.00496092, "memory(GiB)": 15.03, "step": 23845, "train_speed(iter/s)": 1.471673 }, { "acc": 1.0, "epoch": 42.10061782877317, "grad_norm": 0.13324567675590515, "learning_rate": 6.618275986339265e-07, "loss": 0.00749469, "memory(GiB)": 15.03, "step": 23850, "train_speed(iter/s)": 1.471681 }, { "acc": 1.0, "epoch": 42.10944395410415, "grad_norm": 0.43877077102661133, "learning_rate": 6.603769467808482e-07, "loss": 0.00423615, "memory(GiB)": 15.03, "step": 23855, "train_speed(iter/s)": 1.471689 }, { "acc": 0.99969234, "epoch": 42.11827007943513, "grad_norm": 0.2852659523487091, "learning_rate": 6.589277764961584e-07, "loss": 0.00394518, "memory(GiB)": 15.03, "step": 23860, "train_speed(iter/s)": 1.471694 }, { "acc": 1.0, "epoch": 42.127096204766104, "grad_norm": 0.32282754778862, "learning_rate": 6.574800882745522e-07, "loss": 0.00372673, "memory(GiB)": 15.03, "step": 23865, "train_speed(iter/s)": 1.47169 }, { "acc": 1.0, "epoch": 42.13592233009709, "grad_norm": 0.1054244115948677, "learning_rate": 6.560338826102227e-07, "loss": 0.00561543, "memory(GiB)": 15.03, "step": 23870, "train_speed(iter/s)": 1.471692 }, { "acc": 1.0, "epoch": 42.144748455428065, "grad_norm": 0.07698088139295578, "learning_rate": 6.545891599968549e-07, "loss": 0.00342409, "memory(GiB)": 15.03, "step": 23875, "train_speed(iter/s)": 1.471699 }, { "acc": 1.0, "epoch": 42.15357458075905, "grad_norm": 0.5493290424346924, "learning_rate": 6.531459209276261e-07, "loss": 0.0064137, "memory(GiB)": 15.03, "step": 23880, "train_speed(iter/s)": 1.471698 }, { "acc": 0.99956894, "epoch": 42.162400706090025, "grad_norm": 0.3763701319694519, "learning_rate": 6.517041658952092e-07, "loss": 0.00920883, "memory(GiB)": 15.03, "step": 23885, "train_speed(iter/s)": 1.471703 }, { "acc": 1.0, "epoch": 42.17122683142101, "grad_norm": 0.011917307041585445, "learning_rate": 6.502638953917678e-07, "loss": 0.00138537, "memory(GiB)": 15.03, "step": 23890, "train_speed(iter/s)": 1.471715 }, { "acc": 1.0, "epoch": 42.180052956751986, "grad_norm": 0.01040063239634037, "learning_rate": 6.488251099089632e-07, "loss": 0.00222001, "memory(GiB)": 15.03, "step": 23895, "train_speed(iter/s)": 1.471718 }, { "acc": 0.99974995, "epoch": 42.18887908208296, "grad_norm": 0.48209595680236816, "learning_rate": 6.473878099379456e-07, "loss": 0.00525221, "memory(GiB)": 15.03, "step": 23900, "train_speed(iter/s)": 1.471725 }, { "acc": 1.0, "epoch": 42.19770520741395, "grad_norm": 0.007447109557688236, "learning_rate": 6.459519959693601e-07, "loss": 0.0022248, "memory(GiB)": 15.03, "step": 23905, "train_speed(iter/s)": 1.471729 }, { "acc": 1.0, "epoch": 42.20653133274492, "grad_norm": 0.004549568053334951, "learning_rate": 6.445176684933429e-07, "loss": 0.00459374, "memory(GiB)": 15.03, "step": 23910, "train_speed(iter/s)": 1.471731 }, { "acc": 1.0, "epoch": 42.21535745807591, "grad_norm": 0.09793488681316376, "learning_rate": 6.430848279995249e-07, "loss": 0.00345301, "memory(GiB)": 15.03, "step": 23915, "train_speed(iter/s)": 1.471713 }, { "acc": 1.0, "epoch": 42.224183583406884, "grad_norm": 0.0323311872780323, "learning_rate": 6.416534749770307e-07, "loss": 0.00492655, "memory(GiB)": 15.03, "step": 23920, "train_speed(iter/s)": 1.471707 }, { "acc": 1.0, "epoch": 42.23300970873787, "grad_norm": 0.03076418675482273, "learning_rate": 6.40223609914471e-07, "loss": 0.00207517, "memory(GiB)": 15.03, "step": 23925, "train_speed(iter/s)": 1.471706 }, { "acc": 1.0, "epoch": 42.241835834068844, "grad_norm": 0.015894319862127304, "learning_rate": 6.38795233299955e-07, "loss": 0.0031035, "memory(GiB)": 15.03, "step": 23930, "train_speed(iter/s)": 1.471707 }, { "acc": 0.99967613, "epoch": 42.25066195939982, "grad_norm": 0.22890831530094147, "learning_rate": 6.373683456210792e-07, "loss": 0.00236179, "memory(GiB)": 15.03, "step": 23935, "train_speed(iter/s)": 1.471716 }, { "acc": 1.0, "epoch": 42.259488084730805, "grad_norm": 0.6520618796348572, "learning_rate": 6.359429473649375e-07, "loss": 0.00558726, "memory(GiB)": 15.03, "step": 23940, "train_speed(iter/s)": 1.471722 }, { "acc": 0.99969511, "epoch": 42.26831421006178, "grad_norm": 0.10773996263742447, "learning_rate": 6.345190390181068e-07, "loss": 0.0052292, "memory(GiB)": 15.03, "step": 23945, "train_speed(iter/s)": 1.471728 }, { "acc": 0.99978809, "epoch": 42.277140335392765, "grad_norm": 0.4877638816833496, "learning_rate": 6.330966210666639e-07, "loss": 0.00524827, "memory(GiB)": 15.03, "step": 23950, "train_speed(iter/s)": 1.471743 }, { "acc": 1.0, "epoch": 42.28596646072374, "grad_norm": 0.24321885406970978, "learning_rate": 6.316756939961715e-07, "loss": 0.00313001, "memory(GiB)": 15.03, "step": 23955, "train_speed(iter/s)": 1.471743 }, { "acc": 1.0, "epoch": 42.29479258605472, "grad_norm": 0.15615519881248474, "learning_rate": 6.302562582916877e-07, "loss": 0.00343781, "memory(GiB)": 15.03, "step": 23960, "train_speed(iter/s)": 1.471749 }, { "acc": 1.0, "epoch": 42.3036187113857, "grad_norm": 0.351227343082428, "learning_rate": 6.28838314437755e-07, "loss": 0.00220932, "memory(GiB)": 15.03, "step": 23965, "train_speed(iter/s)": 1.471738 }, { "acc": 1.0, "epoch": 42.31244483671668, "grad_norm": 0.0017670626984909177, "learning_rate": 6.27421862918412e-07, "loss": 0.00701407, "memory(GiB)": 15.03, "step": 23970, "train_speed(iter/s)": 1.471746 }, { "acc": 0.99949989, "epoch": 42.32127096204766, "grad_norm": 0.33147481083869934, "learning_rate": 6.260069042171886e-07, "loss": 0.00601492, "memory(GiB)": 15.03, "step": 23975, "train_speed(iter/s)": 1.471747 }, { "acc": 1.0, "epoch": 42.33009708737864, "grad_norm": 0.004935340024530888, "learning_rate": 6.245934388171002e-07, "loss": 0.00166495, "memory(GiB)": 15.03, "step": 23980, "train_speed(iter/s)": 1.471746 }, { "acc": 0.99982643, "epoch": 42.338923212709624, "grad_norm": 0.29320228099823, "learning_rate": 6.231814672006579e-07, "loss": 0.00472351, "memory(GiB)": 15.03, "step": 23985, "train_speed(iter/s)": 1.471763 }, { "acc": 1.0, "epoch": 42.3477493380406, "grad_norm": 0.4470735788345337, "learning_rate": 6.21770989849857e-07, "loss": 0.00460431, "memory(GiB)": 15.03, "step": 23990, "train_speed(iter/s)": 1.471766 }, { "acc": 0.9997159, "epoch": 42.35657546337158, "grad_norm": 0.29988157749176025, "learning_rate": 6.203620072461868e-07, "loss": 0.00247554, "memory(GiB)": 15.03, "step": 23995, "train_speed(iter/s)": 1.471782 }, { "acc": 1.0, "epoch": 42.36540158870256, "grad_norm": 0.011232148855924606, "learning_rate": 6.189545198706272e-07, "loss": 0.00054733, "memory(GiB)": 15.03, "step": 24000, "train_speed(iter/s)": 1.471783 }, { "acc": 1.0, "epoch": 42.37422771403354, "grad_norm": 0.317590594291687, "learning_rate": 6.175485282036446e-07, "loss": 0.00510471, "memory(GiB)": 15.03, "step": 24005, "train_speed(iter/s)": 1.471788 }, { "acc": 1.0, "epoch": 42.38305383936452, "grad_norm": 0.017734723165631294, "learning_rate": 6.16144032725196e-07, "loss": 0.00323341, "memory(GiB)": 15.03, "step": 24010, "train_speed(iter/s)": 1.471791 }, { "acc": 1.0, "epoch": 42.3918799646955, "grad_norm": 0.05036448687314987, "learning_rate": 6.147410339147265e-07, "loss": 0.0016286, "memory(GiB)": 15.03, "step": 24015, "train_speed(iter/s)": 1.471798 }, { "acc": 0.99949999, "epoch": 42.40070609002648, "grad_norm": 0.4896076023578644, "learning_rate": 6.133395322511748e-07, "loss": 0.0062457, "memory(GiB)": 15.03, "step": 24020, "train_speed(iter/s)": 1.471804 }, { "acc": 0.99965277, "epoch": 42.40953221535746, "grad_norm": 0.5706002712249756, "learning_rate": 6.119395282129621e-07, "loss": 0.0107984, "memory(GiB)": 15.03, "step": 24025, "train_speed(iter/s)": 1.471801 }, { "acc": 1.0, "epoch": 42.418358340688435, "grad_norm": 0.010719685815274715, "learning_rate": 6.105410222780035e-07, "loss": 0.00321561, "memory(GiB)": 15.03, "step": 24030, "train_speed(iter/s)": 1.4718 }, { "acc": 1.0, "epoch": 42.42718446601942, "grad_norm": 0.24735760688781738, "learning_rate": 6.091440149236989e-07, "loss": 0.00236239, "memory(GiB)": 15.03, "step": 24035, "train_speed(iter/s)": 1.471787 }, { "acc": 1.0, "epoch": 42.436010591350396, "grad_norm": 0.006364793051034212, "learning_rate": 6.077485066269387e-07, "loss": 0.00241072, "memory(GiB)": 15.03, "step": 24040, "train_speed(iter/s)": 1.47178 }, { "acc": 1.0, "epoch": 42.44483671668138, "grad_norm": 0.49737292528152466, "learning_rate": 6.063544978641046e-07, "loss": 0.00332587, "memory(GiB)": 15.03, "step": 24045, "train_speed(iter/s)": 1.471772 }, { "acc": 0.99980774, "epoch": 42.453662842012356, "grad_norm": 0.21464569866657257, "learning_rate": 6.049619891110585e-07, "loss": 0.00199664, "memory(GiB)": 15.03, "step": 24050, "train_speed(iter/s)": 1.471783 }, { "acc": 0.99978065, "epoch": 42.46248896734333, "grad_norm": 0.16389888525009155, "learning_rate": 6.035709808431585e-07, "loss": 0.00339934, "memory(GiB)": 15.03, "step": 24055, "train_speed(iter/s)": 1.471795 }, { "acc": 1.0, "epoch": 42.47131509267432, "grad_norm": 0.7087053060531616, "learning_rate": 6.021814735352437e-07, "loss": 0.00559037, "memory(GiB)": 15.03, "step": 24060, "train_speed(iter/s)": 1.471798 }, { "acc": 1.0, "epoch": 42.48014121800529, "grad_norm": 0.2616870403289795, "learning_rate": 6.007934676616482e-07, "loss": 0.00353842, "memory(GiB)": 15.03, "step": 24065, "train_speed(iter/s)": 1.471804 }, { "acc": 0.99969511, "epoch": 42.48896734333628, "grad_norm": 0.46164703369140625, "learning_rate": 5.994069636961846e-07, "loss": 0.00963016, "memory(GiB)": 15.03, "step": 24070, "train_speed(iter/s)": 1.471807 }, { "acc": 1.0, "epoch": 42.497793468667254, "grad_norm": 0.26520466804504395, "learning_rate": 5.980219621121609e-07, "loss": 0.00375747, "memory(GiB)": 15.03, "step": 24075, "train_speed(iter/s)": 1.47182 }, { "acc": 0.99986115, "epoch": 42.50661959399824, "grad_norm": 0.28252309560775757, "learning_rate": 5.966384633823664e-07, "loss": 0.0032887, "memory(GiB)": 15.03, "step": 24080, "train_speed(iter/s)": 1.471814 }, { "acc": 1.0, "epoch": 42.515445719329215, "grad_norm": 0.1609584391117096, "learning_rate": 5.95256467979081e-07, "loss": 0.00157876, "memory(GiB)": 15.03, "step": 24085, "train_speed(iter/s)": 1.471793 }, { "acc": 1.0, "epoch": 42.52427184466019, "grad_norm": 0.5697582364082336, "learning_rate": 5.938759763740731e-07, "loss": 0.00620191, "memory(GiB)": 15.03, "step": 24090, "train_speed(iter/s)": 1.471792 }, { "acc": 1.0, "epoch": 42.533097969991175, "grad_norm": 0.0028928820975124836, "learning_rate": 5.924969890385902e-07, "loss": 0.00246752, "memory(GiB)": 15.03, "step": 24095, "train_speed(iter/s)": 1.471797 }, { "acc": 0.9998106, "epoch": 42.54192409532215, "grad_norm": 0.1283050924539566, "learning_rate": 5.911195064433746e-07, "loss": 0.00385113, "memory(GiB)": 15.03, "step": 24100, "train_speed(iter/s)": 1.471787 }, { "acc": 0.99981613, "epoch": 42.550750220653136, "grad_norm": 0.013390269130468369, "learning_rate": 5.897435290586487e-07, "loss": 0.00515033, "memory(GiB)": 15.03, "step": 24105, "train_speed(iter/s)": 1.471771 }, { "acc": 0.99981346, "epoch": 42.55957634598411, "grad_norm": 0.185637965798378, "learning_rate": 5.883690573541265e-07, "loss": 0.00308403, "memory(GiB)": 15.03, "step": 24110, "train_speed(iter/s)": 1.471767 }, { "acc": 0.99980164, "epoch": 42.568402471315096, "grad_norm": 0.13529251515865326, "learning_rate": 5.869960917990024e-07, "loss": 0.00256787, "memory(GiB)": 15.03, "step": 24115, "train_speed(iter/s)": 1.471767 }, { "acc": 1.0, "epoch": 42.57722859664607, "grad_norm": 0.024165058508515358, "learning_rate": 5.856246328619606e-07, "loss": 0.00189903, "memory(GiB)": 15.03, "step": 24120, "train_speed(iter/s)": 1.471757 }, { "acc": 1.0, "epoch": 42.58605472197705, "grad_norm": 0.047990452498197556, "learning_rate": 5.842546810111713e-07, "loss": 0.0025186, "memory(GiB)": 15.03, "step": 24125, "train_speed(iter/s)": 1.471775 }, { "acc": 1.0, "epoch": 42.59488084730803, "grad_norm": 0.05029277503490448, "learning_rate": 5.828862367142879e-07, "loss": 0.00259045, "memory(GiB)": 15.03, "step": 24130, "train_speed(iter/s)": 1.471792 }, { "acc": 1.0, "epoch": 42.60370697263901, "grad_norm": 0.235254168510437, "learning_rate": 5.8151930043845e-07, "loss": 0.00237834, "memory(GiB)": 15.03, "step": 24135, "train_speed(iter/s)": 1.471792 }, { "acc": 0.99970236, "epoch": 42.612533097969994, "grad_norm": 0.5013272166252136, "learning_rate": 5.801538726502808e-07, "loss": 0.00596214, "memory(GiB)": 15.03, "step": 24140, "train_speed(iter/s)": 1.471794 }, { "acc": 0.99975014, "epoch": 42.62135922330097, "grad_norm": 0.5011602640151978, "learning_rate": 5.787899538158934e-07, "loss": 0.00667371, "memory(GiB)": 15.03, "step": 24145, "train_speed(iter/s)": 1.471799 }, { "acc": 1.0, "epoch": 42.63018534863195, "grad_norm": 0.04506147652864456, "learning_rate": 5.774275444008811e-07, "loss": 0.00283433, "memory(GiB)": 15.03, "step": 24150, "train_speed(iter/s)": 1.471796 }, { "acc": 1.0, "epoch": 42.63901147396293, "grad_norm": 0.5952819585800171, "learning_rate": 5.760666448703237e-07, "loss": 0.00368549, "memory(GiB)": 15.03, "step": 24155, "train_speed(iter/s)": 1.471795 }, { "acc": 1.0, "epoch": 42.64783759929391, "grad_norm": 0.0034609674476087093, "learning_rate": 5.747072556887845e-07, "loss": 0.00766354, "memory(GiB)": 15.03, "step": 24160, "train_speed(iter/s)": 1.471796 }, { "acc": 0.99986553, "epoch": 42.65666372462489, "grad_norm": 0.005603192839771509, "learning_rate": 5.733493773203121e-07, "loss": 0.00332313, "memory(GiB)": 15.03, "step": 24165, "train_speed(iter/s)": 1.471777 }, { "acc": 1.0, "epoch": 42.66548984995587, "grad_norm": 0.19017180800437927, "learning_rate": 5.719930102284427e-07, "loss": 0.00337595, "memory(GiB)": 15.03, "step": 24170, "train_speed(iter/s)": 1.471782 }, { "acc": 1.0, "epoch": 42.67431597528685, "grad_norm": 0.18839018046855927, "learning_rate": 5.70638154876189e-07, "loss": 0.00502708, "memory(GiB)": 15.03, "step": 24175, "train_speed(iter/s)": 1.471788 }, { "acc": 1.0, "epoch": 42.68314210061783, "grad_norm": 0.31737589836120605, "learning_rate": 5.692848117260542e-07, "loss": 0.00181503, "memory(GiB)": 15.03, "step": 24180, "train_speed(iter/s)": 1.471795 }, { "acc": 0.99972219, "epoch": 42.691968225948806, "grad_norm": 0.48472264409065247, "learning_rate": 5.679329812400214e-07, "loss": 0.00573693, "memory(GiB)": 15.03, "step": 24185, "train_speed(iter/s)": 1.471801 }, { "acc": 0.99963236, "epoch": 42.70079435127979, "grad_norm": 0.20010630786418915, "learning_rate": 5.665826638795612e-07, "loss": 0.01155209, "memory(GiB)": 15.03, "step": 24190, "train_speed(iter/s)": 1.471812 }, { "acc": 1.0, "epoch": 42.709620476610766, "grad_norm": 0.2860701382160187, "learning_rate": 5.652338601056235e-07, "loss": 0.00233324, "memory(GiB)": 15.03, "step": 24195, "train_speed(iter/s)": 1.471816 }, { "acc": 1.0, "epoch": 42.71844660194175, "grad_norm": 0.655714750289917, "learning_rate": 5.638865703786449e-07, "loss": 0.01073838, "memory(GiB)": 15.03, "step": 24200, "train_speed(iter/s)": 1.471837 }, { "acc": 1.0, "epoch": 42.72727272727273, "grad_norm": 0.24085047841072083, "learning_rate": 5.625407951585414e-07, "loss": 0.00248976, "memory(GiB)": 15.03, "step": 24205, "train_speed(iter/s)": 1.471846 }, { "acc": 1.0, "epoch": 42.73609885260371, "grad_norm": 0.5423467755317688, "learning_rate": 5.611965349047157e-07, "loss": 0.00352725, "memory(GiB)": 15.03, "step": 24210, "train_speed(iter/s)": 1.471846 }, { "acc": 1.0, "epoch": 42.74492497793469, "grad_norm": 0.015501591376960278, "learning_rate": 5.598537900760541e-07, "loss": 0.001038, "memory(GiB)": 15.03, "step": 24215, "train_speed(iter/s)": 1.471829 }, { "acc": 1.0, "epoch": 42.753751103265664, "grad_norm": 0.37811753153800964, "learning_rate": 5.585125611309189e-07, "loss": 0.00188228, "memory(GiB)": 15.03, "step": 24220, "train_speed(iter/s)": 1.471839 }, { "acc": 0.99941101, "epoch": 42.76257722859665, "grad_norm": 0.06510193645954132, "learning_rate": 5.571728485271631e-07, "loss": 0.00442246, "memory(GiB)": 15.03, "step": 24225, "train_speed(iter/s)": 1.471853 }, { "acc": 1.0, "epoch": 42.771403353927624, "grad_norm": 0.17622646689414978, "learning_rate": 5.558346527221153e-07, "loss": 0.0026866, "memory(GiB)": 15.03, "step": 24230, "train_speed(iter/s)": 1.471872 }, { "acc": 1.0, "epoch": 42.78022947925861, "grad_norm": 0.005589916370809078, "learning_rate": 5.54497974172594e-07, "loss": 0.00163842, "memory(GiB)": 15.03, "step": 24235, "train_speed(iter/s)": 1.471859 }, { "acc": 0.99978065, "epoch": 42.789055604589585, "grad_norm": 0.18560273945331573, "learning_rate": 5.531628133348895e-07, "loss": 0.00485748, "memory(GiB)": 15.03, "step": 24240, "train_speed(iter/s)": 1.471863 }, { "acc": 0.99968748, "epoch": 42.79788172992056, "grad_norm": 0.5643466114997864, "learning_rate": 5.51829170664783e-07, "loss": 0.00603951, "memory(GiB)": 15.03, "step": 24245, "train_speed(iter/s)": 1.471878 }, { "acc": 0.99984179, "epoch": 42.806707855251545, "grad_norm": 0.27268967032432556, "learning_rate": 5.504970466175347e-07, "loss": 0.0031678, "memory(GiB)": 15.03, "step": 24250, "train_speed(iter/s)": 1.471869 }, { "acc": 1.0, "epoch": 42.81553398058252, "grad_norm": 0.5259931683540344, "learning_rate": 5.491664416478844e-07, "loss": 0.00297854, "memory(GiB)": 15.03, "step": 24255, "train_speed(iter/s)": 1.47189 }, { "acc": 0.99975967, "epoch": 42.824360105913506, "grad_norm": 0.10487742722034454, "learning_rate": 5.478373562100551e-07, "loss": 0.00476313, "memory(GiB)": 15.03, "step": 24260, "train_speed(iter/s)": 1.471895 }, { "acc": 0.9998106, "epoch": 42.83318623124448, "grad_norm": 0.3861006796360016, "learning_rate": 5.465097907577501e-07, "loss": 0.00778672, "memory(GiB)": 15.03, "step": 24265, "train_speed(iter/s)": 1.471897 }, { "acc": 0.99981346, "epoch": 42.84201235657547, "grad_norm": 0.4787944555282593, "learning_rate": 5.451837457441554e-07, "loss": 0.00724489, "memory(GiB)": 15.03, "step": 24270, "train_speed(iter/s)": 1.471907 }, { "acc": 1.0, "epoch": 42.85083848190644, "grad_norm": 0.3863280713558197, "learning_rate": 5.438592216219376e-07, "loss": 0.00783265, "memory(GiB)": 15.03, "step": 24275, "train_speed(iter/s)": 1.471904 }, { "acc": 1.0, "epoch": 42.85966460723742, "grad_norm": 0.27761510014533997, "learning_rate": 5.425362188432425e-07, "loss": 0.00362938, "memory(GiB)": 15.03, "step": 24280, "train_speed(iter/s)": 1.4719 }, { "acc": 1.0, "epoch": 42.868490732568404, "grad_norm": 0.38375166058540344, "learning_rate": 5.412147378596968e-07, "loss": 0.00298903, "memory(GiB)": 15.03, "step": 24285, "train_speed(iter/s)": 1.4719 }, { "acc": 1.0, "epoch": 42.87731685789938, "grad_norm": 0.018607428297400475, "learning_rate": 5.398947791224097e-07, "loss": 0.0002666, "memory(GiB)": 15.03, "step": 24290, "train_speed(iter/s)": 1.471899 }, { "acc": 1.0, "epoch": 42.886142983230364, "grad_norm": 0.6946424841880798, "learning_rate": 5.385763430819705e-07, "loss": 0.00585412, "memory(GiB)": 15.03, "step": 24295, "train_speed(iter/s)": 1.471898 }, { "acc": 0.9997159, "epoch": 42.89496910856134, "grad_norm": 0.3373645544052124, "learning_rate": 5.372594301884467e-07, "loss": 0.0051316, "memory(GiB)": 15.03, "step": 24300, "train_speed(iter/s)": 1.471899 }, { "acc": 1.0, "epoch": 42.903795233892325, "grad_norm": 0.021355172619223595, "learning_rate": 5.359440408913877e-07, "loss": 0.00098886, "memory(GiB)": 15.03, "step": 24305, "train_speed(iter/s)": 1.471892 }, { "acc": 0.99970236, "epoch": 42.9126213592233, "grad_norm": 0.19133859872817993, "learning_rate": 5.346301756398201e-07, "loss": 0.00358951, "memory(GiB)": 15.03, "step": 24310, "train_speed(iter/s)": 1.471889 }, { "acc": 1.0, "epoch": 42.92144748455428, "grad_norm": 0.34636780619621277, "learning_rate": 5.333178348822542e-07, "loss": 0.00260328, "memory(GiB)": 15.03, "step": 24315, "train_speed(iter/s)": 1.471904 }, { "acc": 1.0, "epoch": 42.93027360988526, "grad_norm": 0.3617970943450928, "learning_rate": 5.320070190666779e-07, "loss": 0.00257472, "memory(GiB)": 15.03, "step": 24320, "train_speed(iter/s)": 1.471899 }, { "acc": 1.0, "epoch": 42.93909973521624, "grad_norm": 0.03686940670013428, "learning_rate": 5.306977286405579e-07, "loss": 0.00494577, "memory(GiB)": 15.03, "step": 24325, "train_speed(iter/s)": 1.471902 }, { "acc": 1.0, "epoch": 42.94792586054722, "grad_norm": 0.13913972675800323, "learning_rate": 5.293899640508397e-07, "loss": 0.00371744, "memory(GiB)": 15.03, "step": 24330, "train_speed(iter/s)": 1.471915 }, { "acc": 0.99975491, "epoch": 42.9567519858782, "grad_norm": 0.22040492296218872, "learning_rate": 5.280837257439499e-07, "loss": 0.00494213, "memory(GiB)": 15.03, "step": 24335, "train_speed(iter/s)": 1.471915 }, { "acc": 1.0, "epoch": 42.965578111209176, "grad_norm": 0.01642647013068199, "learning_rate": 5.26779014165796e-07, "loss": 0.00121004, "memory(GiB)": 15.03, "step": 24340, "train_speed(iter/s)": 1.471909 }, { "acc": 1.0, "epoch": 42.97440423654016, "grad_norm": 0.42815256118774414, "learning_rate": 5.254758297617565e-07, "loss": 0.00656587, "memory(GiB)": 15.03, "step": 24345, "train_speed(iter/s)": 1.471911 }, { "acc": 1.0, "epoch": 42.983230361871136, "grad_norm": 0.5431821346282959, "learning_rate": 5.241741729766975e-07, "loss": 0.00534479, "memory(GiB)": 15.03, "step": 24350, "train_speed(iter/s)": 1.471896 }, { "acc": 1.0, "epoch": 42.99205648720212, "grad_norm": 0.14571033418178558, "learning_rate": 5.228740442549575e-07, "loss": 0.00323928, "memory(GiB)": 15.03, "step": 24355, "train_speed(iter/s)": 1.471893 }, { "acc": 0.9998457, "epoch": 43.0008826125331, "grad_norm": 0.29329779744148254, "learning_rate": 5.215754440403587e-07, "loss": 0.0051532, "memory(GiB)": 15.03, "step": 24360, "train_speed(iter/s)": 1.471857 }, { "acc": 0.9997159, "epoch": 43.00970873786408, "grad_norm": 0.30985739827156067, "learning_rate": 5.202783727761938e-07, "loss": 0.00751359, "memory(GiB)": 15.03, "step": 24365, "train_speed(iter/s)": 1.471873 }, { "acc": 0.99975491, "epoch": 43.01853486319506, "grad_norm": 0.007734738755971193, "learning_rate": 5.189828309052425e-07, "loss": 0.00703726, "memory(GiB)": 15.03, "step": 24370, "train_speed(iter/s)": 1.471873 }, { "acc": 1.0, "epoch": 43.027360988526034, "grad_norm": 0.27276283502578735, "learning_rate": 5.176888188697548e-07, "loss": 0.00345855, "memory(GiB)": 15.03, "step": 24375, "train_speed(iter/s)": 1.471866 }, { "acc": 1.0, "epoch": 43.03618711385702, "grad_norm": 0.019281111657619476, "learning_rate": 5.163963371114651e-07, "loss": 0.00062425, "memory(GiB)": 15.03, "step": 24380, "train_speed(iter/s)": 1.471857 }, { "acc": 1.0, "epoch": 43.045013239187995, "grad_norm": 0.3589235842227936, "learning_rate": 5.151053860715788e-07, "loss": 0.0015711, "memory(GiB)": 15.03, "step": 24385, "train_speed(iter/s)": 1.471877 }, { "acc": 1.0, "epoch": 43.05383936451898, "grad_norm": 0.26923590898513794, "learning_rate": 5.13815966190784e-07, "loss": 0.00375959, "memory(GiB)": 15.03, "step": 24390, "train_speed(iter/s)": 1.471869 }, { "acc": 1.0, "epoch": 43.062665489849955, "grad_norm": 0.022759901359677315, "learning_rate": 5.125280779092443e-07, "loss": 0.00253901, "memory(GiB)": 15.03, "step": 24395, "train_speed(iter/s)": 1.471883 }, { "acc": 0.99962959, "epoch": 43.07149161518094, "grad_norm": 0.17498885095119476, "learning_rate": 5.112417216665997e-07, "loss": 0.00435847, "memory(GiB)": 15.03, "step": 24400, "train_speed(iter/s)": 1.471893 }, { "acc": 1.0, "epoch": 43.080317740511916, "grad_norm": 0.3935859203338623, "learning_rate": 5.099568979019683e-07, "loss": 0.00270307, "memory(GiB)": 15.03, "step": 24405, "train_speed(iter/s)": 1.471885 }, { "acc": 1.0, "epoch": 43.08914386584289, "grad_norm": 0.30087077617645264, "learning_rate": 5.086736070539437e-07, "loss": 0.00201353, "memory(GiB)": 15.03, "step": 24410, "train_speed(iter/s)": 1.471893 }, { "acc": 1.0, "epoch": 43.097969991173876, "grad_norm": 0.4221856892108917, "learning_rate": 5.073918495605978e-07, "loss": 0.00348621, "memory(GiB)": 15.03, "step": 24415, "train_speed(iter/s)": 1.471892 }, { "acc": 1.0, "epoch": 43.10679611650485, "grad_norm": 0.005810107104480267, "learning_rate": 5.061116258594795e-07, "loss": 0.0020407, "memory(GiB)": 15.03, "step": 24420, "train_speed(iter/s)": 1.471891 }, { "acc": 1.0, "epoch": 43.11562224183584, "grad_norm": 0.14516299962997437, "learning_rate": 5.048329363876122e-07, "loss": 0.00364409, "memory(GiB)": 15.03, "step": 24425, "train_speed(iter/s)": 1.471896 }, { "acc": 1.0, "epoch": 43.12444836716681, "grad_norm": 0.38540858030319214, "learning_rate": 5.035557815814958e-07, "loss": 0.00306929, "memory(GiB)": 15.03, "step": 24430, "train_speed(iter/s)": 1.471895 }, { "acc": 1.0, "epoch": 43.13327449249779, "grad_norm": 0.0037584705278277397, "learning_rate": 5.022801618771067e-07, "loss": 0.00512815, "memory(GiB)": 15.03, "step": 24435, "train_speed(iter/s)": 1.471878 }, { "acc": 1.0, "epoch": 43.142100617828774, "grad_norm": 0.0031765643507242203, "learning_rate": 5.010060777098987e-07, "loss": 0.00393665, "memory(GiB)": 15.03, "step": 24440, "train_speed(iter/s)": 1.471888 }, { "acc": 0.99981613, "epoch": 43.15092674315975, "grad_norm": 0.15071386098861694, "learning_rate": 4.997335295147998e-07, "loss": 0.00434655, "memory(GiB)": 15.03, "step": 24445, "train_speed(iter/s)": 1.471895 }, { "acc": 0.99978809, "epoch": 43.159752868490735, "grad_norm": 0.06754852831363678, "learning_rate": 4.984625177262146e-07, "loss": 0.00484316, "memory(GiB)": 15.03, "step": 24450, "train_speed(iter/s)": 1.471893 }, { "acc": 1.0, "epoch": 43.16857899382171, "grad_norm": 0.0668192058801651, "learning_rate": 4.971930427780201e-07, "loss": 0.00493442, "memory(GiB)": 15.03, "step": 24455, "train_speed(iter/s)": 1.4719 }, { "acc": 1.0, "epoch": 43.177405119152695, "grad_norm": 0.004147069063037634, "learning_rate": 4.959251051035738e-07, "loss": 0.0019103, "memory(GiB)": 15.03, "step": 24460, "train_speed(iter/s)": 1.471907 }, { "acc": 1.0, "epoch": 43.18623124448367, "grad_norm": 0.22913344204425812, "learning_rate": 4.946587051357073e-07, "loss": 0.00310496, "memory(GiB)": 15.03, "step": 24465, "train_speed(iter/s)": 1.471914 }, { "acc": 1.0, "epoch": 43.19505736981465, "grad_norm": 0.11371398717164993, "learning_rate": 4.933938433067219e-07, "loss": 0.00239207, "memory(GiB)": 15.03, "step": 24470, "train_speed(iter/s)": 1.47192 }, { "acc": 1.0, "epoch": 43.20388349514563, "grad_norm": 0.36479413509368896, "learning_rate": 4.921305200484012e-07, "loss": 0.00235415, "memory(GiB)": 15.03, "step": 24475, "train_speed(iter/s)": 1.471925 }, { "acc": 1.0, "epoch": 43.21270962047661, "grad_norm": 0.06875593215227127, "learning_rate": 4.908687357919973e-07, "loss": 0.00360762, "memory(GiB)": 15.03, "step": 24480, "train_speed(iter/s)": 1.471929 }, { "acc": 0.9998311, "epoch": 43.22153574580759, "grad_norm": 0.16959308087825775, "learning_rate": 4.89608490968244e-07, "loss": 0.00484145, "memory(GiB)": 15.03, "step": 24485, "train_speed(iter/s)": 1.471927 }, { "acc": 0.99969511, "epoch": 43.23036187113857, "grad_norm": 0.003672570688650012, "learning_rate": 4.883497860073402e-07, "loss": 0.00493631, "memory(GiB)": 15.03, "step": 24490, "train_speed(iter/s)": 1.471941 }, { "acc": 1.0, "epoch": 43.23918799646955, "grad_norm": 0.23420043289661407, "learning_rate": 4.870926213389688e-07, "loss": 0.0017152, "memory(GiB)": 15.03, "step": 24495, "train_speed(iter/s)": 1.471933 }, { "acc": 1.0, "epoch": 43.24801412180053, "grad_norm": 0.13908414542675018, "learning_rate": 4.85836997392279e-07, "loss": 0.00187968, "memory(GiB)": 15.03, "step": 24500, "train_speed(iter/s)": 1.471934 }, { "acc": 1.0, "epoch": 43.25684024713151, "grad_norm": 0.008355307392776012, "learning_rate": 4.845829145959017e-07, "loss": 0.00365787, "memory(GiB)": 15.03, "step": 24505, "train_speed(iter/s)": 1.471945 }, { "acc": 0.99984179, "epoch": 43.26566637246249, "grad_norm": 0.09325806051492691, "learning_rate": 4.833303733779327e-07, "loss": 0.00398287, "memory(GiB)": 15.03, "step": 24510, "train_speed(iter/s)": 1.471945 }, { "acc": 1.0, "epoch": 43.27449249779347, "grad_norm": 0.002255092142149806, "learning_rate": 4.820793741659489e-07, "loss": 0.00266357, "memory(GiB)": 15.03, "step": 24515, "train_speed(iter/s)": 1.471937 }, { "acc": 1.0, "epoch": 43.28331862312445, "grad_norm": 0.18007299304008484, "learning_rate": 4.808299173869994e-07, "loss": 0.00955372, "memory(GiB)": 15.03, "step": 24520, "train_speed(iter/s)": 1.47195 }, { "acc": 1.0, "epoch": 43.29214474845543, "grad_norm": 0.0031202249228954315, "learning_rate": 4.795820034676033e-07, "loss": 0.00601385, "memory(GiB)": 15.03, "step": 24525, "train_speed(iter/s)": 1.47195 }, { "acc": 0.9998106, "epoch": 43.300970873786405, "grad_norm": 0.10853568464517593, "learning_rate": 4.783356328337588e-07, "loss": 0.00411711, "memory(GiB)": 15.03, "step": 24530, "train_speed(iter/s)": 1.471964 }, { "acc": 0.99986706, "epoch": 43.30979699911739, "grad_norm": 0.06130742281675339, "learning_rate": 4.770908059109292e-07, "loss": 0.00236419, "memory(GiB)": 15.03, "step": 24535, "train_speed(iter/s)": 1.471964 }, { "acc": 1.0, "epoch": 43.318623124448365, "grad_norm": 0.23869335651397705, "learning_rate": 4.7584752312405796e-07, "loss": 0.00581637, "memory(GiB)": 15.03, "step": 24540, "train_speed(iter/s)": 1.47198 }, { "acc": 1.0, "epoch": 43.32744924977935, "grad_norm": 0.09922749549150467, "learning_rate": 4.7460578489756004e-07, "loss": 0.00398948, "memory(GiB)": 15.03, "step": 24545, "train_speed(iter/s)": 1.471992 }, { "acc": 1.0, "epoch": 43.336275375110326, "grad_norm": 0.004540830850601196, "learning_rate": 4.733655916553208e-07, "loss": 0.00574694, "memory(GiB)": 15.03, "step": 24550, "train_speed(iter/s)": 1.472002 }, { "acc": 1.0, "epoch": 43.34510150044131, "grad_norm": 0.0030718897469341755, "learning_rate": 4.721269438206996e-07, "loss": 0.00467249, "memory(GiB)": 15.03, "step": 24555, "train_speed(iter/s)": 1.472016 }, { "acc": 1.0, "epoch": 43.353927625772286, "grad_norm": 0.0633729100227356, "learning_rate": 4.7088984181652717e-07, "loss": 0.00339558, "memory(GiB)": 15.03, "step": 24560, "train_speed(iter/s)": 1.472014 }, { "acc": 0.99956894, "epoch": 43.36275375110326, "grad_norm": 0.01330020185559988, "learning_rate": 4.6965428606510953e-07, "loss": 0.00768645, "memory(GiB)": 15.03, "step": 24565, "train_speed(iter/s)": 1.472029 }, { "acc": 1.0, "epoch": 43.37157987643425, "grad_norm": 0.19564923644065857, "learning_rate": 4.684202769882214e-07, "loss": 0.00315042, "memory(GiB)": 15.03, "step": 24570, "train_speed(iter/s)": 1.472024 }, { "acc": 1.0, "epoch": 43.38040600176522, "grad_norm": 0.17605485022068024, "learning_rate": 4.6718781500711143e-07, "loss": 0.00385233, "memory(GiB)": 15.03, "step": 24575, "train_speed(iter/s)": 1.472037 }, { "acc": 1.0, "epoch": 43.38923212709621, "grad_norm": 0.07948023825883865, "learning_rate": 4.659569005424989e-07, "loss": 0.00654928, "memory(GiB)": 15.03, "step": 24580, "train_speed(iter/s)": 1.472016 }, { "acc": 1.0, "epoch": 43.398058252427184, "grad_norm": 0.21827948093414307, "learning_rate": 4.647275340145755e-07, "loss": 0.00578467, "memory(GiB)": 15.03, "step": 24585, "train_speed(iter/s)": 1.472013 }, { "acc": 1.0, "epoch": 43.40688437775817, "grad_norm": 0.09780970215797424, "learning_rate": 4.634997158430075e-07, "loss": 0.00138345, "memory(GiB)": 15.03, "step": 24590, "train_speed(iter/s)": 1.472003 }, { "acc": 1.0, "epoch": 43.415710503089144, "grad_norm": 0.0023677160497754812, "learning_rate": 4.622734464469248e-07, "loss": 0.00125379, "memory(GiB)": 15.03, "step": 24595, "train_speed(iter/s)": 1.471984 }, { "acc": 1.0, "epoch": 43.42453662842012, "grad_norm": 0.4954034984111786, "learning_rate": 4.61048726244938e-07, "loss": 0.00298897, "memory(GiB)": 15.03, "step": 24600, "train_speed(iter/s)": 1.471981 }, { "acc": 1.0, "epoch": 43.433362753751105, "grad_norm": 0.09713775664567947, "learning_rate": 4.598255556551207e-07, "loss": 0.00143053, "memory(GiB)": 15.03, "step": 24605, "train_speed(iter/s)": 1.471985 }, { "acc": 1.0, "epoch": 43.44218887908208, "grad_norm": 0.0028047477826476097, "learning_rate": 4.5860393509502563e-07, "loss": 0.00118908, "memory(GiB)": 15.03, "step": 24610, "train_speed(iter/s)": 1.471984 }, { "acc": 0.99979506, "epoch": 43.451015004413065, "grad_norm": 0.24780765175819397, "learning_rate": 4.573838649816667e-07, "loss": 0.00559202, "memory(GiB)": 15.03, "step": 24615, "train_speed(iter/s)": 1.471977 }, { "acc": 0.99960938, "epoch": 43.45984112974404, "grad_norm": 0.010267210192978382, "learning_rate": 4.5616534573153745e-07, "loss": 0.0048087, "memory(GiB)": 15.03, "step": 24620, "train_speed(iter/s)": 1.471992 }, { "acc": 1.0, "epoch": 43.46866725507502, "grad_norm": 0.22001604735851288, "learning_rate": 4.549483777605964e-07, "loss": 0.00181405, "memory(GiB)": 15.03, "step": 24625, "train_speed(iter/s)": 1.472006 }, { "acc": 0.99982872, "epoch": 43.477493380406, "grad_norm": 0.22055664658546448, "learning_rate": 4.537329614842751e-07, "loss": 0.00351488, "memory(GiB)": 15.03, "step": 24630, "train_speed(iter/s)": 1.472006 }, { "acc": 0.99962606, "epoch": 43.48631950573698, "grad_norm": 0.2644612491130829, "learning_rate": 4.525190973174768e-07, "loss": 0.00381115, "memory(GiB)": 15.03, "step": 24635, "train_speed(iter/s)": 1.472021 }, { "acc": 0.9998106, "epoch": 43.49514563106796, "grad_norm": 0.41778281331062317, "learning_rate": 4.5130678567456917e-07, "loss": 0.00424187, "memory(GiB)": 15.03, "step": 24640, "train_speed(iter/s)": 1.472023 }, { "acc": 0.99975967, "epoch": 43.50397175639894, "grad_norm": 0.29499390721321106, "learning_rate": 4.500960269693965e-07, "loss": 0.00289532, "memory(GiB)": 15.03, "step": 24645, "train_speed(iter/s)": 1.472022 }, { "acc": 1.0, "epoch": 43.512797881729924, "grad_norm": 0.3142264187335968, "learning_rate": 4.4888682161526827e-07, "loss": 0.00534432, "memory(GiB)": 15.03, "step": 24650, "train_speed(iter/s)": 1.472017 }, { "acc": 0.99986706, "epoch": 43.5216240070609, "grad_norm": 0.37316522002220154, "learning_rate": 4.4767917002496815e-07, "loss": 0.0048906, "memory(GiB)": 15.03, "step": 24655, "train_speed(iter/s)": 1.472018 }, { "acc": 1.0, "epoch": 43.53045013239188, "grad_norm": 0.14388485252857208, "learning_rate": 4.4647307261074286e-07, "loss": 0.00257055, "memory(GiB)": 15.03, "step": 24660, "train_speed(iter/s)": 1.472024 }, { "acc": 1.0, "epoch": 43.53927625772286, "grad_norm": 0.2059125304222107, "learning_rate": 4.452685297843146e-07, "loss": 0.00242572, "memory(GiB)": 15.03, "step": 24665, "train_speed(iter/s)": 1.472033 }, { "acc": 1.0, "epoch": 43.54810238305384, "grad_norm": 0.3023088276386261, "learning_rate": 4.4406554195687317e-07, "loss": 0.00502383, "memory(GiB)": 15.03, "step": 24670, "train_speed(iter/s)": 1.472033 }, { "acc": 1.0, "epoch": 43.55692850838482, "grad_norm": 0.23390939831733704, "learning_rate": 4.4286410953907693e-07, "loss": 0.0021347, "memory(GiB)": 15.03, "step": 24675, "train_speed(iter/s)": 1.47205 }, { "acc": 0.99976416, "epoch": 43.5657546337158, "grad_norm": 0.24901646375656128, "learning_rate": 4.416642329410534e-07, "loss": 0.00510521, "memory(GiB)": 15.03, "step": 24680, "train_speed(iter/s)": 1.472067 }, { "acc": 1.0, "epoch": 43.57458075904678, "grad_norm": 0.034361451864242554, "learning_rate": 4.4046591257239773e-07, "loss": 0.0057832, "memory(GiB)": 15.03, "step": 24685, "train_speed(iter/s)": 1.47207 }, { "acc": 0.99986115, "epoch": 43.58340688437776, "grad_norm": 0.26715004444122314, "learning_rate": 4.392691488421774e-07, "loss": 0.00432682, "memory(GiB)": 15.03, "step": 24690, "train_speed(iter/s)": 1.472071 }, { "acc": 1.0, "epoch": 43.592233009708735, "grad_norm": 0.3244686424732208, "learning_rate": 4.380739421589247e-07, "loss": 0.00439004, "memory(GiB)": 15.03, "step": 24695, "train_speed(iter/s)": 1.47207 }, { "acc": 0.9998457, "epoch": 43.60105913503972, "grad_norm": 0.278507798910141, "learning_rate": 4.3688029293064307e-07, "loss": 0.00391162, "memory(GiB)": 15.03, "step": 24700, "train_speed(iter/s)": 1.472057 }, { "acc": 0.99980164, "epoch": 43.609885260370696, "grad_norm": 0.09304408729076385, "learning_rate": 4.356882015648017e-07, "loss": 0.00276484, "memory(GiB)": 15.03, "step": 24705, "train_speed(iter/s)": 1.472057 }, { "acc": 1.0, "epoch": 43.61871138570168, "grad_norm": 0.4619811773300171, "learning_rate": 4.344976684683401e-07, "loss": 0.00678189, "memory(GiB)": 15.03, "step": 24710, "train_speed(iter/s)": 1.472058 }, { "acc": 1.0, "epoch": 43.627537511032656, "grad_norm": 0.009750180877745152, "learning_rate": 4.333086940476679e-07, "loss": 0.00379238, "memory(GiB)": 15.03, "step": 24715, "train_speed(iter/s)": 1.472052 }, { "acc": 1.0, "epoch": 43.63636363636363, "grad_norm": 0.32609760761260986, "learning_rate": 4.321212787086555e-07, "loss": 0.00357064, "memory(GiB)": 15.03, "step": 24720, "train_speed(iter/s)": 1.472047 }, { "acc": 1.0, "epoch": 43.64518976169462, "grad_norm": 0.283220112323761, "learning_rate": 4.309354228566486e-07, "loss": 0.00320231, "memory(GiB)": 15.03, "step": 24725, "train_speed(iter/s)": 1.472065 }, { "acc": 1.0, "epoch": 43.654015887025594, "grad_norm": 0.28440922498703003, "learning_rate": 4.297511268964556e-07, "loss": 0.00175428, "memory(GiB)": 15.03, "step": 24730, "train_speed(iter/s)": 1.472084 }, { "acc": 1.0, "epoch": 43.66284201235658, "grad_norm": 0.2733234763145447, "learning_rate": 4.2856839123235646e-07, "loss": 0.00358209, "memory(GiB)": 15.03, "step": 24735, "train_speed(iter/s)": 1.47209 }, { "acc": 1.0, "epoch": 43.671668137687554, "grad_norm": 0.015781771391630173, "learning_rate": 4.273872162680952e-07, "loss": 0.00307341, "memory(GiB)": 15.03, "step": 24740, "train_speed(iter/s)": 1.4721 }, { "acc": 1.0, "epoch": 43.68049426301854, "grad_norm": 0.5776798129081726, "learning_rate": 4.262076024068844e-07, "loss": 0.00561214, "memory(GiB)": 15.03, "step": 24745, "train_speed(iter/s)": 1.472096 }, { "acc": 1.0, "epoch": 43.689320388349515, "grad_norm": 0.29485461115837097, "learning_rate": 4.250295500514022e-07, "loss": 0.00510826, "memory(GiB)": 15.03, "step": 24750, "train_speed(iter/s)": 1.47209 }, { "acc": 1.0, "epoch": 43.69814651368049, "grad_norm": 0.47074341773986816, "learning_rate": 4.2385305960379617e-07, "loss": 0.00582542, "memory(GiB)": 15.03, "step": 24755, "train_speed(iter/s)": 1.472087 }, { "acc": 0.9998457, "epoch": 43.706972639011475, "grad_norm": 0.5341167449951172, "learning_rate": 4.22678131465682e-07, "loss": 0.00688915, "memory(GiB)": 15.03, "step": 24760, "train_speed(iter/s)": 1.472086 }, { "acc": 1.0, "epoch": 43.71579876434245, "grad_norm": 0.04431227594614029, "learning_rate": 4.2150476603813537e-07, "loss": 0.00508141, "memory(GiB)": 15.03, "step": 24765, "train_speed(iter/s)": 1.472092 }, { "acc": 1.0, "epoch": 43.724624889673436, "grad_norm": 0.09694565832614899, "learning_rate": 4.203329637217053e-07, "loss": 0.00362323, "memory(GiB)": 15.03, "step": 24770, "train_speed(iter/s)": 1.472098 }, { "acc": 0.99981346, "epoch": 43.73345101500441, "grad_norm": 0.04214610531926155, "learning_rate": 4.19162724916404e-07, "loss": 0.00123167, "memory(GiB)": 15.03, "step": 24775, "train_speed(iter/s)": 1.472104 }, { "acc": 1.0, "epoch": 43.74227714033539, "grad_norm": 0.005462651140987873, "learning_rate": 4.179940500217118e-07, "loss": 0.00598625, "memory(GiB)": 15.03, "step": 24780, "train_speed(iter/s)": 1.472088 }, { "acc": 1.0, "epoch": 43.75110326566637, "grad_norm": 0.26448705792427063, "learning_rate": 4.16826939436572e-07, "loss": 0.00621228, "memory(GiB)": 15.03, "step": 24785, "train_speed(iter/s)": 1.472087 }, { "acc": 1.0, "epoch": 43.75992939099735, "grad_norm": 0.21484357118606567, "learning_rate": 4.1566139355939663e-07, "loss": 0.00113797, "memory(GiB)": 15.03, "step": 24790, "train_speed(iter/s)": 1.472082 }, { "acc": 1.0, "epoch": 43.76875551632833, "grad_norm": 0.006031593773514032, "learning_rate": 4.1449741278806415e-07, "loss": 0.00238945, "memory(GiB)": 15.03, "step": 24795, "train_speed(iter/s)": 1.472077 }, { "acc": 1.0, "epoch": 43.77758164165931, "grad_norm": 0.009653680957853794, "learning_rate": 4.133349975199164e-07, "loss": 0.00517376, "memory(GiB)": 15.03, "step": 24800, "train_speed(iter/s)": 1.472067 }, { "acc": 0.99960938, "epoch": 43.786407766990294, "grad_norm": 0.3631892204284668, "learning_rate": 4.1217414815176186e-07, "loss": 0.0062146, "memory(GiB)": 15.03, "step": 24805, "train_speed(iter/s)": 1.472063 }, { "acc": 1.0, "epoch": 43.79523389232127, "grad_norm": 0.039065323770046234, "learning_rate": 4.110148650798738e-07, "loss": 0.00100642, "memory(GiB)": 15.03, "step": 24810, "train_speed(iter/s)": 1.472072 }, { "acc": 0.99984179, "epoch": 43.80406001765225, "grad_norm": 0.014737566001713276, "learning_rate": 4.0985714869999345e-07, "loss": 0.00190343, "memory(GiB)": 15.03, "step": 24815, "train_speed(iter/s)": 1.472068 }, { "acc": 1.0, "epoch": 43.81288614298323, "grad_norm": 0.49501582980155945, "learning_rate": 4.087009994073244e-07, "loss": 0.00358898, "memory(GiB)": 15.03, "step": 24820, "train_speed(iter/s)": 1.472048 }, { "acc": 1.0, "epoch": 43.82171226831421, "grad_norm": 0.06800848245620728, "learning_rate": 4.0754641759653533e-07, "loss": 0.00133246, "memory(GiB)": 15.03, "step": 24825, "train_speed(iter/s)": 1.472035 }, { "acc": 1.0, "epoch": 43.83053839364519, "grad_norm": 0.1781284064054489, "learning_rate": 4.0639340366176104e-07, "loss": 0.00216615, "memory(GiB)": 15.03, "step": 24830, "train_speed(iter/s)": 1.472041 }, { "acc": 1.0, "epoch": 43.83936451897617, "grad_norm": 0.18582499027252197, "learning_rate": 4.0524195799660007e-07, "loss": 0.00394409, "memory(GiB)": 15.03, "step": 24835, "train_speed(iter/s)": 1.472045 }, { "acc": 0.99980164, "epoch": 43.84819064430715, "grad_norm": 0.01343527901917696, "learning_rate": 4.0409208099411857e-07, "loss": 0.00621221, "memory(GiB)": 15.03, "step": 24840, "train_speed(iter/s)": 1.47204 }, { "acc": 1.0, "epoch": 43.85701676963813, "grad_norm": 0.1796160191297531, "learning_rate": 4.0294377304684337e-07, "loss": 0.00206851, "memory(GiB)": 15.03, "step": 24845, "train_speed(iter/s)": 1.472063 }, { "acc": 1.0, "epoch": 43.865842894969106, "grad_norm": 0.18713070452213287, "learning_rate": 4.017970345467677e-07, "loss": 0.00192449, "memory(GiB)": 15.03, "step": 24850, "train_speed(iter/s)": 1.472056 }, { "acc": 1.0, "epoch": 43.87466902030009, "grad_norm": 0.004123594146221876, "learning_rate": 4.0065186588534707e-07, "loss": 0.0053217, "memory(GiB)": 15.03, "step": 24855, "train_speed(iter/s)": 1.472062 }, { "acc": 0.9997282, "epoch": 43.883495145631066, "grad_norm": 0.5342593193054199, "learning_rate": 3.995082674535046e-07, "loss": 0.00860782, "memory(GiB)": 15.03, "step": 24860, "train_speed(iter/s)": 1.472066 }, { "acc": 1.0, "epoch": 43.89232127096205, "grad_norm": 0.014267977327108383, "learning_rate": 3.98366239641624e-07, "loss": 0.00177219, "memory(GiB)": 15.03, "step": 24865, "train_speed(iter/s)": 1.47208 }, { "acc": 0.99982586, "epoch": 43.90114739629303, "grad_norm": 0.04471032693982124, "learning_rate": 3.972257828395555e-07, "loss": 0.0022651, "memory(GiB)": 15.03, "step": 24870, "train_speed(iter/s)": 1.472085 }, { "acc": 1.0, "epoch": 43.90997352162401, "grad_norm": 0.016300857067108154, "learning_rate": 3.9608689743660924e-07, "loss": 0.00222061, "memory(GiB)": 15.03, "step": 24875, "train_speed(iter/s)": 1.472083 }, { "acc": 1.0, "epoch": 43.91879964695499, "grad_norm": 0.05482291430234909, "learning_rate": 3.9494958382156367e-07, "loss": 0.00521299, "memory(GiB)": 15.03, "step": 24880, "train_speed(iter/s)": 1.472093 }, { "acc": 1.0, "epoch": 43.927625772285964, "grad_norm": 0.23630110919475555, "learning_rate": 3.9381384238265943e-07, "loss": 0.0022431, "memory(GiB)": 15.03, "step": 24885, "train_speed(iter/s)": 1.472101 }, { "acc": 0.99986706, "epoch": 43.93645189761695, "grad_norm": 0.2167433202266693, "learning_rate": 3.9267967350759585e-07, "loss": 0.00404702, "memory(GiB)": 15.03, "step": 24890, "train_speed(iter/s)": 1.472108 }, { "acc": 0.99975491, "epoch": 43.945278022947925, "grad_norm": 0.30550694465637207, "learning_rate": 3.915470775835422e-07, "loss": 0.00328674, "memory(GiB)": 15.03, "step": 24895, "train_speed(iter/s)": 1.4721 }, { "acc": 0.99921875, "epoch": 43.95410414827891, "grad_norm": 0.10034029930830002, "learning_rate": 3.9041605499712624e-07, "loss": 0.00640923, "memory(GiB)": 15.03, "step": 24900, "train_speed(iter/s)": 1.472111 }, { "acc": 0.99970236, "epoch": 43.962930273609885, "grad_norm": 0.335548996925354, "learning_rate": 3.892866061344419e-07, "loss": 0.00307435, "memory(GiB)": 15.03, "step": 24905, "train_speed(iter/s)": 1.472136 }, { "acc": 1.0, "epoch": 43.97175639894086, "grad_norm": 0.20551176369190216, "learning_rate": 3.881587313810408e-07, "loss": 0.00128191, "memory(GiB)": 15.03, "step": 24910, "train_speed(iter/s)": 1.472145 }, { "acc": 1.0, "epoch": 43.980582524271846, "grad_norm": 0.14892730116844177, "learning_rate": 3.870324311219429e-07, "loss": 0.00246983, "memory(GiB)": 15.03, "step": 24915, "train_speed(iter/s)": 1.47215 }, { "acc": 1.0, "epoch": 43.98940864960282, "grad_norm": 0.005147302057594061, "learning_rate": 3.859077057416283e-07, "loss": 0.00620047, "memory(GiB)": 15.03, "step": 24920, "train_speed(iter/s)": 1.472151 }, { "acc": 1.0, "epoch": 43.998234774933806, "grad_norm": 0.14910398423671722, "learning_rate": 3.847845556240387e-07, "loss": 0.00326029, "memory(GiB)": 15.03, "step": 24925, "train_speed(iter/s)": 1.47216 }, { "acc": 1.0, "epoch": 44.00706090026478, "grad_norm": 0.2131347358226776, "learning_rate": 3.836629811525796e-07, "loss": 0.00368113, "memory(GiB)": 15.03, "step": 24930, "train_speed(iter/s)": 1.472135 }, { "acc": 1.0, "epoch": 44.01588702559577, "grad_norm": 0.42958903312683105, "learning_rate": 3.825429827101158e-07, "loss": 0.00473756, "memory(GiB)": 15.03, "step": 24935, "train_speed(iter/s)": 1.47214 }, { "acc": 1.0, "epoch": 44.02471315092674, "grad_norm": 0.22899048030376434, "learning_rate": 3.814245606789788e-07, "loss": 0.00372403, "memory(GiB)": 15.03, "step": 24940, "train_speed(iter/s)": 1.472148 }, { "acc": 0.99984179, "epoch": 44.03353927625772, "grad_norm": 0.15684926509857178, "learning_rate": 3.8030771544095833e-07, "loss": 0.00454332, "memory(GiB)": 15.03, "step": 24945, "train_speed(iter/s)": 1.472146 }, { "acc": 1.0, "epoch": 44.042365401588704, "grad_norm": 0.20964913070201874, "learning_rate": 3.7919244737730683e-07, "loss": 0.00252106, "memory(GiB)": 15.03, "step": 24950, "train_speed(iter/s)": 1.472163 }, { "acc": 0.99978065, "epoch": 44.05119152691968, "grad_norm": 0.18083994090557098, "learning_rate": 3.7807875686873774e-07, "loss": 0.006074, "memory(GiB)": 15.03, "step": 24955, "train_speed(iter/s)": 1.472166 }, { "acc": 1.0, "epoch": 44.060017652250664, "grad_norm": 0.11980843544006348, "learning_rate": 3.7696664429542664e-07, "loss": 0.00566126, "memory(GiB)": 15.03, "step": 24960, "train_speed(iter/s)": 1.472178 }, { "acc": 0.99960938, "epoch": 44.06884377758164, "grad_norm": 0.038321226835250854, "learning_rate": 3.7585611003701195e-07, "loss": 0.00468912, "memory(GiB)": 15.03, "step": 24965, "train_speed(iter/s)": 1.472189 }, { "acc": 1.0, "epoch": 44.077669902912625, "grad_norm": 0.4748092293739319, "learning_rate": 3.7474715447259064e-07, "loss": 0.00286511, "memory(GiB)": 15.03, "step": 24970, "train_speed(iter/s)": 1.472193 }, { "acc": 0.99962997, "epoch": 44.0864960282436, "grad_norm": 0.3623943626880646, "learning_rate": 3.736397779807217e-07, "loss": 0.00578825, "memory(GiB)": 15.03, "step": 24975, "train_speed(iter/s)": 1.472186 }, { "acc": 1.0, "epoch": 44.09532215357458, "grad_norm": 0.3601240813732147, "learning_rate": 3.7253398093942524e-07, "loss": 0.00378655, "memory(GiB)": 15.03, "step": 24980, "train_speed(iter/s)": 1.472192 }, { "acc": 1.0, "epoch": 44.10414827890556, "grad_norm": 0.27173760533332825, "learning_rate": 3.714297637261833e-07, "loss": 0.00246612, "memory(GiB)": 15.03, "step": 24985, "train_speed(iter/s)": 1.472176 }, { "acc": 1.0, "epoch": 44.11297440423654, "grad_norm": 0.4819261133670807, "learning_rate": 3.7032712671793714e-07, "loss": 0.00484061, "memory(GiB)": 15.03, "step": 24990, "train_speed(iter/s)": 1.472185 }, { "acc": 1.0, "epoch": 44.12180052956752, "grad_norm": 0.007975260727107525, "learning_rate": 3.692260702910886e-07, "loss": 0.00125232, "memory(GiB)": 15.03, "step": 24995, "train_speed(iter/s)": 1.472195 }, { "acc": 1.0, "epoch": 44.1306266548985, "grad_norm": 0.1953269988298416, "learning_rate": 3.681265948214994e-07, "loss": 0.00333067, "memory(GiB)": 15.03, "step": 25000, "train_speed(iter/s)": 1.472189 }, { "acc": 1.0, "epoch": 44.139452780229476, "grad_norm": 0.004499353934079409, "learning_rate": 3.6702870068449444e-07, "loss": 0.00261692, "memory(GiB)": 15.03, "step": 25005, "train_speed(iter/s)": 1.472193 }, { "acc": 1.0, "epoch": 44.14827890556046, "grad_norm": 0.11580200493335724, "learning_rate": 3.6593238825485705e-07, "loss": 0.00218447, "memory(GiB)": 15.03, "step": 25010, "train_speed(iter/s)": 1.472204 }, { "acc": 1.0, "epoch": 44.15710503089144, "grad_norm": 0.2588663697242737, "learning_rate": 3.648376579068287e-07, "loss": 0.0081229, "memory(GiB)": 15.03, "step": 25015, "train_speed(iter/s)": 1.472202 }, { "acc": 0.99973402, "epoch": 44.16593115622242, "grad_norm": 0.05932711809873581, "learning_rate": 3.6374451001411427e-07, "loss": 0.0021394, "memory(GiB)": 15.03, "step": 25020, "train_speed(iter/s)": 1.472218 }, { "acc": 1.0, "epoch": 44.1747572815534, "grad_norm": 0.3170051574707031, "learning_rate": 3.6265294494987494e-07, "loss": 0.0022979, "memory(GiB)": 15.03, "step": 25025, "train_speed(iter/s)": 1.472215 }, { "acc": 1.0, "epoch": 44.18358340688438, "grad_norm": 0.21543829143047333, "learning_rate": 3.615629630867361e-07, "loss": 0.00585484, "memory(GiB)": 15.03, "step": 25030, "train_speed(iter/s)": 1.472229 }, { "acc": 1.0, "epoch": 44.19240953221536, "grad_norm": 0.059361547231674194, "learning_rate": 3.604745647967766e-07, "loss": 0.00148668, "memory(GiB)": 15.03, "step": 25035, "train_speed(iter/s)": 1.472227 }, { "acc": 0.99991436, "epoch": 44.201235657546334, "grad_norm": 0.02246857061982155, "learning_rate": 3.593877504515405e-07, "loss": 0.00169694, "memory(GiB)": 15.03, "step": 25040, "train_speed(iter/s)": 1.472219 }, { "acc": 1.0, "epoch": 44.21006178287732, "grad_norm": 0.006394298747181892, "learning_rate": 3.583025204220282e-07, "loss": 0.00207271, "memory(GiB)": 15.03, "step": 25045, "train_speed(iter/s)": 1.472215 }, { "acc": 1.0, "epoch": 44.218887908208295, "grad_norm": 0.4989708960056305, "learning_rate": 3.5721887507870033e-07, "loss": 0.00439272, "memory(GiB)": 15.03, "step": 25050, "train_speed(iter/s)": 1.472227 }, { "acc": 1.0, "epoch": 44.22771403353928, "grad_norm": 0.28531113266944885, "learning_rate": 3.5613681479147505e-07, "loss": 0.00285121, "memory(GiB)": 15.03, "step": 25055, "train_speed(iter/s)": 1.472241 }, { "acc": 1.0, "epoch": 44.236540158870255, "grad_norm": 0.13382308185100555, "learning_rate": 3.5505633992973043e-07, "loss": 0.00283632, "memory(GiB)": 15.03, "step": 25060, "train_speed(iter/s)": 1.47226 }, { "acc": 1.0, "epoch": 44.24536628420124, "grad_norm": 0.2143760770559311, "learning_rate": 3.5397745086230443e-07, "loss": 0.00651058, "memory(GiB)": 15.03, "step": 25065, "train_speed(iter/s)": 1.472246 }, { "acc": 1.0, "epoch": 44.254192409532216, "grad_norm": 0.05110679566860199, "learning_rate": 3.5290014795749133e-07, "loss": 0.00123486, "memory(GiB)": 15.03, "step": 25070, "train_speed(iter/s)": 1.472241 }, { "acc": 1.0, "epoch": 44.26301853486319, "grad_norm": 0.03796418756246567, "learning_rate": 3.518244315830472e-07, "loss": 0.00049478, "memory(GiB)": 15.03, "step": 25075, "train_speed(iter/s)": 1.472242 }, { "acc": 1.0, "epoch": 44.271844660194176, "grad_norm": 0.20688584446907043, "learning_rate": 3.50750302106181e-07, "loss": 0.00376081, "memory(GiB)": 15.03, "step": 25080, "train_speed(iter/s)": 1.472246 }, { "acc": 0.99984179, "epoch": 44.28067078552515, "grad_norm": 0.21242666244506836, "learning_rate": 3.496777598935662e-07, "loss": 0.00359253, "memory(GiB)": 15.03, "step": 25085, "train_speed(iter/s)": 1.472219 }, { "acc": 1.0, "epoch": 44.28949691085614, "grad_norm": 0.004581824876368046, "learning_rate": 3.486068053113321e-07, "loss": 0.00261208, "memory(GiB)": 15.03, "step": 25090, "train_speed(iter/s)": 1.472218 }, { "acc": 1.0, "epoch": 44.298323036187114, "grad_norm": 0.11504954844713211, "learning_rate": 3.47537438725064e-07, "loss": 0.00089321, "memory(GiB)": 15.03, "step": 25095, "train_speed(iter/s)": 1.47222 }, { "acc": 0.99986706, "epoch": 44.30714916151809, "grad_norm": 0.39441317319869995, "learning_rate": 3.464696604998085e-07, "loss": 0.00442707, "memory(GiB)": 15.03, "step": 25100, "train_speed(iter/s)": 1.472212 }, { "acc": 1.0, "epoch": 44.315975286849074, "grad_norm": 0.12095576524734497, "learning_rate": 3.4540347100006586e-07, "loss": 0.00187878, "memory(GiB)": 15.03, "step": 25105, "train_speed(iter/s)": 1.472217 }, { "acc": 1.0, "epoch": 44.32480141218005, "grad_norm": 0.010540344752371311, "learning_rate": 3.4433887058979847e-07, "loss": 0.00368336, "memory(GiB)": 15.03, "step": 25110, "train_speed(iter/s)": 1.472217 }, { "acc": 1.0, "epoch": 44.333627537511035, "grad_norm": 0.17724531888961792, "learning_rate": 3.432758596324242e-07, "loss": 0.00229654, "memory(GiB)": 15.03, "step": 25115, "train_speed(iter/s)": 1.472224 }, { "acc": 1.0, "epoch": 44.34245366284201, "grad_norm": 0.3318701684474945, "learning_rate": 3.4221443849081716e-07, "loss": 0.00338937, "memory(GiB)": 15.03, "step": 25120, "train_speed(iter/s)": 1.47224 }, { "acc": 1.0, "epoch": 44.351279788172995, "grad_norm": 0.17705075442790985, "learning_rate": 3.411546075273101e-07, "loss": 0.00280203, "memory(GiB)": 15.03, "step": 25125, "train_speed(iter/s)": 1.472241 }, { "acc": 1.0, "epoch": 44.36010591350397, "grad_norm": 0.3663312494754791, "learning_rate": 3.4009636710369333e-07, "loss": 0.00304877, "memory(GiB)": 15.03, "step": 25130, "train_speed(iter/s)": 1.472246 }, { "acc": 1.0, "epoch": 44.36893203883495, "grad_norm": 0.016424402594566345, "learning_rate": 3.390397175812147e-07, "loss": 0.00626453, "memory(GiB)": 15.03, "step": 25135, "train_speed(iter/s)": 1.472241 }, { "acc": 1.0, "epoch": 44.37775816416593, "grad_norm": 0.35656148195266724, "learning_rate": 3.379846593205751e-07, "loss": 0.00600343, "memory(GiB)": 15.03, "step": 25140, "train_speed(iter/s)": 1.472233 }, { "acc": 1.0, "epoch": 44.38658428949691, "grad_norm": 0.29874125123023987, "learning_rate": 3.369311926819381e-07, "loss": 0.00547158, "memory(GiB)": 15.03, "step": 25145, "train_speed(iter/s)": 1.472222 }, { "acc": 0.99980469, "epoch": 44.39541041482789, "grad_norm": 0.005099767353385687, "learning_rate": 3.3587931802491794e-07, "loss": 0.00779343, "memory(GiB)": 15.03, "step": 25150, "train_speed(iter/s)": 1.472223 }, { "acc": 1.0, "epoch": 44.40423654015887, "grad_norm": 0.3713805377483368, "learning_rate": 3.3482903570859185e-07, "loss": 0.00632032, "memory(GiB)": 15.03, "step": 25155, "train_speed(iter/s)": 1.472218 }, { "acc": 0.99933109, "epoch": 44.413062665489846, "grad_norm": 0.33464932441711426, "learning_rate": 3.3378034609148665e-07, "loss": 0.00539458, "memory(GiB)": 15.03, "step": 25160, "train_speed(iter/s)": 1.472211 }, { "acc": 1.0, "epoch": 44.42188879082083, "grad_norm": 0.13520990312099457, "learning_rate": 3.327332495315908e-07, "loss": 0.0047124, "memory(GiB)": 15.03, "step": 25165, "train_speed(iter/s)": 1.472226 }, { "acc": 1.0, "epoch": 44.43071491615181, "grad_norm": 0.079896941781044, "learning_rate": 3.316877463863453e-07, "loss": 0.00073701, "memory(GiB)": 15.03, "step": 25170, "train_speed(iter/s)": 1.472237 }, { "acc": 1.0, "epoch": 44.43954104148279, "grad_norm": 0.5580422878265381, "learning_rate": 3.3064383701265054e-07, "loss": 0.00753371, "memory(GiB)": 15.03, "step": 25175, "train_speed(iter/s)": 1.47224 }, { "acc": 1.0, "epoch": 44.44836716681377, "grad_norm": 0.6137739419937134, "learning_rate": 3.2960152176686197e-07, "loss": 0.00584498, "memory(GiB)": 15.03, "step": 25180, "train_speed(iter/s)": 1.472233 }, { "acc": 0.99974995, "epoch": 44.45719329214475, "grad_norm": 0.9532871246337891, "learning_rate": 3.285608010047872e-07, "loss": 0.01288429, "memory(GiB)": 15.03, "step": 25185, "train_speed(iter/s)": 1.472242 }, { "acc": 1.0, "epoch": 44.46601941747573, "grad_norm": 0.037250153720378876, "learning_rate": 3.2752167508169474e-07, "loss": 0.0018697, "memory(GiB)": 15.03, "step": 25190, "train_speed(iter/s)": 1.472256 }, { "acc": 1.0, "epoch": 44.474845542806705, "grad_norm": 0.0131763881072402, "learning_rate": 3.264841443523048e-07, "loss": 0.00497968, "memory(GiB)": 15.03, "step": 25195, "train_speed(iter/s)": 1.472251 }, { "acc": 1.0, "epoch": 44.48367166813769, "grad_norm": 0.08135177195072174, "learning_rate": 3.2544820917079637e-07, "loss": 0.00509778, "memory(GiB)": 15.03, "step": 25200, "train_speed(iter/s)": 1.472258 }, { "acc": 0.99989033, "epoch": 44.492497793468665, "grad_norm": 0.32129502296447754, "learning_rate": 3.2441386989080013e-07, "loss": 0.00444234, "memory(GiB)": 15.03, "step": 25205, "train_speed(iter/s)": 1.472255 }, { "acc": 1.0, "epoch": 44.50132391879965, "grad_norm": 0.407539427280426, "learning_rate": 3.2338112686540497e-07, "loss": 0.00496018, "memory(GiB)": 15.03, "step": 25210, "train_speed(iter/s)": 1.472252 }, { "acc": 1.0, "epoch": 44.510150044130626, "grad_norm": 0.26162439584732056, "learning_rate": 3.2234998044715346e-07, "loss": 0.00353146, "memory(GiB)": 15.03, "step": 25215, "train_speed(iter/s)": 1.472272 }, { "acc": 1.0, "epoch": 44.51897616946161, "grad_norm": 0.2167697548866272, "learning_rate": 3.2132043098804455e-07, "loss": 0.00220127, "memory(GiB)": 15.03, "step": 25220, "train_speed(iter/s)": 1.472273 }, { "acc": 0.99978809, "epoch": 44.527802294792586, "grad_norm": 0.3441566824913025, "learning_rate": 3.2029247883952904e-07, "loss": 0.00524512, "memory(GiB)": 15.03, "step": 25225, "train_speed(iter/s)": 1.472257 }, { "acc": 1.0, "epoch": 44.53662842012356, "grad_norm": 0.004987304098904133, "learning_rate": 3.19266124352515e-07, "loss": 0.00495827, "memory(GiB)": 15.03, "step": 25230, "train_speed(iter/s)": 1.472256 }, { "acc": 1.0, "epoch": 44.54545454545455, "grad_norm": 0.051060717552900314, "learning_rate": 3.1824136787736483e-07, "loss": 0.00401204, "memory(GiB)": 15.03, "step": 25235, "train_speed(iter/s)": 1.472266 }, { "acc": 1.0, "epoch": 44.55428067078552, "grad_norm": 0.2309550791978836, "learning_rate": 3.1721820976389554e-07, "loss": 0.00123563, "memory(GiB)": 15.03, "step": 25240, "train_speed(iter/s)": 1.472257 }, { "acc": 1.0, "epoch": 44.56310679611651, "grad_norm": 0.3192908465862274, "learning_rate": 3.1619665036137703e-07, "loss": 0.00556969, "memory(GiB)": 15.03, "step": 25245, "train_speed(iter/s)": 1.472276 }, { "acc": 1.0, "epoch": 44.571932921447484, "grad_norm": 0.014068477787077427, "learning_rate": 3.1517669001853393e-07, "loss": 0.00201891, "memory(GiB)": 15.03, "step": 25250, "train_speed(iter/s)": 1.472279 }, { "acc": 1.0, "epoch": 44.58075904677847, "grad_norm": 0.36714428663253784, "learning_rate": 3.141583290835467e-07, "loss": 0.00501347, "memory(GiB)": 15.03, "step": 25255, "train_speed(iter/s)": 1.472265 }, { "acc": 1.0, "epoch": 44.589585172109444, "grad_norm": 0.12007353454828262, "learning_rate": 3.1314156790405015e-07, "loss": 0.00243951, "memory(GiB)": 15.03, "step": 25260, "train_speed(iter/s)": 1.472257 }, { "acc": 1.0, "epoch": 44.59841129744042, "grad_norm": 0.2300254851579666, "learning_rate": 3.1212640682712764e-07, "loss": 0.00265799, "memory(GiB)": 15.03, "step": 25265, "train_speed(iter/s)": 1.472266 }, { "acc": 1.0, "epoch": 44.607237422771405, "grad_norm": 0.14560580253601074, "learning_rate": 3.1111284619932383e-07, "loss": 0.00228397, "memory(GiB)": 15.03, "step": 25270, "train_speed(iter/s)": 1.472282 }, { "acc": 0.99986706, "epoch": 44.61606354810238, "grad_norm": 0.008475994691252708, "learning_rate": 3.101008863666307e-07, "loss": 0.0019422, "memory(GiB)": 15.03, "step": 25275, "train_speed(iter/s)": 1.472302 }, { "acc": 1.0, "epoch": 44.624889673433366, "grad_norm": 0.09930496662855148, "learning_rate": 3.090905276744978e-07, "loss": 0.00490355, "memory(GiB)": 15.03, "step": 25280, "train_speed(iter/s)": 1.472303 }, { "acc": 1.0, "epoch": 44.63371579876434, "grad_norm": 0.4677456021308899, "learning_rate": 3.080817704678271e-07, "loss": 0.00767011, "memory(GiB)": 15.03, "step": 25285, "train_speed(iter/s)": 1.472308 }, { "acc": 1.0, "epoch": 44.64254192409532, "grad_norm": 0.03771212324500084, "learning_rate": 3.0707461509097305e-07, "loss": 0.00202214, "memory(GiB)": 15.03, "step": 25290, "train_speed(iter/s)": 1.472307 }, { "acc": 1.0, "epoch": 44.6513680494263, "grad_norm": 0.3243062496185303, "learning_rate": 3.0606906188774286e-07, "loss": 0.00854157, "memory(GiB)": 15.03, "step": 25295, "train_speed(iter/s)": 1.472319 }, { "acc": 1.0, "epoch": 44.66019417475728, "grad_norm": 0.04436984658241272, "learning_rate": 3.0506511120139814e-07, "loss": 0.00112519, "memory(GiB)": 15.03, "step": 25300, "train_speed(iter/s)": 1.472326 }, { "acc": 0.9998106, "epoch": 44.66902030008826, "grad_norm": 0.2137315273284912, "learning_rate": 3.040627633746555e-07, "loss": 0.00392568, "memory(GiB)": 15.03, "step": 25305, "train_speed(iter/s)": 1.472345 }, { "acc": 1.0, "epoch": 44.67784642541924, "grad_norm": 0.3002799451351166, "learning_rate": 3.0306201874967803e-07, "loss": 0.00324691, "memory(GiB)": 15.03, "step": 25310, "train_speed(iter/s)": 1.472348 }, { "acc": 1.0, "epoch": 44.686672550750224, "grad_norm": 0.09556324779987335, "learning_rate": 3.0206287766808776e-07, "loss": 0.00216026, "memory(GiB)": 15.03, "step": 25315, "train_speed(iter/s)": 1.472345 }, { "acc": 1.0, "epoch": 44.6954986760812, "grad_norm": 0.21044185757637024, "learning_rate": 3.0106534047095664e-07, "loss": 0.00375651, "memory(GiB)": 15.03, "step": 25320, "train_speed(iter/s)": 1.472334 }, { "acc": 1.0, "epoch": 44.70432480141218, "grad_norm": 0.0290613304823637, "learning_rate": 3.000694074988104e-07, "loss": 0.00154494, "memory(GiB)": 15.03, "step": 25325, "train_speed(iter/s)": 1.472338 }, { "acc": 1.0, "epoch": 44.71315092674316, "grad_norm": 0.370931476354599, "learning_rate": 2.990750790916235e-07, "loss": 0.00376043, "memory(GiB)": 15.03, "step": 25330, "train_speed(iter/s)": 1.472355 }, { "acc": 1.0, "epoch": 44.72197705207414, "grad_norm": 0.3473140001296997, "learning_rate": 2.9808235558882787e-07, "loss": 0.00384775, "memory(GiB)": 15.03, "step": 25335, "train_speed(iter/s)": 1.472364 }, { "acc": 1.0, "epoch": 44.73080317740512, "grad_norm": 0.24921512603759766, "learning_rate": 2.970912373293045e-07, "loss": 0.00171404, "memory(GiB)": 15.03, "step": 25340, "train_speed(iter/s)": 1.472357 }, { "acc": 0.99918633, "epoch": 44.7396293027361, "grad_norm": 0.34293732047080994, "learning_rate": 2.9610172465138667e-07, "loss": 0.00960125, "memory(GiB)": 15.03, "step": 25345, "train_speed(iter/s)": 1.47236 }, { "acc": 0.99977684, "epoch": 44.748455428067075, "grad_norm": 0.20790821313858032, "learning_rate": 2.9511381789286037e-07, "loss": 0.00498253, "memory(GiB)": 15.03, "step": 25350, "train_speed(iter/s)": 1.47236 }, { "acc": 0.99980774, "epoch": 44.75728155339806, "grad_norm": 0.2610751986503601, "learning_rate": 2.941275173909616e-07, "loss": 0.00446962, "memory(GiB)": 15.03, "step": 25355, "train_speed(iter/s)": 1.472351 }, { "acc": 1.0, "epoch": 44.766107678729036, "grad_norm": 0.22565065324306488, "learning_rate": 2.931428234823811e-07, "loss": 0.00147442, "memory(GiB)": 15.03, "step": 25360, "train_speed(iter/s)": 1.472369 }, { "acc": 1.0, "epoch": 44.77493380406002, "grad_norm": 0.15433485805988312, "learning_rate": 2.9215973650325926e-07, "loss": 0.0019322, "memory(GiB)": 15.03, "step": 25365, "train_speed(iter/s)": 1.472359 }, { "acc": 0.99975491, "epoch": 44.783759929390996, "grad_norm": 0.39850425720214844, "learning_rate": 2.9117825678918676e-07, "loss": 0.00727159, "memory(GiB)": 15.03, "step": 25370, "train_speed(iter/s)": 1.472361 }, { "acc": 1.0, "epoch": 44.79258605472198, "grad_norm": 0.023130837827920914, "learning_rate": 2.90198384675207e-07, "loss": 0.00086896, "memory(GiB)": 15.03, "step": 25375, "train_speed(iter/s)": 1.472355 }, { "acc": 1.0, "epoch": 44.80141218005296, "grad_norm": 0.18929478526115417, "learning_rate": 2.892201204958157e-07, "loss": 0.00724961, "memory(GiB)": 15.03, "step": 25380, "train_speed(iter/s)": 1.472342 }, { "acc": 0.99986115, "epoch": 44.81023830538393, "grad_norm": 0.13292233645915985, "learning_rate": 2.8824346458495837e-07, "loss": 0.00332226, "memory(GiB)": 15.03, "step": 25385, "train_speed(iter/s)": 1.472352 }, { "acc": 0.99921875, "epoch": 44.81906443071492, "grad_norm": 0.02271365560591221, "learning_rate": 2.87268417276031e-07, "loss": 0.00356631, "memory(GiB)": 15.03, "step": 25390, "train_speed(iter/s)": 1.472343 }, { "acc": 1.0, "epoch": 44.827890556045894, "grad_norm": 0.564680814743042, "learning_rate": 2.8629497890188194e-07, "loss": 0.00407994, "memory(GiB)": 15.03, "step": 25395, "train_speed(iter/s)": 1.472365 }, { "acc": 0.9998106, "epoch": 44.83671668137688, "grad_norm": 0.04891844838857651, "learning_rate": 2.853231497948082e-07, "loss": 0.00182047, "memory(GiB)": 15.03, "step": 25400, "train_speed(iter/s)": 1.472362 }, { "acc": 1.0, "epoch": 44.845542806707854, "grad_norm": 0.19876104593276978, "learning_rate": 2.8435293028655994e-07, "loss": 0.00664229, "memory(GiB)": 15.03, "step": 25405, "train_speed(iter/s)": 1.472378 }, { "acc": 1.0, "epoch": 44.85436893203884, "grad_norm": 0.2041526883840561, "learning_rate": 2.83384320708337e-07, "loss": 0.00465676, "memory(GiB)": 15.03, "step": 25410, "train_speed(iter/s)": 1.472401 }, { "acc": 1.0, "epoch": 44.863195057369815, "grad_norm": 0.36328792572021484, "learning_rate": 2.824173213907879e-07, "loss": 0.00828966, "memory(GiB)": 15.03, "step": 25415, "train_speed(iter/s)": 1.472401 }, { "acc": 1.0, "epoch": 44.87202118270079, "grad_norm": 0.4005582630634308, "learning_rate": 2.8145193266401273e-07, "loss": 0.00462528, "memory(GiB)": 15.03, "step": 25420, "train_speed(iter/s)": 1.4724 }, { "acc": 0.99961109, "epoch": 44.880847308031775, "grad_norm": 0.7198876142501831, "learning_rate": 2.8048815485756323e-07, "loss": 0.01157408, "memory(GiB)": 15.03, "step": 25425, "train_speed(iter/s)": 1.472394 }, { "acc": 1.0, "epoch": 44.88967343336275, "grad_norm": 0.2200309783220291, "learning_rate": 2.795259883004405e-07, "loss": 0.00198442, "memory(GiB)": 15.03, "step": 25430, "train_speed(iter/s)": 1.472406 }, { "acc": 0.99989033, "epoch": 44.898499558693736, "grad_norm": 0.003840730292722583, "learning_rate": 2.785654333210923e-07, "loss": 0.00363751, "memory(GiB)": 15.03, "step": 25435, "train_speed(iter/s)": 1.472416 }, { "acc": 0.99986115, "epoch": 44.90732568402471, "grad_norm": 0.6242836713790894, "learning_rate": 2.7760649024742185e-07, "loss": 0.00818277, "memory(GiB)": 15.03, "step": 25440, "train_speed(iter/s)": 1.472429 }, { "acc": 1.0, "epoch": 44.916151809355696, "grad_norm": 0.35498374700546265, "learning_rate": 2.766491594067767e-07, "loss": 0.0044413, "memory(GiB)": 15.03, "step": 25445, "train_speed(iter/s)": 1.472428 }, { "acc": 1.0, "epoch": 44.92497793468667, "grad_norm": 0.3103071451187134, "learning_rate": 2.7569344112596004e-07, "loss": 0.00205838, "memory(GiB)": 15.03, "step": 25450, "train_speed(iter/s)": 1.472431 }, { "acc": 1.0, "epoch": 44.93380406001765, "grad_norm": 0.051164254546165466, "learning_rate": 2.747393357312171e-07, "loss": 0.00246765, "memory(GiB)": 15.03, "step": 25455, "train_speed(iter/s)": 1.472444 }, { "acc": 0.99970236, "epoch": 44.942630185348634, "grad_norm": 0.2979535162448883, "learning_rate": 2.737868435482486e-07, "loss": 0.00342275, "memory(GiB)": 15.03, "step": 25460, "train_speed(iter/s)": 1.472465 }, { "acc": 1.0, "epoch": 44.95145631067961, "grad_norm": 0.4884444773197174, "learning_rate": 2.7283596490220303e-07, "loss": 0.00441424, "memory(GiB)": 15.03, "step": 25465, "train_speed(iter/s)": 1.47248 }, { "acc": 0.99976416, "epoch": 44.960282436010594, "grad_norm": 0.23496882617473602, "learning_rate": 2.7188670011767715e-07, "loss": 0.00436586, "memory(GiB)": 15.03, "step": 25470, "train_speed(iter/s)": 1.472486 }, { "acc": 0.9998457, "epoch": 44.96910856134157, "grad_norm": 0.3752577006816864, "learning_rate": 2.709390495187171e-07, "loss": 0.00376737, "memory(GiB)": 15.03, "step": 25475, "train_speed(iter/s)": 1.472495 }, { "acc": 1.0, "epoch": 44.97793468667255, "grad_norm": 0.23564057052135468, "learning_rate": 2.699930134288173e-07, "loss": 0.00192256, "memory(GiB)": 15.03, "step": 25480, "train_speed(iter/s)": 1.472496 }, { "acc": 1.0, "epoch": 44.98676081200353, "grad_norm": 0.03638656437397003, "learning_rate": 2.6904859217092317e-07, "loss": 0.00346003, "memory(GiB)": 15.03, "step": 25485, "train_speed(iter/s)": 1.472504 }, { "acc": 1.0, "epoch": 44.99558693733451, "grad_norm": 0.006170360837131739, "learning_rate": 2.681057860674268e-07, "loss": 0.00136624, "memory(GiB)": 15.03, "step": 25490, "train_speed(iter/s)": 1.472499 }, { "acc": 1.0, "epoch": 45.00441306266549, "grad_norm": 0.11999630928039551, "learning_rate": 2.671645954401707e-07, "loss": 0.00231271, "memory(GiB)": 15.03, "step": 25495, "train_speed(iter/s)": 1.472438 }, { "acc": 1.0, "epoch": 45.01323918799647, "grad_norm": 0.24248522520065308, "learning_rate": 2.6622502061044305e-07, "loss": 0.00189334, "memory(GiB)": 15.03, "step": 25500, "train_speed(iter/s)": 1.472447 }, { "acc": 1.0, "epoch": 45.02206531332745, "grad_norm": 0.35558417439460754, "learning_rate": 2.6528706189898343e-07, "loss": 0.00353588, "memory(GiB)": 15.03, "step": 25505, "train_speed(iter/s)": 1.47244 }, { "acc": 1.0, "epoch": 45.03089143865843, "grad_norm": 0.18268819153308868, "learning_rate": 2.6435071962597923e-07, "loss": 0.00290073, "memory(GiB)": 15.03, "step": 25510, "train_speed(iter/s)": 1.472438 }, { "acc": 1.0, "epoch": 45.039717563989406, "grad_norm": 0.21828635036945343, "learning_rate": 2.6341599411106496e-07, "loss": 0.00545499, "memory(GiB)": 15.03, "step": 25515, "train_speed(iter/s)": 1.472437 }, { "acc": 1.0, "epoch": 45.04854368932039, "grad_norm": 0.016691097989678383, "learning_rate": 2.62482885673323e-07, "loss": 0.00336325, "memory(GiB)": 15.03, "step": 25520, "train_speed(iter/s)": 1.472436 }, { "acc": 1.0, "epoch": 45.057369814651366, "grad_norm": 0.008348474279046059, "learning_rate": 2.615513946312843e-07, "loss": 0.00541567, "memory(GiB)": 15.03, "step": 25525, "train_speed(iter/s)": 1.472442 }, { "acc": 1.0, "epoch": 45.06619593998235, "grad_norm": 0.19785919785499573, "learning_rate": 2.606215213029295e-07, "loss": 0.00152765, "memory(GiB)": 15.03, "step": 25530, "train_speed(iter/s)": 1.472442 }, { "acc": 1.0, "epoch": 45.07502206531333, "grad_norm": 0.1596856266260147, "learning_rate": 2.596932660056839e-07, "loss": 0.00166305, "memory(GiB)": 15.03, "step": 25535, "train_speed(iter/s)": 1.472448 }, { "acc": 1.0, "epoch": 45.083848190644304, "grad_norm": 0.10172059386968613, "learning_rate": 2.5876662905642243e-07, "loss": 0.00075324, "memory(GiB)": 15.03, "step": 25540, "train_speed(iter/s)": 1.472472 }, { "acc": 0.9998106, "epoch": 45.09267431597529, "grad_norm": 0.3058759868144989, "learning_rate": 2.5784161077146533e-07, "loss": 0.01008508, "memory(GiB)": 15.03, "step": 25545, "train_speed(iter/s)": 1.472472 }, { "acc": 1.0, "epoch": 45.101500441306264, "grad_norm": 0.06555098295211792, "learning_rate": 2.569182114665839e-07, "loss": 0.00302366, "memory(GiB)": 15.03, "step": 25550, "train_speed(iter/s)": 1.472463 }, { "acc": 1.0, "epoch": 45.11032656663725, "grad_norm": 0.012238557450473309, "learning_rate": 2.559964314569956e-07, "loss": 0.00201939, "memory(GiB)": 15.03, "step": 25555, "train_speed(iter/s)": 1.472473 }, { "acc": 1.0, "epoch": 45.119152691968225, "grad_norm": 0.043585751205682755, "learning_rate": 2.550762710573612e-07, "loss": 0.00162298, "memory(GiB)": 15.03, "step": 25560, "train_speed(iter/s)": 1.472478 }, { "acc": 0.99986115, "epoch": 45.12797881729921, "grad_norm": 0.31941232085227966, "learning_rate": 2.5415773058179455e-07, "loss": 0.00622718, "memory(GiB)": 15.03, "step": 25565, "train_speed(iter/s)": 1.47249 }, { "acc": 1.0, "epoch": 45.136804942630185, "grad_norm": 0.35015568137168884, "learning_rate": 2.5324081034385237e-07, "loss": 0.00232034, "memory(GiB)": 15.03, "step": 25570, "train_speed(iter/s)": 1.472501 }, { "acc": 1.0, "epoch": 45.14563106796116, "grad_norm": 0.1453854739665985, "learning_rate": 2.5232551065654074e-07, "loss": 0.00273767, "memory(GiB)": 15.03, "step": 25575, "train_speed(iter/s)": 1.472505 }, { "acc": 1.0, "epoch": 45.154457193292146, "grad_norm": 0.20474329590797424, "learning_rate": 2.5141183183230963e-07, "loss": 0.00067496, "memory(GiB)": 15.03, "step": 25580, "train_speed(iter/s)": 1.472513 }, { "acc": 1.0, "epoch": 45.16328331862312, "grad_norm": 0.321094810962677, "learning_rate": 2.504997741830582e-07, "loss": 0.0049335, "memory(GiB)": 15.03, "step": 25585, "train_speed(iter/s)": 1.472511 }, { "acc": 1.0, "epoch": 45.172109443954106, "grad_norm": 0.12401662766933441, "learning_rate": 2.4958933802013247e-07, "loss": 0.00083113, "memory(GiB)": 15.03, "step": 25590, "train_speed(iter/s)": 1.472526 }, { "acc": 0.99974995, "epoch": 45.18093556928508, "grad_norm": 0.1771753579378128, "learning_rate": 2.486805236543232e-07, "loss": 0.0109859, "memory(GiB)": 15.03, "step": 25595, "train_speed(iter/s)": 1.472532 }, { "acc": 1.0, "epoch": 45.18976169461607, "grad_norm": 0.20581570267677307, "learning_rate": 2.477733313958684e-07, "loss": 0.00209534, "memory(GiB)": 15.03, "step": 25600, "train_speed(iter/s)": 1.472542 }, { "acc": 1.0, "epoch": 45.19858781994704, "grad_norm": 0.19846057891845703, "learning_rate": 2.468677615544516e-07, "loss": 0.00359423, "memory(GiB)": 15.03, "step": 25605, "train_speed(iter/s)": 1.472547 }, { "acc": 1.0, "epoch": 45.20741394527802, "grad_norm": 0.21668051183223724, "learning_rate": 2.4596381443920456e-07, "loss": 0.00473334, "memory(GiB)": 15.03, "step": 25610, "train_speed(iter/s)": 1.472554 }, { "acc": 1.0, "epoch": 45.216240070609004, "grad_norm": 0.21069982647895813, "learning_rate": 2.4506149035870183e-07, "loss": 0.00175129, "memory(GiB)": 15.03, "step": 25615, "train_speed(iter/s)": 1.472569 }, { "acc": 1.0, "epoch": 45.22506619593998, "grad_norm": 0.286691278219223, "learning_rate": 2.4416078962096836e-07, "loss": 0.00594479, "memory(GiB)": 15.03, "step": 25620, "train_speed(iter/s)": 1.472569 }, { "acc": 1.0, "epoch": 45.233892321270964, "grad_norm": 0.10905555635690689, "learning_rate": 2.4326171253346907e-07, "loss": 0.00609207, "memory(GiB)": 15.03, "step": 25625, "train_speed(iter/s)": 1.472568 }, { "acc": 1.0, "epoch": 45.24271844660194, "grad_norm": 0.2703552842140198, "learning_rate": 2.423642594031195e-07, "loss": 0.00254078, "memory(GiB)": 15.03, "step": 25630, "train_speed(iter/s)": 1.472575 }, { "acc": 1.0, "epoch": 45.251544571932925, "grad_norm": 0.41207772493362427, "learning_rate": 2.414684305362805e-07, "loss": 0.00583719, "memory(GiB)": 15.03, "step": 25635, "train_speed(iter/s)": 1.472584 }, { "acc": 1.0, "epoch": 45.2603706972639, "grad_norm": 0.1890016496181488, "learning_rate": 2.405742262387552e-07, "loss": 0.00454516, "memory(GiB)": 15.03, "step": 25640, "train_speed(iter/s)": 1.472595 }, { "acc": 1.0, "epoch": 45.26919682259488, "grad_norm": 0.039728280156850815, "learning_rate": 2.3968164681579504e-07, "loss": 0.00110571, "memory(GiB)": 15.03, "step": 25645, "train_speed(iter/s)": 1.472602 }, { "acc": 1.0, "epoch": 45.27802294792586, "grad_norm": 0.01993120275437832, "learning_rate": 2.387906925720946e-07, "loss": 0.0035611, "memory(GiB)": 15.03, "step": 25650, "train_speed(iter/s)": 1.472593 }, { "acc": 0.99921875, "epoch": 45.28684907325684, "grad_norm": 0.4373582601547241, "learning_rate": 2.3790136381179738e-07, "loss": 0.00741903, "memory(GiB)": 15.03, "step": 25655, "train_speed(iter/s)": 1.472598 }, { "acc": 1.0, "epoch": 45.29567519858782, "grad_norm": 0.20121733844280243, "learning_rate": 2.370136608384874e-07, "loss": 0.00208522, "memory(GiB)": 15.03, "step": 25660, "train_speed(iter/s)": 1.472595 }, { "acc": 1.0, "epoch": 45.3045013239188, "grad_norm": 0.22681479156017303, "learning_rate": 2.3612758395519697e-07, "loss": 0.00460652, "memory(GiB)": 15.03, "step": 25665, "train_speed(iter/s)": 1.472599 }, { "acc": 1.0, "epoch": 45.313327449249776, "grad_norm": 0.006244304124265909, "learning_rate": 2.35243133464401e-07, "loss": 0.00166066, "memory(GiB)": 15.03, "step": 25670, "train_speed(iter/s)": 1.472605 }, { "acc": 0.99980774, "epoch": 45.32215357458076, "grad_norm": 0.04498986154794693, "learning_rate": 2.3436030966802113e-07, "loss": 0.00317962, "memory(GiB)": 15.03, "step": 25675, "train_speed(iter/s)": 1.472611 }, { "acc": 1.0, "epoch": 45.33097969991174, "grad_norm": 0.13168898224830627, "learning_rate": 2.3347911286742506e-07, "loss": 0.00196143, "memory(GiB)": 15.03, "step": 25680, "train_speed(iter/s)": 1.472617 }, { "acc": 1.0, "epoch": 45.33980582524272, "grad_norm": 0.0020824798848479986, "learning_rate": 2.3259954336341918e-07, "loss": 0.00394457, "memory(GiB)": 15.03, "step": 25685, "train_speed(iter/s)": 1.472603 }, { "acc": 1.0, "epoch": 45.3486319505737, "grad_norm": 0.3782796859741211, "learning_rate": 2.317216014562606e-07, "loss": 0.0059939, "memory(GiB)": 15.03, "step": 25690, "train_speed(iter/s)": 1.472602 }, { "acc": 1.0, "epoch": 45.35745807590468, "grad_norm": 0.15017665922641754, "learning_rate": 2.3084528744564723e-07, "loss": 0.00286669, "memory(GiB)": 15.03, "step": 25695, "train_speed(iter/s)": 1.4726 }, { "acc": 1.0, "epoch": 45.36628420123566, "grad_norm": 0.26303863525390625, "learning_rate": 2.2997060163072442e-07, "loss": 0.00266709, "memory(GiB)": 15.03, "step": 25700, "train_speed(iter/s)": 1.472592 }, { "acc": 1.0, "epoch": 45.375110326566634, "grad_norm": 0.059114836156368256, "learning_rate": 2.290975443100767e-07, "loss": 0.00586463, "memory(GiB)": 15.03, "step": 25705, "train_speed(iter/s)": 1.472596 }, { "acc": 1.0, "epoch": 45.38393645189762, "grad_norm": 0.2795005738735199, "learning_rate": 2.2822611578173808e-07, "loss": 0.00731531, "memory(GiB)": 15.03, "step": 25710, "train_speed(iter/s)": 1.472589 }, { "acc": 1.0, "epoch": 45.392762577228595, "grad_norm": 0.23160094022750854, "learning_rate": 2.273563163431836e-07, "loss": 0.00623483, "memory(GiB)": 15.03, "step": 25715, "train_speed(iter/s)": 1.472595 }, { "acc": 0.99986706, "epoch": 45.40158870255958, "grad_norm": 0.12124837934970856, "learning_rate": 2.2648814629133268e-07, "loss": 0.00331905, "memory(GiB)": 15.03, "step": 25720, "train_speed(iter/s)": 1.472604 }, { "acc": 1.0, "epoch": 45.410414827890556, "grad_norm": 0.10599515587091446, "learning_rate": 2.2562160592254975e-07, "loss": 0.0023449, "memory(GiB)": 15.03, "step": 25725, "train_speed(iter/s)": 1.4726 }, { "acc": 1.0, "epoch": 45.41924095322153, "grad_norm": 0.003425831440836191, "learning_rate": 2.247566955326396e-07, "loss": 0.00357288, "memory(GiB)": 15.03, "step": 25730, "train_speed(iter/s)": 1.472601 }, { "acc": 1.0, "epoch": 45.428067078552516, "grad_norm": 0.42046865820884705, "learning_rate": 2.2389341541685496e-07, "loss": 0.00283714, "memory(GiB)": 15.03, "step": 25735, "train_speed(iter/s)": 1.472607 }, { "acc": 1.0, "epoch": 45.43689320388349, "grad_norm": 0.5261374115943909, "learning_rate": 2.2303176586988773e-07, "loss": 0.0081896, "memory(GiB)": 15.03, "step": 25740, "train_speed(iter/s)": 1.47262 }, { "acc": 0.99921875, "epoch": 45.44571932921448, "grad_norm": 0.23094680905342102, "learning_rate": 2.2217174718587825e-07, "loss": 0.00313357, "memory(GiB)": 15.03, "step": 25745, "train_speed(iter/s)": 1.472622 }, { "acc": 1.0, "epoch": 45.45454545454545, "grad_norm": 0.3946073651313782, "learning_rate": 2.2131335965840445e-07, "loss": 0.00353223, "memory(GiB)": 15.03, "step": 25750, "train_speed(iter/s)": 1.472624 }, { "acc": 1.0, "epoch": 45.46337157987644, "grad_norm": 0.030101364478468895, "learning_rate": 2.2045660358049157e-07, "loss": 0.00136173, "memory(GiB)": 15.03, "step": 25755, "train_speed(iter/s)": 1.472638 }, { "acc": 0.99970236, "epoch": 45.472197705207414, "grad_norm": 0.2660312354564667, "learning_rate": 2.196014792446063e-07, "loss": 0.0077932, "memory(GiB)": 15.03, "step": 25760, "train_speed(iter/s)": 1.472636 }, { "acc": 1.0, "epoch": 45.48102383053839, "grad_norm": 0.2919526696205139, "learning_rate": 2.187479869426593e-07, "loss": 0.00382812, "memory(GiB)": 15.03, "step": 25765, "train_speed(iter/s)": 1.472654 }, { "acc": 1.0, "epoch": 45.489849955869374, "grad_norm": 0.26213881373405457, "learning_rate": 2.1789612696600287e-07, "loss": 0.00318991, "memory(GiB)": 15.03, "step": 25770, "train_speed(iter/s)": 1.47266 }, { "acc": 0.99945116, "epoch": 45.49867608120035, "grad_norm": 0.264875203371048, "learning_rate": 2.170458996054318e-07, "loss": 0.00662476, "memory(GiB)": 15.03, "step": 25775, "train_speed(iter/s)": 1.472674 }, { "acc": 1.0, "epoch": 45.507502206531335, "grad_norm": 0.29808446764945984, "learning_rate": 2.1619730515118612e-07, "loss": 0.00212242, "memory(GiB)": 15.03, "step": 25780, "train_speed(iter/s)": 1.472679 }, { "acc": 1.0, "epoch": 45.51632833186231, "grad_norm": 0.6508634090423584, "learning_rate": 2.153503438929456e-07, "loss": 0.00732072, "memory(GiB)": 15.03, "step": 25785, "train_speed(iter/s)": 1.472676 }, { "acc": 1.0, "epoch": 45.525154457193295, "grad_norm": 0.08308485150337219, "learning_rate": 2.145050161198334e-07, "loss": 0.00127605, "memory(GiB)": 15.03, "step": 25790, "train_speed(iter/s)": 1.472669 }, { "acc": 1.0, "epoch": 45.53398058252427, "grad_norm": 0.022680021822452545, "learning_rate": 2.1366132212041587e-07, "loss": 0.00266585, "memory(GiB)": 15.03, "step": 25795, "train_speed(iter/s)": 1.472678 }, { "acc": 0.99963236, "epoch": 45.54280670785525, "grad_norm": 0.03549691289663315, "learning_rate": 2.1281926218270053e-07, "loss": 0.00294051, "memory(GiB)": 15.03, "step": 25800, "train_speed(iter/s)": 1.472677 }, { "acc": 1.0, "epoch": 45.55163283318623, "grad_norm": 0.22643376886844635, "learning_rate": 2.1197883659413915e-07, "loss": 0.00187838, "memory(GiB)": 15.03, "step": 25805, "train_speed(iter/s)": 1.472678 }, { "acc": 1.0, "epoch": 45.56045895851721, "grad_norm": 0.014850606210529804, "learning_rate": 2.1114004564162193e-07, "loss": 0.00367903, "memory(GiB)": 15.03, "step": 25810, "train_speed(iter/s)": 1.472679 }, { "acc": 1.0, "epoch": 45.56928508384819, "grad_norm": 0.19709129631519318, "learning_rate": 2.103028896114845e-07, "loss": 0.00303807, "memory(GiB)": 15.03, "step": 25815, "train_speed(iter/s)": 1.472685 }, { "acc": 1.0, "epoch": 45.57811120917917, "grad_norm": 0.043587107211351395, "learning_rate": 2.0946736878950235e-07, "loss": 0.00064413, "memory(GiB)": 15.03, "step": 25820, "train_speed(iter/s)": 1.472681 }, { "acc": 1.0, "epoch": 45.586937334510154, "grad_norm": 0.4036567211151123, "learning_rate": 2.0863348346089388e-07, "loss": 0.00233302, "memory(GiB)": 15.03, "step": 25825, "train_speed(iter/s)": 1.472677 }, { "acc": 1.0, "epoch": 45.59576345984113, "grad_norm": 0.01587153971195221, "learning_rate": 2.0780123391031948e-07, "loss": 0.00300159, "memory(GiB)": 15.03, "step": 25830, "train_speed(iter/s)": 1.472687 }, { "acc": 1.0, "epoch": 45.60458958517211, "grad_norm": 0.004949828144162893, "learning_rate": 2.0697062042187909e-07, "loss": 0.00395119, "memory(GiB)": 15.03, "step": 25835, "train_speed(iter/s)": 1.47269 }, { "acc": 0.99975491, "epoch": 45.61341571050309, "grad_norm": 0.06677082180976868, "learning_rate": 2.0614164327911638e-07, "loss": 0.00322084, "memory(GiB)": 15.03, "step": 25840, "train_speed(iter/s)": 1.472677 }, { "acc": 0.99986706, "epoch": 45.62224183583407, "grad_norm": 0.00848627183586359, "learning_rate": 2.05314302765015e-07, "loss": 0.00138832, "memory(GiB)": 15.03, "step": 25845, "train_speed(iter/s)": 1.472682 }, { "acc": 0.99982872, "epoch": 45.63106796116505, "grad_norm": 0.005639645271003246, "learning_rate": 2.0448859916200183e-07, "loss": 0.00229898, "memory(GiB)": 15.03, "step": 25850, "train_speed(iter/s)": 1.472687 }, { "acc": 1.0, "epoch": 45.63989408649603, "grad_norm": 0.16320177912712097, "learning_rate": 2.036645327519416e-07, "loss": 0.0034201, "memory(GiB)": 15.03, "step": 25855, "train_speed(iter/s)": 1.472683 }, { "acc": 1.0, "epoch": 45.648720211827005, "grad_norm": 0.1514512300491333, "learning_rate": 2.0284210381614381e-07, "loss": 0.00456712, "memory(GiB)": 15.03, "step": 25860, "train_speed(iter/s)": 1.472697 }, { "acc": 1.0, "epoch": 45.65754633715799, "grad_norm": 0.18559317290782928, "learning_rate": 2.0202131263535645e-07, "loss": 0.0024342, "memory(GiB)": 15.03, "step": 25865, "train_speed(iter/s)": 1.472705 }, { "acc": 1.0, "epoch": 45.666372462488965, "grad_norm": 0.1588459610939026, "learning_rate": 2.0120215948977004e-07, "loss": 0.00309297, "memory(GiB)": 15.03, "step": 25870, "train_speed(iter/s)": 1.472723 }, { "acc": 0.99969511, "epoch": 45.67519858781995, "grad_norm": 0.1508217602968216, "learning_rate": 2.0038464465901404e-07, "loss": 0.00625733, "memory(GiB)": 15.03, "step": 25875, "train_speed(iter/s)": 1.47272 }, { "acc": 1.0, "epoch": 45.684024713150926, "grad_norm": 0.2903580367565155, "learning_rate": 1.9956876842216065e-07, "loss": 0.00289949, "memory(GiB)": 15.03, "step": 25880, "train_speed(iter/s)": 1.472715 }, { "acc": 1.0, "epoch": 45.69285083848191, "grad_norm": 0.21580585837364197, "learning_rate": 1.9875453105772252e-07, "loss": 0.00277564, "memory(GiB)": 15.03, "step": 25885, "train_speed(iter/s)": 1.47272 }, { "acc": 1.0, "epoch": 45.701676963812886, "grad_norm": 0.38582348823547363, "learning_rate": 1.9794193284365115e-07, "loss": 0.002517, "memory(GiB)": 15.03, "step": 25890, "train_speed(iter/s)": 1.47273 }, { "acc": 1.0, "epoch": 45.71050308914386, "grad_norm": 0.22123445570468903, "learning_rate": 1.971309740573402e-07, "loss": 0.00411297, "memory(GiB)": 15.03, "step": 25895, "train_speed(iter/s)": 1.472734 }, { "acc": 0.99924173, "epoch": 45.71932921447485, "grad_norm": 0.834667980670929, "learning_rate": 1.9632165497562225e-07, "loss": 0.01121516, "memory(GiB)": 15.03, "step": 25900, "train_speed(iter/s)": 1.472745 }, { "acc": 1.0, "epoch": 45.728155339805824, "grad_norm": 0.00438265036791563, "learning_rate": 1.9551397587477193e-07, "loss": 0.00357911, "memory(GiB)": 15.03, "step": 25905, "train_speed(iter/s)": 1.472739 }, { "acc": 1.0, "epoch": 45.73698146513681, "grad_norm": 0.018230680376291275, "learning_rate": 1.947079370305022e-07, "loss": 0.00074794, "memory(GiB)": 15.03, "step": 25910, "train_speed(iter/s)": 1.472747 }, { "acc": 1.0, "epoch": 45.745807590467784, "grad_norm": 0.20467394590377808, "learning_rate": 1.9390353871796768e-07, "loss": 0.00154872, "memory(GiB)": 15.03, "step": 25915, "train_speed(iter/s)": 1.472756 }, { "acc": 0.99989033, "epoch": 45.75463371579876, "grad_norm": 0.5822111368179321, "learning_rate": 1.931007812117612e-07, "loss": 0.00640854, "memory(GiB)": 15.03, "step": 25920, "train_speed(iter/s)": 1.472756 }, { "acc": 1.0, "epoch": 45.763459841129745, "grad_norm": 0.15342067182064056, "learning_rate": 1.922996647859173e-07, "loss": 0.0025257, "memory(GiB)": 15.03, "step": 25925, "train_speed(iter/s)": 1.472766 }, { "acc": 1.0, "epoch": 45.77228596646072, "grad_norm": 0.2585427463054657, "learning_rate": 1.9150018971390974e-07, "loss": 0.0043318, "memory(GiB)": 15.03, "step": 25930, "train_speed(iter/s)": 1.472762 }, { "acc": 1.0, "epoch": 45.781112091791705, "grad_norm": 0.30012035369873047, "learning_rate": 1.9070235626865188e-07, "loss": 0.00422052, "memory(GiB)": 15.03, "step": 25935, "train_speed(iter/s)": 1.472747 }, { "acc": 1.0, "epoch": 45.78993821712268, "grad_norm": 0.02117210254073143, "learning_rate": 1.8990616472249525e-07, "loss": 0.00270519, "memory(GiB)": 15.03, "step": 25940, "train_speed(iter/s)": 1.472746 }, { "acc": 1.0, "epoch": 45.798764342453666, "grad_norm": 0.1190142035484314, "learning_rate": 1.8911161534723302e-07, "loss": 0.00317184, "memory(GiB)": 15.03, "step": 25945, "train_speed(iter/s)": 1.472732 }, { "acc": 1.0, "epoch": 45.80759046778464, "grad_norm": 0.17478777468204498, "learning_rate": 1.883187084140971e-07, "loss": 0.00510912, "memory(GiB)": 15.03, "step": 25950, "train_speed(iter/s)": 1.472738 }, { "acc": 1.0, "epoch": 45.81641659311562, "grad_norm": 0.19394271075725555, "learning_rate": 1.8752744419375847e-07, "loss": 0.01019152, "memory(GiB)": 15.03, "step": 25955, "train_speed(iter/s)": 1.472744 }, { "acc": 1.0, "epoch": 45.8252427184466, "grad_norm": 0.14546862244606018, "learning_rate": 1.867378229563267e-07, "loss": 0.00274201, "memory(GiB)": 15.03, "step": 25960, "train_speed(iter/s)": 1.472744 }, { "acc": 1.0, "epoch": 45.83406884377758, "grad_norm": 0.007580151781439781, "learning_rate": 1.8594984497135312e-07, "loss": 0.00037204, "memory(GiB)": 15.03, "step": 25965, "train_speed(iter/s)": 1.472737 }, { "acc": 1.0, "epoch": 45.84289496910856, "grad_norm": 0.2504122257232666, "learning_rate": 1.8516351050782402e-07, "loss": 0.00410482, "memory(GiB)": 15.03, "step": 25970, "train_speed(iter/s)": 1.472745 }, { "acc": 1.0, "epoch": 45.85172109443954, "grad_norm": 0.294889360666275, "learning_rate": 1.843788198341688e-07, "loss": 0.002289, "memory(GiB)": 15.03, "step": 25975, "train_speed(iter/s)": 1.472735 }, { "acc": 1.0, "epoch": 45.860547219770524, "grad_norm": 0.004440551158040762, "learning_rate": 1.8359577321825262e-07, "loss": 0.00117391, "memory(GiB)": 15.03, "step": 25980, "train_speed(iter/s)": 1.472743 }, { "acc": 1.0, "epoch": 45.8693733451015, "grad_norm": 0.005198409780859947, "learning_rate": 1.8281437092738093e-07, "loss": 0.00194862, "memory(GiB)": 15.03, "step": 25985, "train_speed(iter/s)": 1.472739 }, { "acc": 1.0, "epoch": 45.87819947043248, "grad_norm": 0.011020172387361526, "learning_rate": 1.8203461322829703e-07, "loss": 0.00113945, "memory(GiB)": 15.03, "step": 25990, "train_speed(iter/s)": 1.472743 }, { "acc": 1.0, "epoch": 45.88702559576346, "grad_norm": 0.11962919682264328, "learning_rate": 1.8125650038718467e-07, "loss": 0.00168952, "memory(GiB)": 15.03, "step": 25995, "train_speed(iter/s)": 1.472746 }, { "acc": 1.0, "epoch": 45.89585172109444, "grad_norm": 0.45423775911331177, "learning_rate": 1.8048003266966316e-07, "loss": 0.00535099, "memory(GiB)": 15.03, "step": 26000, "train_speed(iter/s)": 1.472747 }, { "acc": 1.0, "epoch": 45.90467784642542, "grad_norm": 0.21663276851177216, "learning_rate": 1.7970521034079225e-07, "loss": 0.00280319, "memory(GiB)": 15.03, "step": 26005, "train_speed(iter/s)": 1.47275 }, { "acc": 1.0, "epoch": 45.9135039717564, "grad_norm": 0.15893404185771942, "learning_rate": 1.7893203366507051e-07, "loss": 0.00179795, "memory(GiB)": 15.03, "step": 26010, "train_speed(iter/s)": 1.472758 }, { "acc": 1.0, "epoch": 45.92233009708738, "grad_norm": 0.4288867712020874, "learning_rate": 1.7816050290643322e-07, "loss": 0.00215731, "memory(GiB)": 15.03, "step": 26015, "train_speed(iter/s)": 1.472758 }, { "acc": 1.0, "epoch": 45.93115622241836, "grad_norm": 0.004884024150669575, "learning_rate": 1.7739061832825448e-07, "loss": 0.00422913, "memory(GiB)": 15.03, "step": 26020, "train_speed(iter/s)": 1.472743 }, { "acc": 1.0, "epoch": 45.939982347749336, "grad_norm": 0.04657462611794472, "learning_rate": 1.7662238019334552e-07, "loss": 0.00246952, "memory(GiB)": 15.03, "step": 26025, "train_speed(iter/s)": 1.472744 }, { "acc": 1.0, "epoch": 45.94880847308032, "grad_norm": 0.06281819939613342, "learning_rate": 1.7585578876395753e-07, "loss": 0.00052853, "memory(GiB)": 15.03, "step": 26030, "train_speed(iter/s)": 1.472747 }, { "acc": 1.0, "epoch": 45.957634598411296, "grad_norm": 0.22546100616455078, "learning_rate": 1.7509084430177715e-07, "loss": 0.00642797, "memory(GiB)": 15.03, "step": 26035, "train_speed(iter/s)": 1.472758 }, { "acc": 1.0, "epoch": 45.96646072374228, "grad_norm": 0.00953519344329834, "learning_rate": 1.7432754706793225e-07, "loss": 0.0046061, "memory(GiB)": 15.03, "step": 26040, "train_speed(iter/s)": 1.472752 }, { "acc": 1.0, "epoch": 45.97528684907326, "grad_norm": 0.1654912382364273, "learning_rate": 1.7356589732298328e-07, "loss": 0.00547193, "memory(GiB)": 15.03, "step": 26045, "train_speed(iter/s)": 1.472768 }, { "acc": 0.99989033, "epoch": 45.98411297440423, "grad_norm": 0.0097175482660532, "learning_rate": 1.728058953269328e-07, "loss": 0.00361102, "memory(GiB)": 15.03, "step": 26050, "train_speed(iter/s)": 1.472762 }, { "acc": 1.0, "epoch": 45.99293909973522, "grad_norm": 0.31917130947113037, "learning_rate": 1.720475413392197e-07, "loss": 0.00294621, "memory(GiB)": 15.03, "step": 26055, "train_speed(iter/s)": 1.472766 }, { "acc": 1.0, "epoch": 46.001765225066194, "grad_norm": 0.12616144120693207, "learning_rate": 1.7129083561871925e-07, "loss": 0.00261651, "memory(GiB)": 15.03, "step": 26060, "train_speed(iter/s)": 1.472731 }, { "acc": 0.99963236, "epoch": 46.01059135039718, "grad_norm": 0.2459304928779602, "learning_rate": 1.7053577842374449e-07, "loss": 0.00462145, "memory(GiB)": 15.03, "step": 26065, "train_speed(iter/s)": 1.472748 }, { "acc": 1.0, "epoch": 46.019417475728154, "grad_norm": 0.005016319453716278, "learning_rate": 1.697823700120463e-07, "loss": 0.00342354, "memory(GiB)": 15.03, "step": 26070, "train_speed(iter/s)": 1.472754 }, { "acc": 1.0, "epoch": 46.02824360105914, "grad_norm": 0.26521894335746765, "learning_rate": 1.6903061064081257e-07, "loss": 0.00476645, "memory(GiB)": 15.03, "step": 26075, "train_speed(iter/s)": 1.472757 }, { "acc": 1.0, "epoch": 46.037069726390115, "grad_norm": 0.2867639362812042, "learning_rate": 1.6828050056666738e-07, "loss": 0.0012755, "memory(GiB)": 15.03, "step": 26080, "train_speed(iter/s)": 1.472765 }, { "acc": 1.0, "epoch": 46.04589585172109, "grad_norm": 0.010929723270237446, "learning_rate": 1.6753204004567303e-07, "loss": 0.00284404, "memory(GiB)": 15.03, "step": 26085, "train_speed(iter/s)": 1.472754 }, { "acc": 0.99969511, "epoch": 46.054721977052075, "grad_norm": 0.36931049823760986, "learning_rate": 1.667852293333274e-07, "loss": 0.0053918, "memory(GiB)": 15.03, "step": 26090, "train_speed(iter/s)": 1.472756 }, { "acc": 1.0, "epoch": 46.06354810238305, "grad_norm": 0.4534231424331665, "learning_rate": 1.6604006868456597e-07, "loss": 0.00513418, "memory(GiB)": 15.03, "step": 26095, "train_speed(iter/s)": 1.472761 }, { "acc": 1.0, "epoch": 46.072374227714036, "grad_norm": 0.07832935452461243, "learning_rate": 1.6529655835376263e-07, "loss": 0.00147809, "memory(GiB)": 15.03, "step": 26100, "train_speed(iter/s)": 1.472763 }, { "acc": 1.0, "epoch": 46.08120035304501, "grad_norm": 0.2483227401971817, "learning_rate": 1.6455469859472393e-07, "loss": 0.00237603, "memory(GiB)": 15.03, "step": 26105, "train_speed(iter/s)": 1.472753 }, { "acc": 1.0, "epoch": 46.09002647837599, "grad_norm": 0.26165831089019775, "learning_rate": 1.63814489660697e-07, "loss": 0.00511836, "memory(GiB)": 15.03, "step": 26110, "train_speed(iter/s)": 1.472757 }, { "acc": 1.0, "epoch": 46.09885260370697, "grad_norm": 0.015528191812336445, "learning_rate": 1.6307593180436153e-07, "loss": 0.00190999, "memory(GiB)": 15.03, "step": 26115, "train_speed(iter/s)": 1.47275 }, { "acc": 1.0, "epoch": 46.10767872903795, "grad_norm": 0.18580830097198486, "learning_rate": 1.623390252778385e-07, "loss": 0.00346077, "memory(GiB)": 15.03, "step": 26120, "train_speed(iter/s)": 1.472756 }, { "acc": 1.0, "epoch": 46.116504854368934, "grad_norm": 0.12516289949417114, "learning_rate": 1.616037703326798e-07, "loss": 0.00075923, "memory(GiB)": 15.03, "step": 26125, "train_speed(iter/s)": 1.472772 }, { "acc": 1.0, "epoch": 46.12533097969991, "grad_norm": 0.3254794180393219, "learning_rate": 1.608701672198773e-07, "loss": 0.00315262, "memory(GiB)": 15.03, "step": 26130, "train_speed(iter/s)": 1.47277 }, { "acc": 0.99953117, "epoch": 46.134157105030894, "grad_norm": 0.1606682538986206, "learning_rate": 1.6013821618985787e-07, "loss": 0.0062887, "memory(GiB)": 15.03, "step": 26135, "train_speed(iter/s)": 1.47278 }, { "acc": 1.0, "epoch": 46.14298323036187, "grad_norm": 0.2567594647407532, "learning_rate": 1.5940791749248492e-07, "loss": 0.00277272, "memory(GiB)": 15.03, "step": 26140, "train_speed(iter/s)": 1.47277 }, { "acc": 1.0, "epoch": 46.15180935569285, "grad_norm": 0.32183894515037537, "learning_rate": 1.5867927137705586e-07, "loss": 0.00399786, "memory(GiB)": 15.03, "step": 26145, "train_speed(iter/s)": 1.472764 }, { "acc": 1.0, "epoch": 46.16063548102383, "grad_norm": 0.04274705424904823, "learning_rate": 1.5795227809230553e-07, "loss": 0.00070626, "memory(GiB)": 15.03, "step": 26150, "train_speed(iter/s)": 1.472764 }, { "acc": 1.0, "epoch": 46.16946160635481, "grad_norm": 0.22527843713760376, "learning_rate": 1.5722693788640578e-07, "loss": 0.00367592, "memory(GiB)": 15.03, "step": 26155, "train_speed(iter/s)": 1.472767 }, { "acc": 0.99989033, "epoch": 46.17828773168579, "grad_norm": 0.15681494772434235, "learning_rate": 1.5650325100696136e-07, "loss": 0.00124043, "memory(GiB)": 15.03, "step": 26160, "train_speed(iter/s)": 1.472758 }, { "acc": 1.0, "epoch": 46.18711385701677, "grad_norm": 0.006624070927500725, "learning_rate": 1.55781217701015e-07, "loss": 0.00169138, "memory(GiB)": 15.03, "step": 26165, "train_speed(iter/s)": 1.472766 }, { "acc": 1.0, "epoch": 46.19593998234775, "grad_norm": 0.0014729831600561738, "learning_rate": 1.5506083821504325e-07, "loss": 0.00226453, "memory(GiB)": 15.03, "step": 26170, "train_speed(iter/s)": 1.472774 }, { "acc": 1.0, "epoch": 46.20476610767873, "grad_norm": 0.23500466346740723, "learning_rate": 1.543421127949586e-07, "loss": 0.00148696, "memory(GiB)": 15.03, "step": 26175, "train_speed(iter/s)": 1.472789 }, { "acc": 1.0, "epoch": 46.213592233009706, "grad_norm": 0.184756800532341, "learning_rate": 1.5362504168610975e-07, "loss": 0.00220537, "memory(GiB)": 15.03, "step": 26180, "train_speed(iter/s)": 1.472796 }, { "acc": 1.0, "epoch": 46.22241835834069, "grad_norm": 0.004839166067540646, "learning_rate": 1.5290962513328024e-07, "loss": 0.00240651, "memory(GiB)": 15.03, "step": 26185, "train_speed(iter/s)": 1.472793 }, { "acc": 1.0, "epoch": 46.23124448367167, "grad_norm": 0.20647643506526947, "learning_rate": 1.5219586338068858e-07, "loss": 0.00151595, "memory(GiB)": 15.03, "step": 26190, "train_speed(iter/s)": 1.472798 }, { "acc": 1.0, "epoch": 46.24007060900265, "grad_norm": 0.1255221664905548, "learning_rate": 1.514837566719872e-07, "loss": 0.0038084, "memory(GiB)": 15.03, "step": 26195, "train_speed(iter/s)": 1.472793 }, { "acc": 1.0, "epoch": 46.24889673433363, "grad_norm": 0.31003087759017944, "learning_rate": 1.5077330525026675e-07, "loss": 0.00392174, "memory(GiB)": 15.03, "step": 26200, "train_speed(iter/s)": 1.47281 }, { "acc": 1.0, "epoch": 46.257722859664604, "grad_norm": 0.0035357861779630184, "learning_rate": 1.500645093580495e-07, "loss": 0.00195791, "memory(GiB)": 15.03, "step": 26205, "train_speed(iter/s)": 1.472809 }, { "acc": 0.99921875, "epoch": 46.26654898499559, "grad_norm": 0.3127444088459015, "learning_rate": 1.4935736923729379e-07, "loss": 0.00310437, "memory(GiB)": 15.03, "step": 26210, "train_speed(iter/s)": 1.472819 }, { "acc": 1.0, "epoch": 46.275375110326564, "grad_norm": 0.16274523735046387, "learning_rate": 1.486518851293935e-07, "loss": 0.00349395, "memory(GiB)": 15.03, "step": 26215, "train_speed(iter/s)": 1.472824 }, { "acc": 1.0, "epoch": 46.28420123565755, "grad_norm": 0.10861328989267349, "learning_rate": 1.479480572751763e-07, "loss": 0.00129762, "memory(GiB)": 15.03, "step": 26220, "train_speed(iter/s)": 1.472833 }, { "acc": 0.99986706, "epoch": 46.293027360988525, "grad_norm": 0.12886610627174377, "learning_rate": 1.472458859149054e-07, "loss": 0.00463428, "memory(GiB)": 15.03, "step": 26225, "train_speed(iter/s)": 1.472833 }, { "acc": 1.0, "epoch": 46.30185348631951, "grad_norm": 0.10401330143213272, "learning_rate": 1.4654537128827677e-07, "loss": 0.00139727, "memory(GiB)": 15.03, "step": 26230, "train_speed(iter/s)": 1.472839 }, { "acc": 1.0, "epoch": 46.310679611650485, "grad_norm": 0.01154229138046503, "learning_rate": 1.458465136344235e-07, "loss": 0.00273097, "memory(GiB)": 15.03, "step": 26235, "train_speed(iter/s)": 1.472831 }, { "acc": 0.99986115, "epoch": 46.31950573698146, "grad_norm": 0.13413307070732117, "learning_rate": 1.451493131919092e-07, "loss": 0.00368473, "memory(GiB)": 15.03, "step": 26240, "train_speed(iter/s)": 1.472835 }, { "acc": 1.0, "epoch": 46.328331862312446, "grad_norm": 0.017574530094861984, "learning_rate": 1.444537701987369e-07, "loss": 0.00124823, "memory(GiB)": 15.03, "step": 26245, "train_speed(iter/s)": 1.472842 }, { "acc": 1.0, "epoch": 46.33715798764342, "grad_norm": 0.07565005868673325, "learning_rate": 1.4375988489233848e-07, "loss": 0.0063324, "memory(GiB)": 15.03, "step": 26250, "train_speed(iter/s)": 1.472841 }, { "acc": 1.0, "epoch": 46.345984112974406, "grad_norm": 0.0011532179778441787, "learning_rate": 1.4306765750958349e-07, "loss": 0.0011541, "memory(GiB)": 15.03, "step": 26255, "train_speed(iter/s)": 1.472844 }, { "acc": 1.0, "epoch": 46.35481023830538, "grad_norm": 0.1496943086385727, "learning_rate": 1.4237708828677487e-07, "loss": 0.0024228, "memory(GiB)": 15.03, "step": 26260, "train_speed(iter/s)": 1.472859 }, { "acc": 1.0, "epoch": 46.36363636363637, "grad_norm": 0.3060230314731598, "learning_rate": 1.416881774596482e-07, "loss": 0.00483852, "memory(GiB)": 15.03, "step": 26265, "train_speed(iter/s)": 1.472847 }, { "acc": 0.99969511, "epoch": 46.372462488967344, "grad_norm": 0.3966377079486847, "learning_rate": 1.410009252633757e-07, "loss": 0.00929793, "memory(GiB)": 15.03, "step": 26270, "train_speed(iter/s)": 1.47285 }, { "acc": 1.0, "epoch": 46.38128861429832, "grad_norm": 0.10316675156354904, "learning_rate": 1.4031533193255962e-07, "loss": 0.00082849, "memory(GiB)": 15.03, "step": 26275, "train_speed(iter/s)": 1.472852 }, { "acc": 0.99963236, "epoch": 46.390114739629304, "grad_norm": 0.31194186210632324, "learning_rate": 1.396313977012387e-07, "loss": 0.00595188, "memory(GiB)": 15.03, "step": 26280, "train_speed(iter/s)": 1.472868 }, { "acc": 1.0, "epoch": 46.39894086496028, "grad_norm": 0.21470196545124054, "learning_rate": 1.3894912280288392e-07, "loss": 0.00224226, "memory(GiB)": 15.03, "step": 26285, "train_speed(iter/s)": 1.472863 }, { "acc": 1.0, "epoch": 46.407766990291265, "grad_norm": 0.004771051928400993, "learning_rate": 1.3826850747040186e-07, "loss": 0.00166244, "memory(GiB)": 15.03, "step": 26290, "train_speed(iter/s)": 1.472862 }, { "acc": 1.0, "epoch": 46.41659311562224, "grad_norm": 0.011021166108548641, "learning_rate": 1.3758955193612945e-07, "loss": 0.00205576, "memory(GiB)": 15.03, "step": 26295, "train_speed(iter/s)": 1.47286 }, { "acc": 0.99989033, "epoch": 46.42541924095322, "grad_norm": 0.11834952980279922, "learning_rate": 1.369122564318392e-07, "loss": 0.00261629, "memory(GiB)": 15.03, "step": 26300, "train_speed(iter/s)": 1.472857 }, { "acc": 1.0, "epoch": 46.4342453662842, "grad_norm": 0.0062039257027208805, "learning_rate": 1.3623662118873753e-07, "loss": 0.00397273, "memory(GiB)": 15.03, "step": 26305, "train_speed(iter/s)": 1.472865 }, { "acc": 1.0, "epoch": 46.44307149161518, "grad_norm": 0.5406511425971985, "learning_rate": 1.3556264643746178e-07, "loss": 0.00411097, "memory(GiB)": 15.03, "step": 26310, "train_speed(iter/s)": 1.472873 }, { "acc": 1.0, "epoch": 46.45189761694616, "grad_norm": 0.003147884737700224, "learning_rate": 1.348903324080844e-07, "loss": 0.00238574, "memory(GiB)": 15.03, "step": 26315, "train_speed(iter/s)": 1.472866 }, { "acc": 1.0, "epoch": 46.46072374227714, "grad_norm": 0.03833026811480522, "learning_rate": 1.3421967933010938e-07, "loss": 0.00488354, "memory(GiB)": 15.03, "step": 26320, "train_speed(iter/s)": 1.472878 }, { "acc": 1.0, "epoch": 46.46954986760812, "grad_norm": 0.154514342546463, "learning_rate": 1.3355068743247565e-07, "loss": 0.00341437, "memory(GiB)": 15.03, "step": 26325, "train_speed(iter/s)": 1.472884 }, { "acc": 1.0, "epoch": 46.4783759929391, "grad_norm": 0.17437677085399628, "learning_rate": 1.3288335694355323e-07, "loss": 0.00287782, "memory(GiB)": 15.03, "step": 26330, "train_speed(iter/s)": 1.472874 }, { "acc": 0.99975491, "epoch": 46.487202118270076, "grad_norm": 0.22981272637844086, "learning_rate": 1.32217688091146e-07, "loss": 0.00452423, "memory(GiB)": 15.03, "step": 26335, "train_speed(iter/s)": 1.472874 }, { "acc": 0.99978065, "epoch": 46.49602824360106, "grad_norm": 0.21726162731647491, "learning_rate": 1.3155368110248942e-07, "loss": 0.00404661, "memory(GiB)": 15.03, "step": 26340, "train_speed(iter/s)": 1.472883 }, { "acc": 1.0, "epoch": 46.50485436893204, "grad_norm": 0.002072380157187581, "learning_rate": 1.3089133620425395e-07, "loss": 0.0014298, "memory(GiB)": 15.03, "step": 26345, "train_speed(iter/s)": 1.472875 }, { "acc": 1.0, "epoch": 46.51368049426302, "grad_norm": 0.0029808899853378534, "learning_rate": 1.3023065362254167e-07, "loss": 0.00250068, "memory(GiB)": 15.03, "step": 26350, "train_speed(iter/s)": 1.472874 }, { "acc": 1.0, "epoch": 46.522506619594, "grad_norm": 0.3714197874069214, "learning_rate": 1.2957163358288512e-07, "loss": 0.0058896, "memory(GiB)": 15.03, "step": 26355, "train_speed(iter/s)": 1.472866 }, { "acc": 1.0, "epoch": 46.53133274492498, "grad_norm": 0.1626017987728119, "learning_rate": 1.289142763102524e-07, "loss": 0.00401087, "memory(GiB)": 15.03, "step": 26360, "train_speed(iter/s)": 1.472868 }, { "acc": 1.0, "epoch": 46.54015887025596, "grad_norm": 0.15963225066661835, "learning_rate": 1.2825858202904156e-07, "loss": 0.0026899, "memory(GiB)": 15.03, "step": 26365, "train_speed(iter/s)": 1.472869 }, { "acc": 1.0, "epoch": 46.548984995586935, "grad_norm": 0.01743023842573166, "learning_rate": 1.2760455096308445e-07, "loss": 0.00226868, "memory(GiB)": 15.03, "step": 26370, "train_speed(iter/s)": 1.47287 }, { "acc": 1.0, "epoch": 46.55781112091792, "grad_norm": 0.14015352725982666, "learning_rate": 1.269521833356457e-07, "loss": 0.00410861, "memory(GiB)": 15.03, "step": 26375, "train_speed(iter/s)": 1.47287 }, { "acc": 0.99982872, "epoch": 46.566637246248895, "grad_norm": 0.024496247991919518, "learning_rate": 1.2630147936941937e-07, "loss": 0.00751588, "memory(GiB)": 15.03, "step": 26380, "train_speed(iter/s)": 1.472878 }, { "acc": 1.0, "epoch": 46.57546337157988, "grad_norm": 0.14374598860740662, "learning_rate": 1.2565243928653494e-07, "loss": 0.0035938, "memory(GiB)": 15.03, "step": 26385, "train_speed(iter/s)": 1.472875 }, { "acc": 1.0, "epoch": 46.584289496910856, "grad_norm": 0.23012909293174744, "learning_rate": 1.2500506330855077e-07, "loss": 0.0020125, "memory(GiB)": 15.03, "step": 26390, "train_speed(iter/s)": 1.472872 }, { "acc": 1.0, "epoch": 46.59311562224183, "grad_norm": 0.17432722449302673, "learning_rate": 1.2435935165646135e-07, "loss": 0.00417042, "memory(GiB)": 15.03, "step": 26395, "train_speed(iter/s)": 1.472869 }, { "acc": 0.99989033, "epoch": 46.601941747572816, "grad_norm": 0.11057006567716599, "learning_rate": 1.237153045506866e-07, "loss": 0.00716905, "memory(GiB)": 15.03, "step": 26400, "train_speed(iter/s)": 1.472865 }, { "acc": 1.0, "epoch": 46.61076787290379, "grad_norm": 0.20394280552864075, "learning_rate": 1.230729222110853e-07, "loss": 0.00245496, "memory(GiB)": 15.03, "step": 26405, "train_speed(iter/s)": 1.472878 }, { "acc": 1.0, "epoch": 46.61959399823478, "grad_norm": 0.29059749841690063, "learning_rate": 1.2243220485694235e-07, "loss": 0.00221893, "memory(GiB)": 15.03, "step": 26410, "train_speed(iter/s)": 1.472892 }, { "acc": 0.99949999, "epoch": 46.62842012356575, "grad_norm": 0.0023446334525942802, "learning_rate": 1.217931527069792e-07, "loss": 0.0146283, "memory(GiB)": 15.03, "step": 26415, "train_speed(iter/s)": 1.472888 }, { "acc": 1.0, "epoch": 46.63724624889674, "grad_norm": 0.16120092570781708, "learning_rate": 1.2115576597934288e-07, "loss": 0.00253683, "memory(GiB)": 15.03, "step": 26420, "train_speed(iter/s)": 1.472891 }, { "acc": 1.0, "epoch": 46.646072374227714, "grad_norm": 0.10408823937177658, "learning_rate": 1.2052004489161697e-07, "loss": 0.00263235, "memory(GiB)": 15.03, "step": 26425, "train_speed(iter/s)": 1.472891 }, { "acc": 1.0, "epoch": 46.65489849955869, "grad_norm": 0.17858663201332092, "learning_rate": 1.1988598966081564e-07, "loss": 0.00310006, "memory(GiB)": 15.03, "step": 26430, "train_speed(iter/s)": 1.472901 }, { "acc": 1.0, "epoch": 46.663724624889674, "grad_norm": 0.22342076897621155, "learning_rate": 1.1925360050338238e-07, "loss": 0.00446793, "memory(GiB)": 15.03, "step": 26435, "train_speed(iter/s)": 1.472895 }, { "acc": 1.0, "epoch": 46.67255075022065, "grad_norm": 0.29034966230392456, "learning_rate": 1.1862287763519352e-07, "loss": 0.00212067, "memory(GiB)": 15.03, "step": 26440, "train_speed(iter/s)": 1.472899 }, { "acc": 1.0, "epoch": 46.681376875551635, "grad_norm": 0.01720135286450386, "learning_rate": 1.1799382127155465e-07, "loss": 0.00561916, "memory(GiB)": 15.03, "step": 26445, "train_speed(iter/s)": 1.472896 }, { "acc": 0.9998106, "epoch": 46.69020300088261, "grad_norm": 0.014557735994458199, "learning_rate": 1.173664316272059e-07, "loss": 0.00604643, "memory(GiB)": 15.03, "step": 26450, "train_speed(iter/s)": 1.472897 }, { "acc": 0.99981613, "epoch": 46.699029126213595, "grad_norm": 0.07199928164482117, "learning_rate": 1.1674070891631558e-07, "loss": 0.0043939, "memory(GiB)": 15.03, "step": 26455, "train_speed(iter/s)": 1.4729 }, { "acc": 1.0, "epoch": 46.70785525154457, "grad_norm": 0.09974480420351028, "learning_rate": 1.1611665335248426e-07, "loss": 0.00111002, "memory(GiB)": 15.03, "step": 26460, "train_speed(iter/s)": 1.472918 }, { "acc": 1.0, "epoch": 46.71668137687555, "grad_norm": 0.22534283995628357, "learning_rate": 1.1549426514874184e-07, "loss": 0.0029215, "memory(GiB)": 15.03, "step": 26465, "train_speed(iter/s)": 1.472915 }, { "acc": 1.0, "epoch": 46.72550750220653, "grad_norm": 0.001067623496055603, "learning_rate": 1.1487354451755103e-07, "loss": 0.00125242, "memory(GiB)": 15.03, "step": 26470, "train_speed(iter/s)": 1.472917 }, { "acc": 1.0, "epoch": 46.73433362753751, "grad_norm": 0.004810499958693981, "learning_rate": 1.1425449167080501e-07, "loss": 0.00235401, "memory(GiB)": 15.03, "step": 26475, "train_speed(iter/s)": 1.47292 }, { "acc": 1.0, "epoch": 46.74315975286849, "grad_norm": 0.006728155072778463, "learning_rate": 1.1363710681982636e-07, "loss": 0.00300269, "memory(GiB)": 15.03, "step": 26480, "train_speed(iter/s)": 1.472928 }, { "acc": 1.0, "epoch": 46.75198587819947, "grad_norm": 0.7062762379646301, "learning_rate": 1.1302139017536925e-07, "loss": 0.00452645, "memory(GiB)": 15.03, "step": 26485, "train_speed(iter/s)": 1.472941 }, { "acc": 1.0, "epoch": 46.76081200353045, "grad_norm": 0.0030524800531566143, "learning_rate": 1.1240734194761791e-07, "loss": 0.00260951, "memory(GiB)": 15.03, "step": 26490, "train_speed(iter/s)": 1.472951 }, { "acc": 1.0, "epoch": 46.76963812886143, "grad_norm": 0.05864041671156883, "learning_rate": 1.1179496234618697e-07, "loss": 0.00259028, "memory(GiB)": 15.03, "step": 26495, "train_speed(iter/s)": 1.47296 }, { "acc": 0.99960938, "epoch": 46.77846425419241, "grad_norm": 0.5440627932548523, "learning_rate": 1.1118425158012278e-07, "loss": 0.00643306, "memory(GiB)": 15.03, "step": 26500, "train_speed(iter/s)": 1.472971 }, { "acc": 1.0, "epoch": 46.78729037952339, "grad_norm": 0.16813616454601288, "learning_rate": 1.1057520985789931e-07, "loss": 0.00297689, "memory(GiB)": 15.03, "step": 26505, "train_speed(iter/s)": 1.47297 }, { "acc": 1.0, "epoch": 46.79611650485437, "grad_norm": 0.0050158752128481865, "learning_rate": 1.0996783738742394e-07, "loss": 0.00161405, "memory(GiB)": 15.03, "step": 26510, "train_speed(iter/s)": 1.472959 }, { "acc": 1.0, "epoch": 46.80494263018535, "grad_norm": 0.2368452250957489, "learning_rate": 1.0936213437603172e-07, "loss": 0.0020684, "memory(GiB)": 15.03, "step": 26515, "train_speed(iter/s)": 1.472972 }, { "acc": 1.0, "epoch": 46.81376875551633, "grad_norm": 0.24485228955745697, "learning_rate": 1.0875810103048989e-07, "loss": 0.00142952, "memory(GiB)": 15.03, "step": 26520, "train_speed(iter/s)": 1.472966 }, { "acc": 1.0, "epoch": 46.822594880847305, "grad_norm": 0.27822214365005493, "learning_rate": 1.0815573755699343e-07, "loss": 0.00083717, "memory(GiB)": 15.03, "step": 26525, "train_speed(iter/s)": 1.472966 }, { "acc": 0.99972219, "epoch": 46.83142100617829, "grad_norm": 0.5624222159385681, "learning_rate": 1.0755504416116893e-07, "loss": 0.00695815, "memory(GiB)": 15.03, "step": 26530, "train_speed(iter/s)": 1.472975 }, { "acc": 1.0, "epoch": 46.840247131509265, "grad_norm": 0.14205066859722137, "learning_rate": 1.0695602104807185e-07, "loss": 0.00142685, "memory(GiB)": 15.03, "step": 26535, "train_speed(iter/s)": 1.472966 }, { "acc": 1.0, "epoch": 46.84907325684025, "grad_norm": 0.18173156678676605, "learning_rate": 1.0635866842218981e-07, "loss": 0.00167983, "memory(GiB)": 15.03, "step": 26540, "train_speed(iter/s)": 1.472965 }, { "acc": 1.0, "epoch": 46.857899382171226, "grad_norm": 0.008739609271287918, "learning_rate": 1.0576298648743597e-07, "loss": 0.00095198, "memory(GiB)": 15.03, "step": 26545, "train_speed(iter/s)": 1.472983 }, { "acc": 1.0, "epoch": 46.86672550750221, "grad_norm": 0.1818826049566269, "learning_rate": 1.0516897544715675e-07, "loss": 0.00179532, "memory(GiB)": 15.03, "step": 26550, "train_speed(iter/s)": 1.472984 }, { "acc": 1.0, "epoch": 46.87555163283319, "grad_norm": 0.014557402580976486, "learning_rate": 1.0457663550412743e-07, "loss": 0.00184595, "memory(GiB)": 15.03, "step": 26555, "train_speed(iter/s)": 1.472974 }, { "acc": 1.0, "epoch": 46.88437775816416, "grad_norm": 0.1852833330631256, "learning_rate": 1.0398596686055213e-07, "loss": 0.00428436, "memory(GiB)": 15.03, "step": 26560, "train_speed(iter/s)": 1.472962 }, { "acc": 1.0, "epoch": 46.89320388349515, "grad_norm": 0.11803708970546722, "learning_rate": 1.0339696971806493e-07, "loss": 0.00191591, "memory(GiB)": 15.03, "step": 26565, "train_speed(iter/s)": 1.472967 }, { "acc": 1.0, "epoch": 46.902030008826124, "grad_norm": 0.006371091585606337, "learning_rate": 1.0280964427772822e-07, "loss": 0.00301904, "memory(GiB)": 15.03, "step": 26570, "train_speed(iter/s)": 1.472983 }, { "acc": 0.99986553, "epoch": 46.91085613415711, "grad_norm": 0.5500329732894897, "learning_rate": 1.0222399074003598e-07, "loss": 0.00437379, "memory(GiB)": 15.03, "step": 26575, "train_speed(iter/s)": 1.472982 }, { "acc": 1.0, "epoch": 46.919682259488084, "grad_norm": 0.0035752567928284407, "learning_rate": 1.016400093049094e-07, "loss": 0.00118094, "memory(GiB)": 15.03, "step": 26580, "train_speed(iter/s)": 1.472977 }, { "acc": 1.0, "epoch": 46.92850838481906, "grad_norm": 0.0032317843288183212, "learning_rate": 1.0105770017170018e-07, "loss": 0.00313343, "memory(GiB)": 15.03, "step": 26585, "train_speed(iter/s)": 1.472977 }, { "acc": 0.99972219, "epoch": 46.937334510150045, "grad_norm": 0.29639768600463867, "learning_rate": 1.004770635391872e-07, "loss": 0.00259052, "memory(GiB)": 15.03, "step": 26590, "train_speed(iter/s)": 1.472988 }, { "acc": 1.0, "epoch": 46.94616063548102, "grad_norm": 0.01818390004336834, "learning_rate": 9.989809960558148e-08, "loss": 0.00192245, "memory(GiB)": 15.03, "step": 26595, "train_speed(iter/s)": 1.472998 }, { "acc": 1.0, "epoch": 46.954986760812005, "grad_norm": 0.14172838628292084, "learning_rate": 9.93208085685202e-08, "loss": 0.00189466, "memory(GiB)": 15.03, "step": 26600, "train_speed(iter/s)": 1.473004 }, { "acc": 1.0, "epoch": 46.96381288614298, "grad_norm": 0.24976351857185364, "learning_rate": 9.87451906250715e-08, "loss": 0.00266692, "memory(GiB)": 15.03, "step": 26605, "train_speed(iter/s)": 1.473004 }, { "acc": 1.0, "epoch": 46.972639011473966, "grad_norm": 0.37163418531417847, "learning_rate": 9.817124597173077e-08, "loss": 0.0050816, "memory(GiB)": 15.03, "step": 26610, "train_speed(iter/s)": 1.473018 }, { "acc": 1.0, "epoch": 46.98146513680494, "grad_norm": 0.03416039049625397, "learning_rate": 9.759897480442281e-08, "loss": 0.00325378, "memory(GiB)": 15.03, "step": 26615, "train_speed(iter/s)": 1.47302 }, { "acc": 1.0, "epoch": 46.99029126213592, "grad_norm": 0.2522742450237274, "learning_rate": 9.702837731850177e-08, "loss": 0.0063197, "memory(GiB)": 15.03, "step": 26620, "train_speed(iter/s)": 1.473023 }, { "acc": 1.0, "epoch": 46.9991173874669, "grad_norm": 0.011740664951503277, "learning_rate": 9.645945370875071e-08, "loss": 0.00184639, "memory(GiB)": 15.03, "step": 26625, "train_speed(iter/s)": 1.473015 }, { "acc": 1.0, "epoch": 47.00794351279788, "grad_norm": 0.14531362056732178, "learning_rate": 9.58922041693787e-08, "loss": 0.00110758, "memory(GiB)": 15.03, "step": 26630, "train_speed(iter/s)": 1.472971 }, { "acc": 1.0, "epoch": 47.016769638128864, "grad_norm": 0.0016841908218339086, "learning_rate": 9.532662889402647e-08, "loss": 0.00274472, "memory(GiB)": 15.03, "step": 26635, "train_speed(iter/s)": 1.472986 }, { "acc": 1.0, "epoch": 47.02559576345984, "grad_norm": 0.006933190859854221, "learning_rate": 9.476272807576136e-08, "loss": 0.00194391, "memory(GiB)": 15.03, "step": 26640, "train_speed(iter/s)": 1.473006 }, { "acc": 1.0, "epoch": 47.034421888790824, "grad_norm": 0.31645718216896057, "learning_rate": 9.420050190708069e-08, "loss": 0.00346589, "memory(GiB)": 15.03, "step": 26645, "train_speed(iter/s)": 1.473023 }, { "acc": 1.0, "epoch": 47.0432480141218, "grad_norm": 0.009442285634577274, "learning_rate": 9.363995057990729e-08, "loss": 0.00372572, "memory(GiB)": 15.03, "step": 26650, "train_speed(iter/s)": 1.473029 }, { "acc": 0.99975491, "epoch": 47.05207413945278, "grad_norm": 0.03485236316919327, "learning_rate": 9.30810742855956e-08, "loss": 0.00531134, "memory(GiB)": 15.03, "step": 26655, "train_speed(iter/s)": 1.473027 }, { "acc": 0.99986706, "epoch": 47.06090026478376, "grad_norm": 0.304684042930603, "learning_rate": 9.252387321492558e-08, "loss": 0.00428751, "memory(GiB)": 15.03, "step": 26660, "train_speed(iter/s)": 1.473024 }, { "acc": 1.0, "epoch": 47.06972639011474, "grad_norm": 0.08893228322267532, "learning_rate": 9.196834755810826e-08, "loss": 0.00112254, "memory(GiB)": 15.03, "step": 26665, "train_speed(iter/s)": 1.473014 }, { "acc": 1.0, "epoch": 47.07855251544572, "grad_norm": 0.22652678191661835, "learning_rate": 9.141449750477852e-08, "loss": 0.0019451, "memory(GiB)": 15.03, "step": 26670, "train_speed(iter/s)": 1.473031 }, { "acc": 1.0, "epoch": 47.0873786407767, "grad_norm": 0.020408980548381805, "learning_rate": 9.086232324400342e-08, "loss": 0.00294371, "memory(GiB)": 15.03, "step": 26675, "train_speed(iter/s)": 1.473045 }, { "acc": 1.0, "epoch": 47.096204766107675, "grad_norm": 0.00819114688783884, "learning_rate": 9.031182496427555e-08, "loss": 0.00306746, "memory(GiB)": 15.03, "step": 26680, "train_speed(iter/s)": 1.473047 }, { "acc": 1.0, "epoch": 47.10503089143866, "grad_norm": 0.09762845933437347, "learning_rate": 8.97630028535169e-08, "loss": 0.00327378, "memory(GiB)": 15.03, "step": 26685, "train_speed(iter/s)": 1.473056 }, { "acc": 1.0, "epoch": 47.113857016769636, "grad_norm": 0.011004800908267498, "learning_rate": 8.921585709907548e-08, "loss": 0.0010765, "memory(GiB)": 15.03, "step": 26690, "train_speed(iter/s)": 1.473053 }, { "acc": 1.0, "epoch": 47.12268314210062, "grad_norm": 0.38738906383514404, "learning_rate": 8.867038788772765e-08, "loss": 0.00470481, "memory(GiB)": 15.03, "step": 26695, "train_speed(iter/s)": 1.47305 }, { "acc": 1.0, "epoch": 47.131509267431596, "grad_norm": 0.14383654296398163, "learning_rate": 8.812659540567863e-08, "loss": 0.00137737, "memory(GiB)": 15.03, "step": 26700, "train_speed(iter/s)": 1.473051 }, { "acc": 1.0, "epoch": 47.14033539276258, "grad_norm": 0.3842445909976959, "learning_rate": 8.758447983856022e-08, "loss": 0.00537832, "memory(GiB)": 15.03, "step": 26705, "train_speed(iter/s)": 1.473053 }, { "acc": 1.0, "epoch": 47.14916151809356, "grad_norm": 0.010189858265221119, "learning_rate": 8.704404137143254e-08, "loss": 0.00203278, "memory(GiB)": 15.03, "step": 26710, "train_speed(iter/s)": 1.473049 }, { "acc": 1.0, "epoch": 47.15798764342453, "grad_norm": 0.11686785519123077, "learning_rate": 8.650528018878117e-08, "loss": 0.004479, "memory(GiB)": 15.03, "step": 26715, "train_speed(iter/s)": 1.473055 }, { "acc": 1.0, "epoch": 47.16681376875552, "grad_norm": 0.5285544395446777, "learning_rate": 8.596819647452121e-08, "loss": 0.00513964, "memory(GiB)": 15.03, "step": 26720, "train_speed(iter/s)": 1.473061 }, { "acc": 1.0, "epoch": 47.175639894086494, "grad_norm": 0.40149301290512085, "learning_rate": 8.543279041199595e-08, "loss": 0.00394234, "memory(GiB)": 15.03, "step": 26725, "train_speed(iter/s)": 1.473054 }, { "acc": 1.0, "epoch": 47.18446601941748, "grad_norm": 0.020375670865178108, "learning_rate": 8.489906218397256e-08, "loss": 0.00047338, "memory(GiB)": 15.03, "step": 26730, "train_speed(iter/s)": 1.473059 }, { "acc": 1.0, "epoch": 47.193292144748455, "grad_norm": 0.38913071155548096, "learning_rate": 8.436701197264874e-08, "loss": 0.00345652, "memory(GiB)": 15.03, "step": 26735, "train_speed(iter/s)": 1.473059 }, { "acc": 1.0, "epoch": 47.20211827007944, "grad_norm": 0.8714988231658936, "learning_rate": 8.38366399596477e-08, "loss": 0.00276513, "memory(GiB)": 15.03, "step": 26740, "train_speed(iter/s)": 1.47306 }, { "acc": 1.0, "epoch": 47.210944395410415, "grad_norm": 0.19199033081531525, "learning_rate": 8.330794632601981e-08, "loss": 0.00369646, "memory(GiB)": 15.03, "step": 26745, "train_speed(iter/s)": 1.473049 }, { "acc": 1.0, "epoch": 47.21977052074139, "grad_norm": 0.056635938584804535, "learning_rate": 8.278093125224376e-08, "loss": 0.00253485, "memory(GiB)": 15.03, "step": 26750, "train_speed(iter/s)": 1.473049 }, { "acc": 1.0, "epoch": 47.228596646072376, "grad_norm": 0.4173620045185089, "learning_rate": 8.225559491822261e-08, "loss": 0.00529503, "memory(GiB)": 15.03, "step": 26755, "train_speed(iter/s)": 1.473063 }, { "acc": 1.0, "epoch": 47.23742277140335, "grad_norm": 0.005296208430081606, "learning_rate": 8.173193750329e-08, "loss": 0.00469217, "memory(GiB)": 15.03, "step": 26760, "train_speed(iter/s)": 1.473064 }, { "acc": 1.0, "epoch": 47.246248896734336, "grad_norm": 0.004345441237092018, "learning_rate": 8.120995918620334e-08, "loss": 0.00255713, "memory(GiB)": 15.03, "step": 26765, "train_speed(iter/s)": 1.47308 }, { "acc": 1.0, "epoch": 47.25507502206531, "grad_norm": 0.0023781489580869675, "learning_rate": 8.068966014514953e-08, "loss": 0.00386355, "memory(GiB)": 15.03, "step": 26770, "train_speed(iter/s)": 1.473075 }, { "acc": 1.0, "epoch": 47.26390114739629, "grad_norm": 0.014511221088469028, "learning_rate": 8.017104055773812e-08, "loss": 0.00220456, "memory(GiB)": 15.03, "step": 26775, "train_speed(iter/s)": 1.473075 }, { "acc": 1.0, "epoch": 47.27272727272727, "grad_norm": 0.25516384840011597, "learning_rate": 7.965410060101033e-08, "loss": 0.00383869, "memory(GiB)": 15.03, "step": 26780, "train_speed(iter/s)": 1.473091 }, { "acc": 1.0, "epoch": 47.28155339805825, "grad_norm": 0.16115789115428925, "learning_rate": 7.913884045143016e-08, "loss": 0.0023836, "memory(GiB)": 15.03, "step": 26785, "train_speed(iter/s)": 1.473093 }, { "acc": 1.0, "epoch": 47.290379523389234, "grad_norm": 0.33181050419807434, "learning_rate": 7.862526028489149e-08, "loss": 0.0023119, "memory(GiB)": 15.03, "step": 26790, "train_speed(iter/s)": 1.473104 }, { "acc": 1.0, "epoch": 47.29920564872021, "grad_norm": 0.002675761468708515, "learning_rate": 7.811336027671045e-08, "loss": 0.00087895, "memory(GiB)": 15.03, "step": 26795, "train_speed(iter/s)": 1.473122 }, { "acc": 1.0, "epoch": 47.308031774051194, "grad_norm": 0.01615305244922638, "learning_rate": 7.760314060163306e-08, "loss": 0.00528506, "memory(GiB)": 15.03, "step": 26800, "train_speed(iter/s)": 1.473127 }, { "acc": 0.99986115, "epoch": 47.31685789938217, "grad_norm": 0.21001294255256653, "learning_rate": 7.709460143383148e-08, "loss": 0.00410699, "memory(GiB)": 15.03, "step": 26805, "train_speed(iter/s)": 1.473128 }, { "acc": 0.99969511, "epoch": 47.32568402471315, "grad_norm": 0.1026991605758667, "learning_rate": 7.65877429469028e-08, "loss": 0.00517211, "memory(GiB)": 15.03, "step": 26810, "train_speed(iter/s)": 1.473113 }, { "acc": 1.0, "epoch": 47.33451015004413, "grad_norm": 0.25819605588912964, "learning_rate": 7.608256531387132e-08, "loss": 0.00256314, "memory(GiB)": 15.03, "step": 26815, "train_speed(iter/s)": 1.473128 }, { "acc": 1.0, "epoch": 47.34333627537511, "grad_norm": 0.26540201902389526, "learning_rate": 7.557906870718679e-08, "loss": 0.00410912, "memory(GiB)": 15.03, "step": 26820, "train_speed(iter/s)": 1.473112 }, { "acc": 1.0, "epoch": 47.35216240070609, "grad_norm": 0.3447560966014862, "learning_rate": 7.507725329872622e-08, "loss": 0.0020496, "memory(GiB)": 15.03, "step": 26825, "train_speed(iter/s)": 1.473108 }, { "acc": 1.0, "epoch": 47.36098852603707, "grad_norm": 0.010331607423722744, "learning_rate": 7.457711925979096e-08, "loss": 0.00106623, "memory(GiB)": 15.03, "step": 26830, "train_speed(iter/s)": 1.473117 }, { "acc": 1.0, "epoch": 47.36981465136805, "grad_norm": 0.006212849635630846, "learning_rate": 7.407866676111129e-08, "loss": 0.00080528, "memory(GiB)": 15.03, "step": 26835, "train_speed(iter/s)": 1.473118 }, { "acc": 1.0, "epoch": 47.37864077669903, "grad_norm": 0.10219041258096695, "learning_rate": 7.358189597284012e-08, "loss": 0.00260388, "memory(GiB)": 15.03, "step": 26840, "train_speed(iter/s)": 1.473138 }, { "acc": 1.0, "epoch": 47.387466902030006, "grad_norm": 0.3406679034233093, "learning_rate": 7.308680706455877e-08, "loss": 0.00272309, "memory(GiB)": 15.03, "step": 26845, "train_speed(iter/s)": 1.473144 }, { "acc": 1.0, "epoch": 47.39629302736099, "grad_norm": 0.036309614777565, "learning_rate": 7.25934002052729e-08, "loss": 0.00529073, "memory(GiB)": 15.03, "step": 26850, "train_speed(iter/s)": 1.473154 }, { "acc": 0.99921875, "epoch": 47.40511915269197, "grad_norm": 0.7638635635375977, "learning_rate": 7.210167556341534e-08, "loss": 0.00347959, "memory(GiB)": 15.03, "step": 26855, "train_speed(iter/s)": 1.47316 }, { "acc": 1.0, "epoch": 47.41394527802295, "grad_norm": 0.2565583288669586, "learning_rate": 7.161163330684336e-08, "loss": 0.00142536, "memory(GiB)": 15.03, "step": 26860, "train_speed(iter/s)": 1.473156 }, { "acc": 1.0, "epoch": 47.42277140335393, "grad_norm": 0.016027215868234634, "learning_rate": 7.112327360284086e-08, "loss": 0.00140367, "memory(GiB)": 15.03, "step": 26865, "train_speed(iter/s)": 1.473166 }, { "acc": 1.0, "epoch": 47.431597528684904, "grad_norm": 0.23328515887260437, "learning_rate": 7.063659661811667e-08, "loss": 0.00263601, "memory(GiB)": 15.03, "step": 26870, "train_speed(iter/s)": 1.473173 }, { "acc": 1.0, "epoch": 47.44042365401589, "grad_norm": 0.3968731760978699, "learning_rate": 7.015160251880574e-08, "loss": 0.00492551, "memory(GiB)": 15.03, "step": 26875, "train_speed(iter/s)": 1.473174 }, { "acc": 1.0, "epoch": 47.449249779346864, "grad_norm": 0.15027831494808197, "learning_rate": 6.966829147046848e-08, "loss": 0.0031626, "memory(GiB)": 15.03, "step": 26880, "train_speed(iter/s)": 1.473169 }, { "acc": 1.0, "epoch": 47.45807590467785, "grad_norm": 0.2565126419067383, "learning_rate": 6.918666363808976e-08, "loss": 0.00412102, "memory(GiB)": 15.03, "step": 26885, "train_speed(iter/s)": 1.473173 }, { "acc": 1.0, "epoch": 47.466902030008825, "grad_norm": 0.22671495378017426, "learning_rate": 6.870671918608159e-08, "loss": 0.00717496, "memory(GiB)": 15.03, "step": 26890, "train_speed(iter/s)": 1.473163 }, { "acc": 1.0, "epoch": 47.47572815533981, "grad_norm": 0.01320321299135685, "learning_rate": 6.822845827828041e-08, "loss": 0.00259847, "memory(GiB)": 15.03, "step": 26895, "train_speed(iter/s)": 1.473166 }, { "acc": 1.0, "epoch": 47.484554280670785, "grad_norm": 0.517720103263855, "learning_rate": 6.775188107794761e-08, "loss": 0.00886712, "memory(GiB)": 15.03, "step": 26900, "train_speed(iter/s)": 1.473176 }, { "acc": 1.0, "epoch": 47.49338040600176, "grad_norm": 0.3107139468193054, "learning_rate": 6.727698774777123e-08, "loss": 0.00181823, "memory(GiB)": 15.03, "step": 26905, "train_speed(iter/s)": 1.473177 }, { "acc": 1.0, "epoch": 47.502206531332746, "grad_norm": 0.1668396145105362, "learning_rate": 6.680377844986147e-08, "loss": 0.00625396, "memory(GiB)": 15.03, "step": 26910, "train_speed(iter/s)": 1.473168 }, { "acc": 1.0, "epoch": 47.51103265666372, "grad_norm": 0.1990807205438614, "learning_rate": 6.63322533457574e-08, "loss": 0.00081109, "memory(GiB)": 15.03, "step": 26915, "train_speed(iter/s)": 1.473176 }, { "acc": 1.0, "epoch": 47.51985878199471, "grad_norm": 0.5430948138237, "learning_rate": 6.58624125964219e-08, "loss": 0.00848846, "memory(GiB)": 15.03, "step": 26920, "train_speed(iter/s)": 1.473191 }, { "acc": 1.0, "epoch": 47.52868490732568, "grad_norm": 0.0033965723123401403, "learning_rate": 6.539425636224011e-08, "loss": 0.00216965, "memory(GiB)": 15.03, "step": 26925, "train_speed(iter/s)": 1.473193 }, { "acc": 1.0, "epoch": 47.53751103265667, "grad_norm": 0.09635604918003082, "learning_rate": 6.49277848030265e-08, "loss": 0.00258555, "memory(GiB)": 15.03, "step": 26930, "train_speed(iter/s)": 1.473197 }, { "acc": 1.0, "epoch": 47.546337157987644, "grad_norm": 0.3408370912075043, "learning_rate": 6.44629980780172e-08, "loss": 0.00261297, "memory(GiB)": 15.03, "step": 26935, "train_speed(iter/s)": 1.473198 }, { "acc": 1.0, "epoch": 47.55516328331862, "grad_norm": 0.17715026438236237, "learning_rate": 6.399989634587554e-08, "loss": 0.0012833, "memory(GiB)": 15.03, "step": 26940, "train_speed(iter/s)": 1.473188 }, { "acc": 1.0, "epoch": 47.563989408649604, "grad_norm": 0.006403123959898949, "learning_rate": 6.3538479764687e-08, "loss": 0.0035953, "memory(GiB)": 15.03, "step": 26945, "train_speed(iter/s)": 1.473193 }, { "acc": 0.9998106, "epoch": 47.57281553398058, "grad_norm": 0.3102474808692932, "learning_rate": 6.307874849196427e-08, "loss": 0.00445283, "memory(GiB)": 15.03, "step": 26950, "train_speed(iter/s)": 1.473204 }, { "acc": 1.0, "epoch": 47.581641659311565, "grad_norm": 0.37424805760383606, "learning_rate": 6.262070268464334e-08, "loss": 0.00271159, "memory(GiB)": 15.03, "step": 26955, "train_speed(iter/s)": 1.473226 }, { "acc": 1.0, "epoch": 47.59046778464254, "grad_norm": 0.3627254366874695, "learning_rate": 6.216434249908625e-08, "loss": 0.00471603, "memory(GiB)": 15.03, "step": 26960, "train_speed(iter/s)": 1.473231 }, { "acc": 1.0, "epoch": 47.59929390997352, "grad_norm": 0.0331205315887928, "learning_rate": 6.170966809107725e-08, "loss": 0.0040792, "memory(GiB)": 15.03, "step": 26965, "train_speed(iter/s)": 1.473225 }, { "acc": 1.0, "epoch": 47.6081200353045, "grad_norm": 0.17257475852966309, "learning_rate": 6.125667961582661e-08, "loss": 0.00180028, "memory(GiB)": 15.03, "step": 26970, "train_speed(iter/s)": 1.473224 }, { "acc": 1.0, "epoch": 47.61694616063548, "grad_norm": 0.03326547145843506, "learning_rate": 6.080537722797021e-08, "loss": 0.00220985, "memory(GiB)": 15.03, "step": 26975, "train_speed(iter/s)": 1.473232 }, { "acc": 0.99986115, "epoch": 47.62577228596646, "grad_norm": 0.14243143796920776, "learning_rate": 6.035576108156658e-08, "loss": 0.00333641, "memory(GiB)": 15.03, "step": 26980, "train_speed(iter/s)": 1.473232 }, { "acc": 1.0, "epoch": 47.63459841129744, "grad_norm": 0.28388893604278564, "learning_rate": 5.990783133009815e-08, "loss": 0.00287238, "memory(GiB)": 15.03, "step": 26985, "train_speed(iter/s)": 1.473241 }, { "acc": 1.0, "epoch": 47.64342453662842, "grad_norm": 0.5142057538032532, "learning_rate": 5.9461588126473984e-08, "loss": 0.00675119, "memory(GiB)": 15.03, "step": 26990, "train_speed(iter/s)": 1.47323 }, { "acc": 0.99970236, "epoch": 47.6522506619594, "grad_norm": 0.43066757917404175, "learning_rate": 5.9017031623025865e-08, "loss": 0.00796808, "memory(GiB)": 15.03, "step": 26995, "train_speed(iter/s)": 1.473234 }, { "acc": 1.0, "epoch": 47.661076787290376, "grad_norm": 0.0015271470183506608, "learning_rate": 5.857416197150941e-08, "loss": 0.00082881, "memory(GiB)": 15.03, "step": 27000, "train_speed(iter/s)": 1.47324 }, { "acc": 0.99980774, "epoch": 47.66990291262136, "grad_norm": 0.20940959453582764, "learning_rate": 5.8132979323106354e-08, "loss": 0.00399929, "memory(GiB)": 15.03, "step": 27005, "train_speed(iter/s)": 1.473239 }, { "acc": 1.0, "epoch": 47.67872903795234, "grad_norm": 0.30160290002822876, "learning_rate": 5.7693483828419495e-08, "loss": 0.00347737, "memory(GiB)": 15.03, "step": 27010, "train_speed(iter/s)": 1.473231 }, { "acc": 1.0, "epoch": 47.68755516328332, "grad_norm": 0.200776144862175, "learning_rate": 5.7255675637478226e-08, "loss": 0.00298724, "memory(GiB)": 15.03, "step": 27015, "train_speed(iter/s)": 1.47323 }, { "acc": 1.0, "epoch": 47.6963812886143, "grad_norm": 0.17851854860782623, "learning_rate": 5.6819554899735826e-08, "loss": 0.00203778, "memory(GiB)": 15.03, "step": 27020, "train_speed(iter/s)": 1.473247 }, { "acc": 1.0, "epoch": 47.70520741394528, "grad_norm": 0.07262735068798065, "learning_rate": 5.638512176406831e-08, "loss": 0.00415809, "memory(GiB)": 15.03, "step": 27025, "train_speed(iter/s)": 1.473249 }, { "acc": 0.9998106, "epoch": 47.71403353927626, "grad_norm": 0.2154141068458557, "learning_rate": 5.595237637877609e-08, "loss": 0.0054473, "memory(GiB)": 15.03, "step": 27030, "train_speed(iter/s)": 1.473256 }, { "acc": 1.0, "epoch": 47.722859664607235, "grad_norm": 0.4268483817577362, "learning_rate": 5.552131889158344e-08, "loss": 0.00321139, "memory(GiB)": 15.03, "step": 27035, "train_speed(iter/s)": 1.473273 }, { "acc": 0.99980774, "epoch": 47.73168578993822, "grad_norm": 0.01435669045895338, "learning_rate": 5.5091949449638475e-08, "loss": 0.00271834, "memory(GiB)": 15.03, "step": 27040, "train_speed(iter/s)": 1.47327 }, { "acc": 1.0, "epoch": 47.740511915269195, "grad_norm": 0.3092576861381531, "learning_rate": 5.466426819951428e-08, "loss": 0.00462388, "memory(GiB)": 15.03, "step": 27045, "train_speed(iter/s)": 1.473273 }, { "acc": 1.0, "epoch": 47.74933804060018, "grad_norm": 0.06634558737277985, "learning_rate": 5.423827528720503e-08, "loss": 0.00056596, "memory(GiB)": 15.03, "step": 27050, "train_speed(iter/s)": 1.473277 }, { "acc": 1.0, "epoch": 47.758164165931156, "grad_norm": 0.22568997740745544, "learning_rate": 5.3813970858130374e-08, "loss": 0.00302129, "memory(GiB)": 15.03, "step": 27055, "train_speed(iter/s)": 1.47328 }, { "acc": 1.0, "epoch": 47.76699029126213, "grad_norm": 0.07266046851873398, "learning_rate": 5.3391355057133863e-08, "loss": 0.00755166, "memory(GiB)": 15.03, "step": 27060, "train_speed(iter/s)": 1.473275 }, { "acc": 0.99982872, "epoch": 47.775816416593116, "grad_norm": 0.23710238933563232, "learning_rate": 5.2970428028481755e-08, "loss": 0.00959535, "memory(GiB)": 15.03, "step": 27065, "train_speed(iter/s)": 1.473272 }, { "acc": 1.0, "epoch": 47.78464254192409, "grad_norm": 0.2732861638069153, "learning_rate": 5.255118991586307e-08, "loss": 0.00890543, "memory(GiB)": 15.03, "step": 27070, "train_speed(iter/s)": 1.473268 }, { "acc": 1.0, "epoch": 47.79346866725508, "grad_norm": 0.07709257304668427, "learning_rate": 5.213364086239233e-08, "loss": 0.00301626, "memory(GiB)": 15.03, "step": 27075, "train_speed(iter/s)": 1.473264 }, { "acc": 1.0, "epoch": 47.80229479258605, "grad_norm": 0.0344352200627327, "learning_rate": 5.17177810106057e-08, "loss": 0.00251565, "memory(GiB)": 15.03, "step": 27080, "train_speed(iter/s)": 1.473261 }, { "acc": 1.0, "epoch": 47.81112091791704, "grad_norm": 0.24972370266914368, "learning_rate": 5.130361050246431e-08, "loss": 0.00366037, "memory(GiB)": 15.03, "step": 27085, "train_speed(iter/s)": 1.473264 }, { "acc": 1.0, "epoch": 47.819947043248014, "grad_norm": 0.7136586904525757, "learning_rate": 5.0891129479350345e-08, "loss": 0.00358735, "memory(GiB)": 15.03, "step": 27090, "train_speed(iter/s)": 1.473273 }, { "acc": 0.99970236, "epoch": 47.82877316857899, "grad_norm": 0.4227483570575714, "learning_rate": 5.048033808207153e-08, "loss": 0.00868242, "memory(GiB)": 15.03, "step": 27095, "train_speed(iter/s)": 1.473273 }, { "acc": 1.0, "epoch": 47.837599293909975, "grad_norm": 0.1033647432923317, "learning_rate": 5.0071236450857765e-08, "loss": 0.00195802, "memory(GiB)": 15.03, "step": 27100, "train_speed(iter/s)": 1.473259 }, { "acc": 1.0, "epoch": 47.84642541924095, "grad_norm": 0.20788608491420746, "learning_rate": 4.9663824725362254e-08, "loss": 0.0029605, "memory(GiB)": 15.03, "step": 27105, "train_speed(iter/s)": 1.473243 }, { "acc": 1.0, "epoch": 47.855251544571935, "grad_norm": 0.8585655689239502, "learning_rate": 4.925810304466149e-08, "loss": 0.00581486, "memory(GiB)": 15.03, "step": 27110, "train_speed(iter/s)": 1.473252 }, { "acc": 1.0, "epoch": 47.86407766990291, "grad_norm": 0.18774205446243286, "learning_rate": 4.885407154725416e-08, "loss": 0.00442969, "memory(GiB)": 15.03, "step": 27115, "train_speed(iter/s)": 1.473242 }, { "acc": 1.0, "epoch": 47.872903795233896, "grad_norm": 0.6097727417945862, "learning_rate": 4.84517303710639e-08, "loss": 0.00468205, "memory(GiB)": 15.03, "step": 27120, "train_speed(iter/s)": 1.473242 }, { "acc": 1.0, "epoch": 47.88172992056487, "grad_norm": 0.26151037216186523, "learning_rate": 4.8051079653434336e-08, "loss": 0.002892, "memory(GiB)": 15.03, "step": 27125, "train_speed(iter/s)": 1.473237 }, { "acc": 1.0, "epoch": 47.89055604589585, "grad_norm": 0.18570730090141296, "learning_rate": 4.765211953113626e-08, "loss": 0.005313, "memory(GiB)": 15.03, "step": 27130, "train_speed(iter/s)": 1.473244 }, { "acc": 1.0, "epoch": 47.89938217122683, "grad_norm": 0.002474906388670206, "learning_rate": 4.72548501403582e-08, "loss": 0.00162434, "memory(GiB)": 15.03, "step": 27135, "train_speed(iter/s)": 1.473247 }, { "acc": 1.0, "epoch": 47.90820829655781, "grad_norm": 0.0012396823149174452, "learning_rate": 4.6859271616715894e-08, "loss": 0.00530102, "memory(GiB)": 15.03, "step": 27140, "train_speed(iter/s)": 1.473248 }, { "acc": 1.0, "epoch": 47.91703442188879, "grad_norm": 0.37435001134872437, "learning_rate": 4.646538409524612e-08, "loss": 0.00376816, "memory(GiB)": 15.03, "step": 27145, "train_speed(iter/s)": 1.473258 }, { "acc": 1.0, "epoch": 47.92586054721977, "grad_norm": 0.21871733665466309, "learning_rate": 4.607318771040789e-08, "loss": 0.00604336, "memory(GiB)": 15.03, "step": 27150, "train_speed(iter/s)": 1.473248 }, { "acc": 1.0, "epoch": 47.93468667255075, "grad_norm": 0.23082900047302246, "learning_rate": 4.568268259608403e-08, "loss": 0.00880555, "memory(GiB)": 15.03, "step": 27155, "train_speed(iter/s)": 1.473247 }, { "acc": 1.0, "epoch": 47.94351279788173, "grad_norm": 0.17510417103767395, "learning_rate": 4.5293868885579014e-08, "loss": 0.00194181, "memory(GiB)": 15.03, "step": 27160, "train_speed(iter/s)": 1.473248 }, { "acc": 1.0, "epoch": 47.95233892321271, "grad_norm": 0.12352573126554489, "learning_rate": 4.490674671162114e-08, "loss": 0.00190689, "memory(GiB)": 15.03, "step": 27165, "train_speed(iter/s)": 1.473253 }, { "acc": 1.0, "epoch": 47.96116504854369, "grad_norm": 0.33449774980545044, "learning_rate": 4.4521316206360394e-08, "loss": 0.00433835, "memory(GiB)": 15.03, "step": 27170, "train_speed(iter/s)": 1.473254 }, { "acc": 1.0, "epoch": 47.96999117387467, "grad_norm": 0.3485816717147827, "learning_rate": 4.41375775013689e-08, "loss": 0.00366558, "memory(GiB)": 15.03, "step": 27175, "train_speed(iter/s)": 1.473257 }, { "acc": 1.0, "epoch": 47.97881729920565, "grad_norm": 0.17968390882015228, "learning_rate": 4.37555307276421e-08, "loss": 0.00426004, "memory(GiB)": 15.03, "step": 27180, "train_speed(iter/s)": 1.47325 }, { "acc": 1.0, "epoch": 47.98764342453663, "grad_norm": 0.028042936697602272, "learning_rate": 4.3375176015598164e-08, "loss": 0.00101634, "memory(GiB)": 15.03, "step": 27185, "train_speed(iter/s)": 1.473261 }, { "acc": 1.0, "epoch": 47.996469549867605, "grad_norm": 0.0078061786480247974, "learning_rate": 4.2996513495076336e-08, "loss": 0.00386355, "memory(GiB)": 15.03, "step": 27190, "train_speed(iter/s)": 1.473267 }, { "acc": 1.0, "epoch": 48.00529567519859, "grad_norm": 0.13035240769386292, "learning_rate": 4.2619543295339203e-08, "loss": 0.00319789, "memory(GiB)": 15.03, "step": 27195, "train_speed(iter/s)": 1.473227 }, { "acc": 1.0, "epoch": 48.014121800529566, "grad_norm": 0.0028925074730068445, "learning_rate": 4.224426554507147e-08, "loss": 0.00228903, "memory(GiB)": 15.03, "step": 27200, "train_speed(iter/s)": 1.473238 }, { "acc": 1.0, "epoch": 48.02294792586055, "grad_norm": 0.4550168514251709, "learning_rate": 4.1870680372380084e-08, "loss": 0.00733901, "memory(GiB)": 15.03, "step": 27205, "train_speed(iter/s)": 1.473249 }, { "acc": 1.0, "epoch": 48.031774051191526, "grad_norm": 0.02155207097530365, "learning_rate": 4.1498787904794144e-08, "loss": 0.00013895, "memory(GiB)": 15.03, "step": 27210, "train_speed(iter/s)": 1.473248 }, { "acc": 1.0, "epoch": 48.04060017652251, "grad_norm": 0.14932265877723694, "learning_rate": 4.1128588269264394e-08, "loss": 0.00041626, "memory(GiB)": 15.03, "step": 27215, "train_speed(iter/s)": 1.473244 }, { "acc": 1.0, "epoch": 48.04942630185349, "grad_norm": 0.21899236738681793, "learning_rate": 4.0760081592164325e-08, "loss": 0.00214826, "memory(GiB)": 15.03, "step": 27220, "train_speed(iter/s)": 1.473251 }, { "acc": 1.0, "epoch": 48.05825242718446, "grad_norm": 0.14799770712852478, "learning_rate": 4.0393267999290725e-08, "loss": 0.00248232, "memory(GiB)": 15.03, "step": 27225, "train_speed(iter/s)": 1.473261 }, { "acc": 1.0, "epoch": 48.06707855251545, "grad_norm": 0.21420696377754211, "learning_rate": 4.002814761585921e-08, "loss": 0.00272087, "memory(GiB)": 15.03, "step": 27230, "train_speed(iter/s)": 1.473263 }, { "acc": 1.0, "epoch": 48.075904677846424, "grad_norm": 0.298247367143631, "learning_rate": 3.966472056651095e-08, "loss": 0.00223833, "memory(GiB)": 15.03, "step": 27235, "train_speed(iter/s)": 1.473258 }, { "acc": 1.0, "epoch": 48.08473080317741, "grad_norm": 0.05739869922399521, "learning_rate": 3.930298697530596e-08, "loss": 0.00348348, "memory(GiB)": 15.03, "step": 27240, "train_speed(iter/s)": 1.473265 }, { "acc": 1.0, "epoch": 48.093556928508384, "grad_norm": 0.004962578881531954, "learning_rate": 3.8942946965728656e-08, "loss": 0.00359535, "memory(GiB)": 15.03, "step": 27245, "train_speed(iter/s)": 1.473274 }, { "acc": 1.0, "epoch": 48.10238305383936, "grad_norm": 0.46742650866508484, "learning_rate": 3.8584600660683425e-08, "loss": 0.00522551, "memory(GiB)": 15.03, "step": 27250, "train_speed(iter/s)": 1.473274 }, { "acc": 1.0, "epoch": 48.111209179170345, "grad_norm": 0.6633155941963196, "learning_rate": 3.8227948182499045e-08, "loss": 0.00955814, "memory(GiB)": 15.03, "step": 27255, "train_speed(iter/s)": 1.473274 }, { "acc": 1.0, "epoch": 48.12003530450132, "grad_norm": 0.30127444863319397, "learning_rate": 3.787298965292207e-08, "loss": 0.00652489, "memory(GiB)": 15.03, "step": 27260, "train_speed(iter/s)": 1.473283 }, { "acc": 1.0, "epoch": 48.128861429832305, "grad_norm": 0.011070081032812595, "learning_rate": 3.751972519312456e-08, "loss": 0.0044845, "memory(GiB)": 15.03, "step": 27265, "train_speed(iter/s)": 1.473278 }, { "acc": 1.0, "epoch": 48.13768755516328, "grad_norm": 0.23844455182552338, "learning_rate": 3.7168154923698517e-08, "loss": 0.00372004, "memory(GiB)": 15.03, "step": 27270, "train_speed(iter/s)": 1.473287 }, { "acc": 1.0, "epoch": 48.146513680494266, "grad_norm": 0.39040419459342957, "learning_rate": 3.681827896465761e-08, "loss": 0.00760888, "memory(GiB)": 15.03, "step": 27275, "train_speed(iter/s)": 1.473297 }, { "acc": 1.0, "epoch": 48.15533980582524, "grad_norm": 0.0018555813003331423, "learning_rate": 3.647009743543823e-08, "loss": 0.00136869, "memory(GiB)": 15.03, "step": 27280, "train_speed(iter/s)": 1.473289 }, { "acc": 1.0, "epoch": 48.16416593115622, "grad_norm": 0.10097295045852661, "learning_rate": 3.612361045489673e-08, "loss": 0.00573644, "memory(GiB)": 15.03, "step": 27285, "train_speed(iter/s)": 1.473288 }, { "acc": 1.0, "epoch": 48.1729920564872, "grad_norm": 0.01528849732130766, "learning_rate": 3.577881814131167e-08, "loss": 0.00237932, "memory(GiB)": 15.03, "step": 27290, "train_speed(iter/s)": 1.473285 }, { "acc": 0.99921875, "epoch": 48.18181818181818, "grad_norm": 0.349522203207016, "learning_rate": 3.543572061238434e-08, "loss": 0.00521769, "memory(GiB)": 15.03, "step": 27295, "train_speed(iter/s)": 1.4733 }, { "acc": 1.0, "epoch": 48.190644307149164, "grad_norm": 0.1322936862707138, "learning_rate": 3.5094317985235433e-08, "loss": 0.00252812, "memory(GiB)": 15.03, "step": 27300, "train_speed(iter/s)": 1.473303 }, { "acc": 1.0, "epoch": 48.19947043248014, "grad_norm": 0.06871441006660461, "learning_rate": 3.47546103764084e-08, "loss": 0.00137939, "memory(GiB)": 15.03, "step": 27305, "train_speed(iter/s)": 1.473299 }, { "acc": 1.0, "epoch": 48.208296557811124, "grad_norm": 0.22959166765213013, "learning_rate": 3.441659790186776e-08, "loss": 0.00355207, "memory(GiB)": 15.03, "step": 27310, "train_speed(iter/s)": 1.473302 }, { "acc": 1.0, "epoch": 48.2171226831421, "grad_norm": 0.02657429315149784, "learning_rate": 3.408028067699966e-08, "loss": 0.00353836, "memory(GiB)": 15.03, "step": 27315, "train_speed(iter/s)": 1.473316 }, { "acc": 1.0, "epoch": 48.22594880847308, "grad_norm": 0.3366581201553345, "learning_rate": 3.374565881660967e-08, "loss": 0.00247226, "memory(GiB)": 15.03, "step": 27320, "train_speed(iter/s)": 1.473335 }, { "acc": 0.99980774, "epoch": 48.23477493380406, "grad_norm": 0.29561087489128113, "learning_rate": 3.341273243492831e-08, "loss": 0.00409422, "memory(GiB)": 15.03, "step": 27325, "train_speed(iter/s)": 1.473353 }, { "acc": 1.0, "epoch": 48.24360105913504, "grad_norm": 0.3175032138824463, "learning_rate": 3.30815016456033e-08, "loss": 0.00475935, "memory(GiB)": 15.03, "step": 27330, "train_speed(iter/s)": 1.47335 }, { "acc": 1.0, "epoch": 48.25242718446602, "grad_norm": 0.018888285383582115, "learning_rate": 3.275196656170731e-08, "loss": 0.00079427, "memory(GiB)": 15.03, "step": 27335, "train_speed(iter/s)": 1.473353 }, { "acc": 1.0, "epoch": 48.261253309797, "grad_norm": 0.249686598777771, "learning_rate": 3.2424127295730215e-08, "loss": 0.00239216, "memory(GiB)": 15.03, "step": 27340, "train_speed(iter/s)": 1.473351 }, { "acc": 1.0, "epoch": 48.270079435127975, "grad_norm": 0.024157950654625893, "learning_rate": 3.20979839595863e-08, "loss": 0.00091231, "memory(GiB)": 15.03, "step": 27345, "train_speed(iter/s)": 1.473345 }, { "acc": 1.0, "epoch": 48.27890556045896, "grad_norm": 0.06579935550689697, "learning_rate": 3.1773536664609236e-08, "loss": 0.00141802, "memory(GiB)": 15.03, "step": 27350, "train_speed(iter/s)": 1.473358 }, { "acc": 0.99986706, "epoch": 48.287731685789936, "grad_norm": 0.17152801156044006, "learning_rate": 3.145078552155436e-08, "loss": 0.00263688, "memory(GiB)": 15.03, "step": 27355, "train_speed(iter/s)": 1.473359 }, { "acc": 1.0, "epoch": 48.29655781112092, "grad_norm": 0.005439462140202522, "learning_rate": 3.112973064059805e-08, "loss": 0.00018187, "memory(GiB)": 15.03, "step": 27360, "train_speed(iter/s)": 1.47335 }, { "acc": 0.99986706, "epoch": 48.305383936451896, "grad_norm": 0.49568378925323486, "learning_rate": 3.0810372131336676e-08, "loss": 0.00618883, "memory(GiB)": 15.03, "step": 27365, "train_speed(iter/s)": 1.473346 }, { "acc": 1.0, "epoch": 48.31421006178288, "grad_norm": 0.49402952194213867, "learning_rate": 3.049271010278824e-08, "loss": 0.00308902, "memory(GiB)": 15.03, "step": 27370, "train_speed(iter/s)": 1.473358 }, { "acc": 1.0, "epoch": 48.32303618711386, "grad_norm": 0.09120181947946548, "learning_rate": 3.017674466339236e-08, "loss": 0.00320229, "memory(GiB)": 15.03, "step": 27375, "train_speed(iter/s)": 1.473365 }, { "acc": 1.0, "epoch": 48.331862312444834, "grad_norm": 0.013701404444873333, "learning_rate": 2.9862475921008625e-08, "loss": 0.00133483, "memory(GiB)": 15.03, "step": 27380, "train_speed(iter/s)": 1.473379 }, { "acc": 0.99975491, "epoch": 48.34068843777582, "grad_norm": 0.23469869792461395, "learning_rate": 2.9549903982916602e-08, "loss": 0.00324795, "memory(GiB)": 15.03, "step": 27385, "train_speed(iter/s)": 1.473375 }, { "acc": 1.0, "epoch": 48.349514563106794, "grad_norm": 0.1822102814912796, "learning_rate": 2.9239028955818587e-08, "loss": 0.00248073, "memory(GiB)": 15.03, "step": 27390, "train_speed(iter/s)": 1.473382 }, { "acc": 1.0, "epoch": 48.35834068843778, "grad_norm": 0.05703435093164444, "learning_rate": 2.8929850945836293e-08, "loss": 0.00134991, "memory(GiB)": 15.03, "step": 27395, "train_speed(iter/s)": 1.473391 }, { "acc": 1.0, "epoch": 48.367166813768755, "grad_norm": 0.16397328674793243, "learning_rate": 2.8622370058511953e-08, "loss": 0.00296749, "memory(GiB)": 15.03, "step": 27400, "train_speed(iter/s)": 1.473389 }, { "acc": 1.0, "epoch": 48.37599293909974, "grad_norm": 0.003640238894149661, "learning_rate": 2.8316586398809987e-08, "loss": 0.0034408, "memory(GiB)": 15.03, "step": 27405, "train_speed(iter/s)": 1.473393 }, { "acc": 1.0, "epoch": 48.384819064430715, "grad_norm": 0.26396411657333374, "learning_rate": 2.8012500071113118e-08, "loss": 0.00909399, "memory(GiB)": 15.03, "step": 27410, "train_speed(iter/s)": 1.473388 }, { "acc": 1.0, "epoch": 48.39364518976169, "grad_norm": 0.309211403131485, "learning_rate": 2.7710111179226805e-08, "loss": 0.00418693, "memory(GiB)": 15.03, "step": 27415, "train_speed(iter/s)": 1.473385 }, { "acc": 1.0, "epoch": 48.402471315092676, "grad_norm": 0.045151665806770325, "learning_rate": 2.7409419826375923e-08, "loss": 0.00450963, "memory(GiB)": 15.03, "step": 27420, "train_speed(iter/s)": 1.473369 }, { "acc": 1.0, "epoch": 48.41129744042365, "grad_norm": 0.4377155303955078, "learning_rate": 2.7110426115206433e-08, "loss": 0.00615567, "memory(GiB)": 15.03, "step": 27425, "train_speed(iter/s)": 1.473374 }, { "acc": 0.9998106, "epoch": 48.420123565754636, "grad_norm": 0.20013323426246643, "learning_rate": 2.6813130147784242e-08, "loss": 0.00561141, "memory(GiB)": 15.03, "step": 27430, "train_speed(iter/s)": 1.473392 }, { "acc": 1.0, "epoch": 48.42894969108561, "grad_norm": 0.03380619361996651, "learning_rate": 2.6517532025595237e-08, "loss": 0.00292441, "memory(GiB)": 15.03, "step": 27435, "train_speed(iter/s)": 1.473405 }, { "acc": 0.99963236, "epoch": 48.43777581641659, "grad_norm": 0.10415429621934891, "learning_rate": 2.6223631849548048e-08, "loss": 0.00340254, "memory(GiB)": 15.03, "step": 27440, "train_speed(iter/s)": 1.473418 }, { "acc": 1.0, "epoch": 48.44660194174757, "grad_norm": 0.3187817633152008, "learning_rate": 2.5931429719968483e-08, "loss": 0.00381322, "memory(GiB)": 15.03, "step": 27445, "train_speed(iter/s)": 1.47343 }, { "acc": 1.0, "epoch": 48.45542806707855, "grad_norm": 0.004765539430081844, "learning_rate": 2.5640925736605107e-08, "loss": 0.00081959, "memory(GiB)": 15.03, "step": 27450, "train_speed(iter/s)": 1.473424 }, { "acc": 1.0, "epoch": 48.464254192409534, "grad_norm": 0.014734429307281971, "learning_rate": 2.5352119998625894e-08, "loss": 0.00155419, "memory(GiB)": 15.03, "step": 27455, "train_speed(iter/s)": 1.47343 }, { "acc": 1.0, "epoch": 48.47308031774051, "grad_norm": 0.02805662713944912, "learning_rate": 2.5065012604618775e-08, "loss": 0.00343379, "memory(GiB)": 15.03, "step": 27460, "train_speed(iter/s)": 1.473433 }, { "acc": 1.0, "epoch": 48.481906443071495, "grad_norm": 0.21633106470108032, "learning_rate": 2.477960365259332e-08, "loss": 0.0020686, "memory(GiB)": 15.03, "step": 27465, "train_speed(iter/s)": 1.473437 }, { "acc": 1.0, "epoch": 48.49073256840247, "grad_norm": 0.00238782144151628, "learning_rate": 2.4495893239976847e-08, "loss": 0.00350133, "memory(GiB)": 15.03, "step": 27470, "train_speed(iter/s)": 1.473434 }, { "acc": 1.0, "epoch": 48.49955869373345, "grad_norm": 0.22826743125915527, "learning_rate": 2.4213881463619407e-08, "loss": 0.00216118, "memory(GiB)": 15.03, "step": 27475, "train_speed(iter/s)": 1.473437 }, { "acc": 1.0, "epoch": 48.50838481906443, "grad_norm": 0.24841509759426117, "learning_rate": 2.3933568419789362e-08, "loss": 0.00410366, "memory(GiB)": 15.03, "step": 27480, "train_speed(iter/s)": 1.473435 }, { "acc": 1.0, "epoch": 48.51721094439541, "grad_norm": 0.09461859613656998, "learning_rate": 2.3654954204176697e-08, "loss": 0.00334712, "memory(GiB)": 15.03, "step": 27485, "train_speed(iter/s)": 1.473442 }, { "acc": 1.0, "epoch": 48.52603706972639, "grad_norm": 0.12710924446582794, "learning_rate": 2.337803891189026e-08, "loss": 0.00445518, "memory(GiB)": 15.03, "step": 27490, "train_speed(iter/s)": 1.47345 }, { "acc": 0.99908581, "epoch": 48.53486319505737, "grad_norm": 0.1920531541109085, "learning_rate": 2.310282263745942e-08, "loss": 0.00481913, "memory(GiB)": 15.03, "step": 27495, "train_speed(iter/s)": 1.473453 }, { "acc": 1.0, "epoch": 48.54368932038835, "grad_norm": 0.43772202730178833, "learning_rate": 2.282930547483351e-08, "loss": 0.00406149, "memory(GiB)": 15.03, "step": 27500, "train_speed(iter/s)": 1.473462 }, { "acc": 1.0, "epoch": 48.55251544571933, "grad_norm": 0.00637431675568223, "learning_rate": 2.2557487517381834e-08, "loss": 0.00188357, "memory(GiB)": 15.03, "step": 27505, "train_speed(iter/s)": 1.473454 }, { "acc": 1.0, "epoch": 48.561341571050306, "grad_norm": 0.8571632504463196, "learning_rate": 2.228736885789365e-08, "loss": 0.01008157, "memory(GiB)": 15.03, "step": 27510, "train_speed(iter/s)": 1.473461 }, { "acc": 1.0, "epoch": 48.57016769638129, "grad_norm": 0.22728943824768066, "learning_rate": 2.2018949588578198e-08, "loss": 0.00258038, "memory(GiB)": 15.03, "step": 27515, "train_speed(iter/s)": 1.473462 }, { "acc": 0.9998106, "epoch": 48.57899382171227, "grad_norm": 0.24188923835754395, "learning_rate": 2.1752229801064676e-08, "loss": 0.00297276, "memory(GiB)": 15.03, "step": 27520, "train_speed(iter/s)": 1.473469 }, { "acc": 1.0, "epoch": 48.58781994704325, "grad_norm": 0.41818344593048096, "learning_rate": 2.148720958640225e-08, "loss": 0.00417644, "memory(GiB)": 15.03, "step": 27525, "train_speed(iter/s)": 1.473462 }, { "acc": 1.0, "epoch": 48.59664607237423, "grad_norm": 0.29575011134147644, "learning_rate": 2.1223889035058938e-08, "loss": 0.00316861, "memory(GiB)": 15.03, "step": 27530, "train_speed(iter/s)": 1.473464 }, { "acc": 1.0, "epoch": 48.605472197705204, "grad_norm": 0.13011972606182098, "learning_rate": 2.0962268236923846e-08, "loss": 0.00301072, "memory(GiB)": 15.03, "step": 27535, "train_speed(iter/s)": 1.473469 }, { "acc": 1.0, "epoch": 48.61429832303619, "grad_norm": 0.20797358453273773, "learning_rate": 2.070234728130493e-08, "loss": 0.00096777, "memory(GiB)": 15.03, "step": 27540, "train_speed(iter/s)": 1.473464 }, { "acc": 1.0, "epoch": 48.623124448367165, "grad_norm": 0.19996193051338196, "learning_rate": 2.044412625693122e-08, "loss": 0.00663886, "memory(GiB)": 15.03, "step": 27545, "train_speed(iter/s)": 1.473457 }, { "acc": 1.0, "epoch": 48.63195057369815, "grad_norm": 0.3426508605480194, "learning_rate": 2.018760525194893e-08, "loss": 0.00286155, "memory(GiB)": 15.03, "step": 27550, "train_speed(iter/s)": 1.473449 }, { "acc": 0.99989033, "epoch": 48.640776699029125, "grad_norm": 0.3434067666530609, "learning_rate": 1.9932784353926485e-08, "loss": 0.01178176, "memory(GiB)": 15.03, "step": 27555, "train_speed(iter/s)": 1.473442 }, { "acc": 1.0, "epoch": 48.64960282436011, "grad_norm": 0.10613273829221725, "learning_rate": 1.9679663649851145e-08, "loss": 0.00312584, "memory(GiB)": 15.03, "step": 27560, "train_speed(iter/s)": 1.473443 }, { "acc": 1.0, "epoch": 48.658428949691086, "grad_norm": 0.2573290467262268, "learning_rate": 1.9428243226129036e-08, "loss": 0.00523417, "memory(GiB)": 15.03, "step": 27565, "train_speed(iter/s)": 1.473445 }, { "acc": 1.0, "epoch": 48.66725507502206, "grad_norm": 0.0016611798200756311, "learning_rate": 1.9178523168586793e-08, "loss": 0.00180284, "memory(GiB)": 15.03, "step": 27570, "train_speed(iter/s)": 1.473442 }, { "acc": 1.0, "epoch": 48.676081200353046, "grad_norm": 0.09515803307294846, "learning_rate": 1.8930503562469925e-08, "loss": 0.00270649, "memory(GiB)": 15.03, "step": 27575, "train_speed(iter/s)": 1.473447 }, { "acc": 1.0, "epoch": 48.68490732568402, "grad_norm": 0.003010798944160342, "learning_rate": 1.8684184492443898e-08, "loss": 0.00233903, "memory(GiB)": 15.03, "step": 27580, "train_speed(iter/s)": 1.473453 }, { "acc": 1.0, "epoch": 48.69373345101501, "grad_norm": 0.02523341402411461, "learning_rate": 1.843956604259303e-08, "loss": 0.00424252, "memory(GiB)": 15.03, "step": 27585, "train_speed(iter/s)": 1.473469 }, { "acc": 1.0, "epoch": 48.70255957634598, "grad_norm": 0.001322363968938589, "learning_rate": 1.8196648296423287e-08, "loss": 0.00508009, "memory(GiB)": 15.03, "step": 27590, "train_speed(iter/s)": 1.473462 }, { "acc": 1.0, "epoch": 48.71138570167697, "grad_norm": 0.2457403689622879, "learning_rate": 1.7955431336856142e-08, "loss": 0.00337981, "memory(GiB)": 15.03, "step": 27595, "train_speed(iter/s)": 1.473463 }, { "acc": 1.0, "epoch": 48.720211827007944, "grad_norm": 0.009654970839619637, "learning_rate": 1.771591524623637e-08, "loss": 0.00072386, "memory(GiB)": 15.03, "step": 27600, "train_speed(iter/s)": 1.47347 }, { "acc": 1.0, "epoch": 48.72903795233892, "grad_norm": 0.033123888075351715, "learning_rate": 1.747810010632595e-08, "loss": 0.00293902, "memory(GiB)": 15.03, "step": 27605, "train_speed(iter/s)": 1.473476 }, { "acc": 1.0, "epoch": 48.737864077669904, "grad_norm": 0.6313555836677551, "learning_rate": 1.7241985998307362e-08, "loss": 0.00506819, "memory(GiB)": 15.03, "step": 27610, "train_speed(iter/s)": 1.473476 }, { "acc": 1.0, "epoch": 48.74669020300088, "grad_norm": 0.15002696216106415, "learning_rate": 1.7007573002780854e-08, "loss": 0.002694, "memory(GiB)": 15.03, "step": 27615, "train_speed(iter/s)": 1.47348 }, { "acc": 1.0, "epoch": 48.755516328331865, "grad_norm": 0.06876680254936218, "learning_rate": 1.677486119976718e-08, "loss": 0.0012453, "memory(GiB)": 15.03, "step": 27620, "train_speed(iter/s)": 1.473476 }, { "acc": 1.0, "epoch": 48.76434245366284, "grad_norm": 0.30599257349967957, "learning_rate": 1.6543850668706503e-08, "loss": 0.00176478, "memory(GiB)": 15.03, "step": 27625, "train_speed(iter/s)": 1.473473 }, { "acc": 1.0, "epoch": 48.77316857899382, "grad_norm": 0.5574575662612915, "learning_rate": 1.6314541488458414e-08, "loss": 0.00563274, "memory(GiB)": 15.03, "step": 27630, "train_speed(iter/s)": 1.473478 }, { "acc": 1.0, "epoch": 48.7819947043248, "grad_norm": 0.17949672043323517, "learning_rate": 1.6086933737299682e-08, "loss": 0.00634441, "memory(GiB)": 15.03, "step": 27635, "train_speed(iter/s)": 1.473461 }, { "acc": 1.0, "epoch": 48.79082082965578, "grad_norm": 0.005325492937117815, "learning_rate": 1.5861027492928158e-08, "loss": 0.00378915, "memory(GiB)": 15.03, "step": 27640, "train_speed(iter/s)": 1.473457 }, { "acc": 1.0, "epoch": 48.79964695498676, "grad_norm": 0.2550700008869171, "learning_rate": 1.5636822832461654e-08, "loss": 0.00253012, "memory(GiB)": 15.03, "step": 27645, "train_speed(iter/s)": 1.473463 }, { "acc": 1.0, "epoch": 48.80847308031774, "grad_norm": 0.0019582805689424276, "learning_rate": 1.5414319832434623e-08, "loss": 0.00419845, "memory(GiB)": 15.03, "step": 27650, "train_speed(iter/s)": 1.473454 }, { "acc": 1.0, "epoch": 48.81729920564872, "grad_norm": 0.12337636202573776, "learning_rate": 1.5193518568802594e-08, "loss": 0.00501247, "memory(GiB)": 15.03, "step": 27655, "train_speed(iter/s)": 1.473455 }, { "acc": 1.0, "epoch": 48.8261253309797, "grad_norm": 0.14504051208496094, "learning_rate": 1.497441911693884e-08, "loss": 0.00404945, "memory(GiB)": 15.03, "step": 27660, "train_speed(iter/s)": 1.473452 }, { "acc": 1.0, "epoch": 48.83495145631068, "grad_norm": 0.24794644117355347, "learning_rate": 1.4757021551636607e-08, "loss": 0.00457284, "memory(GiB)": 15.03, "step": 27665, "train_speed(iter/s)": 1.47345 }, { "acc": 1.0, "epoch": 48.84377758164166, "grad_norm": 0.09679381549358368, "learning_rate": 1.4541325947108544e-08, "loss": 0.0022186, "memory(GiB)": 15.03, "step": 27670, "train_speed(iter/s)": 1.47346 }, { "acc": 1.0, "epoch": 48.85260370697264, "grad_norm": 0.0023926792200654745, "learning_rate": 1.4327332376985602e-08, "loss": 0.00128813, "memory(GiB)": 15.03, "step": 27675, "train_speed(iter/s)": 1.473458 }, { "acc": 1.0, "epoch": 48.86142983230362, "grad_norm": 0.40454593300819397, "learning_rate": 1.4115040914317041e-08, "loss": 0.00306199, "memory(GiB)": 15.03, "step": 27680, "train_speed(iter/s)": 1.473462 }, { "acc": 1.0, "epoch": 48.8702559576346, "grad_norm": 0.04034668579697609, "learning_rate": 1.3904451631572081e-08, "loss": 0.00254595, "memory(GiB)": 15.03, "step": 27685, "train_speed(iter/s)": 1.473463 }, { "acc": 1.0, "epoch": 48.87908208296558, "grad_norm": 0.4895118474960327, "learning_rate": 1.3695564600639355e-08, "loss": 0.00559993, "memory(GiB)": 15.03, "step": 27690, "train_speed(iter/s)": 1.473469 }, { "acc": 0.99984179, "epoch": 48.88790820829656, "grad_norm": 0.27008259296417236, "learning_rate": 1.3488379892824686e-08, "loss": 0.00353913, "memory(GiB)": 15.03, "step": 27695, "train_speed(iter/s)": 1.473468 }, { "acc": 1.0, "epoch": 48.896734333627535, "grad_norm": 0.031295765191316605, "learning_rate": 1.328289757885442e-08, "loss": 0.00768868, "memory(GiB)": 15.03, "step": 27700, "train_speed(iter/s)": 1.47347 }, { "acc": 1.0, "epoch": 48.90556045895852, "grad_norm": 0.15719276666641235, "learning_rate": 1.3079117728872654e-08, "loss": 0.00431711, "memory(GiB)": 15.03, "step": 27705, "train_speed(iter/s)": 1.473474 }, { "acc": 1.0, "epoch": 48.914386584289495, "grad_norm": 0.5180853009223938, "learning_rate": 1.2877040412443438e-08, "loss": 0.00298726, "memory(GiB)": 15.03, "step": 27710, "train_speed(iter/s)": 1.473487 }, { "acc": 1.0, "epoch": 48.92321270962048, "grad_norm": 0.020695962011814117, "learning_rate": 1.2676665698549138e-08, "loss": 0.00647445, "memory(GiB)": 15.03, "step": 27715, "train_speed(iter/s)": 1.473498 }, { "acc": 1.0, "epoch": 48.932038834951456, "grad_norm": 0.0028065654914826155, "learning_rate": 1.247799365558986e-08, "loss": 0.00148277, "memory(GiB)": 15.03, "step": 27720, "train_speed(iter/s)": 1.473485 }, { "acc": 1.0, "epoch": 48.94086496028243, "grad_norm": 0.34508606791496277, "learning_rate": 1.2281024351386227e-08, "loss": 0.00248789, "memory(GiB)": 15.03, "step": 27725, "train_speed(iter/s)": 1.4735 }, { "acc": 1.0, "epoch": 48.949691085613416, "grad_norm": 0.19850590825080872, "learning_rate": 1.208575785317606e-08, "loss": 0.0012737, "memory(GiB)": 15.03, "step": 27730, "train_speed(iter/s)": 1.4735 }, { "acc": 1.0, "epoch": 48.95851721094439, "grad_norm": 0.03680484741926193, "learning_rate": 1.1892194227617149e-08, "loss": 0.00170614, "memory(GiB)": 15.03, "step": 27735, "train_speed(iter/s)": 1.473502 }, { "acc": 1.0, "epoch": 48.96734333627538, "grad_norm": 0.2784964442253113, "learning_rate": 1.1700333540785018e-08, "loss": 0.0023441, "memory(GiB)": 15.03, "step": 27740, "train_speed(iter/s)": 1.473505 }, { "acc": 0.99963236, "epoch": 48.976169461606354, "grad_norm": 0.033287420868873596, "learning_rate": 1.1510175858174615e-08, "loss": 0.00521335, "memory(GiB)": 15.03, "step": 27745, "train_speed(iter/s)": 1.473511 }, { "acc": 1.0, "epoch": 48.98499558693734, "grad_norm": 0.18654878437519073, "learning_rate": 1.1321721244699183e-08, "loss": 0.0018007, "memory(GiB)": 15.03, "step": 27750, "train_speed(iter/s)": 1.473511 }, { "acc": 1.0, "epoch": 48.993821712268314, "grad_norm": 0.10515152662992477, "learning_rate": 1.1134969764690822e-08, "loss": 0.00242103, "memory(GiB)": 15.03, "step": 27755, "train_speed(iter/s)": 1.473518 }, { "acc": 1.0, "epoch": 49.00264783759929, "grad_norm": 0.2023598700761795, "learning_rate": 1.094992148189938e-08, "loss": 0.00397296, "memory(GiB)": 15.03, "step": 27760, "train_speed(iter/s)": 1.473465 }, { "acc": 1.0, "epoch": 49.011473962930275, "grad_norm": 0.2455327957868576, "learning_rate": 1.0766576459494114e-08, "loss": 0.00106996, "memory(GiB)": 15.03, "step": 27765, "train_speed(iter/s)": 1.473458 }, { "acc": 1.0, "epoch": 49.02030008826125, "grad_norm": 0.10062769800424576, "learning_rate": 1.0584934760063694e-08, "loss": 0.00282226, "memory(GiB)": 15.03, "step": 27770, "train_speed(iter/s)": 1.473465 }, { "acc": 1.0, "epoch": 49.029126213592235, "grad_norm": 0.34597525000572205, "learning_rate": 1.0404996445613429e-08, "loss": 0.00602197, "memory(GiB)": 15.03, "step": 27775, "train_speed(iter/s)": 1.473471 }, { "acc": 1.0, "epoch": 49.03795233892321, "grad_norm": 0.2185078114271164, "learning_rate": 1.0226761577568031e-08, "loss": 0.00263742, "memory(GiB)": 15.03, "step": 27780, "train_speed(iter/s)": 1.473472 }, { "acc": 1.0, "epoch": 49.046778464254196, "grad_norm": 0.15935176610946655, "learning_rate": 1.0050230216771075e-08, "loss": 0.00332736, "memory(GiB)": 15.03, "step": 27785, "train_speed(iter/s)": 1.473494 }, { "acc": 0.99986706, "epoch": 49.05560458958517, "grad_norm": 0.04913777858018875, "learning_rate": 9.875402423483884e-09, "loss": 0.00375265, "memory(GiB)": 15.03, "step": 27790, "train_speed(iter/s)": 1.473509 }, { "acc": 1.0, "epoch": 49.06443071491615, "grad_norm": 0.18512719869613647, "learning_rate": 9.70227825738719e-09, "loss": 0.0010898, "memory(GiB)": 15.03, "step": 27795, "train_speed(iter/s)": 1.473509 }, { "acc": 1.0, "epoch": 49.07325684024713, "grad_norm": 0.0074022733606398106, "learning_rate": 9.530857777579465e-09, "loss": 0.00238163, "memory(GiB)": 15.03, "step": 27800, "train_speed(iter/s)": 1.473519 }, { "acc": 1.0, "epoch": 49.08208296557811, "grad_norm": 0.0909709483385086, "learning_rate": 9.361141042576937e-09, "loss": 0.00296688, "memory(GiB)": 15.03, "step": 27805, "train_speed(iter/s)": 1.473523 }, { "acc": 1.0, "epoch": 49.09090909090909, "grad_norm": 0.2249739170074463, "learning_rate": 9.193128110316351e-09, "loss": 0.00619833, "memory(GiB)": 15.03, "step": 27810, "train_speed(iter/s)": 1.473529 }, { "acc": 1.0, "epoch": 49.09973521624007, "grad_norm": 0.04018590971827507, "learning_rate": 9.026819038150533e-09, "loss": 0.00279366, "memory(GiB)": 15.03, "step": 27815, "train_speed(iter/s)": 1.473526 }, { "acc": 1.0, "epoch": 49.10856134157105, "grad_norm": 0.04074092209339142, "learning_rate": 8.862213882852275e-09, "loss": 0.0054491, "memory(GiB)": 15.03, "step": 27820, "train_speed(iter/s)": 1.473537 }, { "acc": 1.0, "epoch": 49.11738746690203, "grad_norm": 0.20996810495853424, "learning_rate": 8.699312700612118e-09, "loss": 0.00505843, "memory(GiB)": 15.03, "step": 27825, "train_speed(iter/s)": 1.473544 }, { "acc": 1.0, "epoch": 49.12621359223301, "grad_norm": 0.20678095519542694, "learning_rate": 8.538115547038346e-09, "loss": 0.00086992, "memory(GiB)": 15.03, "step": 27830, "train_speed(iter/s)": 1.473535 }, { "acc": 1.0, "epoch": 49.13503971756399, "grad_norm": 0.1395721137523651, "learning_rate": 8.378622477158101e-09, "loss": 0.000832, "memory(GiB)": 15.03, "step": 27835, "train_speed(iter/s)": 1.473538 }, { "acc": 1.0, "epoch": 49.14386584289497, "grad_norm": 0.1602018177509308, "learning_rate": 8.220833545417938e-09, "loss": 0.00204578, "memory(GiB)": 15.03, "step": 27840, "train_speed(iter/s)": 1.473531 }, { "acc": 1.0, "epoch": 49.15269196822595, "grad_norm": 0.1278962939977646, "learning_rate": 8.064748805679936e-09, "loss": 0.00206608, "memory(GiB)": 15.03, "step": 27845, "train_speed(iter/s)": 1.473535 }, { "acc": 1.0, "epoch": 49.16151809355693, "grad_norm": 0.009701074101030827, "learning_rate": 7.910368311227247e-09, "loss": 0.00090784, "memory(GiB)": 15.03, "step": 27850, "train_speed(iter/s)": 1.473534 }, { "acc": 1.0, "epoch": 49.170344218887905, "grad_norm": 0.18429748713970184, "learning_rate": 7.757692114759668e-09, "loss": 0.00184988, "memory(GiB)": 15.03, "step": 27855, "train_speed(iter/s)": 1.473532 }, { "acc": 1.0, "epoch": 49.17917034421889, "grad_norm": 0.23239590227603912, "learning_rate": 7.606720268395844e-09, "loss": 0.00729807, "memory(GiB)": 15.03, "step": 27860, "train_speed(iter/s)": 1.473529 }, { "acc": 1.0, "epoch": 49.187996469549866, "grad_norm": 0.14483843743801117, "learning_rate": 7.457452823671617e-09, "loss": 0.00437462, "memory(GiB)": 15.03, "step": 27865, "train_speed(iter/s)": 1.473525 }, { "acc": 1.0, "epoch": 49.19682259488085, "grad_norm": 0.002335901604965329, "learning_rate": 7.309889831542792e-09, "loss": 0.00285762, "memory(GiB)": 15.03, "step": 27870, "train_speed(iter/s)": 1.473524 }, { "acc": 1.0, "epoch": 49.205648720211826, "grad_norm": 0.1070650964975357, "learning_rate": 7.164031342381257e-09, "loss": 0.00104542, "memory(GiB)": 15.03, "step": 27875, "train_speed(iter/s)": 1.47352 }, { "acc": 1.0, "epoch": 49.21447484554281, "grad_norm": 0.26810789108276367, "learning_rate": 7.019877405978863e-09, "loss": 0.00374083, "memory(GiB)": 15.03, "step": 27880, "train_speed(iter/s)": 1.473524 }, { "acc": 1.0, "epoch": 49.22330097087379, "grad_norm": 0.35275453329086304, "learning_rate": 6.877428071544103e-09, "loss": 0.00333441, "memory(GiB)": 15.03, "step": 27885, "train_speed(iter/s)": 1.47352 }, { "acc": 1.0, "epoch": 49.23212709620476, "grad_norm": 0.20724645256996155, "learning_rate": 6.736683387704876e-09, "loss": 0.00285137, "memory(GiB)": 15.03, "step": 27890, "train_speed(iter/s)": 1.473526 }, { "acc": 1.0, "epoch": 49.24095322153575, "grad_norm": 0.012161986902356148, "learning_rate": 6.597643402506275e-09, "loss": 0.00107777, "memory(GiB)": 15.03, "step": 27895, "train_speed(iter/s)": 1.473537 }, { "acc": 1.0, "epoch": 49.249779346866724, "grad_norm": 0.3925943970680237, "learning_rate": 6.460308163411694e-09, "loss": 0.00215511, "memory(GiB)": 15.03, "step": 27900, "train_speed(iter/s)": 1.473522 }, { "acc": 1.0, "epoch": 49.25860547219771, "grad_norm": 0.09854356199502945, "learning_rate": 6.3246777173028294e-09, "loss": 0.00215505, "memory(GiB)": 15.03, "step": 27905, "train_speed(iter/s)": 1.47353 }, { "acc": 1.0, "epoch": 49.267431597528685, "grad_norm": 0.09925952553749084, "learning_rate": 6.1907521104796755e-09, "loss": 0.00684967, "memory(GiB)": 15.03, "step": 27910, "train_speed(iter/s)": 1.473537 }, { "acc": 1.0, "epoch": 49.27625772285966, "grad_norm": 0.013537640683352947, "learning_rate": 6.058531388658864e-09, "loss": 0.00075895, "memory(GiB)": 15.03, "step": 27915, "train_speed(iter/s)": 1.473532 }, { "acc": 0.99982872, "epoch": 49.285083848190645, "grad_norm": 0.023324165493249893, "learning_rate": 5.928015596976442e-09, "loss": 0.00269681, "memory(GiB)": 15.03, "step": 27920, "train_speed(iter/s)": 1.473526 }, { "acc": 1.0, "epoch": 49.29390997352162, "grad_norm": 0.42523080110549927, "learning_rate": 5.799204779986196e-09, "loss": 0.00513207, "memory(GiB)": 15.03, "step": 27925, "train_speed(iter/s)": 1.473523 }, { "acc": 1.0, "epoch": 49.302736098852606, "grad_norm": 0.06802354007959366, "learning_rate": 5.67209898165911e-09, "loss": 0.00160232, "memory(GiB)": 15.03, "step": 27930, "train_speed(iter/s)": 1.473512 }, { "acc": 1.0, "epoch": 49.31156222418358, "grad_norm": 0.19961269199848175, "learning_rate": 5.546698245385576e-09, "loss": 0.00185129, "memory(GiB)": 15.03, "step": 27935, "train_speed(iter/s)": 1.4735 }, { "acc": 1.0, "epoch": 49.320388349514566, "grad_norm": 0.012644562870264053, "learning_rate": 5.423002613972069e-09, "loss": 0.00648172, "memory(GiB)": 15.03, "step": 27940, "train_speed(iter/s)": 1.473492 }, { "acc": 1.0, "epoch": 49.32921447484554, "grad_norm": 0.005319549702107906, "learning_rate": 5.301012129645027e-09, "loss": 0.00238716, "memory(GiB)": 15.03, "step": 27945, "train_speed(iter/s)": 1.473486 }, { "acc": 1.0, "epoch": 49.33804060017652, "grad_norm": 0.17423951625823975, "learning_rate": 5.180726834047528e-09, "loss": 0.00318049, "memory(GiB)": 15.03, "step": 27950, "train_speed(iter/s)": 1.473496 }, { "acc": 1.0, "epoch": 49.3468667255075, "grad_norm": 0.15507568418979645, "learning_rate": 5.062146768240394e-09, "loss": 0.00345793, "memory(GiB)": 15.03, "step": 27955, "train_speed(iter/s)": 1.473488 }, { "acc": 1.0, "epoch": 49.35569285083848, "grad_norm": 0.016958050429821014, "learning_rate": 4.945271972702749e-09, "loss": 0.00178077, "memory(GiB)": 15.03, "step": 27960, "train_speed(iter/s)": 1.473487 }, { "acc": 1.0, "epoch": 49.364518976169464, "grad_norm": 0.19273635745048523, "learning_rate": 4.830102487332576e-09, "loss": 0.00267298, "memory(GiB)": 15.03, "step": 27965, "train_speed(iter/s)": 1.473492 }, { "acc": 1.0, "epoch": 49.37334510150044, "grad_norm": 0.16701319813728333, "learning_rate": 4.716638351443377e-09, "loss": 0.00161689, "memory(GiB)": 15.03, "step": 27970, "train_speed(iter/s)": 1.473486 }, { "acc": 1.0, "epoch": 49.382171226831424, "grad_norm": 0.5813364386558533, "learning_rate": 4.6048796037691835e-09, "loss": 0.00740779, "memory(GiB)": 15.03, "step": 27975, "train_speed(iter/s)": 1.473494 }, { "acc": 0.99972219, "epoch": 49.3909973521624, "grad_norm": 0.002667654538527131, "learning_rate": 4.494826282459548e-09, "loss": 0.00184862, "memory(GiB)": 15.03, "step": 27980, "train_speed(iter/s)": 1.47349 }, { "acc": 0.99975491, "epoch": 49.39982347749338, "grad_norm": 0.009716748259961605, "learning_rate": 4.386478425083436e-09, "loss": 0.00414917, "memory(GiB)": 15.03, "step": 27985, "train_speed(iter/s)": 1.473491 }, { "acc": 1.0, "epoch": 49.40864960282436, "grad_norm": 0.06062089279294014, "learning_rate": 4.2798360686270055e-09, "loss": 0.00596419, "memory(GiB)": 15.03, "step": 27990, "train_speed(iter/s)": 1.473498 }, { "acc": 1.0, "epoch": 49.41747572815534, "grad_norm": 0.19442470371723175, "learning_rate": 4.174899249494158e-09, "loss": 0.00320259, "memory(GiB)": 15.03, "step": 27995, "train_speed(iter/s)": 1.473477 }, { "acc": 1.0, "epoch": 49.42630185348632, "grad_norm": 0.03712864965200424, "learning_rate": 4.071668003507097e-09, "loss": 0.00228944, "memory(GiB)": 15.03, "step": 28000, "train_speed(iter/s)": 1.473484 }, { "acc": 1.0, "epoch": 49.4351279788173, "grad_norm": 0.23926514387130737, "learning_rate": 3.970142365904664e-09, "loss": 0.00179672, "memory(GiB)": 15.03, "step": 28005, "train_speed(iter/s)": 1.473489 }, { "acc": 0.99970236, "epoch": 49.443954104148276, "grad_norm": 0.22356387972831726, "learning_rate": 3.870322371344556e-09, "loss": 0.00570964, "memory(GiB)": 15.03, "step": 28010, "train_speed(iter/s)": 1.473495 }, { "acc": 1.0, "epoch": 49.45278022947926, "grad_norm": 0.2410331815481186, "learning_rate": 3.772208053902216e-09, "loss": 0.00288165, "memory(GiB)": 15.03, "step": 28015, "train_speed(iter/s)": 1.473492 }, { "acc": 1.0, "epoch": 49.461606354810236, "grad_norm": 0.21994878351688385, "learning_rate": 3.6757994470702775e-09, "loss": 0.00279669, "memory(GiB)": 15.03, "step": 28020, "train_speed(iter/s)": 1.473485 }, { "acc": 1.0, "epoch": 49.47043248014122, "grad_norm": 0.26808470487594604, "learning_rate": 3.581096583759676e-09, "loss": 0.00466066, "memory(GiB)": 15.03, "step": 28025, "train_speed(iter/s)": 1.47348 }, { "acc": 0.99975491, "epoch": 49.4792586054722, "grad_norm": 0.4005672037601471, "learning_rate": 3.488099496297983e-09, "loss": 0.00736423, "memory(GiB)": 15.03, "step": 28030, "train_speed(iter/s)": 1.473488 }, { "acc": 1.0, "epoch": 49.48808473080318, "grad_norm": 0.22806492447853088, "learning_rate": 3.3968082164310725e-09, "loss": 0.00196523, "memory(GiB)": 15.03, "step": 28035, "train_speed(iter/s)": 1.473478 }, { "acc": 0.99963236, "epoch": 49.49691085613416, "grad_norm": 0.19333148002624512, "learning_rate": 3.307222775323672e-09, "loss": 0.00819679, "memory(GiB)": 15.03, "step": 28040, "train_speed(iter/s)": 1.473495 }, { "acc": 1.0, "epoch": 49.505736981465134, "grad_norm": 0.016636576503515244, "learning_rate": 3.219343203556039e-09, "loss": 0.0037946, "memory(GiB)": 15.03, "step": 28045, "train_speed(iter/s)": 1.473496 }, { "acc": 1.0, "epoch": 49.51456310679612, "grad_norm": 0.02881932631134987, "learning_rate": 3.1331695311283935e-09, "loss": 0.00202342, "memory(GiB)": 15.03, "step": 28050, "train_speed(iter/s)": 1.473488 }, { "acc": 1.0, "epoch": 49.523389232127094, "grad_norm": 0.11308465898036957, "learning_rate": 3.048701787456484e-09, "loss": 0.00334465, "memory(GiB)": 15.03, "step": 28055, "train_speed(iter/s)": 1.473496 }, { "acc": 1.0, "epoch": 49.53221535745808, "grad_norm": 0.38919270038604736, "learning_rate": 2.9659400013749146e-09, "loss": 0.00404872, "memory(GiB)": 15.03, "step": 28060, "train_speed(iter/s)": 1.473489 }, { "acc": 1.0, "epoch": 49.541041482789055, "grad_norm": 0.1645839363336563, "learning_rate": 2.884884201136036e-09, "loss": 0.00374094, "memory(GiB)": 15.03, "step": 28065, "train_speed(iter/s)": 1.47349 }, { "acc": 1.0, "epoch": 49.54986760812004, "grad_norm": 0.25143536925315857, "learning_rate": 2.8055344144093896e-09, "loss": 0.00422703, "memory(GiB)": 15.03, "step": 28070, "train_speed(iter/s)": 1.473482 }, { "acc": 1.0, "epoch": 49.558693733451015, "grad_norm": 0.0031383708119392395, "learning_rate": 2.7278906682822636e-09, "loss": 0.00423054, "memory(GiB)": 15.03, "step": 28075, "train_speed(iter/s)": 1.473479 }, { "acc": 0.99986706, "epoch": 49.56751985878199, "grad_norm": 0.502650797367096, "learning_rate": 2.651952989259137e-09, "loss": 0.00641383, "memory(GiB)": 15.03, "step": 28080, "train_speed(iter/s)": 1.473478 }, { "acc": 1.0, "epoch": 49.576345984112976, "grad_norm": 0.27470290660858154, "learning_rate": 2.57772140326279e-09, "loss": 0.00215715, "memory(GiB)": 15.03, "step": 28085, "train_speed(iter/s)": 1.473475 }, { "acc": 1.0, "epoch": 49.58517210944395, "grad_norm": 0.16136325895786285, "learning_rate": 2.5051959356337496e-09, "loss": 0.00263243, "memory(GiB)": 15.03, "step": 28090, "train_speed(iter/s)": 1.473487 }, { "acc": 1.0, "epoch": 49.593998234774936, "grad_norm": 0.31497377157211304, "learning_rate": 2.4343766111286234e-09, "loss": 0.00392734, "memory(GiB)": 15.03, "step": 28095, "train_speed(iter/s)": 1.473483 }, { "acc": 1.0, "epoch": 49.60282436010591, "grad_norm": 0.36288538575172424, "learning_rate": 2.3652634539239856e-09, "loss": 0.00600689, "memory(GiB)": 15.03, "step": 28100, "train_speed(iter/s)": 1.473486 }, { "acc": 1.0, "epoch": 49.61165048543689, "grad_norm": 0.31847083568573, "learning_rate": 2.2978564876119362e-09, "loss": 0.00469697, "memory(GiB)": 15.03, "step": 28105, "train_speed(iter/s)": 1.473491 }, { "acc": 1.0, "epoch": 49.620476610767874, "grad_norm": 0.008568934164941311, "learning_rate": 2.2321557352034322e-09, "loss": 0.00298375, "memory(GiB)": 15.03, "step": 28110, "train_speed(iter/s)": 1.473503 }, { "acc": 1.0, "epoch": 49.62930273609885, "grad_norm": 0.03910712152719498, "learning_rate": 2.168161219125512e-09, "loss": 0.00106041, "memory(GiB)": 15.03, "step": 28115, "train_speed(iter/s)": 1.473508 }, { "acc": 1.0, "epoch": 49.638128861429834, "grad_norm": 0.013815044425427914, "learning_rate": 2.1058729612246234e-09, "loss": 0.00319189, "memory(GiB)": 15.03, "step": 28120, "train_speed(iter/s)": 1.473505 }, { "acc": 1.0, "epoch": 49.64695498676081, "grad_norm": 0.0018319153459742665, "learning_rate": 2.0452909827632976e-09, "loss": 0.00194472, "memory(GiB)": 15.03, "step": 28125, "train_speed(iter/s)": 1.47349 }, { "acc": 1.0, "epoch": 49.655781112091795, "grad_norm": 0.10545600950717926, "learning_rate": 1.9864153044223652e-09, "loss": 0.00180097, "memory(GiB)": 15.03, "step": 28130, "train_speed(iter/s)": 1.473485 }, { "acc": 1.0, "epoch": 49.66460723742277, "grad_norm": 0.6102511286735535, "learning_rate": 1.9292459462998504e-09, "loss": 0.00444908, "memory(GiB)": 15.03, "step": 28135, "train_speed(iter/s)": 1.473487 }, { "acc": 1.0, "epoch": 49.67343336275375, "grad_norm": 0.5306189656257629, "learning_rate": 1.873782927910966e-09, "loss": 0.00449403, "memory(GiB)": 15.03, "step": 28140, "train_speed(iter/s)": 1.473487 }, { "acc": 1.0, "epoch": 49.68225948808473, "grad_norm": 0.16373640298843384, "learning_rate": 1.8200262681897823e-09, "loss": 0.00193115, "memory(GiB)": 15.03, "step": 28145, "train_speed(iter/s)": 1.473485 }, { "acc": 1.0, "epoch": 49.69108561341571, "grad_norm": 0.3162863552570343, "learning_rate": 1.7679759854864514e-09, "loss": 0.00278845, "memory(GiB)": 15.03, "step": 28150, "train_speed(iter/s)": 1.473489 }, { "acc": 1.0, "epoch": 49.69991173874669, "grad_norm": 0.4170714020729065, "learning_rate": 1.7176320975694254e-09, "loss": 0.00600945, "memory(GiB)": 15.03, "step": 28155, "train_speed(iter/s)": 1.47349 }, { "acc": 1.0, "epoch": 49.70873786407767, "grad_norm": 0.41332313418388367, "learning_rate": 1.6689946216237937e-09, "loss": 0.00233527, "memory(GiB)": 15.03, "step": 28160, "train_speed(iter/s)": 1.473484 }, { "acc": 1.0, "epoch": 49.71756398940865, "grad_norm": 0.24288438260555267, "learning_rate": 1.622063574253501e-09, "loss": 0.00386422, "memory(GiB)": 15.03, "step": 28165, "train_speed(iter/s)": 1.473496 }, { "acc": 1.0, "epoch": 49.72639011473963, "grad_norm": 0.07966198027133942, "learning_rate": 1.5768389714785736e-09, "loss": 0.00319909, "memory(GiB)": 15.03, "step": 28170, "train_speed(iter/s)": 1.473499 }, { "acc": 1.0, "epoch": 49.735216240070606, "grad_norm": 0.39523845911026, "learning_rate": 1.5333208287373396e-09, "loss": 0.00221432, "memory(GiB)": 15.03, "step": 28175, "train_speed(iter/s)": 1.473496 }, { "acc": 1.0, "epoch": 49.74404236540159, "grad_norm": 0.30962392687797546, "learning_rate": 1.4915091608853175e-09, "loss": 0.00534396, "memory(GiB)": 15.03, "step": 28180, "train_speed(iter/s)": 1.473488 }, { "acc": 1.0, "epoch": 49.75286849073257, "grad_norm": 0.3923606872558594, "learning_rate": 1.4514039821957724e-09, "loss": 0.00356808, "memory(GiB)": 15.03, "step": 28185, "train_speed(iter/s)": 1.47348 }, { "acc": 1.0, "epoch": 49.76169461606355, "grad_norm": 0.20916616916656494, "learning_rate": 1.4130053063591605e-09, "loss": 0.00314244, "memory(GiB)": 15.03, "step": 28190, "train_speed(iter/s)": 1.473475 }, { "acc": 1.0, "epoch": 49.77052074139453, "grad_norm": 0.3709869980812073, "learning_rate": 1.3763131464831293e-09, "loss": 0.0057277, "memory(GiB)": 15.03, "step": 28195, "train_speed(iter/s)": 1.473488 }, { "acc": 0.99982872, "epoch": 49.779346866725504, "grad_norm": 0.2610364556312561, "learning_rate": 1.3413275150936281e-09, "loss": 0.00213359, "memory(GiB)": 15.03, "step": 28200, "train_speed(iter/s)": 1.473489 }, { "acc": 1.0, "epoch": 49.78817299205649, "grad_norm": 0.021790826693177223, "learning_rate": 1.3080484241332408e-09, "loss": 0.00233174, "memory(GiB)": 15.03, "step": 28205, "train_speed(iter/s)": 1.473481 }, { "acc": 1.0, "epoch": 49.796999117387465, "grad_norm": 0.08752741664648056, "learning_rate": 1.2764758849622995e-09, "loss": 0.00199613, "memory(GiB)": 15.03, "step": 28210, "train_speed(iter/s)": 1.473472 }, { "acc": 1.0, "epoch": 49.80582524271845, "grad_norm": 0.07735937088727951, "learning_rate": 1.2466099083583258e-09, "loss": 0.00306469, "memory(GiB)": 15.03, "step": 28215, "train_speed(iter/s)": 1.473476 }, { "acc": 1.0, "epoch": 49.814651368049425, "grad_norm": 0.18493863940238953, "learning_rate": 1.218450504517144e-09, "loss": 0.00256604, "memory(GiB)": 15.03, "step": 28220, "train_speed(iter/s)": 1.47348 }, { "acc": 1.0, "epoch": 49.82347749338041, "grad_norm": 0.4242521822452545, "learning_rate": 1.1919976830512137e-09, "loss": 0.00214157, "memory(GiB)": 15.03, "step": 28225, "train_speed(iter/s)": 1.473489 }, { "acc": 1.0, "epoch": 49.832303618711386, "grad_norm": 0.1968277245759964, "learning_rate": 1.167251452990741e-09, "loss": 0.00195799, "memory(GiB)": 15.03, "step": 28230, "train_speed(iter/s)": 1.47349 }, { "acc": 1.0, "epoch": 49.84112974404236, "grad_norm": 0.009080996736884117, "learning_rate": 1.1442118227825681e-09, "loss": 0.00082018, "memory(GiB)": 15.03, "step": 28235, "train_speed(iter/s)": 1.473472 }, { "acc": 1.0, "epoch": 49.849955869373346, "grad_norm": 0.07281246781349182, "learning_rate": 1.1228788002923934e-09, "loss": 0.00485443, "memory(GiB)": 15.03, "step": 28240, "train_speed(iter/s)": 1.473468 }, { "acc": 1.0, "epoch": 49.85878199470432, "grad_norm": 0.3094254732131958, "learning_rate": 1.103252392801997e-09, "loss": 0.01000773, "memory(GiB)": 15.03, "step": 28245, "train_speed(iter/s)": 1.473461 }, { "acc": 1.0, "epoch": 49.86760812003531, "grad_norm": 0.24765489995479584, "learning_rate": 1.0853326070114598e-09, "loss": 0.00229386, "memory(GiB)": 15.03, "step": 28250, "train_speed(iter/s)": 1.473452 }, { "acc": 0.99986115, "epoch": 49.87643424536628, "grad_norm": 0.28967341780662537, "learning_rate": 1.0691194490380538e-09, "loss": 0.00405755, "memory(GiB)": 15.03, "step": 28255, "train_speed(iter/s)": 1.473458 }, { "acc": 1.0, "epoch": 49.88526037069727, "grad_norm": 0.14157044887542725, "learning_rate": 1.0546129244162423e-09, "loss": 0.00272448, "memory(GiB)": 15.03, "step": 28260, "train_speed(iter/s)": 1.473456 }, { "acc": 1.0, "epoch": 49.894086496028244, "grad_norm": 0.24315805733203888, "learning_rate": 1.0418130380982353e-09, "loss": 0.00259403, "memory(GiB)": 15.03, "step": 28265, "train_speed(iter/s)": 1.473454 }, { "acc": 1.0, "epoch": 49.90291262135922, "grad_norm": 0.2752399742603302, "learning_rate": 1.0307197944528783e-09, "loss": 0.00435386, "memory(GiB)": 15.03, "step": 28270, "train_speed(iter/s)": 1.473456 }, { "acc": 0.9998106, "epoch": 49.911738746690204, "grad_norm": 0.32812193036079407, "learning_rate": 1.0213331972678733e-09, "loss": 0.00293649, "memory(GiB)": 15.03, "step": 28275, "train_speed(iter/s)": 1.473461 }, { "acc": 1.0, "epoch": 49.92056487202118, "grad_norm": 0.003945101983845234, "learning_rate": 1.013653249747004e-09, "loss": 0.00163138, "memory(GiB)": 15.03, "step": 28280, "train_speed(iter/s)": 1.473465 }, { "acc": 1.0, "epoch": 49.929390997352165, "grad_norm": 0.42058229446411133, "learning_rate": 1.0076799545117997e-09, "loss": 0.00348822, "memory(GiB)": 15.03, "step": 28285, "train_speed(iter/s)": 1.473456 }, { "acc": 1.0, "epoch": 49.93821712268314, "grad_norm": 0.0033747772686183453, "learning_rate": 1.0034133136020912e-09, "loss": 0.00187716, "memory(GiB)": 15.03, "step": 28290, "train_speed(iter/s)": 1.473446 }, { "acc": 1.0, "epoch": 49.94704324801412, "grad_norm": 0.036098405718803406, "learning_rate": 1.0008533284732352e-09, "loss": 0.00264561, "memory(GiB)": 15.03, "step": 28295, "train_speed(iter/s)": 1.473457 }, { "acc": 0.99980774, "epoch": 49.9558693733451, "grad_norm": 0.008809243328869343, "learning_rate": 1e-09, "loss": 0.00395218, "memory(GiB)": 15.03, "step": 28300, "train_speed(iter/s)": 1.473461 }, { "epoch": 49.9558693733451, "eval_acc": 0.7897245563133501, "eval_loss": 1.7421954870224, "eval_runtime": 30.3105, "eval_samples_per_second": 44.044, "eval_steps_per_second": 5.51, "step": 28300 } ], "logging_steps": 5, "max_steps": 28300, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.536908201440051e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }